├── BLASTn_NT_parser.pl ├── BLASTn_RefGenome_parser.pl ├── README.md ├── SequenceQualityControl.pl ├── VirusScan.pl ├── assignment_report_virus_gi.pl ├── assignment_summary_gi.pl ├── blast_summary.pl ├── check_Blast_parsed_file.pl ├── check_SequenceQualityControl.pl ├── check_split_BN.pl ├── check_split_RefG.pl ├── check_split_cdhit.pl ├── generate_final_report_gi.pl ├── get_fasta_from_bam_filter.pl ├── import_gi_taxid_nucl.sql ├── send_email.pl ├── split_fasta.pl └── trim_readid.pl /BLASTn_NT_parser.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl -w 3 | 4 | use strict; 5 | use Bio::SearchIO; 6 | use Bio::Taxon; 7 | use Bio::DB::Taxonomy; 8 | use Bio::Tree::Tree; 9 | use DBI(); 10 | 11 | my $Usage = ' 12 | This script accepts a blastn output file and parse the information 13 | 14 | perl script 15 | = directory that blast output file resides in, without last "/" 16 | = name of the blastn output file 17 | 18 | '; 19 | 20 | die $Usage unless scalar @ARGV == 2; 21 | my ($dir, $blastout) = @ARGV; 22 | 23 | ################################################################################### 24 | # This section needs to be modified to use local configuration 25 | my $database_dir = "/gscmnt/gc3027/info/medseq/taxdump_2014_01_08"; 26 | 27 | # open a connection to mysql database 28 | my $dbh_mysql = DBI->connect("DBI:mysql:database=scao_taxondb;host=mysql1","scao", "asdf1234",{'RaiseError'=>1}) or die "Unable to connect $DBI::errstr\n"; 29 | 30 | ################################################################################### 31 | # Everhting below should not need modification 32 | my $HOME =$ENV{HOME}; 33 | 34 | my %assignment = (); 35 | 36 | # cutoff value for having a good hit 37 | my $E_cutoff = 1e-10; 38 | my $havedefinedHit=0; # Song added 39 | # create ouput file 40 | my $outFile = $blastout; 41 | $outFile =~ s/blastn\.out/blastn.parsed/; 42 | $outFile = $dir."/".$outFile; 43 | open (OUT, ">$outFile") or die "can not open file $outFile!\n"; 44 | 45 | # create a tmp directory in the home directory if tmp does not exist 46 | if (! -d $HOME."/taxo") { 47 | `mkdir $HOME"/taxo"`; 48 | } 49 | 50 | # get a Taxon from a Bio::DB::Taxonomy object 51 | my $dbh = Bio::DB::Taxonomy->new(-source => 'flatfile', 52 | -directory=> "$HOME/taxo", 53 | -nodesfile=> "$database_dir/nodes.dmp", 54 | -namesfile=> "$database_dir/names.dmp", 55 | ); 56 | 57 | my @keep_for_tblastx = (); # query should be kept for further analysis 58 | my @known = (); # queries that are significantly similar to known sequences 59 | my $total_records = 0; 60 | 61 | print "parsing blast output files...\n\n"; 62 | 63 | my $input_file = $dir."/".$blastout; 64 | my $report = new Bio::SearchIO(-format => 'blast', -file => $input_file, -report_type => 'blastn'); 65 | 66 | # Go through BLAST reports one by one 67 | while(my $result = $report->next_result) {# next query output 68 | $total_records++; 69 | my $haveHit = 0; 70 | my $keep_for_tblastx = 1; 71 | %assignment = (); 72 | 73 | # only take the best hits 74 | my $best_e = 100; 75 | my $hit_count = 0; 76 | $havedefinedHit=0; #song added; 77 | while(my $hit = $result->next_hit) { 78 | # from hit name get hit gi number 79 | my $hit_name = $hit->name; # gi|num|database|accessionNum| 80 | my @temp_arr = split(/\|/, $hit_name); 81 | my $gi = $temp_arr[1]; 82 | #print $gi,"\n"; 83 | if ($temp_arr[2] eq "pdb") { # skip data from pdb database 84 | next; 85 | } 86 | $haveHit = 1; 87 | $hit_count++; 88 | if ($hit_count == 1) { 89 | $best_e = $hit->significance; 90 | } 91 | 92 | # check whether the hit should be kept 93 | if ($best_e <= $E_cutoff) { # similar to known, need Phylotyped 94 | $keep_for_tblastx = 0; 95 | 96 | # print $result->query_name, " similar to known, output information!\n\n"; 97 | # print "the $hit_count hit, $best_e \n"; 98 | if ($hit->significance == $best_e || ($hit->significance <= $E_cutoff && $havedefinedHit==1)) { # only get best hits #song changed 99 | # from gi get taxonomy lineage 100 | my $sth = $dbh_mysql->prepare("SELECT * FROM gi_taxid_nucl where gi = $gi"); 101 | $sth->execute(); 102 | my $ref = $sth->fetchrow_hashref(); 103 | # print "gi = $ref->{'gi'}, taxid = $ref->{'tax_id'}\n"; 104 | 105 | $sth->finish(); 106 | my $taxID = $ref->{'tax_id'}; 107 | if ($taxID) { # some gi don't have record in gi_taxid_nucl 108 | # print "taxID is $taxID\n"; 109 | my $taxon_obj = $dbh->get_taxon(-taxonid => $taxID); 110 | 111 | if (!(defined $taxon_obj)) { 112 | # die "unable to get taxon_obj object\n"; 113 | my $description = "undefined taxon ".$hit->description."\t".$hit->name."\t".$hit->significance; 114 | $assignment{"other"} = $description; 115 | } 116 | 117 | else { 118 | my $tree_function = Bio::Tree::Tree->new(); 119 | my @lineage = $tree_function->get_lineage_nodes($taxon_obj); 120 | # each lineage node is a Bio::Tree::NodeI object 121 | 122 | #if($gi eq "61741475") { 123 | #print "hit gi is $gi\n"; 124 | #print "id is ", $taxon_obj->id, "\n"; 125 | #print "rank is ", $taxon_obj->rank, "\n"; 126 | #print "divison is ", $taxon_obj->division, "\n\n"; 127 | #print "lineage is @lineage\n"; 128 | #; 129 | #} 130 | 131 | if (scalar @lineage) { 132 | # print "PhyloTyped, don't save for further analysis\n"; 133 | &PhyloType(\@lineage,$hit, $best_e, $dbh_mysql, $dbh, \%assignment); 134 | } 135 | #} 136 | } 137 | } 138 | else { # for situations that gi does not have corresponding taxid 139 | # print $result->query_name, " ", $hit->name, "\n"; 140 | # print "gi = $ref->{'gi'}, taxid = $ref->{'tax_id'}\n"; 141 | # print "hit gi is $gi\n"; 142 | my $desc = $hit->description."\t".$hit->name."\t".$hit->significance; 143 | # print $result->query_name, "\t", $desc, "\n"; 144 | $assignment{"other"} = $desc; 145 | } 146 | } 147 | else { 148 | last; 149 | } 150 | } # finish phylotype for given hit 151 | } # finish all hits 152 | 153 | # foreach my $key (keys %assignment) { 154 | # print "after parsing ", $key, "\t", $assignment{$key},"\n"; 155 | # } 156 | # consolidate assignment 157 | # If a query is assigned both Homo and Primates, it will be reported as Homo only 158 | # If a query is assigned a real taxon name and "other" for reason like"other sequences; 159 | # artificial sequences", or no taxon id in taxon database it will be reported only as 160 | # the real taxon name 161 | my $num_assignment = keys %assignment; 162 | if ($num_assignment > 1) { # have multiple assignment 163 | # handle the situation that assigned both a specific category and "other" 164 | # only specific category will be save. 165 | my $has_specific = 0; 166 | my $has_other = 0; 167 | if ((defined $assignment{"Bacteria"}) || (defined $assignment{"Artificial"}) || (defined $assignment{"Fungi"}) || (defined $assignment{"Homo"}) || (defined $assignment{"Mus"}) || (defined $assignment{"Phage"}) || (defined $assignment{"Viruses"})) { 168 | $has_specific = 1; 169 | } 170 | if (defined $assignment{"other"}) { 171 | $has_other = 1; 172 | } 173 | ################################################################# 174 | # If a sequence hits virus and any other species with the same e value, 175 | # the sequence is assigned to "Ambiguous" category. cai added 12/2010 176 | #remove human since we have done extensive filtering for human sequence 10/12/2014 177 | 178 | if (((defined $assignment{"Bacteria"}) || (defined $assignment{"Fungi"}) || (defined $assignment{"Mus"}) || (defined $assignment{"Phage"}) || (defined $assignment{"other"})) && (defined $assignment{"Viruses"})) { 179 | $assignment{"Ambiguous"} = $assignment{"Viruses"}; 180 | delete $assignment{"Viruses"}; 181 | } 182 | if (((defined $assignment{"Viruses"}) || (defined $assignment{"Fungi"}) || (defined $assignment{"Mus"}) || (defined $assignment{"Phage"}) || (defined $assignment{"other"})) && (defined $assignment{"Bacteria"})) { 183 | $assignment{"Ambiguous"} = $assignment{"Bacteria"}; 184 | delete $assignment{"Bacteria"}; 185 | } 186 | ################################# 187 | if ($has_specific && $has_other) { 188 | delete $assignment{"other"}; 189 | } 190 | 191 | } 192 | 193 | # foreach my $key (keys %assignment) { 194 | # print "after consolidateion ", $key, "\t", $assignment{$key},"\n"; 195 | # } 196 | 197 | # print out assignment for this query 198 | foreach my $assign (keys %assignment) { 199 | print OUT $result->query_name, "\t", $result->query_length, "\t", $assign, "\t", $assignment{$assign}, "\n"; 200 | # print $result->query_name, "\t", $result->query_length, "\t", $assign, "\t", $assignment{$assign}, "\n"; 201 | 202 | } 203 | 204 | if ($keep_for_tblastx) { 205 | push @keep_for_tblastx, $result->query_name; 206 | # print $result->query_name, " keep_for_tblastx!\n\n"; 207 | } 208 | else { 209 | push @known, $result->query_name; 210 | } 211 | #} 212 | } 213 | print OUT "# Summary: ", scalar @keep_for_tblastx, " out of $total_records ", scalar @keep_for_tblastx/$total_records, " is saved for next step analysis.\n"; 214 | 215 | close OUT; 216 | 217 | # generate a fasta file that contains all the sequences that will be kept for further analysis 218 | # read in blast input sequences 219 | my $file = $blastout; 220 | $file =~ s/\.blastn\.out//; 221 | $file = $dir."/".$file.".fa"; 222 | my %seq = &read_FASTA_data($file); 223 | 224 | $outFile = $blastout; 225 | $outFile =~ s/\.blastn\.out//; 226 | $outFile = $dir."/".$outFile.".BNfiltered.fa"; 227 | open (OUT2, ">$outFile") or die "can not open file $outFile!\n"; 228 | foreach my $seq_name (@keep_for_tblastx) { 229 | print OUT2 ">$seq_name\n"; 230 | print OUT2 $seq{$seq_name}, "\n"; 231 | } 232 | close OUT2; 233 | 234 | $dbh_mysql->disconnect(); 235 | 236 | exit; 237 | 238 | 239 | ############################################################################ 240 | sub read_FASTA_data () { 241 | my $fastaFile = shift @_; 242 | 243 | #keep old read seperator and set new read seperator to ">" 244 | my $oldseperator = $/; 245 | $/ = ">"; 246 | 247 | my %fastaSeq; 248 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 249 | 250 | while (my $line = ){ 251 | # Discard blank lines 252 | if ($line =~ /^\s*$/) { 253 | next; 254 | } 255 | # discard comment lines 256 | elsif ($line =~ /^\s*#/) { 257 | next; 258 | } 259 | # discard the first line which only has ">", keep the rest 260 | elsif ($line ne ">") { 261 | chomp $line; 262 | my @rows = (); 263 | @rows = split (/\s/, $line); 264 | my $contigName = shift @rows; 265 | my $contigSeq = join("", @rows); 266 | $contigSeq =~ s/\s//g; #remove white space 267 | $fastaSeq{$contigName} = $contigSeq; 268 | } 269 | } 270 | 271 | # to check the correctness 272 | # foreach my $key (keys %fastaSeq){ 273 | # print "Here is the key for fasta seq: $key \t $fastaSeq{$key}\n"; 274 | # } 275 | 276 | #reset the read seperator 277 | $/ = $oldseperator; 278 | close FastaFile; 279 | 280 | return %fastaSeq; 281 | } 282 | 283 | 284 | ############################################################################### 285 | # subroutine to determine the taxonomy lineage for a given blast hit 286 | sub PhyloType { 287 | my ($lineage_ref, $hit_ref, $best_e, $dbh_mysql, $dbh_taxonomy, $assignment_ref) = @_; 288 | my $description = ""; 289 | my $node_id; 290 | my $obj; 291 | my $name; 292 | my $assigned = 0; 293 | 294 | my $Lineage = ""; 295 | for (my $i = 0; $i <= $#$lineage_ref; $i++) { 296 | my $temp_node_id = $lineage_ref->[$i]->id; 297 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id); 298 | my $temp_name = $temp_obj->scientific_name; 299 | $Lineage .= $temp_name.";"; 300 | } 301 | #print "linease is $Lineage\n"; 302 | 303 | if($Lineage =~/Mimiviridae/i || $Lineage =~/Phycodnaviridae/i || $Lineage =~/marseillevirus/i || $Lineage =~/Iridoviridae/i) { $havedefinedHit=1; } #song added; 304 | 305 | # check to see if it is a human sequence 306 | if (scalar @{$lineage_ref} >= 4) { 307 | $node_id = $lineage_ref->[3]->id; 308 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id); 309 | $name = $obj->scientific_name; 310 | if ($name eq "Metazoa") { 311 | # make assignment 312 | for (my $i = 0; $i <= $#$lineage_ref; $i++) { 313 | my $temp_node_id = $lineage_ref->[$i]->id; 314 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id); 315 | my $temp_name = $temp_obj->scientific_name; 316 | #print "name = $temp_name\n"; 317 | #; 318 | if ($temp_name eq "Homo") { 319 | if(!defined $assignment_ref->{"Homo"}) { # only keep the first best hit description, song added 1/7/2012 320 | # print "assigned to Homo\n\n"; 321 | $description .= "Homo\t".$hit_ref->name."\t".$hit_ref->significance; 322 | $assignment_ref->{"Homo"} = $description; 323 | } 324 | $assigned = 1; 325 | last; 326 | } 327 | } 328 | if (!$assigned) { 329 | for (my $i = 0; $i <= $#$lineage_ref; $i++) { 330 | my $temp_node_id = $lineage_ref->[$i]->id; 331 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id); 332 | my $temp_name = $temp_obj->scientific_name; 333 | # print "name = $temp_name\n"; 334 | 335 | if ($temp_name eq "Mus") { 336 | if(!defined $assignment_ref->{"Mus"}) { # only keep the first best hit description, song added 1/7/2012 337 | # print "assigned to Mus\n\n"; 338 | $description .= "Mus\t".$hit_ref->name."\t".$hit_ref->significance; 339 | $assignment_ref->{"Mus"} = $description; 340 | } 341 | $assigned = 1; 342 | last; 343 | } 344 | } 345 | } 346 | if (!$assigned) { 347 | if(!defined $assignment_ref->{"other"}) { # only take the first best hit description 348 | $description .= $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 349 | # print "assigned to other\n\n"; 350 | $assignment_ref->{"other"} = $description; 351 | } 352 | $assigned = 1; 353 | } 354 | } 355 | } 356 | 357 | # check to see if it is bacteria sequence 358 | if ((scalar @{$lineage_ref} >= 2)&&(!$assigned)) { 359 | $node_id = $lineage_ref->[1]->id; 360 | #print $node_id,"\n"; 361 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id); 362 | $name = $obj->scientific_name; 363 | #print $name,"\n"; 364 | 365 | if($name=~/artificial sequences/i) { 366 | if(!defined $assignment_ref->{"Artificial"}) 367 | { 368 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 369 | $assignment_ref->{"Artificial"} = $description;} 370 | $assigned=1; 371 | } 372 | 373 | if ($name eq "Bacteria") { 374 | if(!defined $assignment_ref->{"Bacteria"}) { # take the first best hit description 375 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 376 | $assignment_ref->{"Bacteria"} = $description; 377 | } 378 | $assigned = 1; 379 | } 380 | } 381 | 382 | # check to see if it is a phage virus sequence 383 | if (!$assigned) { 384 | $node_id = $lineage_ref->[0]->id; 385 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id); 386 | $name = $obj->scientific_name; 387 | if ($name eq "Viruses") { 388 | for (my $i = 0; $i <= $#$lineage_ref; $i++) { 389 | my $temp_node_id = $lineage_ref->[$i]->id; 390 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id); 391 | my $temp_name = $temp_obj->scientific_name; 392 | $description .= $temp_name.";"; 393 | if (($temp_name eq "Lipothrixviridae")||($temp_name eq "Caudovirales")||($temp_name eq "Corticoviridae")||($temp_name eq "Cystoviridae")||($temp_name eq "Inoviridae")||($temp_name eq "Leviviridae")||($temp_name eq "Microviridae")||($temp_name eq "Tectiviridae")||($temp_name =~ /phage/i)) { 394 | # print "assigned to phage\n\n"; 395 | if(!defined $assignment_ref->{"Phage"}) { # take the first best hit description 396 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 397 | $assignment_ref->{"Phage"} = $description; 398 | } 399 | $assigned = 1; 400 | last; 401 | } 402 | } 403 | } 404 | } 405 | 406 | # check to see if it is a virus sequence 407 | $description = ""; 408 | if (!$assigned) { 409 | $node_id = $lineage_ref->[0]->id; 410 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id); 411 | $name = $obj->scientific_name; 412 | if ($name eq "Viruses") { 413 | if(!defined $assignment_ref->{"Viruses"}) { # take the first best hit description 414 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 415 | $assignment_ref->{"Viruses"} = $description; 416 | } 417 | $assigned = 1; 418 | } 419 | } 420 | 421 | # check to see if it is a fungi sequence 422 | if ((scalar @{$lineage_ref} >= 4)&&(!$assigned)) { 423 | $node_id = $lineage_ref->[3]->id; 424 | $obj = $dbh->get_taxon(-taxonid=>$node_id); 425 | $name = $obj->scientific_name; 426 | if ($name eq "Fungi") { 427 | if(!defined $assignment_ref->{"Fungi"}) { # take the first best hit description 428 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 429 | $assignment_ref->{"Fungi"} = $description; 430 | } 431 | $assigned = 1; 432 | } 433 | } 434 | 435 | # if still not assigned, assigned to "other" category 436 | if (!$assigned) { 437 | if(!defined $assignment_ref->{"other"}) { 438 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance; 439 | $assignment_ref->{"other"} = $description; 440 | } 441 | $assigned = 1; 442 | } 443 | 444 | return $assigned; 445 | } 446 | 447 | 448 | -------------------------------------------------------------------------------- /BLASTn_RefGenome_parser.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl -w 3 | 4 | use strict; 5 | use Bio::SearchIO; 6 | 7 | my $Usage = ' 8 | This script accepts a BLASTn output file that were blasted against Reference 9 | genome, find out whether the best hit has a e value lower than the cutoff. If 10 | yes, output query information. If no, the sequence will be kept for further analysis. 11 | 12 | perl script 13 | = directory that blast output file resides in, without last "/" 14 | = name of the blastn output file 15 | = Bacteria, Homo, Phage, Fungi, Mus, other 16 | 17 | '; 18 | 19 | die $Usage unless scalar @ARGV == 3; 20 | my ($dir, $blastout, $RefGenomeTaxonomy) = @ARGV; 21 | 22 | # cutoff value for having a good hit, 1e-10 is a value that gives reasonable confidence 23 | my $E_cutoff = 1e-10; 24 | 25 | # create ouput file 26 | my $outFile = $blastout; 27 | $outFile =~ s/RefGblast\.out/RefGblast.parsed/; 28 | $outFile = $dir."/".$outFile; 29 | open (OUT, ">$outFile") or die "can not open file $outFile!\n"; 30 | 31 | my @keep = (); # query should be kept for further analysis 32 | my @known = (); # queries that are significantly similar to Reference sequences 33 | my $total_records = 0; 34 | 35 | #print "parsing blast output files...\n\n"; 36 | 37 | my $input_file = $dir."/".$blastout; 38 | my $report = new Bio::SearchIO(-format => 'blast', -file => $input_file, -report_type => 'blastn'); 39 | 40 | # Go through BLAST reports one by one 41 | while(my $result = $report->next_result) {# next query output 42 | # print "\\", $result->query_name, "\\ input\n\n"; 43 | if ($result->query_name eq "") { # deals with situation where blast 1st report is empty 44 | next; 45 | } 46 | 47 | $total_records++; 48 | my $haveHit = 0; 49 | my $keep = 1; 50 | while(my $hit = $result->next_hit) { 51 | $haveHit = 1; 52 | # check whether the query should be kept for further analysis 53 | if ($hit->significance <= $E_cutoff) { 54 | $keep = 0; 55 | # print $result->query_name, " similar to known, output information!\n\n"; 56 | print OUT $result->query_name, "\t", $result->query_length, "\t$RefGenomeTaxonomy\t$RefGenomeTaxonomy\t".$hit->name."\t".$hit->significance,"\n"; 57 | } 58 | last; # only need to look at the first hit 59 | } 60 | 61 | if ($haveHit) { 62 | if ($keep) { 63 | push @keep, $result->query_name; 64 | # print $result->query_name, " keep!\n\n"; 65 | } 66 | else { 67 | push @known, $result->query_name; 68 | } 69 | } 70 | else { # does not have a hit, keep for further analysis 71 | push @keep, $result->query_name; 72 | # print $result->query_name, " keep!\n\n"; 73 | } 74 | 75 | } 76 | print OUT "# Summary: ", scalar @keep, " out of $total_records ", scalar @keep/$total_records, " is saved for BLASTN analysis.\n"; 77 | 78 | close OUT; 79 | 80 | # generate a fasta file that contains all the non-Reference sequences 81 | # read in blastn input sequences 82 | my $file = $blastout; 83 | $file =~ s/\.RefGblast\.out//; 84 | $file = $dir."/".$file.".fa"; 85 | my %seq = &read_FASTA_data($file); 86 | 87 | $outFile = $blastout; 88 | $outFile =~ s/\.RefGblast\.out//; 89 | $outFile = $dir."/".$outFile.".RefGfiltered.fa"; 90 | open (OUT2, ">$outFile") or die "can not open file $outFile!\n"; 91 | foreach my $seq_name (@keep) { 92 | if ($seq_name eq "") { # deals with situation where blast 1st report is empty 93 | next; 94 | } 95 | print OUT2 ">$seq_name\n"; 96 | print OUT2 $seq{$seq_name}, "\n"; 97 | } 98 | close OUT2; 99 | 100 | 101 | exit; 102 | 103 | 104 | ############################################################################ 105 | # subroutines 106 | sub read_FASTA_data () { 107 | my $fastaFile = shift @_; 108 | 109 | #keep old read seperator and set new read seperator to ">" 110 | my $oldseperator = $/; 111 | $/ = ">"; 112 | 113 | my %fastaSeq; 114 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 115 | 116 | while (my $line = ){ 117 | # Discard blank lines 118 | if ($line =~ /^\s*$/) { 119 | next; 120 | } 121 | # discard comment lines 122 | elsif ($line =~ /^\s*#/) { 123 | next; 124 | } 125 | # discard the first line which only has ">", keep the rest 126 | elsif ($line ne ">") { 127 | chomp $line; 128 | my @rows = (); 129 | @rows = split (/\s/, $line); 130 | my $contigName = shift @rows; 131 | my $contigSeq = join("", @rows); 132 | $contigSeq =~ s/\s//g; #remove white space 133 | $fastaSeq{$contigName} = $contigSeq; 134 | } 135 | } 136 | 137 | # check for correctness 138 | # foreach my $key (keys %fastaSeq){ 139 | # print "$key \t $fastaSeq{$key}\n"; 140 | # } 141 | 142 | #reset the read seperator 143 | $/ = $oldseperator; 144 | 145 | close FastaFile; 146 | return %fastaSeq; 147 | } 148 | 149 | 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #### VirusScan version 1.1 #### 2 | 3 | Author: Song Cao 4 | 5 | Contact: scao@wustl.edu 6 | 7 | Released on Apr 25, 2016 8 | 9 | Please cite the following paper for VirusScan pipeline: 10 | 11 | Song Cao, Michael C. Wendl, Matthew A. Wyczalkowski, Kristine Wylie, Kai Ye, Reyka Jayasinghe, Mingchao Xie, Song Wu, Beifang Niu, Robert Grubb III, Kimberly J. Johnson, Hiram Gay, Ken Chen, Janet S. Rader, John F. Dipersio, Feng Chen, and Li Ding, Divergent viral presentation among human tumors and adjacent normal tissues, Scientific Reports, 2016, 6:28294. 12 | 13 | VirusScan pipeline is a fully automated and modular software package designed for the fast 14 | and accurate detection of known viruses from NGS data. It works on LSF job scheduler. 15 | 16 | It was developed from VirusHunter pipeline, which focuses on identification of novel viruses for 454 reads. 17 | Compared to VirusHunter pipeline, VirusScan can work on Illlumina WGS, WES and RNA-Seq data and fastly return 18 | the discovery result of known viruses. 19 | 20 | ###Dependencies:### 21 | 22 | 23 | 24 | 1. RepeatMasker: Download and install RepeatMasker from http://www.repeatmasker.org/RMDownload.html. 25 | 26 | 2. BLAST Module: Download and install BLAST from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/. 27 | 28 | 3. MySQL DBI: See http://search.cpan.org/dist/DBI/. DBI may be included with your Linux distribution by default. This is a Perl module that allows Perl to interact directly with MySQL database. 29 | 30 | 4. BioPerl: See http://bioperl.org/. BioPerl is used for parsing BLAST output files, and to construct taxonomy lineage tree from a taxonomy ID. 31 | 32 | 5. NCBI nt database: Download NT database from ftp://ftp.ncbi.nih.gov/blast/db/. 33 | 34 | 6. Viral nt database: Downlaod viral nt database from https://drive.google.com/open?id=0B-teklYT0wbDMEh6ZlhzMVo2QlE. 35 | 36 | 7. NCBI taxonomy database: Download NCBI taxonomy database from ftp://ftp.ncbi.nih.gov/pub/taxonomy/. 37 | 38 | 7.1. Create a directory to hold taxonomy file, e.g. taxdump_2016_06_20. Download taxdump.tar.gz file to the directory and Type "tar -xzf taxdump.tar.gz" to untar the file. 39 | 40 | 7.2. Create MySQL Database for the taxonomy information 41 | 42 | Ask your MySQL database administrator to create a MySQL database for taxonomy information, and grant privileges on this database to a suitable username. 43 | For example, ask your MySQL database administrator to use following commands to create a database named "vs_taxondb" and grant all privileges to the user "vs_taxonUser" with the password "vs_password". 44 | $ mysql -u root -p 45 | CREATE DATABASE test_taxondb; 46 | GRANT ALL ON vs_taxondb.* TO 'vs_taxonUser'@'localhost' IDENTIFIED BY 'vs_password'; 47 | GRANT ALL ON vs_taxondb.* TO 'vs_taxonUser'@'%' IDENTIFIED BY 'vs_password'; 48 | QUIT; 49 | 50 | 6.3. Load gi-taxid into database for nucleotide sequences: 51 | download gi_taxid_nucl.dmp.gz to the directory 52 | unzip the file 53 | 54 | Modify script "import_gi_taxid_nucl.sql " to replace the full path to the gi_taxid_nucl.dmp file with the actual full path in your local system at line " LOAD DATA LOCAL INFILE" in the script. 55 | The LOAD DATA INFILE statement reads rows from a text file into a table at a very high speed. The file name must be given as a literal string. 56 | 57 | Load the gi_taxid_nucl.dmp content to a MySQL database using script " import_gi_taxid_nucl.sql" with the following command: 58 | Cat import_gi_taxid_nucl.sql | mysql -h hostname --user=username databaseName --pass=password & 59 | 60 | Warning: This can take a very long time. It is better to run it as a background task. 61 | 62 | ###Usage:### 63 | 64 | git clone https://github.com/ding-lab/VirusScan.git 65 | 66 | perl VirusScan.pl < run_folder > < step_number > 67 | 68 | run_folder: A folder contains different bam files for different samples: 69 | 70 | For example: 71 | 72 | work/sample1/sample1.bam 73 | 74 | work/sample2/sample2.bam 75 | 76 | Warning: The prefix of the name of the bam file should be the same as the sample directory. 77 | 78 | step_number: Integer between 1 and 33 which represents the following step: 79 | 80 | [1] Extract unmapped no-human reads from aligned bam file and map extracted reads to the viral database 81 | 82 | [2] Split files for running RepeatMasker 83 | 84 | [3] Submit RepeatMasker job array 85 | 86 | [4] Sequence Quality Control 87 | 88 | [5] Split files for Blast Human Genome 89 | 90 | [6] Submit Blast Human Genome job array 91 | 92 | [7] Parse Human Genome Blast result 93 | 94 | [8] Pool and split files for BlastN 95 | 96 | [9] Submit BlastN job array 97 | 98 | [10] Parse BlastN result 99 | 100 | [11] Generate summary result for blastn output 101 | 102 | [12] Assignment report for each sample 103 | 104 | [13] Assignment summary for each sample 105 | 106 | [14] Generate report for the run 107 | 108 | [22] Run steps from 2 to 14 109 | 110 | [23] Run steps from 3 to 14 111 | 112 | [24] Run steps from 4 to 14 113 | 114 | [25] Run steps from 5 to 14 115 | 116 | [26] Run steps from 6 to 14 117 | 118 | [27] Run steps from 7 to 14 119 | 120 | [28] Run steps from 8 to 14 121 | 122 | [29] Run steps from 9 to 14 123 | 124 | [30] Run steps from 10 to 14 125 | 126 | [31] Run steps from 11 to 14 127 | 128 | [32] Run steps from 12 to 14 129 | 130 | [33] Run steps from 13 to 14 131 | 132 | -------------------------------------------------------------------------------- /SequenceQualityControl.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | 5 | my $usage = ' 6 | This script will check each .masked file in the given directory. 7 | Some sequences have only/lots of Ns because masked by RepeatMasker. 8 | 1) Sequences that do not have greater than 40 nt of consecutive 9 | sequence without N will be put into file .fa.cdhit_out.masked.badSeq 10 | 2) Sequences with >= 40% of total length of being masked will be put 11 | into file .fa.cdhit_out.masked.RepeatLowComplexSeq 12 | 13 | perl script 14 | = full path of the folder holding files for this sample 15 | without last "/" 16 | 17 | '; 18 | die $usage unless scalar @ARGV == 1; 19 | my ( $dir ) = @ARGV; 20 | my $percent_masked_cutoff = 0.4; 21 | 22 | # get directory path 23 | my @fields = split(/\//, $dir); 24 | my $libName = $fields[$#fields]; 25 | 26 | my $total_seq = 0; 27 | my $good_seq = 0; 28 | my $bad_seq = 0; 29 | my $RepeatLowComplex_seq = 0; 30 | my $OutFile1 = $dir."/".$libName.".fa.cdhit_out.masked.goodSeq"; 31 | my $OutFile2 = $dir."/".$libName.".fa.cdhit_out.masked.badSeq"; 32 | my $OutFile3 = $dir."/".$libName.".fa.cdhit_out.masked.RepeatLowComplexSeq"; 33 | 34 | open (OUT1, ">$OutFile1") or die "can not open $OutFile1\n"; 35 | open (OUT2, ">$OutFile2") or die "can not open $OutFile2\n"; 36 | open (OUT3, ">$OutFile3") or die "can not open $OutFile3\n"; 37 | 38 | opendir(DH, $dir) or die "Can not open dir $dir!\n"; 39 | foreach my $name (readdir DH) { 40 | if ($name =~ /.cdhit_out_RepeatMasker$/) { # RepeatMasker directory 41 | my $full_path = $dir."/".$name; 42 | opendir(SubDH, $full_path) or die "can not open dir $full_path!\n"; 43 | foreach my $file (readdir SubDH) { 44 | if ($file =~ /\.masked$/) { # masked sequence 45 | my $maskedFile = $full_path."/".$file; 46 | my %seq = (); 47 | print $maskedFile,"\n"; 48 | &read_FASTA_data($maskedFile, \%seq); 49 | 50 | # check for contiguous bases >= 40 bp (non-Ns) 51 | foreach my $read_id (keys %seq) { 52 | #print $read_id,"\n"; ; 53 | $total_seq++; 54 | my $seq_temp = $seq{$read_id}; 55 | my $goodQuality=$seq_temp=~/[ACTG]{40,}/; 56 | if($goodQuality) { 57 | my $length_masked = ($seq_temp =~ tr/N/N/); 58 | my $length_total = length $seq_temp; 59 | my $percent_masked = $length_masked/$length_total; 60 | 61 | # print ">$read_id\n"; 62 | # print $seq{$read_id}, "\n"; 63 | #print "total length $length_total, total number of Ns $length_masked, percentage $percent_masked\n"; 64 | #; 65 | if ($percent_masked >= $percent_masked_cutoff) { 66 | print OUT3 ">$read_id\n"; 67 | print OUT3 $seq{$read_id}, "\n"; 68 | $RepeatLowComplex_seq++; 69 | } 70 | else { 71 | print OUT1 ">$read_id\n"; 72 | #print $read_id,"\t","OUT1","\n"; 73 | print OUT1 $seq{$read_id}, "\n"; 74 | $good_seq++; 75 | } 76 | } 77 | else { 78 | print OUT2 ">$read_id\n"; 79 | print OUT2 "$seq{$read_id}\n"; 80 | $bad_seq++; 81 | } 82 | } 83 | } 84 | } 85 | } 86 | } 87 | 88 | print OUT2 "total unique seq = $total_seq\n"; 89 | print OUT2 "good seq = $good_seq\n"; 90 | print OUT2 "bad seq = $bad_seq\n"; 91 | print OUT2 "Repeat and Low complexicity seq = $RepeatLowComplex_seq\n"; 92 | 93 | 94 | print OUT3 "total unique seq = $total_seq\n"; 95 | print OUT3 "Repeat and Low complexicity seq = $RepeatLowComplex_seq\n"; 96 | 97 | close(OUT1); 98 | close(OUT2); 99 | close(OUT3); 100 | 101 | exit; 102 | 103 | ############################################################################ 104 | sub read_FASTA_data () { 105 | my ($fastaFile, $hash_ref) = @_; 106 | 107 | #keep old read seperator and set new read seperator to ">" 108 | my $oldseperator = $/; 109 | $/ = ">"; 110 | 111 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 112 | while (my $line = ){ 113 | # Discard blank lines 114 | if ($line =~ /^\s*$/) { 115 | next; 116 | } 117 | # discard comment lines 118 | elsif ($line =~ /#/) { 119 | next; 120 | } 121 | # discard the first line which only has ">", keep the rest 122 | elsif ($line ne ">") { 123 | chomp $line; 124 | my @rows = (); 125 | @rows = split (/\n/m, $line); 126 | my $seqName = shift @rows; 127 | my @temp = split (/\s/, $seqName); 128 | $seqName = shift @temp; 129 | my $Seq = join("", @rows); 130 | $Seq =~ s/\s//g; #remove white space 131 | $hash_ref->{$seqName} = $Seq; 132 | # print "name = $seqName\n"; 133 | # print "seq = \\$Seq\\\n"; 134 | } 135 | } 136 | 137 | close FastaFile; 138 | #reset the read seperator 139 | $/ = $oldseperator; 140 | } 141 | -------------------------------------------------------------------------------- /VirusScan.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | use warnings; 5 | #use POSIX; 6 | 7 | #color code 8 | my $red = "\e[31m"; 9 | my $gray = "\e[37m"; 10 | my $yellow = "\e[33m"; 11 | my $green = "\e[32m"; 12 | my $purple = "\e[35m"; 13 | my $cyan = "\e[36m"; 14 | my $normal = "\e[0m"; 15 | 16 | #usage information 17 | (my $usage = < $normal 21 | 22 | = full path of the folder holding files for this sequence run 23 | 24 | run this pipeline step by step. (running the whole pipeline if step number is 0) 25 | 26 | $green [1] Run bwa 27 | $red [2, or <=22] Split files for RepeatMasker 28 | [3 or <=23] Submit RepeatMasker job array 29 | $yellow [4 or <=24] Sequence Qulity Control 30 | $green [5 or <=25] Split files for Blast Reference Genome 31 | [6 or <=26] Submit Blast Reference Genome job array 32 | [7 or <=27] Parse Reference Genome Blast result 33 | $gray [8 or <=28] Pool and split files for BlastN 34 | [9 or <=29] Submit BlastN job array 35 | [10 or <=30] Parse BlastN result 36 | [11 or <=31] Get summary of BlastN 37 | $purple [12 or <=32] Assignment report for each sample 38 | [13 or <=33] Assignment summary for each sample 39 | [14 or <=34] Generate report for the run 40 | $normal 41 | OUT 42 | 43 | die $usage unless @ARGV == 2; 44 | my ( $run_dir, $step_number ) = @ARGV; 45 | if ($run_dir =~/(.+)\/$/) { 46 | $run_dir = $1; 47 | } 48 | 49 | die $usage unless ($step_number >=0)&&(($step_number <= 17) || ($step_number >= 22)); 50 | 51 | 52 | ##################################################################################### 53 | # values need to be modified to adapt to local environment 54 | my $email = "scao\@wustl\.edu"; 55 | 56 | # software path 57 | #my $cd_hit = "/gscuser/mboolcha/software/cdhit/cd-hit-est"; 58 | my $repeat_masker = "RepeatMasker"; 59 | my $blastn = "/gscuser/scao/tools/ncbi-blast+/bin/blastn"; 60 | #my $blastx = "/gscuser/scao/tools/software/ncbi-blast+/bin/blastx"; 61 | 62 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 63 | # path and name of databases 64 | #my $db_BN = "/gscuser/scao/gc3027/nt/nt"; 65 | #my $db_BX = "/gscuser/scao/gc3027/nr/nr"; 66 | #my $bwa_ref = "/gscuser/scao/gc3027/fasta/virus/virusdb_082414.fa"; 67 | 68 | my $db_BN = "/gscmnt/gc3027/dinglab/medseq/nt/nt"; 69 | my $db_BX = "/gscmnt/gc3027/dinglab/medseq/nr/nr"; 70 | my $bwa_ref = "/gscmnt/gc3027/dinglab/medseq/fasta/nt012414_RE_Split/nt012414_virus_abbr_cdhit98.fa"; 71 | 72 | # reference genome taxonomy classification and database location. 73 | # It's better to change $refrence_genome_taxonomy and $reference_genome based on the data being analyzed. 74 | my $refrence_genome_taxonomy = ""; 75 | my $reference_genome = ""; 76 | 77 | #if ($ref_genome_choice == 1) { 78 | # $refrence_genome_taxonomy = "Homo"; # use Bacteria, Homo, Phage, Fungi, Mus, other 79 | 80 | # path to the reference genome 81 | # $reference_genome = "/gscmnt/gc3027/dinglab/medseq/human70.37/humandnacdna.fa"; 82 | #} 83 | 84 | $refrence_genome_taxonomy = "Homo"; 85 | 86 | $reference_genome = "/gscmnt/gc3027/dinglab/medseq/human70.37/humandnacdna.fa"; 87 | 88 | ##################################################################################### 89 | # everything else below should be automated 90 | my $HOME = $ENV{HOME}; 91 | my $working_name= (split(/\//,$run_dir))[-2]; 92 | 93 | # To run jobs faster, split large fasta files to small ones. Split to specific number of 94 | # files instead of specific sequences in each small file, because the number of job array 95 | # cannot be determined if spliting to specific number of sequences in each file. Job 96 | # number is required by qsub ${SGE_TASK_ID}. The minimum size of each file is 4kb. 97 | # The number of files should be determined accourding to CPUs available in the computer 98 | # cluster. 99 | 100 | # The number of small fasta files to split to from a large file for RepeatMasker 101 | my $file_number_of_RepeatMasker = 100; #default 102 | # the number of small fasta files to split to from a large file for Blast_Reference_Genome 103 | my $file_number_of_Blast_Ref_Genome = 100; #default 104 | # the number of small fasta files to split to from a large file for Blast_N 105 | my $file_number_of_Blast_N = 100; #default 106 | # the number of small fasta files to split to from a large file for Blast_X 107 | #my $file_number_of_Blast_X = 200; #default 108 | 109 | #store job files here 110 | my $HOME1="/gscmnt/gc2524/dinglab"; 111 | 112 | #store job files here 113 | if (! -d $HOME1."/tmp") { 114 | `mkdir $HOME1"/tmp"`; 115 | } 116 | my $job_files_dir = $HOME1."/tmp"; 117 | 118 | #store SGE output and error files here 119 | if (! -d $HOME1."/SGE_DIR") { 120 | `mkdir $HOME1"/SGE_DIR"`; 121 | } 122 | my $lsf_file_dir = $HOME1."/SGE_DIR"; 123 | 124 | # obtain script path 125 | my $run_script_path = `dirname $0`; 126 | chomp $run_script_path; 127 | $run_script_path = "/usr/bin/perl ".$run_script_path."/"; 128 | 129 | my $hold_RM_job = "norm"; 130 | my $current_job_file = "";#cannot be empty 131 | my $hold_job_file = ""; 132 | my $bsub_com = ""; 133 | my $sample_full_path = ""; 134 | my $sample_name = ""; 135 | 136 | #directory suffix constants 137 | my $REPEAT_MASKER_DIR_SUFFIX = "fa.cdhit_out_RepeatMasker"; 138 | my $BLAST_RefG_DIR_SUFFIX = "fa.cdhit_out.masked.goodSeq_RefGblast"; 139 | my $BLAST_NT_DIR_SUFFIX = "RefGfiltered_BLASTN"; 140 | my $BLASTX_NR_DIR_SUFFIX = "BNFiltered_BLASTX_NR"; 141 | 142 | # get sample list in the run, name should not contain "." 143 | opendir(DH, $run_dir) or die "Cannot open dir $run_dir: $!\n"; 144 | my @sample_dir_list = readdir DH; 145 | close DH; 146 | 147 | # check to make sure the input directory has correct structure 148 | &check_input_dir($run_dir); 149 | 150 | # start data processsing 151 | if ($step_number < 14 || $step_number>=22) { 152 | #begin to process each sample 153 | for (my $i=0;$i<@sample_dir_list;$i++) {#use the for loop instead. the foreach loop has some problem to pass the global variable $sample_name to the sub functions 154 | $sample_name = $sample_dir_list[$i]; 155 | if (!($sample_name =~ /\./)) { 156 | $sample_full_path = $run_dir."/".$sample_name; 157 | if (-d $sample_full_path) { # is a full path directory containing a sample 158 | print $yellow, "\nSubmitting jobs for the sample ",$sample_name, "...",$normal, "\n"; 159 | $current_job_file=""; 160 | if ($step_number == 0 || $step_number>=22) {#run the whole pipeline 161 | ###################################################################### 162 | #cd-hit 163 | if($step_number==0) 164 | { &bsub_bwa();} 165 | 166 | ###################################################################### 167 | #RepeatMasker 168 | #split file for RepeatMasker 169 | #my $f_fa=$sample_full_path.".fa"; 170 | #if(! -f $f_fa) { next; } 171 | 172 | if($step_number<=22) 173 | { 174 | &split_for_RepeatMasker(); } 175 | 176 | #submit RepeatMasker job array 177 | if($step_number<=23) 178 | { 179 | &submit_job_array_RM(); 180 | $hold_RM_job=$current_job_file; # to limit number repeatmasker jobs run in the cluster at the same time. Can be removed if the cluster is able to handle the volumn of data input/output. 181 | } 182 | ###################################################################### 183 | #Sequence Quality Control 184 | if($step_number<=24) 185 | { &seq_QC();} 186 | 187 | ###################################################################### 188 | #BLASTn against Reference Genome 189 | if($step_number<=25) 190 | { 191 | &split_for_blast_RefG();} 192 | 193 | #submit Blast RefG job array 194 | if($step_number<=26) 195 | { 196 | &submit_job_array_blast_RefG();} 197 | 198 | if($step_number<=27) 199 | { 200 | #parser Blast RefG file 201 | &parse_blast_RefG();} 202 | 203 | 204 | ###################################################################### 205 | #BLASTn against nt 206 | #pool and split files for BLASTn 207 | if($step_number<=28) 208 | { 209 | &pool_split_for_blast_N();} 210 | 211 | #submit BLASTn job array 212 | if($step_number<=29) 213 | { 214 | &submit_job_array_blast_N();} 215 | 216 | #parser BLASTn output file 217 | if($step_number<=30) 218 | { 219 | &parse_blast_N();} 220 | 221 | if($step_number<=31) 222 | { 223 | &blast_S();} 224 | 225 | if($step_number<=32){ 226 | &report_for_each_sample();} 227 | 228 | #Assignment summary for each sample 229 | if($step_number<=33) { 230 | &summary_for_each_sample();} 231 | 232 | ###################################################################### 233 | #run the pipeline step by step 234 | }elsif ($step_number == 1) { 235 | &bsub_bwa(); 236 | }elsif ($step_number == 2) { 237 | &split_for_RepeatMasker(1); 238 | }elsif ($step_number == 3) { 239 | &submit_job_array_RM(1); 240 | $hold_RM_job=$current_job_file; # to limit number of repeatmasker jobs 241 | }elsif ($step_number == 4) { 242 | &seq_QC(1); 243 | }elsif ($step_number == 5) { 244 | &split_for_blast_RefG(1); 245 | }elsif ($step_number == 6) { 246 | &submit_job_array_blast_RefG(1); 247 | }elsif ($step_number == 7) { 248 | &parse_blast_RefG(1); 249 | }elsif ($step_number == 8) { 250 | &pool_split_for_blast_N(1); 251 | }elsif ($step_number == 9) { 252 | &submit_job_array_blast_N(1); 253 | }elsif ($step_number == 10) { 254 | &parse_blast_N(1); 255 | }elsif ($step_number == 11) { 256 | &blast_S(1); 257 | }elsif ($step_number == 12) { 258 | &report_for_each_sample(1); 259 | }elsif ($step_number == 13) { 260 | &summary_for_each_sample(1); 261 | } 262 | } 263 | } 264 | } 265 | } 266 | 267 | ########################################################################################## 268 | # generate report for the run 269 | if (($step_number == 0) || ($step_number == 14) || ($step_number>=22)) { 270 | 271 | print $yellow, "Submitting jobs for generating the report for the run ....",$normal, "\n"; 272 | $hold_job_file=$current_job_file; 273 | $current_job_file = "Run_report_".$$.".sh"; 274 | open(REPRUN, ">$job_files_dir/$current_job_file") or die $!; 275 | print REPRUN "#!/bin/bash\n"; 276 | print REPRUN "#BSUB -n 1\n"; 277 | print REPRUN "#BSUB -R \"rusage[mem=40000]\"","\n"; 278 | print REPRUN "#BSUB -M 40000000\n"; 279 | #print REPRUN "#BSUB -q ding-lab\n"; 280 | print REPRUN "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 281 | print REPRUN "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 282 | print REPRUN "#BSUB -J $current_job_file\n"; 283 | print REPRUN "#BSUB -w \"$hold_job_file\"","\n"; 284 | 285 | print REPRUN "BAD_SEQ=fa.cdhit_out.masked.badSeq\n"; #output of RepeatMasker 286 | print REPRUN "BAD_SEQ=fa.cdhit_out.masked.badSeq\n"; #output of RepeatMasker 287 | 288 | print REPRUN "OUTPUT=".$run_dir."/Analysis_Report_gi_".$working_name."\n"; 289 | 290 | print REPRUN 'if [ -f $OUTPUT ] ',"\n"; # file exist 291 | print REPRUN "then\n"; 292 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n"; 293 | print REPRUN ' CHECK=$?',"\n"; 294 | print REPRUN ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish 295 | print REPRUN " do\n"; 296 | print REPRUN " ".$run_script_path."generate_final_report_gi.pl ".$run_dir." ".$version,"\n"; 297 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n"; 298 | print REPRUN ' CHECK=$?',"\n"; 299 | print REPRUN " done\n"; 300 | print REPRUN "else\n"; # file does not exist 301 | print REPRUN " ".$run_script_path."generate_final_report_gi.pl ".$run_dir." ".$version,"\n"; 302 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n"; 303 | print REPRUN ' CHECK=$?',"\n"; 304 | print REPRUN ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish 305 | print REPRUN " do\n"; 306 | print REPRUN " ".$run_script_path."generate_final_report_gi.pl ".$run_dir." ".$version,"\n"; 307 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n"; 308 | print REPRUN ' CHECK=$?',"\n"; 309 | print REPRUN " done\n"; 310 | print REPRUN "fi\n"; 311 | close REPRUN; 312 | close REPRUN; 313 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n"; 314 | #$bsub_com = "qsub -V -P long -hold_jid $working_name -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 315 | system ($bsub_com); 316 | 317 | } 318 | 319 | ####################################################################### 320 | # send email to notify the finish of the analysis 321 | if (($step_number == 0) || ($step_number == 15) || ($step_number>=22)) { 322 | print $yellow, "Submitting the job for sending an email when the run finishes ",$sample_name, "...",$normal, "\n"; 323 | $hold_job_file = $current_job_file; 324 | $current_job_file = "Email_run_".$$.".sh"; 325 | open(EMAIL, ">$job_files_dir/$current_job_file") or die $!; 326 | print EMAIL "#!/bin/bash\n"; 327 | print EMAIL "#BSUB -n 1\n"; 328 | print EMAIL "#BSUB -o $lsf_file_dir","\n"; 329 | print EMAIL "#BSUB -e $lsf_file_dir","\n"; 330 | print EMAIL "#BSUB -J $current_job_file\n"; 331 | print EMAIL "#BSUB -w \"$hold_job_file\"","\n"; 332 | print EMAIL $run_script_path."send_email.pl ".$run_dir." ".$email."\n"; 333 | close EMAIL; 334 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n"; 335 | #$bsub_com = "qsub -V -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 336 | system ($bsub_com); 337 | } 338 | ####################################################################### 339 | if ($step_number == 0) { 340 | print $green, "All jobs are submitted! You will get email notification when this run is completed.\n",$normal; 341 | } 342 | 343 | exit; 344 | 345 | 346 | ######################################################################## 347 | # subroutines 348 | 349 | sub check_input_dir { 350 | my ($input_dir) = @_; 351 | my $have_input_sample = 0; 352 | 353 | # get sample list in the run, name should not contain "." 354 | opendir(DH, $input_dir) or die "Cannot open dir $input_dir: $!\n"; 355 | my @sample_list = readdir DH; 356 | close DH; 357 | 358 | for (my $i=0;$i<@sample_list;$i++) {#use the for loop instead. the foreach loop has some problem to pass the global variable $sample_name to the sub functions 359 | $sample_name = $sample_list[$i]; 360 | if (!($sample_name =~ /\./)&&!($sample_name =~/Analysis_/)) { 361 | $have_input_sample = 1; 362 | $sample_full_path = $input_dir."/".$sample_name; 363 | if (-d $sample_full_path) { # is a full path directory containing a sample 364 | my $input_file = $input_dir."/".$sample_name."/".$sample_name.".bam"; 365 | if (!(-e $input_file)) { # input file does not exist 366 | print $red, "Do not have appropriate input directory structure. Please check your command line argument!", $normal, "\n\n"; 367 | die; 368 | } 369 | } 370 | else { # input sample directory does not exist 371 | print $red, "Do not have appropriate input directory structure. Please check your command line argument!", $normal, "\n\n"; 372 | die; 373 | } 374 | } 375 | } 376 | 377 | if (!($have_input_sample)) { # does not have any input sample directory 378 | print $red, "Do not have appropriate input directory structure. Please check your command line argument!", $normal, "\n\n"; 379 | die; 380 | } 381 | 382 | } 383 | 384 | ######################################################################## 385 | ######################################################################## 386 | sub bsub_bwa{ 387 | 388 | #my $cdhitReport = $sample_full_path."/".$sample_name.".fa.cdhitReport"; 389 | 390 | $current_job_file = "j1_bwa_".$sample_name.$$.".sh"; 391 | 392 | my $IN_bam = $sample_full_path."/".$sample_name.".bam"; 393 | 394 | if (! -e $IN_bam) {#make sure there is a input fasta file 395 | print $red, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n"; 396 | print "Warning: Died because there is no input bam file for bwa:\n"; 397 | print "File $IN_bam does not exist!\n"; 398 | die "Please check command line argument!", $normal, "\n\n"; 399 | 400 | } 401 | if (! -s $IN_bam) {#make sure input fasta file is not empty 402 | print $red, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n"; 403 | die "Warning: Died because $IN_bam is empty!", $normal, "\n\n"; 404 | } 405 | 406 | open(BWA, ">$job_files_dir/$current_job_file") or die $!; 407 | print BWA "#!/bin/bash\n"; 408 | print BWA "#BSUB -n 1\n"; 409 | print BWA "#BSUB -R \"rusage[mem=20000]\"","\n"; 410 | print BWA "#BSUB -M 20000000\n"; 411 | print BWA "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 412 | print BWA "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 413 | print BWA "#BSUB -J $current_job_file\n"; 414 | print BWA "BWA_IN=".$sample_full_path."/".$sample_name.".bam\n"; 415 | print BWA "BWA_fq=".$sample_full_path."/".$sample_name.".fq\n"; 416 | print BWA "BWA_sai=".$sample_full_path."/".$sample_name.".sai\n"; 417 | #print BWA "BWA_sam=".$sample_full_path."/".$sample_name.".sam\n"; 418 | #print BWA "BWA_bam=".$sample_full_path."/".$sample_name.".realign.bam\n"; 419 | #print BWA "BWA_mapped_bam=".$sample_full_path."/".$sample_name.".mapped.bam\n"; 420 | print BWA "BWA_mapped=".$sample_full_path."/".$sample_name.".mapped.reads\n"; 421 | print BWA "BWA_fa=".$sample_full_path."/".$sample_name.".fa\n"; 422 | #print BWA 423 | print BWA 'if [ ! -s $BWA_mapped ]',"\n"; 424 | print BWA " then\n"; 425 | print BWA "rm \${BWA_sai}","\n"; 426 | print BWA "rm \${BWA_fq}","\n"; 427 | #print BWA "mkfifo \${BWA_sai}","\n"; 428 | print BWA "mkfifo \${BWA_fq}","\n"; 429 | #0x100: secondary alignment 430 | #0x800: supplementary alignment 431 | #H: Hard clipping 432 | #S: Soft clipping 433 | print BWA "samtools view -h \${BWA_IN} | perl -ne \'\$line=\$_; \@ss=split(\"\\t\",\$line); \$flag=\$ss[1]; \$cigar=\$ss[5]; if(\$ss[0]=~/^\@/ || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && ((\$flag & 0x4) || (\$cigar=~/S/))) || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && (\$ss[2]=~/^gi/))) { print \$line;}\' | samtools view -Sb - | bamtools convert -format fastq > \${BWA_fq} \&","\n"; 434 | #print BWA "bwa aln $bwa_ref -b0 \${BWA_IN} > \${BWA_sai} \&","\n"; 435 | print BWA "bwa aln $bwa_ref \${BWA_fq} > \${BWA_sai}","\n"; 436 | print BWA 'rm ${BWA_fq}',"\n"; 437 | print BWA "mkfifo \${BWA_fq}","\n"; 438 | print BWA "samtools view -h \${BWA_IN} | perl -ne \'\$line=\$_; \@ss=split(\"\\t\",\$line); \$flag=\$ss[1]; \$cigar=\$ss[5]; if(\$ss[0]=~/^\@/ || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && ((\$flag & 0x4) || (\$cigar=~/S/))) || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && (\$ss[2]=~/^gi/))) { print \$line;}\' | samtools view -Sb - | bamtools convert -format fastq > \${BWA_fq} \&","\n"; 439 | #print BWA "samtools view -h \${BWA_IN} | gawk \'{if (substr(\$1,1,1)==\"\@\" || (and(\$2,0x4) || and(\$2,0x8) )) print}\' | samtools view -Sb - | bamtools convert -format fastq > \${BWA_fq} \&","\n"; 440 | print BWA "bwa samse $bwa_ref \${BWA_sai} \${BWA_fq} | grep -v \@SQ | perl -ne \'\$line=\$_; \@ss=split(\"\\t\",\$line); if(\$ss[2]=~/^gi/) { print \$line; }\' > \${BWA_mapped}","\n"; 441 | print BWA " ".$run_script_path."get_fasta_from_bam_filter.pl \${BWA_mapped} \${BWA_fa}\n"; 442 | print BWA " ".$run_script_path."trim_readid.pl \${BWA_fa} \${BWA_fa}.cdhit_out\n"; 443 | print BWA 'rm ${BWA_sai}',"\n"; 444 | print BWA 'rm ${BWA_fq}',"\n"; 445 | print BWA "else\n"; 446 | print BWA " ".$run_script_path."get_fasta_from_bam_filter.pl \${BWA_mapped} \${BWA_fa}\n"; 447 | print BWA " ".$run_script_path."trim_readid.pl \${BWA_fa} \${BWA_fa}.cdhit_out\n"; 448 | print BWA " fi\n"; 449 | close BWA; 450 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n"; 451 | system ( $bsub_com ); 452 | } 453 | 454 | ##################################################################################### 455 | 456 | sub split_for_RepeatMasker { 457 | #split file for RepeatMasker 458 | my ($step_by_step) = @_; 459 | if ($step_by_step) { 460 | $hold_job_file = ""; 461 | }else{ 462 | $hold_job_file = $current_job_file; 463 | } 464 | $current_job_file = "j2_".$sample_name."_RM_split_".$$.".sh"; 465 | open(RMSPLIT, ">$job_files_dir/$current_job_file") or die $!; 466 | print RMSPLIT "#!/bin/bash\n"; 467 | print RMSPLIT "#BSUB -n 1\n"; 468 | #print RMSPLIT "#BSUB -q ding-lab\n"; 469 | print RMSPLIT "#BSUB -R \"rusage[mem=10000]\"","\n"; 470 | print RMSPLIT "#BSUB -M 10000000\n"; 471 | print RMSPLIT "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 472 | print RMSPLIT "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 473 | print RMSPLIT "#BSUB -J $current_job_file\n"; 474 | print RMSPLIT "RMSPLIT_IN=".$sample_full_path."/".$sample_name.".fa\n"; 475 | print RMSPLIT "#BSUB -w \"$hold_job_file\"","\n"; 476 | ##################### 477 | print RMSPLIT "RM_DIR=".$sample_full_path."/".$sample_name.".$REPEAT_MASKER_DIR_SUFFIX\n"; 478 | print RMSPLIT "SAMPLE_DIR=".$sample_full_path."\n\n"; 479 | print RMSPLIT "if [ ! -d \${RM_DIR} ]\n"; 480 | print RMSPLIT "then\n"; 481 | print RMSPLIT " mkdir \${RM_DIR}\n"; 482 | print RMSPLIT " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out -o \${RM_DIR} -n $file_number_of_RepeatMasker -p ".$sample_name.".fa.cdhit_out_file\n"; 483 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n"; 484 | print RMSPLIT ' CHECK=$?',"\n"; 485 | print RMSPLIT ' while [ ${CHECK} -eq 10 ]',"\n"; # 10 is the error exit code of check_split_cdhit.pl. It will check whether split_cdhit is correctly completed, if not correctly completed 486 | print RMSPLIT " do\n"; # run split and check again 487 | print RMSPLIT " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out -o \${RM_DIR} -n $file_number_of_RepeatMasker -p ".$sample_name.".fa.cdhit_out_file\n"; 488 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n"; 489 | print RMSPLIT ' CHECK=$?',"\n"; 490 | print RMSPLIT " done\n"; 491 | print RMSPLIT "else\n"; # RepeatMasker directory already existed (file already splited) 492 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n"; 493 | print RMSPLIT ' CHECK=$?',"\n"; 494 | #check if spliting file is correctly completed, if not correctly completed. check again 495 | print RMSPLIT ' while [ ${CHECK} -eq 10 ]',"\n"; 496 | print RMSPLIT " do\n";# check again 497 | print RMSPLIT " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out -o \${RM_DIR} -n $file_number_of_RepeatMasker -p ".$sample_name.".fa.cdhit_out_file\n"; 498 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n"; 499 | print RMSPLIT ' CHECK=$?',"\n"; 500 | print RMSPLIT " done\n"; 501 | print RMSPLIT "fi\n"; 502 | close RMSPLIT; 503 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 504 | #$bsub_com = "qsub -V -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 505 | system ($bsub_com); 506 | } 507 | 508 | ##################################################################################### 509 | 510 | sub submit_job_array_RM { 511 | #submit RepeatMasker job array 512 | my ($step_by_step) = @_; 513 | if ($step_by_step) { 514 | $hold_job_file = ""; 515 | }else{ 516 | $hold_job_file = $current_job_file; 517 | } 518 | $current_job_file = "j3_".$sample_name."_RM_".$$.".sh"; 519 | open (RM, ">$job_files_dir/$current_job_file") or die $!; 520 | print RM "#!/bin/bash\n"; 521 | print RM "#BSUB -n 1\n"; 522 | #print RM "#BSUB -q ding-lab\n"; 523 | print RM "#BSUB -R \"span[hosts=1] rusage[mem=10000]\"","\n"; 524 | #print RM "#BSUB -R \"rusage[mem=40000]\"","\n"; 525 | print RM "#BSUB -M 10000000\n"; 526 | print RM "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 527 | print RM "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 528 | print RM "#BSUB -J $current_job_file\[1-$file_number_of_RepeatMasker\]\n"; 529 | print RM "#BSUB -w \"$hold_job_file\"","\n"; 530 | print RM "RM_IN=".$sample_full_path."/".$sample_name.".fa\n"; 531 | ##################### 532 | print RM "RM_dir=".$sample_full_path."/".$sample_name.".$REPEAT_MASKER_DIR_SUFFIX\n"; 533 | #print RM "#\$ -t 1-$file_number_of_RepeatMasker:1","\n"; 534 | print RM "RMOUT=",'${RM_dir}',"/".$sample_name.".fa.cdhit_out_file".'${LSB_JOBINDEX}'.".fa.masked","\n"; 535 | print RM "RMIN=",'${RM_dir}',"/".$sample_name.".fa.cdhit_out_file".'${LSB_JOBINDEX}',".fa\n"; 536 | print RM "RMOTHER=",'${RM_dir}',"/".$sample_name.".fa.cdhit_out_file".'${LSB_JOBINDEX}'.".fa.out","\n\n"; 537 | print RM 'if [ -f $RMIN ]',"\n"; # input file exist 538 | print RM "then\n"; 539 | print RM ' if [ ! -s $RMOUT ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished 540 | print RM " then\n"; 541 | #print RM ' while [ ! -s $RMOUT ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished 542 | # print RM " do\n"; # run RepeatMasker until it finishes 543 | print RM " $repeat_masker -pa 4 \$RMIN \n"; 544 | # print RM " done\n"; 545 | print RM " fi\n\n"; 546 | print RM ' if [ ! -f $RMOTHER ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished 547 | print RM " then\n"; 548 | print RM ' while [ ! -f $RMOTHER ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished 549 | print RM " do\n"; # run RepeatMasker until it finishes 550 | print RM " $repeat_masker -pa 4 \$RMIN \n"; 551 | print RM " done\n"; 552 | print RM " fi\n\n"; 553 | print RM ' if [ ! -f $RMOUT ]',"\n"; #sometimes repeatmasker does not find any repeat in input files, in these cases no .masked file will be generated. 554 | print RM " then\n"; 555 | print RM ' cp ${RMIN} ${RMOUT}',"\n"; 556 | print RM " fi\n"; 557 | print RM "fi\n"; 558 | close RM; 559 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n"; 560 | #print $bsub_com, "\n"; 561 | #$bsub_com = "qsub -V -l h_vmem=4G -hold_jid $hold_job_file,$hold_RM_job -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 562 | system ($bsub_com) 563 | } 564 | 565 | ##################################################################################### 566 | 567 | sub seq_QC { 568 | my ($step_by_step) = @_; 569 | if ($step_by_step) { 570 | $hold_job_file = ""; 571 | }else{ 572 | $hold_job_file = $current_job_file; 573 | } 574 | $current_job_file = "j4_".$sample_name."_QC_".$$.".sh"; 575 | open(QC, ">$job_files_dir/$current_job_file") or die $!; 576 | print QC "#!/bin/bash\n"; 577 | print QC "#BSUB -n 1\n"; 578 | print QC "#BSUB -R \"rusage[mem=10000]\"","\n"; 579 | print QC "#BSUB -M 10000000\n"; 580 | print QC "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 581 | print QC "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 582 | print QC "#BSUB -J $current_job_file\n"; 583 | print QC "#BSUB -w \"$hold_job_file\"","\n"; 584 | ##################### 585 | print QC "SAMPLE_DIR=".$sample_full_path."\n"; 586 | print QC "QC_OUT=".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq\n\n"; 587 | print QC "f_fa=".$sample_full_path."/".$sample_name.".fa\n"; 588 | print QC 'if [ ! -f $QC_OUT] && [ -s $f_fa]',"\n"; 589 | print QC "then\n"; 590 | print QC " ".$run_script_path."SequenceQualityControl.pl ".$sample_full_path."\n"; 591 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n"; 592 | print QC ' CHECK=$?',"\n"; 593 | print QC ' while [ ${CHECK} -eq 10 ]',"\n";#10 is the exit code of check_SequenceQualityControl.pl if it is not correctly completed. 594 | print QC " do\n";#run split and check again 595 | print QC " ".$run_script_path."SequenceQualityControl.pl ".$sample_full_path."\n"; 596 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n"; 597 | print QC ' CHECK=$?',"\n"; 598 | print QC " done\n"; 599 | print QC "else\n"; 600 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n"; 601 | print QC ' CHECK=$?',"\n"; 602 | #check if parsed file is completed, if not completed. check again 603 | print QC ' while [ ${CHECK} -eq 10 ]',"\n"; 604 | print QC " do\n";#run parser again 605 | print QC " ".$run_script_path."SequenceQualityControl.pl ".$sample_full_path."\n"; 606 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n"; 607 | print QC ' CHECK=$?',"\n"; 608 | print QC ' CHECK=1',"\n"; 609 | print QC " done\n"; 610 | print QC "fi\n"; 611 | close QC; 612 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n"; 613 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 614 | system ($bsub_com); 615 | } 616 | 617 | ##################################################################################### 618 | 619 | sub split_for_blast_RefG{ 620 | #split file for RefG blast 621 | my ($step_by_step) = @_; 622 | if ($step_by_step) { 623 | $hold_job_file = ""; 624 | }else{ 625 | $hold_job_file = $current_job_file; 626 | } 627 | 628 | $current_job_file = "j5_".$sample_name."_RefG_split_".$$.".sh"; 629 | open(RefGS, ">$job_files_dir/$current_job_file") or die $!; 630 | print RefGS "#!/bin/bash\n"; 631 | print RefGS "#BSUB -n 1\n"; 632 | print RefGS "#BSUB -R \"rusage[mem=10000]\"","\n"; 633 | print RefGS "#BSUB -M 10000000\n"; 634 | print RefGS "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 635 | print RefGS "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 636 | print RefGS "#BSUB -J $current_job_file\n"; 637 | print RefGS "#BSUB -w \"$hold_job_file\"","\n"; 638 | ############################ 639 | print RefGS "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n"; 640 | print RefGS "SAMPLE_DIR=".$sample_full_path."\n\n"; 641 | print RefGS 'if [ ! -d $RefG_DIR ]',"\n"; 642 | print RefGS "then\n"; 643 | print RefGS " mkdir \${RefG_DIR}\n"; 644 | print RefGS " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq -o \${RefG_DIR} -n $file_number_of_Blast_Ref_Genome -p ".$sample_name.".fa.cdhit_out.masked.goodSeq_file\n"; 645 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n"; 646 | print RefGS ' CHECK=$?',"\n"; 647 | print RefGS ' while [ ${CHECK} -eq 10 ]',"\n";#10 is the error exit code of it is not correctly completed. 648 | print RefGS " do\n";#run split and check again 649 | print RefGS " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq -o \${RefG_DIR} -n $file_number_of_Blast_Ref_Genome -p ".$sample_name.".fa.cdhit_out.masked.goodSeq_file\n"; 650 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n"; 651 | print RefGS ' CHECK=$?',"\n"; 652 | print RefGS " done\n"; 653 | print RefGS "else\n"; 654 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n"; 655 | print RefGS ' CHECK=$?',"\n"; 656 | #check if parsed file is completed, if not completed. check again 657 | print RefGS ' while [ ${CHECK} -eq 10 ]',"\n"; 658 | print RefGS " do\n";#run parser again 659 | print RefGS " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq -o \${RefG_DIR} -n $file_number_of_Blast_Ref_Genome -p ".$sample_name.".fa.cdhit_out.masked.goodSeq_file\n"; 660 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n"; 661 | print RefGS ' CHECK=$?',"\n"; 662 | print RefGS " done\n"; 663 | print RefGS "fi\n"; 664 | close RefGS; 665 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 666 | #$bsub_com = "qsub -V -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 667 | system ($bsub_com); 668 | } 669 | 670 | ##################################################################################### 671 | 672 | sub submit_job_array_blast_RefG{ 673 | my ($step_by_step) = @_; 674 | if ($step_by_step) { 675 | $hold_job_file = ""; 676 | }else{ 677 | $hold_job_file = $current_job_file; 678 | } 679 | 680 | $current_job_file = "j6_".$sample_name."_BRefG_".$$.".sh"; 681 | open (RefG, ">$job_files_dir/$current_job_file") or die $!; 682 | print RefG "#!/bin/bash\n"; 683 | print RefG "#BSUB -n 1\n"; 684 | print RefG "#BSUB -R \"span[hosts=1] rusage[mem=20000]\"","\n"; 685 | print RefG "#BSUB -M 20000000\n"; 686 | print RefG "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 687 | print RefG "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 688 | print RefG "#BSUB -J $current_job_file\[1-$file_number_of_Blast_Ref_Genome\]\n"; 689 | print RefG "#BSUB -w \"$hold_job_file\"","\n"; 690 | 691 | #################### 692 | print RefG "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n"; 693 | #print RefG "#\$ -t 1-$file_number_of_Blast_Ref_Genome:1","\n"; #the number must be a digital value in the .sh job file, cannot be calculated when the job submitted 694 | print RefG "BlastRefGOUT=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}',".RefGblast.out\n"; 695 | print RefG "QUERY=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}'.".fa\n\n"; 696 | print RefG 'if [ -s $QUERY ]',"\n"; #modified by song: check if a file is empty. 697 | print RefG "then\n"; 698 | #if blast output file does not exist, do blast and check the completeness of output 699 | print RefG ' if [ ! -f $BlastRefGOUT ]',"\n"; 700 | print RefG " then\n"; 701 | print RefG " $blastn -evalue 1e-9 -show_gis -num_threads 4 -num_descriptions 2 -num_alignments 2 -query \${QUERY} -out \${BlastRefGOUT} -db $reference_genome","\n"; 702 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n"; 703 | print RefG ' CHECK=$?',"\n"; 704 | print RefG ' while [ ${CHECK} -eq 1 ]',"\n"; 705 | print RefG " do\n"; 706 | print RefG " $blastn -evalue 1e-9 -show_gis -num_threads 4 -num_descriptions 2 -num_alignments 2 -query \${QUERY} -out \${BlastRefGOUT} -db $reference_genome","\n"; 707 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n"; 708 | print RefG ' CHECK=$?',"\n"; 709 | print RefG " done\n"; 710 | #if blast output file exists, check the completeness of output 711 | print RefG " else\n"; 712 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n"; 713 | print RefG ' CHECK=$?',"\n"; 714 | print RefG ' while [ ${CHECK} -eq 1 ]',"\n"; 715 | print RefG " do\n"; 716 | print RefG " $blastn -evalue 1e-9 -show_gis -num_threads 4 -num_descriptions 2 -num_alignments 2 -query \${QUERY} -out \${BlastRefGOUT} -db $reference_genome","\n"; 717 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n"; 718 | print RefG ' CHECK=$?',"\n"; 719 | print RefG " done\n"; 720 | print RefG " fi\n"; 721 | print RefG "fi"; 722 | close RefG; 723 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 724 | #$bsub_com = "qsub -V -l h_vmem=10G -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 725 | system ($bsub_com); 726 | } 727 | 728 | ##################################################################################### 729 | 730 | sub parse_blast_RefG{ 731 | my ($step_by_step) = @_; 732 | if ($step_by_step) { 733 | $hold_job_file = ""; 734 | }else{ 735 | $hold_job_file = $current_job_file; 736 | } 737 | 738 | # $current_job_file = "j10_".$sample_name."_PBN_".$$.".sh"; 739 | my $BND=$sample_full_path."/".$sample_name.".".$BLAST_RefG_DIR_SUFFIX; 740 | #if 741 | #my $nn1=`tail $BND/*.out | grep Matrix | wc -l`; 742 | #my $nn2=`ls $BND/*.out | wc -l`; 743 | #print $nn1,"\n"; 744 | #print $nn2,"\n"; 745 | #if($nn1 != $nn2) { print "resubmitted blastHG for $sample_name","\n"; &submit_job_array_blast_RefG(1); } 746 | #else { 747 | $current_job_file = "j7_".$sample_name."_PRefG_".$$.".sh"; 748 | open (PRefG, ">$job_files_dir/$current_job_file") or die $!; 749 | print PRefG "#!/bin/bash\n"; 750 | print PRefG "#BSUB -n 1\n"; 751 | print PRefG "#BSUB -R \"rusage[mem=10000]\"","\n"; 752 | print PRefG "#BSUB -M 10000000\n"; 753 | print PRefG "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 754 | print PRefG "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 755 | print PRefG "#BSUB -J $current_job_file\[1-$file_number_of_Blast_Ref_Genome\]\n"; 756 | print PRefG "#BSUB -w \"$hold_job_file\"","\n"; 757 | ################################# 758 | print PRefG "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n"; 759 | #print PRefG "#\$ -t 1-$file_number_of_Blast_Ref_Genome:1","\n";#must be a decimal number 760 | print PRefG "BlastRefGOUT=${sample_name}.fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}',".RefGblast.out\n";#name only, not full path 761 | print PRefG "BlastRefGIN=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}'.".fa\n";#full path 762 | print PRefG "PARSED=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}'.".RefGblast.parsed\n\n"; 763 | print PRefG 'if [ -s $BlastRefGIN ]',"\n"; # change -f to -s 764 | print PRefG "then\n"; 765 | #if the parsed file does not exist, run parser and check the completeness of the parsed file 766 | print PRefG ' if [ ! -f $PARSED ]',"\n"; 767 | print PRefG " then\n"; 768 | print PRefG " ".$run_script_path."BLASTn_RefGenome_parser.pl \${RefG_DIR} \${BlastRefGOUT} $refrence_genome_taxonomy\n"; 769 | #check the completeess of parse 770 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n"; 771 | print PRefG ' CHECK=$?',"\n"; 772 | # rerun if not completed 773 | print PRefG ' while [ ${CHECK} -eq 1 ]',"\n"; 774 | print PRefG " do\n";#run parse again 775 | print PRefG " ".$run_script_path."BLASTn_RefGenome_parser.pl \${RefG_DIR} \${BlastRefGOUT} $refrence_genome_taxonomy \n"; 776 | #check the completeess of parse 777 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n"; 778 | print PRefG ' CHECK=$?',"\n"; 779 | print PRefG " done\n"; 780 | #if the parsed file exists, check the completeness of the parsed file 781 | print PRefG " else\n"; 782 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n"; 783 | print PRefG ' CHECK=$?',"\n"; 784 | print PRefG ' while [ ${CHECK} -eq 1 ]',"\n"; #not complete 785 | print PRefG " do\n"; 786 | print PRefG " ".$run_script_path."BLASTn_RefGenome_parser.pl \${RefG_DIR} \${BlastRefGOUT} $refrence_genome_taxonomy \n"; 787 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n"; 788 | print PRefG ' CHECK=$?',"\n"; 789 | print PRefG " done\n"; 790 | print PRefG " fi\n"; 791 | print PRefG "fi"; 792 | close PRefG; 793 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 794 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 795 | system ($bsub_com); 796 | #} 797 | } 798 | 799 | ##################################################################################### 800 | 801 | sub pool_split_for_blast_N{ 802 | my ($step_by_step) = @_; 803 | if ($step_by_step) { 804 | $hold_job_file = ""; 805 | }else{ 806 | $hold_job_file = $current_job_file; 807 | } 808 | 809 | $current_job_file = "j8_".$sample_name."_BN_split_".$$.".sh"; 810 | open(BNS, ">$job_files_dir/$current_job_file") or die $!; 811 | print BNS "#!/bin/bash\n"; 812 | print BNS "#BSUB -n 1\n"; 813 | print BNS "#BSUB -R \"rusage[mem=10000]\"","\n"; 814 | print BNS "#BSUB -M 10000000\n"; 815 | print BNS "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 816 | print BNS "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 817 | print BNS "#BSUB -J $current_job_file\n"; 818 | print BNS "#BSUB -w \"$hold_job_file\"","\n"; 819 | ############################ 820 | print BNS "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n"; 821 | print BNS "SAMPLE_DIR=".$sample_full_path."\n"; 822 | print BNS "RefGFiltered_fa=".$sample_full_path."/".$sample_name.".RefGfiltered.fa\n"; 823 | print BNS "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n\n"; 824 | print BNS 'if [ ! -d $BN_DIR ] ',"\n"; 825 | print BNS "then\n"; 826 | print BNS " mkdir \${BN_DIR}\n"; 827 | print BNS "fi\n"; 828 | print BNS 'if [ -f $RefGFiltered_fa ] ',"\n"; 829 | print BNS "then\n"; 830 | print BNS " rm \${RefGFiltered_fa}\n"; 831 | print BNS "fi\n"; 832 | print BNS "cat \${RefG_DIR}/*.RefGfiltered.fa >> \${RefGFiltered_fa}\n"; 833 | print BNS "".$run_script_path."check_split_BN.pl \${SAMPLE_DIR}\n"; 834 | print BNS 'CHECK=$?',"\n"; 835 | print BNS 'while [ ${CHECK} -eq 10 ]',"\n"; #10 is the exit code of check_split_BN.pl. Check whether it is correctly completed, if not rerun split and check again. 836 | print BNS "do\n"; 837 | # split to -n number of files, this number should be consistent with 838 | # the number of blastn job array submitted bellow 839 | print BNS " ".$run_script_path."split_fasta.pl -i \${RefGFiltered_fa} -o \${BN_DIR} -n $file_number_of_Blast_N -p ".$sample_name.".RefGfiltered.fa_file\n"; 840 | print BNS " ".$run_script_path."check_split_BN.pl \${SAMPLE_DIR}\n"; 841 | print BNS ' CHECK=$?',"\n"; 842 | print BNS "done\n"; 843 | close BNS; 844 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 845 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 846 | system ($bsub_com); 847 | } 848 | 849 | ##################################################################################### 850 | 851 | sub submit_job_array_blast_N{ 852 | my ($step_by_step) = @_; 853 | if ($step_by_step) { 854 | $hold_job_file = ""; 855 | }else{ 856 | $hold_job_file = $current_job_file; 857 | } 858 | 859 | my $BND=$sample_full_path."/".$sample_name.".".$BLAST_NT_DIR_SUFFIX; 860 | 861 | #my $nn1=`tail $BND/*.out | grep Matrix | wc -l`; 862 | #my $nn2=`ls $BND/*.out | wc -l`; 863 | 864 | #print $nn1,"\n"; 865 | #print $nn2,"\n"; 866 | 867 | #if($nn1 != $nn2 || $nn2<200) 868 | #{ 869 | $current_job_file = "j9_".$sample_name."_BN_".$$.".sh"; 870 | open (BN, ">$job_files_dir/$current_job_file") or die $!; 871 | print BN "#!/bin/bash\n"; 872 | print BN "#BSUB -n 1\n"; 873 | print BN "#BSUB -R \"span[hosts=1] rusage[mem=40000]\"","\n"; 874 | print BN "#BSUB -M 40000000\n"; 875 | print BN "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 876 | print BN "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 877 | print BN "#BSUB -J $current_job_file\[1-$file_number_of_Blast_N\]\n"; 878 | print BN "#BSUB -w \"$hold_job_file\"","\n"; 879 | ################################# 880 | print BN "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n"; 881 | #print BN "#\$ -t 1-$file_number_of_Blast_N:1","\n"; #must be a decimal number, the value must be determined when this job file is generated. cannot be a variable 882 | print BN "BlastNOUT=",'${BN_DIR}',"/",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.out\n";#full path 883 | print BN "QUERY=",'${BN_DIR}',"/".$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".fa\n\n"; 884 | print BN 'if [ -s $QUERY ]',"\n"; #modified by song. check if the file is empty 885 | print BN "then\n"; 886 | #if the output file does not exist, run and check the completeness of the output file 887 | print BN ' if [ ! -f $BlastNOUT ]',"\n"; 888 | print BN " then\n"; 889 | print BN " $blastn -evalue 1e-9 -show_gis -num_threads 4 -query \${QUERY} -out \${BlastNOUT} -db $db_BN","\n"; 890 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n"; 891 | print BN ' CHECK1=$?',"\n"; 892 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n"; # one possible blast error message ( see the end of this script). 893 | print BN ' CHECK2=$?',"\n"; 894 | print BN ' while [ ${CHECK1} -eq 1 ] || [ ${CHECK2} -eq 0 ]',"\n"; 895 | print BN " do\n"; 896 | print BN " $blastn -evalue 1e-9 -show_gis -num_threads 4 -query \${QUERY} -out \${BlastNOUT} -db $db_BN","\n"; 897 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n"; 898 | print BN ' CHECK1=$?',"\n"; 899 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n";#see the end of this script 900 | print BN ' CHECK2=$?',"\n"; 901 | print BN " done\n"; 902 | #if the output file exists, check the completeness of the output file 903 | print BN " else\n"; 904 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n"; 905 | print BN ' CHECK1=$?',"\n"; 906 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n";# one possible blast error (see the end of this script). 907 | print BN ' CHECK2=$?',"\n"; 908 | print BN ' while [ ${CHECK1} -eq 1 ] || [ ${CHECK2} -eq 0 ]',"\n"; 909 | print BN " do\n"; 910 | print BN " $blastn -evalue 1e-9 -show_gis -num_threads 4 -query \${QUERY} -out \${BlastNOUT} -db $db_BN","\n"; 911 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n"; 912 | print BN ' CHECK1=$?',"\n"; 913 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n";#see the end of this script 914 | print BN ' CHECK2=$?',"\n"; 915 | print BN " done\n"; 916 | print BN " fi\n"; 917 | print BN "fi"; 918 | close BN; 919 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 920 | #$bsub_com = "qsub -V -l h_vmem=10G -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 921 | system ($bsub_com); 922 | #} 923 | } 924 | 925 | ##################################################################################### 926 | 927 | sub parse_blast_N{ 928 | my ($step_by_step) = @_; 929 | if ($step_by_step) { 930 | $hold_job_file = ""; 931 | }else{ 932 | $hold_job_file = $current_job_file; 933 | } 934 | 935 | $current_job_file = "j10_".$sample_name."_PBN_".$$.".sh"; 936 | #my $BND=$sample_full_path."/".$sample_name.".".$BLAST_NT_DIR_SUFFIX; 937 | #my $nn1=`tail $BND/*.out | grep Matrix | wc -l`; 938 | #my $nn2=`ls $BND/*.out | wc -l`; 939 | #print $nn1,"\n"; 940 | #print $nn2,"\n"; 941 | #if($nn1 != $nn2) { print "resubmited blastN for $sample_name","\n"; &submit_job_array_blast_N(1); } 942 | #else { 943 | #exit(2); 944 | open (PBN, ">$job_files_dir/$current_job_file") or die $!; 945 | print PBN "#!/bin/bash\n"; 946 | print PBN "#BSUB -n 1\n"; 947 | print PBN "#BSUB -R \"rusage[mem=10000]\"","\n"; 948 | print PBN "#BSUB -M 10000000\n"; 949 | print PBN "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 950 | print PBN "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 951 | print PBN "#BSUB -J $current_job_file\[1-$file_number_of_Blast_N\]\n"; 952 | print PBN "#BSUB -w \"$hold_job_file\"","\n"; 953 | ################################# 954 | print PBN "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n"; 955 | #print PBN "#\$ -t 1-$file_number_of_Blast_N:1","\n"; #must be a decimal number when the job file is created, cannot be a variable 956 | print PBN "BlastNOUT=",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.out\n";#name only, not full path 957 | print PBN "BlastNIN=",'${BN_DIR}',"/",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".fa\n";#full path 958 | print PBN "PARSED=",'${BN_DIR}',"/".$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.parsed\n\n"; 959 | print PBN 'if [ -s $BlastNIN ]',"\n"; #song changed -f to -s; 960 | print PBN "then\n"; 961 | #if the parsed file does not exist, run parser and check the completeness of the parsed file 962 | print PBN ' if [ ! -f $PARSED ]',"\n"; 963 | print PBN " then\n"; 964 | print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n"; 965 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n"; 966 | print PBN ' CHECK=$?',"\n"; 967 | #check if parsed file is completed, if not completed. run and check again 968 | print PBN ' while [ ${CHECK} -eq 10 ]',"\n"; #10 is the error exit code of check_Blast_parsed_file.pl if it's not correctly completed. 969 | print PBN " do\n"; #run parser again 970 | print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n"; 971 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n"; 972 | print PBN ' CHECK=$?',"\n"; 973 | print PBN " done\n"; 974 | #if the parsed file exists, check the completeness of the parsed file 975 | print PBN " else\n"; 976 | # print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n"; 977 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n"; 978 | print PBN ' CHECK=$?',"\n"; 979 | #check if parsed file is completed. If not correctly completed run and check again 980 | print PBN ' while [ ${CHECK} -eq 10 ]',"\n"; 981 | print PBN " do\n"; #run parser again 982 | print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n"; 983 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n"; 984 | print PBN ' CHECK=$?',"\n"; 985 | print PBN " done\n"; 986 | print PBN " fi\n"; 987 | print PBN "fi"; 988 | close PBN; 989 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 990 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 991 | system ($bsub_com); 992 | } 993 | 994 | ##################################################################################### 995 | 996 | sub blast_S{ 997 | 998 | my ($step_by_step) = @_; 999 | if ($step_by_step) { 1000 | $hold_job_file = ""; 1001 | }else{ 1002 | $hold_job_file = $current_job_file; 1003 | } 1004 | $current_job_file = "j11_".$sample_name."_blastS_".$$.".sh"; 1005 | open (PS, ">$job_files_dir/$current_job_file") or die $!; 1006 | print PS "#!/bin/bash\n"; 1007 | print PS "#BSUB -n 1\n"; 1008 | print PS "#BSUB -R \"rusage[mem=10000]\"","\n"; 1009 | print PS "#BSUB -M 10000000\n"; 1010 | print PS "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 1011 | print PS "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 1012 | print PS "#BSUB -J $current_job_file\[1-$file_number_of_Blast_N\]\n"; 1013 | print PS "#BSUB -w \"$hold_job_file\"","\n"; 1014 | ################################# 1015 | print PS "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n"; 1016 | #print PBN "#\$ -t 1-$file_number_of_Blast_N:1","\n"; #must be a decimal number when the job file is created, cannot be a variable 1017 | print PS "BlastNparsed=",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.parsed\n";#name only, not full path 1018 | print PS "BlastNIN=",'${BN_DIR}',"/",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".fa\n";#full path 1019 | print PS "OUTPUT=",'${BN_DIR}',"/".$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.summary\n\n"; 1020 | print PS 'if [ -s $BlastNIN ]',"\n"; #song changed -f to -s; 1021 | print PS "then\n"; 1022 | #if the parsed file does not exist, run parser and check the completeness of the parsed file 1023 | print PS ' if [ ! -f $OUTPUT ]',"\n"; 1024 | print PS " then\n"; 1025 | print PS " ".$run_script_path."blast_summary.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNparsed}\n"; 1026 | print PS ' grep "Finished summary" ${OUTPUT}',"\n"; 1027 | print PS ' CHECK=$?',"\n"; 1028 | #check if parsed file is completed, if not completed. run and check again 1029 | print PS ' while [ ${CHECK} -eq 1 ]',"\n"; #10 is the error exit code of check_Blast_parsed_file.pl if it's not correctly completed. 1030 | print PS " do\n"; #run parser again 1031 | print PS " ".$run_script_path."blast_summary.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNparsed}\n"; 1032 | print PS ' grep "Finished summary" ${OUTPUT}',"\n"; 1033 | print PS ' CHECK=$?',"\n"; 1034 | print PS " done\n"; 1035 | #if the parsed file exists, check the completeness of the parsed file 1036 | print PS " else\n"; 1037 | #print PS " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n"; 1038 | print PS ' grep "Finished summary" ${OUTPUT}',"\n"; 1039 | print PS ' CHECK=$?',"\n"; 1040 | #check if parsed file is completed. If not correctly completed run and check again 1041 | print PS ' while [ ${CHECK} -eq 1 ]',"\n"; 1042 | print PS " do\n"; #run parser again 1043 | print PS " ".$run_script_path."blast_summary.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNparsed}\n"; 1044 | print PS ' grep "Finished summary" ${OUTPUT}',"\n"; 1045 | print PS ' CHECK=$?',"\n"; 1046 | print PS " done\n"; 1047 | print PS " fi\n"; 1048 | print PS "fi"; 1049 | close PS; 1050 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 1051 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 1052 | system ($bsub_com); 1053 | 1054 | } 1055 | 1056 | 1057 | ##################################################################################### 1058 | 1059 | sub report_for_each_sample{ 1060 | my ($step_by_step) = @_; 1061 | if ($step_by_step) { 1062 | $hold_job_file = ""; 1063 | }else{ 1064 | $hold_job_file = $current_job_file; 1065 | } 1066 | 1067 | $current_job_file = "j12_".$sample_name."_Rep_".$$.".sh"; 1068 | open(REP, ">$job_files_dir/$current_job_file") or die $!; 1069 | print REP "#!/bin/bash\n"; 1070 | print REP "#BSUB -n 1\n"; 1071 | print REP "#BSUB -R \"rusage[mem=40000]\"","\n"; 1072 | print REP "#BSUB -M 40000000\n"; 1073 | print REP "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 1074 | print REP "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 1075 | print REP "#BSUB -J $current_job_file\n"; 1076 | print REP "#BSUB -w \"$hold_job_file\"","\n"; 1077 | ############################ 1078 | print REP "INPUT=".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq\n";#RepeatMasker QC output 1079 | print REP "REPORT=".$sample_full_path."/".$sample_name.".gi.AssignmentReport\n"; 1080 | print REP 'if [ -f $REPORT ] ',"\n"; # report file exist 1081 | print REP "then\n"; 1082 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n"; 1083 | print REP ' CHECK=$?',"\n"; 1084 | print REP ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, report not finish 1085 | print REP " do\n"; 1086 | print REP " ".$run_script_path."assignment_report_virus_gi.pl ".$sample_full_path." \${INPUT} $refrence_genome_taxonomy \n"; 1087 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n"; 1088 | print REP ' CHECK=$?',"\n"; 1089 | print REP " done\n"; 1090 | print REP "else\n"; # report file does not exist 1091 | print REP " ".$run_script_path."assignment_report_virus_gi.pl ".$sample_full_path." \${INPUT} $refrence_genome_taxonomy \n"; 1092 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n"; 1093 | print REP ' CHECK=$?',"\n"; 1094 | print REP ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, report not finish 1095 | print REP " do\n"; 1096 | print REP " ".$run_script_path."assignment_report_virus_gi.pl ".$sample_full_path." \${INPUT} $refrence_genome_taxonomy \n"; 1097 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n"; 1098 | print REP ' CHECK=$?',"\n"; 1099 | print REP " done\n"; 1100 | print REP "fi\n"; 1101 | close REP; 1102 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 1103 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 1104 | system ($bsub_com); 1105 | } 1106 | 1107 | ##################################################################################### 1108 | 1109 | sub summary_for_each_sample{ 1110 | 1111 | my ($step_by_step) = @_; 1112 | if ($step_by_step) { 1113 | $hold_job_file = ""; 1114 | }else{ 1115 | $hold_job_file = $current_job_file; 1116 | } 1117 | 1118 | $current_job_file = "j13_".$sample_name."_Sum_".$$.".sh"; 1119 | 1120 | open(SUM, ">$job_files_dir/$current_job_file") or die $!; 1121 | print SUM "#!/bin/bash\n"; 1122 | print SUM "#BSUB -n 1\n"; 1123 | print SUM "#BSUB -R \"rusage[mem=40000]\"","\n"; 1124 | print SUM "#BSUB -M 40000000\n"; 1125 | print SUM "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n"; 1126 | print SUM "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n"; 1127 | print SUM "#BSUB -J $current_job_file\n"; 1128 | print SUM "#BSUB -w \"$hold_job_file\"","\n"; 1129 | ############################ 1130 | print SUM "OUTPUT=".$sample_full_path."/".$sample_name.".gi.AssignmentSummary\n"; 1131 | print SUM "BAD_SEQ=".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.badSeq\n\n"; #output of RepeatMasker 1132 | print SUM 'if [ -f $OUTPUT ] ',"\n"; # summary file exist 1133 | print SUM "then\n"; 1134 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n"; 1135 | print SUM ' CHECK=$?',"\n"; 1136 | print SUM ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish 1137 | print SUM " do\n"; 1138 | print SUM " ".$run_script_path."assignment_summary_gi.pl ".$sample_full_path." \${BAD_SEQ}\n"; 1139 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n"; 1140 | print SUM ' CHECK=$?',"\n"; 1141 | print SUM " done\n"; 1142 | print SUM "else\n"; # file does not exist 1143 | print SUM " ".$run_script_path."assignment_summary_gi.pl ".$sample_full_path." \${BAD_SEQ}\n"; 1144 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n"; 1145 | print SUM ' CHECK=$?',"\n"; 1146 | print SUM ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish 1147 | print SUM " do\n"; 1148 | print SUM " ".$run_script_path."assignment_summary_gi.pl ".$sample_full_path." \${BAD_SEQ}\n"; 1149 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n"; 1150 | print SUM ' CHECK=$?',"\n"; 1151 | print SUM " done\n"; 1152 | print SUM "fi\n"; 1153 | close SUM; 1154 | $bsub_com = "bsub < $job_files_dir/$current_job_file"; 1155 | #$bsub_com = "qsub -V -P long -N $working_name -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n"; 1156 | system ($bsub_com); 1157 | } 1158 | 1159 | =add 1160 | possible blast error 1161 | Sequence with id 224967180 no longer exists in database...alignment skipped 1162 | Sequence with id 224967180 no longer exists in database...alignment skipped 1163 | =cut 1164 | -------------------------------------------------------------------------------- /assignment_report_virus_gi.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | use Switch; 5 | use Bio::SearchIO; 6 | 7 | my $usage = ' 8 | This script will read corresponding files in the given director and 9 | generate a report. It will report in each library, for each category, 10 | how many total sequence were assigned to this category, how many were 11 | assigned by BLASTN, how many were assigned by TBLASTX, the range of 12 | percent identity. It will also generate four fasta format files which 13 | contain viral reads from blastn, tblastx, all viral reads and reads 14 | that can not be assigned to any category. 15 | 16 | perl script 17 | = full path to the directory holding files for the given 18 | library 19 | e.g. .../S21_Rota_other 20 | = 1. Human 21 | 2. Mouse 22 | 3. Worm (C. elegans, C. briggsae) 23 | 4. Mouse lemur (Microcebus_murinus) 24 | 5. sand fly (Lutzomyia longipalpis) 25 | 26 | 27 | '; 28 | die $usage unless scalar @ARGV == 3; 29 | my ( $dir, $input_good_seq_fasta_file, $ref_genome_choice ) = @ARGV; 30 | 31 | # get all the viral read sequences 32 | my %viral_reads_blastn = (); 33 | my %viral_reads_blastx = (); 34 | 35 | my %best_e_blastn = (); # viral_read_ID => best_e value for this read in blastn 36 | my %best_e_blastx = (); # viral_read_ID => best_e value for this read in blastx 37 | 38 | my @blast_files_blastn = (); # all blastn.out files 39 | my @blast_files_blastx = (); # all blastx.out files 40 | 41 | my @unassigned_reads = (); 42 | #################################### 43 | my @ambiguous_reads = (); #cai added 12/2010 44 | #################################### 45 | 46 | # read in original sequences 47 | my @temp = split("\/", $dir); 48 | my $lib_name = pop @temp; 49 | # print "lib is $lib_name\n"; 50 | #my $fasta_file = $dir."/".$lib_name.".fa.cdhit_out.masked.goodSeq"; 51 | my $fasta_file = $input_good_seq_fasta_file; #cai changed, added segmasker 52 | 53 | my %seq = &read_FASTA_data($fasta_file); 54 | 55 | my $out1 = $dir."/".$lib_name.".gi.AssignmentReport"; 56 | open (OUT1, ">$out1") or die "can not open file $out1!\n"; 57 | my $OUT2 = $dir."/".$lib_name.".gi.ViralReads_all.fa"; 58 | open (OUT2, ">$OUT2") or die "can not open file $OUT2!\n"; 59 | my $OUT3 = $dir."/".$lib_name.".gi.unassigned.fa"; 60 | open (OUT3, ">$OUT3") or die "can not open file $OUT3!\n"; 61 | ##################################cai added 12/2010 62 | my $out4 = $dir."/".$lib_name.".gi.AmbiguousReads_all.fa"; 63 | open (OUT4, ">$out4") or die "can not open file $out4!\n"; 64 | ################################## 65 | 66 | # category => num of sequence assigned to this category by blastn 67 | my %blastn = ( 68 | "Bacteria" => 0, 69 | "Fungi" => 0, 70 | "Homo" => 0, 71 | "Mus" => 0, 72 | "Phage" => 0, 73 | "Viruses" => 0, 74 | "other" => 0, 75 | "unassigned" => 0, 76 | ##################################cai added 12/2010 77 | "Ambiguous" => 0, 78 | ##################################cai added 79 | ); 80 | 81 | # category => num of sequence assigned to this category by blastn of Reference genome 82 | my %blastn_RefG = (); 83 | foreach my $key (keys %blastn) { 84 | $blastn_RefG{$key} = 0; 85 | } 86 | 87 | # category => num of sequence assigned to this category by tblastx of viral genome 88 | my %blastx = (); 89 | foreach my $key (keys %blastn) { 90 | $blastx{$key} = 0; 91 | } 92 | 93 | # viral_lineage => number of reads assigned to this lineage in the library 94 | my %num_reads = (); 95 | my %blast_readinfo =(); # readID => information about this read 96 | my %lineage_blastn = (); # lineage => [read ID] 97 | my %lineage_gi = (); 98 | my %lineage_blastx = (); # lineage => [read ID] 99 | 100 | opendir(DH, $dir) or die "Can not open dir $dir!\n"; 101 | foreach my $name (readdir DH) { 102 | # name is either file name or directory for splited files 103 | my $full_path = $dir."/".$name; 104 | 105 | # full_path= dir/goodSeq_RefGblast 106 | if ($name =~ /goodSeq_RefGblast$/) { # Reference genome blast result 107 | # enter subdirectory where blastn results resides 108 | opendir (RefGDIR, $full_path) or die "can not open dir $full_path!\n"; 109 | foreach my $blast_file (readdir RefGDIR) { 110 | if ($blast_file =~ /RefGblast\.parsed$/) { 111 | my $parsed = $full_path."/".$blast_file; 112 | open (IN, $parsed) or die "can not open file $parsed!\n"; 113 | while () { 114 | if ($_ =~ /#/) { # skip comment line 115 | next; 116 | } 117 | chomp; 118 | my ($read_ID, $length, $category, $lineage, $hit_name, $e_value) = split("\t", $_); 119 | # print "readID = $read_ID, length = $length, category = $category, lineage = $lineage, hit name = $hit_name, e = $e_value\n"; 120 | $blastn_RefG{$ref_genome_choice}++; 121 | } 122 | close IN; 123 | } 124 | } 125 | closedir RefGDIR; 126 | } # finish .RefGblast.parsed 127 | 128 | # full_path= dir/RefGfiltered_BLASTN 129 | 130 | if ($name =~ /RefGfiltered_BLASTN$/) { 131 | # enter subdirectory where blastx results resides 132 | opendir (BNDIR, $full_path) or die "can not open dir $full_path!\n"; 133 | foreach my $blast_file (readdir BNDIR) { 134 | if ($blast_file =~ /blastn\.parsed$/) { 135 | # print "blastn parsed file $blast_file\n"; 136 | my $blast_out = $blast_file; 137 | $blast_out =~ s/\.blastn\.parsed/\.blastn\.out/; 138 | $blast_out = $full_path."/".$blast_out; 139 | my $blast_s = $blast_file; 140 | $blast_s =~ s/\.blastn\.parsed/\.blastn\.summary/; 141 | $blast_s = $full_path."/".$blast_s; 142 | push @blast_files_blastn, $blast_s; 143 | my $parsed = $full_path."/".$blast_file; 144 | #print $parsed,"\n"; 145 | ##################################cai changed 12/2010 146 | &collect_information($parsed, \%blastn, \%viral_reads_blastn, \%best_e_blastn, \%lineage_blastn, \%lineage_gi, \%num_reads, \@unassigned_reads, \@ambiguous_reads); 147 | ################################## 148 | } 149 | } 150 | closedir BNDIR; 151 | } # finish .blastn.parsed 152 | 153 | 154 | # full_path= dir/RefGfiltered_BLASTN 155 | #if ($name =~ /BLASTX_NR$/i) { 156 | # enter subdirectory where blastx results resides 157 | # opendir (BXDIR, $full_path) or die "can not open dir $full_path!\n"; 158 | # foreach my $blast_file (readdir BXDIR) { 159 | # if ($blast_file =~ /blastx\.parsed$/) { 160 | # print "blastn parsed file $blast_file\n"; 161 | # my $blast_out = $blast_file; 162 | # $blast_out =~ s/\.blastx\.parsed/\.blastx\.out/; 163 | # $blast_out = $full_path."/".$blast_out; 164 | # push @blast_files_blastx, $blast_out; 165 | # my $parsed = $full_path."/".$blast_file; 166 | ##################################cai changed 12/2010 167 | # &collect_information($parsed, \%blastx, \%viral_reads_blastx, \%best_e_blastx, \%lineage_blastx, \%num_reads, \@unassigned_reads, \@ambiguous_reads); 168 | ################################## 169 | # } 170 | # } 171 | # closedir BXDIR; 172 | #} # finish .blastx.parsed 173 | 174 | } 175 | 176 | close DH; 177 | 178 | # get detailed information about each viral read 179 | &get_viral_read_info(\@blast_files_blastn, \%blast_readinfo); 180 | 181 | #&get_viral_read_info( \@blast_files_blastx, "blastx", \%viral_reads_blastx, \%best_e_blastx, \%blast_readinfo); 182 | # print out report for this library 183 | 184 | print OUT1 $dir, "\n"; 185 | printf OUT1 "%12s\t%7s\t%7s\t%7s\t%7s\t%7s\t%7s\t%7s\n", "category", "total", "BN_RefG", "BN", "BX_NR"; 186 | 187 | foreach my $key (sort {$a cmp $b } keys %blastx) { 188 | printf OUT1 "%12s\t%7d\t%7d\t%7d\t%7d\n", $key, $blastn_RefG{$key}+$blastn{$key}+$blastx{$key},$blastn_RefG{$key}, $blastn{$key}, $blastx{$key}; 189 | } 190 | 191 | print OUT1 "\n###########################################################\n\n"; 192 | 193 | foreach my $gi (sort {$num_reads{$a} <=> $num_reads{$b}} keys %num_reads) { 194 | 195 | print OUT1 $gi, "\t", $lineage_gi{$gi}, "\ttotal number of reads: ", $num_reads{$gi}, "\n\n"; 196 | 197 | print OUT1 "QueryName\tQuerylength\t HitName \tHitLen\t HitDesc \tAlnLen\t%ID\tHitStart\tHitEnd\te\n"; 198 | 199 | if (defined $lineage_blastn{$gi}) { 200 | if (scalar @{$lineage_blastn{$gi}}) { 201 | print OUT1 "reads from blastn:\n"; 202 | foreach my $read (sort {$a cmp $b} @{$lineage_blastn{$gi}}) { 203 | print OUT1 $blast_readinfo{$read}; 204 | } 205 | } 206 | } 207 | 208 | #if (defined $lineage_blastx{$lineage}) { 209 | # if (scalar @{$lineage_blastx{$lineage}}) { 210 | # print OUT1 "reads from blastx:\n"; 211 | # foreach my $read (sort {$a cmp $b} @{$lineage_blastx{$lineage}}) { 212 | # print OUT1 $blast_readinfo{$read}; 213 | # } 214 | # } 215 | #} 216 | print OUT1 "\n##################################################\n\n"; 217 | } 218 | 219 | # get all the viral reads and put into output file: 220 | 221 | foreach my $gi (keys %num_reads) { 222 | 223 | foreach my $read (@{$lineage_blastn{$gi}}) { 224 | print OUT2 ">$read\n"; 225 | print OUT2 $seq{$read}, "\n"; 226 | } 227 | } 228 | #foreach my $read (@{$lineage_blastx{$lineage}}) { 229 | # print OUT2 ">$read\n"; 230 | # print OUT2 $seq{$read}, "\n"; 231 | #} 232 | #} 233 | 234 | # get all unassigned reads 235 | #foreach my $read (@unassigned_reads) { 236 | # print OUT3 ">$read\n"; 237 | # print OUT3 $seq{$read}, "\n"; 238 | #} 239 | 240 | ######################cai added 241 | #foreach my $read (@ambiguous_reads) { 242 | # print OUT4 ">$read\n"; 243 | # print OUT4 $seq{$read}, "\n"; 244 | #} 245 | ##################### 246 | 247 | print OUT1 "# Finished Assignment Report\n"; 248 | 249 | exit; 250 | 251 | ##################################################################################### 252 | # collecte information from given directory 253 | sub collect_information { 254 | ##################################cai changed 12/2010 255 | my ($infile, $category_hash_ref, $viral_reads_hash_ref, $best_e_hash_ref, $lineage_hash_ref, $lineage_hash_gi, $num_reads_hash_ref, $unassigned_reads_arr_ref, $ambiguous_reads_arr_ref) = @_; 256 | ################################## 257 | open (IN, $infile) or die "can not open file $infile!\n"; 258 | while () { 259 | if ($_ =~ /#/) { # skip comment line 260 | next; 261 | } 262 | chomp; 263 | my ($read_ID, $length, $category, $lineage, $hit_name, $e_value) = split("\t", $_); 264 | # print "readID = $read_ID, length = $length, category = $category, lineage = $lineage, hit name = $hit_name, e = $e_value\n"; 265 | my $gid=0; 266 | if($hit_name=~/gi\|(\d+)\|/) { $gid=$1; $lineage_hash_gi->{$gid}=$lineage; } 267 | 268 | switch ($category ) { 269 | case "Bacteria" { $category_hash_ref->{"Bacteria"}++ } 270 | case "Fungi" { $category_hash_ref->{"Fungi"}++ } 271 | case "Homo" { $category_hash_ref->{"Homo"}++ } 272 | case "Mus" { $category_hash_ref->{"Mus"}++ } 273 | case "Phage" {$category_hash_ref->{"Phage"}++ } 274 | case "Viruses" { $category_hash_ref->{"Viruses"}++ } 275 | case "other" {$category_hash_ref->{"other"}++ } 276 | case "unassigned" {$category_hash_ref->{"unassigned"}++} 277 | case "Ambiguous" {$category_hash_ref->{"Ambiguous"}++ } #cai added 278 | } 279 | 280 | if (($category eq "Viruses") && $gid!=0) { 281 | 282 | $viral_reads_hash_ref->{$read_ID} = 1; 283 | 284 | $best_e_hash_ref->{$read_ID} = $e_value; 285 | 286 | if (!(defined $lineage_hash_ref->{$gid})) { 287 | $lineage_hash_ref->{$gid} = [$read_ID]; 288 | } 289 | else { 290 | push @{$lineage_hash_ref->{$gid}}, $read_ID; 291 | } 292 | 293 | if (defined $num_reads_hash_ref->{$gid}) { 294 | $num_reads_hash_ref->{$gid}++; 295 | } 296 | else { 297 | $num_reads_hash_ref->{$gid} = 1; 298 | } 299 | 300 | ##################################cai added 12/2010 301 | }elsif ($category eq "Ambiguous"){ 302 | push @{$ambiguous_reads_arr_ref}, $read_ID; 303 | ################################## 304 | }elsif ($category eq "unassigned") { 305 | push @{$unassigned_reads_arr_ref}, $read_ID; 306 | } 307 | } 308 | close IN; 309 | } 310 | 311 | ############################################################################ 312 | sub read_FASTA_data () { 313 | my $fastaFile = shift @_; 314 | 315 | #keep old read seperator and set new read seperator to ">" 316 | my $oldseperator = $/; 317 | $/ = ">"; 318 | 319 | my %fastaSeq; 320 | open (FAfile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 321 | while (my $line = ){ 322 | # Discard blank lines 323 | if ($line =~ /^\s*$/) { 324 | next; 325 | } 326 | # discard comment lines 327 | elsif ($line =~ /^\s*#/) { 328 | next; 329 | } 330 | # discard the first line which only has ">", keep the rest 331 | elsif ($line ne ">") { 332 | chomp $line; 333 | my @rows = (); 334 | @rows = split (/\n/, $line); 335 | my $temp = shift @rows; 336 | my @temp = split(/\s+/, $temp); 337 | my $name = shift @temp; 338 | my $Seq = join("", @rows); 339 | $Seq =~ s/\s//g; #remove white space 340 | $fastaSeq{$name} = $Seq; 341 | } 342 | } 343 | 344 | # check 345 | # foreach my $key (keys %fastaSeq){ 346 | # print "Here is the key for fasta seq: $key \t $fastaSeq{$key}\n"; 347 | # } 348 | 349 | #reset the read seperator 350 | $/ = $oldseperator; 351 | close FAfile; 352 | 353 | return %fastaSeq; 354 | 355 | } 356 | 357 | ############################################################################# 358 | # get detailed information about each viral read 359 | sub get_viral_read_info { 360 | my ($report_file_ref,$blast_readinfo_hash_ref) = @_; 361 | my $report; # blast report object 362 | foreach my $file (@{$report_file_ref}) { 363 | foreach my $line (`cat $file`) 364 | { 365 | if($line=~/Finished summary/) { next; } 366 | else { 367 | my @ss=split("\t",$line); 368 | $blast_readinfo_hash_ref->{$ss[0]} = $line;} 369 | } 370 | } 371 | } 372 | -------------------------------------------------------------------------------- /assignment_summary_gi.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | 5 | my $usage = ' 6 | This script will read the assignment report files in the given 7 | directory and generate a summary report for a given library. It will report 8 | in each library, for each category, how many total sequence were 9 | assigned to this category, how many were assigned by BLASTN, how many 10 | were assigned by BLASTX. 11 | 12 | It will also filter the virus lineage, leave out virus that are phage. 13 | It will rank the virus lineage by range of percent ID from low to high. 14 | 15 | It will also generate a .InterestingReads report about the details of each lineage. 16 | 17 | perl script 18 | = full path to the folder holding files for a given sample 19 | 20 | '; 21 | 22 | die $usage unless scalar @ARGV == 2; 23 | my ( $dir, $bad_seq ) = @ARGV; 24 | 25 | # cutoff for sequences to be interesting, we choose to report everything. 26 | my $percentID_cutoff = 100; 27 | 28 | my @temp = split("\/", $dir); 29 | my $lib_name = pop @temp; 30 | 31 | my $out = $dir."/".$lib_name.".gi.AssignmentSummary"; 32 | open (OUT, ">$out") or die "can not open file $out!\n"; 33 | my $out2 = $dir."/".$lib_name.".gi.InterestingReads"; 34 | open (OUT2, ">$out2") or die "can not open file $out2!\n"; 35 | 36 | my $seq_file = $dir."/".$lib_name.".fa"; 37 | my %sequences = &read_FASTA_data($seq_file); # read_ID => sequence 38 | 39 | my %ID_low = (); # lineage => lowest percent identity to hits 40 | my %ID_high = (); # lineage => highest percent identity to hits 41 | my %num_reads = (); 42 | my $C = "##############################################\n\n"; 43 | 44 | print OUT "$dir\n\n"; 45 | # get sequence statistics 46 | my @nums = &get_SequenceInfo_OneSample($dir); 47 | #print "@nums\n"; 48 | 49 | print OUT "#total\tuniq\ttotal\%\tFiltered\ttotal\%\tLowComplex\ttotal\%\tgood\ttotal\%\tBNRefG\ttotal\%\tBNNT\ttotal\%\tBXNR\ttotal\%\n"; 50 | printf OUT ("%d\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\n", $nums[0], $nums[1], $nums[1]*100/$nums[0], $nums[2], $nums[2]*100/$nums[0], $nums[3], $nums[3]*100/$nums[0], $nums[4], $nums[4]*100/$nums[0], $nums[5], $nums[5]*100/$nums[0], $nums[6], $nums[6]*100/$nums[0], $nums[7], $nums[7]*100/$nums[0]); 51 | print OUT "\n\n"; 52 | 53 | 54 | my $oldSeperator = $/; 55 | $/ = "###########\n"; 56 | my $AssignmentReport_file = $dir."/".$lib_name.".gi.AssignmentReport"; 57 | open (IN, $AssignmentReport_file) or die "can not open file $AssignmentReport_file!\n"; 58 | my $line = ; 59 | $line =~ s/#//g; 60 | my @temps = split("\n", $line); 61 | shift @temps; 62 | foreach my $temp (@temps) { 63 | print OUT $temp, "\n"; 64 | } 65 | print OUT "\n\n"; 66 | 67 | while () { 68 | if ($_ =~ /^\s*$/) { # skip blank line 69 | next; 70 | } 71 | elsif ($_ =~ /Finished Assignment Report/) { next; } 72 | 73 | my @lines = split("\n", $_); 74 | my $lineage = shift @lines; 75 | $lineage = shift @lines; 76 | #$lineage = shift @lines; 77 | #print $lineage,"\n"; 78 | #; 79 | my $high = 0; 80 | my $low = 100; 81 | my %readID_Identity = (); # readID => percent ID 82 | my %readID_desc = (); # readID => description of the read 83 | foreach my $l (@lines) { 84 | if ($l =~ /^\s*$/) { next; } 85 | elsif ($l =~ /QueryName/) { next; } 86 | elsif ($l =~ /reads from/) { next; } 87 | elsif ($l =~ /#+/) { next; } 88 | my ($read_ID, $Qlength, $hitName, $hitLen, $hitDesc, $alnLen, $ID, $hitS, $hitE, $e) = split("\t", $l); 89 | if($ID > $high) { $high = $ID;} 90 | if($ID < $low) { $low = $ID;} 91 | 92 | if (defined ($readID_Identity{$read_ID})) { 93 | if ($ID > $readID_Identity{$read_ID}) { 94 | $readID_Identity{$read_ID} = $ID; 95 | $readID_desc{$read_ID} = $l; 96 | } 97 | } 98 | else { 99 | $readID_Identity{$read_ID} = $ID; 100 | $readID_desc{$read_ID} = $l; 101 | } 102 | } 103 | if ($high == 0) { 104 | $high = 100; 105 | } 106 | 107 | $ID_low{$lineage} = $low; 108 | $ID_high{$lineage} = $high; 109 | 110 | if($lineage=~/total number of reads: (\d+)/) { $num_reads{$lineage}=$1; } 111 | 112 | if ($low <= $percentID_cutoff) { 113 | print OUT2 $lineage, "\t[$low, $high]\n\n"; 114 | foreach my $key (sort {$readID_Identity{$a} <=> $readID_Identity{$b}} keys %readID_Identity) { 115 | print OUT2 $readID_desc{$key}, "\n"; 116 | } 117 | print OUT2 "\n"; 118 | foreach my $key (sort {$readID_Identity{$a} <=> $readID_Identity{$b}} keys %readID_Identity) { 119 | print OUT2 ">$key\n"; 120 | print OUT2 "$sequences{$key}\n\n"; 121 | } 122 | } 123 | } 124 | close IN; 125 | 126 | 127 | foreach my $key (sort {$num_reads{$b} <=> $num_reads{$a}} keys %num_reads) { 128 | printf OUT ("%s\t[%4.1f, %4.1f]%\n", $key, $ID_low{$key}, $ID_high{$key}); 129 | } 130 | print OUT "# Finished Assignment Summary\n"; 131 | 132 | $/ = $oldSeperator; 133 | 134 | close OUT; 135 | close OUT2; 136 | 137 | exit; 138 | 139 | ##################################################################### 140 | sub read_FASTA_data () { 141 | my $fastaFile = shift @_; 142 | 143 | #keep old read seperator and set new read seperator to ">" 144 | my $oldseperator = $/; 145 | $/ = ">"; 146 | 147 | my %fastaSeq; 148 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 149 | 150 | while (my $line = ){ 151 | # Discard blank lines 152 | if ($line =~ /^\s*$/) { 153 | next; 154 | } 155 | # discard comment lines 156 | elsif ($line =~ /^\s*#/) { 157 | next; 158 | } 159 | # discard the first line which only has ">", keep the rest 160 | elsif ($line ne ">") { 161 | chomp $line; 162 | my @rows = (); 163 | @rows = split (/\n/m, $line); 164 | my $temp = shift @rows; 165 | my @temp_arr = split(/\s/, $temp); 166 | my $contigName = shift @temp_arr; 167 | my $contigSeq = join("", @rows); 168 | $contigSeq =~ s/\s//g; #remove white space 169 | $fastaSeq{$contigName} = $contigSeq; 170 | # print " name = \\$contigName\\, seq = \\$contigSeq\\\n\n"; 171 | } 172 | } 173 | 174 | # check 175 | # foreach my $key (keys %fastaSeq){ 176 | # print "Here is the key for fasta seq: $key \t $fastaSeq{$key}\n"; 177 | # } 178 | 179 | #reset the read seperator 180 | $/ = $oldseperator; 181 | 182 | return %fastaSeq; 183 | } 184 | 185 | 186 | ########################################################################## 187 | sub get_SequenceInfo_OneSample { 188 | my ( $dir ) = @_; 189 | 190 | my $total_seq = 0; 191 | my $unique_seq = 0; 192 | my $good_seq = 0; 193 | my $filtered_seq = 0; 194 | my $RepeatLowComplex_seq = 0; 195 | my $blast_RefG_assigned = 0; 196 | my $blastn_assigned = 0; 197 | my $blastx_NR_assigned = 0; 198 | # my $tblastx_NTVS_assigned = 0; 199 | 200 | # get directory path 201 | my @fields = split(/\//, $dir); 202 | my $libName = $fields[$#fields]; 203 | 204 | # get total number of sequences in the sample 205 | my $tempF = $dir."/".$libName.".fa"; 206 | $total_seq = &count_num_of_seq($tempF); 207 | 208 | # get number of unique sequence in the sample 209 | $tempF = $dir."/".$libName.".fa.cdhit_out"; 210 | if (-e $tempF) { 211 | $unique_seq = &count_num_of_seq($tempF); 212 | # print "total # seq = ", $total_seq, " unique # seq: ", $unique_seq, "\n"; 213 | } 214 | 215 | # get number of Filtered and good sequences 216 | #$tempF = $dir."/".$libName.".fa.cdhit_out.masked.badSeq"; 217 | $tempF = $bad_seq; #cai changed, added segmasker 218 | if (-e $tempF) { 219 | open (IN, $tempF) or die "can not open file $tempF!\n"; 220 | } 221 | while () { 222 | if ($_ =~ /good seq = (\d+)/) { 223 | # print "num of good seq: $1, percentage: $2 (percentage of unique sequences\n"; 224 | $good_seq = $1; 225 | } 226 | if ($_ =~ /bad seq = (\d+)/) { 227 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n"; 228 | $filtered_seq = $1; 229 | } 230 | if ($_ =~ /Repeat and Low complexicity seq = (\d+)/) { 231 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n"; 232 | $RepeatLowComplex_seq = $1; 233 | } 234 | 235 | } 236 | close IN; 237 | 238 | # get number of sequences assigned by BLAST ReferenceGenome 239 | my $RefGfiltered = 0; 240 | my $tempF = $dir."/".$libName.".RefGfiltered.fa"; 241 | if (-e $tempF) { 242 | $RefGfiltered = &count_num_of_seq($tempF); 243 | } 244 | else { 245 | $RefGfiltered = 0; 246 | } 247 | $blast_RefG_assigned = $good_seq - $RefGfiltered; 248 | 249 | # get number of sequences assigned by BLASTN 250 | my $BNFiltered = 0; 251 | my $tempF = $dir."/".$libName.".BNFiltered.fa"; 252 | if (-e $tempF) { 253 | $BNFiltered = &count_num_of_seq($tempF); 254 | } 255 | else { 256 | $BNFiltered = 0; 257 | } 258 | $blastn_assigned = $RefGfiltered - $BNFiltered; 259 | 260 | # get number of sequences assigned by BLASTX NR 261 | my $unassigned_num = 0; 262 | my $tempF = $dir."/".$libName.".unassigned.fa"; 263 | if (-e $tempF) { 264 | $unassigned_num = &count_num_of_seq($tempF); 265 | } 266 | else { 267 | $unassigned_num = 0; 268 | } 269 | $blastx_NR_assigned = $BNFiltered - $unassigned_num; 270 | 271 | 272 | return ($total_seq, $unique_seq, $filtered_seq, $RepeatLowComplex_seq, $good_seq, $blast_RefG_assigned, $blastn_assigned, $blastx_NR_assigned); 273 | } 274 | 275 | ############################################################################ 276 | sub count_num_of_seq () { 277 | my ($fastaFile) = @_; 278 | my $count = 0; 279 | 280 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 281 | while (my $line = ){ 282 | if ($line =~ ">") { 283 | $count++; 284 | } 285 | } 286 | close FastaFile; 287 | 288 | return $count; 289 | } 290 | -------------------------------------------------------------------------------- /blast_summary.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | use Switch; 5 | use Bio::SearchIO; 6 | 7 | my $usage = ' 8 | perl $full_path $blast_file 9 | '; 10 | 11 | die $usage unless scalar @ARGV == 2; 12 | my ( $full_path, $blast_file) = @ARGV; 13 | 14 | # get all the viral read sequences 15 | my %viral_reads_blastn = (); 16 | my %viral_reads_blastx = (); 17 | my %best_e_blastn = (); # viral_read_ID => best_e value for this read in blastn 18 | my %best_e_blastx = (); # viral_read_ID => best_e value for this read in blastx 19 | my @blast_files_blastn = (); # all blastn.out files 20 | my @blast_files_blastx = (); # all blastx.out files 21 | my @unassigned_reads = (); 22 | #################################### 23 | my @ambiguous_reads = (); #cai added 12/2010 24 | #################################### 25 | 26 | # read in original sequences 27 | #my @temp = split("\/", $dir); 28 | #my $lib_name = pop @temp; 29 | # print "lib is $lib_name\n"; 30 | #my $fasta_file = $dir."/".$lib_name.".fa.cdhit_out.masked.goodSeq"; 31 | #my $fasta_file = $input_good_seq_fasta_file; #cai changed, added segmasker 32 | 33 | #my %seq = &read_FASTA_data($fasta_file); 34 | 35 | #my $out1 = $dir."/".$lib_name.".gi.AssignmentReport"; 36 | #open (OUT1, ">$out1") or die "can not open file $out1!\n"; 37 | #my $OUT2 = $dir."/".$lib_name.".gi.ViralReads_all.fa"; 38 | #open (OUT2, ">$OUT2") or die "can not open file $OUT2!\n"; 39 | #my $OUT3 = $dir."/".$lib_name.".gi.unassigned.fa"; 40 | #open (OUT3, ">$OUT3") or die "can not open file $OUT3!\n"; 41 | 42 | ##################################cai added 12/2010 43 | #my $out4 = $dir."/".$lib_name.".gi.AmbiguousReads_all.fa"; 44 | #open (OUT4, ">$out4") or die "can not open file $out4!\n"; 45 | ################################## 46 | 47 | 48 | # category => num of sequence assigned to this category by blastn 49 | my %blastn = ( 50 | "Bacteria" => 0, 51 | "Fungi" => 0, 52 | "Homo" => 0, 53 | "Mus" => 0, 54 | "Phage" => 0, 55 | "Viruses" => 0, 56 | "other" => 0, 57 | "unassigned" => 0, 58 | ##################################cai added 12/2010 59 | "Ambiguous" => 0, 60 | ##################################cai added 61 | ); 62 | 63 | # category => num of sequence assigned to this category by blastn of Reference genome 64 | my %blastn_RefG = (); 65 | foreach my $key (keys %blastn) { 66 | $blastn_RefG{$key} = 0; 67 | } 68 | 69 | # category => num of sequence assigned to this category by tblastx of viral genome 70 | my %blastx = (); 71 | foreach my $key (keys %blastn) { 72 | $blastx{$key} = 0; 73 | } 74 | 75 | # viral_lineage => number of reads assigned to this lineage in the library 76 | my %num_reads = (); 77 | my %blast_readinfo =(); # readID => information about this read 78 | my %lineage_blastn = (); # lineage => [read ID] 79 | my %lineage_gi = (); 80 | my %lineage_blastx = (); # lineage => [read ID] 81 | #if ($blast_file =~ /blastn\.parsed$/) { 82 | my $blast_out = $blast_file; 83 | $blast_out =~ s/\.blastn\.parsed/\.blastn\.out/; 84 | $blast_out = $full_path."/".$blast_out; 85 | my $blast_s = $blast_file; 86 | print $blast_s,"\n"; 87 | $blast_s =~ s/\.blastn\.parsed/\.blastn\.summary/; 88 | $blast_s = $full_path."/".$blast_s; 89 | #print $blast_file,"\n"; 90 | #print $blast_out,"\n"; 91 | #print $blast_s,"\n"; 92 | 93 | #open(OUT,">$blast_s"); 94 | 95 | #foreach my $id (keys %blast_readinfo) 96 | #{ 97 | # print OUT $id,"\t",$blast_readinfo{$id},"\n"; 98 | #} 99 | 100 | push @blast_files_blastn, $blast_out; 101 | my $parsed = $full_path."/".$blast_file; 102 | &collect_information($parsed, \%blastn, \%viral_reads_blastn, \%best_e_blastn, \%lineage_blastn, \%lineage_gi, \%num_reads, \@unassigned_reads, \@ambiguous_reads); 103 | 104 | &get_viral_read_info( \@blast_files_blastn, "blastn", \%viral_reads_blastn, \%best_e_blastn, \%blast_readinfo); 105 | 106 | open(OUT,">$blast_s"); 107 | 108 | foreach my $id (keys %blast_readinfo) 109 | { 110 | print OUT $blast_readinfo{$id}; 111 | } 112 | 113 | print OUT "Finished summary\n"; 114 | close OUT; 115 | ##################################################################################### 116 | # collecte information from given directory 117 | sub collect_information { 118 | ##################################cai changed 12/2010 119 | my ($infile, $category_hash_ref, $viral_reads_hash_ref, $best_e_hash_ref, $lineage_hash_ref, $lineage_hash_gi, $num_reads_hash_ref, $unassigned_reads_arr_ref, $ambiguous_reads_arr_ref) = @_; 120 | ################################## 121 | open (IN, $infile) or die "can not open file $infile!\n"; 122 | while () { 123 | if ($_ =~ /#/) { # skip comment line 124 | next; 125 | } 126 | chomp; 127 | my ($read_ID, $length, $category, $lineage, $hit_name, $e_value) = split("\t", $_); 128 | # print "readID = $read_ID, length = $length, category = $category, lineage = $lineage, hit name = $hit_name, e = $e_value\n"; 129 | my $gid=0; 130 | if($hit_name=~/gi\|(\d+)\|/) { $gid=$1; $lineage_hash_gi->{$gid}=$lineage; } 131 | 132 | switch ($category ) { 133 | case "Bacteria" { $category_hash_ref->{"Bacteria"}++ } 134 | case "Fungi" { $category_hash_ref->{"Fungi"}++ } 135 | case "Homo" { $category_hash_ref->{"Homo"}++ } 136 | case "Mus" { $category_hash_ref->{"Mus"}++ } 137 | case "Phage" {$category_hash_ref->{"Phage"}++ } 138 | case "Viruses" { $category_hash_ref->{"Viruses"}++ } 139 | case "other" {$category_hash_ref->{"other"}++ } 140 | case "unassigned" {$category_hash_ref->{"unassigned"}++} 141 | case "Ambiguous" {$category_hash_ref->{"Ambiguous"}++ } #cai added 142 | } 143 | 144 | if (($category eq "Viruses" || $category eq "Bacteria") && $gid!=0) { 145 | 146 | $viral_reads_hash_ref->{$read_ID} = 1; 147 | 148 | $best_e_hash_ref->{$read_ID} = $e_value; 149 | 150 | if (!(defined $lineage_hash_ref->{$gid})) { 151 | $lineage_hash_ref->{$gid} = [$read_ID]; 152 | } 153 | else { 154 | push @{$lineage_hash_ref->{$gid}}, $read_ID; 155 | } 156 | 157 | if (defined $num_reads_hash_ref->{$gid}) { 158 | $num_reads_hash_ref->{$gid}++; 159 | } 160 | else { 161 | $num_reads_hash_ref->{$gid} = 1; 162 | } 163 | 164 | ##################################cai added 12/2010 165 | }elsif ($category eq "Ambiguous"){ 166 | push @{$ambiguous_reads_arr_ref}, $read_ID; 167 | ################################## 168 | }elsif ($category eq "unassigned") { 169 | push @{$unassigned_reads_arr_ref}, $read_ID; 170 | } 171 | } 172 | close IN; 173 | } 174 | 175 | ############################################################################# 176 | # get detailed information about each viral read 177 | sub get_viral_read_info { 178 | my ($report_file_ref, $report_type, $viral_reads_hash_ref, $best_e_hash_ref, $blast_readinfo_hash_ref) = @_; 179 | my $report; # blast report object 180 | foreach my $file (@{$report_file_ref}) { 181 | $report = new Bio::SearchIO(-format => 'blast', -file => $file, -report_type => $report_type); 182 | # Go through BLAST reports one by one 183 | while(my $result = $report->next_result) {# next query output 184 | my $read_ID = $result->query_name; 185 | if (defined $viral_reads_hash_ref->{$read_ID}) { 186 | my $desc = ""; 187 | my $hit_count = 0; 188 | while (my $hit = $result->next_hit()) { 189 | if ($hit->significance() == $best_e_hash_ref->{$read_ID}) { 190 | $hit_count++; 191 | # for those with hundreads hits, only take the first 100 192 | if ($hit_count == 2) { 193 | last; 194 | } 195 | $desc .= $result->query_name()."\t"; 196 | $desc .= $result->query_length()."\t"; 197 | $desc .= $hit->name()."\t"; 198 | $desc .= $hit->length()."\t"; 199 | $desc .= $hit->description(60)."\t"; 200 | while (my $hsp = $hit->next_hsp()) { 201 | $desc .= $hsp->length('hit')."\t"; 202 | my $percent_id = sprintf("%4.1f", $hsp->percent_identity()); 203 | $desc .= $percent_id."\%\t["; 204 | $desc .= $hsp->start('hit')."\t"; 205 | $desc .= $hsp->end('hit')."]\t"; 206 | $desc .= $hsp->evalue()."\n"; 207 | last; 208 | } 209 | } 210 | } 211 | $blast_readinfo_hash_ref->{$read_ID} = $desc; 212 | } 213 | } 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /check_Blast_parsed_file.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = " 7 | This script will check all .blastn.parsed or .blastx.parsed files 8 | to make sure parsing blast output file is finished for each file. 9 | 10 | perl $0 11 | "; 12 | 13 | exit( 10 ) unless scalar @ARGV == 1; 14 | my ( $PARSED ) = @ARGV; 15 | my $HOME = $ENV{HOME}; 16 | 17 | my $finished = &check_blastnParsed_output($PARSED); 18 | 19 | exit ($finished); 20 | 21 | sub check_blastnParsed_output { 22 | my ( $in_file ) = @_; 23 | my $have_summary_line = 0; 24 | my $line_count = 0; 25 | my $total_seq = 0; 26 | my $saved_seq = 0; 27 | my $num_undefined_taxon = 0; 28 | 29 | open (TEMP, "<$in_file") or return 10; 30 | while (my $line = ) { 31 | $line_count++; 32 | if ($line =~ /# Summary: (\d+) out of (\d+)/) { 33 | $saved_seq = $1; 34 | $total_seq = $2; 35 | $have_summary_line = 1; 36 | } 37 | if ($line =~ /undefined taxon/) { 38 | $num_undefined_taxon++; 39 | } 40 | } 41 | close TEMP; 42 | 43 | if (!$have_summary_line) { 44 | return 10; 45 | } 46 | 47 | # taxonomy record has to be equal or greater than the number of sequences get 48 | # successful phylotyped because some sequence could be assigned multiple taxonomy 49 | # categories. Should have at least $num_phylotyped + 1 lines 50 | my $num_phylotyped = $total_seq - $saved_seq; 51 | if ( $num_phylotyped == 0 ) { # every sequence is unassigned 52 | #print "every sequence is unassigned\n"; 53 | return 1; 54 | } 55 | # deal with situation where all records showed as undefined taxon and relative 56 | # to humber of phylotyped sequences 57 | elsif ( $num_phylotyped <= $num_undefined_taxon) { 58 | # print "every sequence is undefined taxon\n"; 59 | return 10; #changed from 0 to 10, the system default $? is 0, avoid the same value, the same reason below 60 | } 61 | 62 | if ( ($line_count - 1) == $num_undefined_taxon) { # deal with situation where all records showed as undefined taxon 63 | # print "every sequence is un defined taxon\n"; 64 | return 10; 65 | } 66 | 67 | # deal with old situation where some reads were not recorded because of no 68 | # entry of gi-taxon record in the database 69 | if ($num_phylotyped > ($line_count -1 ) ) { 70 | #print "record number less than num phylotyped\n"; 71 | return 10; 72 | } 73 | 74 | return 1; 75 | } 76 | 77 | -------------------------------------------------------------------------------- /check_SequenceQualityControl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | my $usage=' 4 | perl script 5 | = full path of the folder holding files for a sample 6 | 7 | '; 8 | die $usage unless scalar @ARGV == 1; 9 | my ( $dir ) = @ARGV; 10 | 11 | my $finished = &check_QC_read_number($dir); 12 | #print $finished; 13 | exit ($finished); 14 | 15 | ########################################################################## 16 | sub check_QC_read_number { 17 | my ( $dir ) = @_; 18 | my $tot_cdhit_seq=0; 19 | my $tot_seq = 0; 20 | 21 | opendir(DH, $dir) or return 10; 22 | foreach my $name (readdir DH) { 23 | if ($name =~/\.cdhit_out$/) { 24 | my $cdFile = $dir."/".$name; 25 | open (IN, $cdFile) or return 10; 26 | while (my $line = ){ 27 | if ($line =~ ">") { 28 | $tot_cdhit_seq++; 29 | } 30 | } 31 | close IN; 32 | } 33 | if ($name =~ /\.badSeq$/) { 34 | my $full_path = $dir."/".$name; 35 | open(IN, $full_path) or return 10; 36 | while(){ 37 | if($_=~/total unique seq = (\d+)/) {$tot_seq=$1;} 38 | } 39 | close IN; 40 | } 41 | } 42 | close DH; 43 | 44 | print "total unique seq in CD-HIT output file: $tot_cdhit_seq\n"; 45 | print "total unique sequence in QC output file: $tot_seq\n"; 46 | 47 | if(abs($tot_cdhit_seq-$tot_seq)/$tot_cdhit_seq<=0.00001) { print "1","\n"; return 1; } 48 | else { 49 | 50 | return 10; 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /check_split_BN.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | my $usage=' 4 | perl script 5 | = full path of the folder holding files for this sample 6 | without last "/" 7 | '; 8 | die $usage unless scalar @ARGV == 1; 9 | my ( $dir ) = @ARGV; 10 | 11 | my $finished = &check_split_output($dir); 12 | #print $finished; 13 | exit ($finished); 14 | 15 | ############################################################## 16 | sub check_split_output { 17 | my ( $dir ) = @_; 18 | my $tot_BN_seq=0; 19 | my $tot_seq = 0; 20 | 21 | opendir(DH, $dir) or return 10; 22 | foreach my $name (readdir DH) { 23 | if ($name =~/\.RefGfiltered\.fa$/) { 24 | my $RefGFile = $dir."/".$name; 25 | open (IN, $RefGFile) or return 10; 26 | while (my $line = ){ 27 | if ($line =~ ">") { 28 | $tot_BN_seq++; 29 | } 30 | } 31 | close IN; 32 | } 33 | if ($name =~ /\.RefGfiltered_BLASTN$/) { # BLASTN directory 34 | my $full_path = $dir."/".$name; 35 | opendir(SubDH, $full_path) or return 10; 36 | foreach my $file (readdir SubDH) { 37 | if ($file =~ /\.fa$/ && !($file=~/\.BNfiltered\.fa/)) { 38 | my $faFile = $full_path."/".$file; 39 | my $count = 0; 40 | open (IN, $faFile) or return 10; 41 | while (my $line = ){ 42 | if ($line =~ ">") { 43 | $count++; 44 | } 45 | } 46 | close IN; 47 | $tot_seq += $count; 48 | } 49 | } 50 | close SubDH; 51 | } 52 | } 53 | close DH; 54 | 55 | # print "$tot_BN_seq\n"; 56 | # print "$tot_seq\n"; 57 | 58 | if($tot_BN_seq==$tot_seq) { return 1; } 59 | else { 60 | opendir(DH, $dir) or return 10; 61 | foreach my $name (readdir DH) { 62 | # print "$name\n"; 63 | if ($name =~ /\.RefGfiltered_BLASTN$/) { 64 | my $full_path = $dir."/".$name; 65 | opendir(SubDH, $full_path) or return 10; 66 | foreach my $file (readdir SubDH) { 67 | my $faFile = $full_path."/".$file; 68 | # print "$faFile\n"; 69 | unlink $faFile; 70 | } 71 | close SubDH; 72 | } 73 | } 74 | close DH; 75 | return 10; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /check_split_RefG.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | my $usage=' 4 | perl script 5 | = full path of the folder holding files for this sample 6 | without last "/" 7 | '; 8 | die $usage unless scalar @ARGV == 1; 9 | my ( $dir ) = @ARGV; 10 | 11 | my $finished = &check_RefG_split($dir); 12 | #print $finished; 13 | exit ($finished); 14 | 15 | ########################################################################### 16 | sub check_RefG_split { 17 | my ( $dir ) = @_; 18 | my $tot_RefG_seq=0; 19 | my $tot_seq = 0; 20 | 21 | opendir(DH, $dir) or return 10; 22 | foreach my $name (readdir DH) { 23 | if ($name =~/\.goodSeq$/) { 24 | my $RefGFile = $dir."/".$name; 25 | open (IN, $RefGFile) or return 10; 26 | while (my $line = ){ 27 | if ($line =~ ">") { 28 | $tot_RefG_seq++; 29 | } 30 | } 31 | close IN; 32 | } 33 | 34 | if ($name =~ /\.goodSeq_RefGblast$/) { # Blast RefG directory 35 | my $full_path = $dir."/".$name; 36 | opendir(SubDH, $full_path) or return 10; 37 | foreach my $file (readdir SubDH) { 38 | if ($file =~ /\.fa$/ && !($file=~/\.RefGfiltered\.fa/)) { 39 | my $faFile = $full_path."/".$file; 40 | my $count = 0; 41 | open (IN, $faFile) or return 10; 42 | while (my $line = ){ 43 | if ($line =~ ">") { 44 | $count++; 45 | } 46 | } 47 | close IN; 48 | $tot_seq += $count; 49 | } 50 | } 51 | close SubDH; 52 | } 53 | } 54 | close DH; 55 | 56 | # print "total seq after spliting: $tot_RefG_seq\n"; 57 | # print "total input seq in .goodSeq: $tot_seq\n"; 58 | 59 | if($tot_RefG_seq==$tot_seq) { return 1; } 60 | else { 61 | opendir(DH, $dir) or return 10; 62 | foreach my $name (readdir DH) { 63 | if ($name =~ /\.goodSeq_RefGblast$/) { 64 | my $full_path = $dir."/".$name; 65 | opendir(SubDH, $full_path) or return 10; 66 | foreach my $file (readdir SubDH) { 67 | my $File = $full_path."/".$file; 68 | # unlink $File; 69 | } 70 | close SubDH; 71 | # `rmdir $full_path`; 72 | } 73 | } 74 | close DH; 75 | return 10; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /check_split_cdhit.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | my $usage=' 4 | perl script 5 | = full path of the folder holding files for a sample 6 | 7 | '; 8 | die $usage unless scalar @ARGV == 1; 9 | my ( $dir ) = @ARGV; 10 | 11 | my $finished = &check_split_output($dir); 12 | #print $finished; 13 | exit ($finished); 14 | 15 | ########################################################################### 16 | sub check_split_output { 17 | my ( $dir ) = @_; 18 | my $tot_cdhit_seq=0; 19 | my $tot_seq = 0; 20 | 21 | opendir(DH, $dir) or return 10; 22 | foreach my $name (readdir DH) { 23 | if ($name =~/\.cdhit_out$/) { 24 | my $cdFile = $dir."/".$name; 25 | open (IN, $cdFile) or return 10; 26 | while (my $line = ){ 27 | if ($line =~ ">") { 28 | $tot_cdhit_seq++; 29 | } 30 | } 31 | close IN; 32 | } 33 | if ($name =~ /\.cdhit_out_RepeatMasker$/) { # RepeatMasker directory 34 | my $full_path = $dir."/".$name; 35 | opendir(SubDH, $full_path) or return 10; 36 | foreach my $file (readdir SubDH) { 37 | if ($file =~ /\.fa$/) { 38 | my $faFile = $full_path."/".$file; 39 | my $count = 0; 40 | open (IN, $faFile) or return 10; 41 | while (my $line = ){ 42 | if ($line =~ ">") { 43 | $count++; 44 | } 45 | } 46 | close IN; 47 | $tot_seq += $count; 48 | } 49 | } 50 | close SubDH; 51 | } 52 | } 53 | close DH; 54 | 55 | # print "$tot_cdhit_seq\n"; 56 | # print "$tot_seq\n"; 57 | 58 | if($tot_cdhit_seq==$tot_seq) { return 1; } 59 | else { 60 | opendir(DH, $dir) or return 10; 61 | foreach my $name (readdir DH) { 62 | if ($name =~ /\.cdhit_out_RepeatMasker$/) { 63 | my $full_path = $dir."/".$name; 64 | my $com1 = "rm -rf $full_path"; 65 | my $com2 ="mkdir $full_path"; 66 | # print "com is $com\n"; 67 | system ( $com1 ); 68 | system ( $com2 ); 69 | } 70 | } 71 | close DH; 72 | return 10; 73 | } 74 | } 75 | 76 | -------------------------------------------------------------------------------- /generate_final_report_gi.pl: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/perl 3 | use strict; 4 | 5 | my $usage = " 6 | This script will read corresponding files in the given director and 7 | generate a report which contains SampleDescription, SequenceReport, 8 | AssignmentSummary, InterestingReads. 9 | 10 | perl $0 11 | = full path of the folder holding files for this sequence run 12 | 13 | "; 14 | die $usage unless scalar @ARGV == 2; 15 | my ( $dir, $version ) = @ARGV; 16 | 17 | my @temp = split("/", $dir); 18 | my $run_name = pop @temp; 19 | my $outFile = $dir."/Analysis_Report_".$run_name; 20 | open (OUT, ">$outFile") or die "can not open file $outFile!\n"; 21 | 22 | my ($wkday,$month,$day,$time,$year) = split(/\s+/, localtime); 23 | print OUT "PathHit V${version}; Processing date: $day-$month-$year\n"; 24 | 25 | my $c = "**************************************************************************\n"; 26 | my $c2 = "#########################################################################\n\n"; 27 | print OUT $c; 28 | 29 | 30 | print OUT "Summary:\n\n"; 31 | &generate_SampleDescription( $dir ); 32 | print OUT "End of Summary\n\n"; 33 | #print OUT $c ; 34 | 35 | #print OUT "\n\nSequence Report\n\n"; 36 | #&generate_SequenceReport( $dir ); 37 | #print OUT "End of Sequence Report\n\n"; 38 | #print OUT $c ; 39 | 40 | #print OUT "\n\nTaxonomy Assignment:\n\n"; 41 | #&generate_AssignmentSummary( $dir ); 42 | #print OUT "End of Assignment\n\n"; 43 | #print OUT $c ; 44 | 45 | #print OUT "\n\nInteresting Reads\n\n"; 46 | #&generate_InterestingReads( $dir ); 47 | #print OUT "End of Interesting Reads\n\n"; 48 | #print "\n"; 49 | 50 | #print OUT "# Finished\n"; 51 | 52 | exit; 53 | 54 | ############################################################################ 55 | sub generate_SampleDescription { 56 | my ($dir) = @_; 57 | 58 | # sample name => num of total sequence in the sample 59 | my %total_seq = (); 60 | 61 | print OUT $dir,"\n"; 62 | printf OUT "%10s\t", " "; 63 | printf OUT "%5s\t%15s\t%10s\t%40s\n", "NoHumanRead", "PercentIDrange", "gi", "IdentifiedNoHuman"; 64 | 65 | opendir(DH, $dir) or die "Can not open dir $dir!\n"; 66 | my @files = readdir DH; 67 | foreach my $name (sort {$a cmp $b} @files) { 68 | if (!($name =~ /\./)) { 69 | # name is either file name or sample name (directory) 70 | my $full_path = $dir."/".$name; 71 | if (-d $full_path) { # is a directory, sample directory 72 | # get total number of sequences in the sample 73 | my $tempF = $full_path."/".$name.".fa"; 74 | $total_seq{$name} = &count_num_of_seq($tempF); 75 | 76 | # print out report for this sample 77 | printf OUT "%30s\t%8d\n", $name, $total_seq{$name}; 78 | my $Summary_file = $full_path."/".$name.".gi.AssignmentSummary"; 79 | if (-e $Summary_file) { 80 | open (IN, $Summary_file) or die "can not open file $Summary_file!\n"; 81 | foreach (1..17) { 82 | ; 83 | } 84 | while () { 85 | if ($_ =~ /^\s*$/) { # empty line 86 | next; 87 | } 88 | elsif ($_ =~ /# Finished Assignment Summary/) { 89 | next; 90 | } 91 | else { 92 | chomp $_; 93 | my $number_reads = 0; 94 | my $range = ""; 95 | my @temp = split(/\t/, $_); 96 | my $range = pop @temp; 97 | my $info = pop @temp; 98 | my $virus_info = pop @temp; 99 | my $virus = ""; 100 | my $gi=$temp[0]; 101 | if ($info =~ /total number of reads: (\d+)/) { 102 | $number_reads = $1; 103 | } 104 | 105 | if ($virus_info =~ /hit does not have taxonomy entry/) { 106 | my @temp2 = split (",", $virus_info); 107 | $virus = shift @temp2; 108 | } 109 | else { 110 | my @temp2 = split(";", $virus_info); 111 | $virus = pop @temp2; 112 | } 113 | 114 | if($number_reads>=1) 115 | { 116 | printf OUT "%10s\t", " "; 117 | printf OUT "%5d\t%20s\t%10s\t%40s\n", $number_reads, $range, $gi, $virus; } 118 | } 119 | } 120 | } 121 | else { 122 | print OUT "$Summary_file does not exist!\n"; 123 | } 124 | } 125 | } 126 | } 127 | } 128 | 129 | ##################################################################### 130 | # Assignment Summary 131 | sub generate_AssignmentSummary { 132 | my ( $dir ) = @_; 133 | 134 | opendir(DH, $dir) or die "Can not open dir $dir!\n"; 135 | my @files = readdir DH; 136 | foreach my $name (sort {$a cmp $b} @files) { 137 | # name is either file name or sample name (directory) 138 | my $full_path = $dir."/".$name; 139 | if (!($name =~ /\./)) { 140 | if (-d $full_path) { # is a directory 141 | my $Summary_file = $full_path."/".$name.".gi.AssignmentSummary"; 142 | if (-e $Summary_file) { 143 | open (IN, $Summary_file) or die "can not open file $Summary_file!\n"; 144 | while () { 145 | if ($_ =~ /# Finished Assignment Summary/) { 146 | next; 147 | } 148 | 149 | print OUT $_; 150 | } 151 | } 152 | print OUT $c2 ; 153 | } 154 | } 155 | } 156 | } 157 | 158 | ########################################################################## 159 | sub generate_SequenceReport { 160 | my ( $dir ) = @_; 161 | 162 | # sample name => num of total sequence in the sample 163 | my %total_seq = (); 164 | 165 | # sample name => num of unique sequence in the sample 166 | my %unique_seq = (); 167 | my %unique_seq_percent = (); 168 | 169 | # sample name => num of Filtered sequence in the libary 170 | my %bad_seq = (); 171 | 172 | # sample name => percentage of Filtered seq in the lib 173 | my %bad_percent = (); 174 | 175 | # sample name => num of Filtered sequence in the libary 176 | my %lowComplex_seq = (); 177 | 178 | # sample name => percentage of Filtered seq in the lib 179 | my %lowComplex_percent = (); 180 | 181 | # libary name => num of good sequenc in the sample 182 | my %good_seq = (); 183 | 184 | # sample name => percentage of Filtered seq in the lib 185 | my %good_percent = (); 186 | 187 | # sample name => num of sequence assigned by BLASTN 188 | my %blastn_assigned = (); 189 | 190 | # sample name => percentage of sequences assigned by blastn 191 | my %blastn_assigned_percent = (); 192 | 193 | # sample name => num of sequence assigned by BLASTN 194 | my %blastx_assigned = (); 195 | 196 | # sample name => percentage of sequences assigned by blastn 197 | my %blastx_assigned_percent = (); 198 | 199 | print OUT $dir,"\n"; 200 | printf OUT "%30s\t", "sampleName"; 201 | print OUT "total\tuniq\t\%\t Filtered\t\%\tLowComplex\t\%\tgood\t\%\tBNassign\t\%\tBXassign\t\%\n"; 202 | opendir(DH, $dir) or die "Can not open dir $dir!\n"; 203 | my @files = readdir DH; 204 | foreach my $name (sort {$a cmp $b} @files) { 205 | # name is either file name or sample name (directory) 206 | my $full_path = $dir."/".$name; 207 | if (!($name =~ /\./)) { 208 | if (-d $full_path) { # is a directory 209 | # get total number of sequences in the sample 210 | my $tempF = $full_path."/".$name.".fa"; 211 | $total_seq{$name} = &count_num_of_seq($tempF); 212 | 213 | # get number of unique sequence in the sample 214 | $tempF = $full_path."/".$name.".fa.cdhit_out"; 215 | if (-e $tempF) { 216 | $unique_seq{$name} = &count_num_of_seq($tempF); 217 | $unique_seq_percent{$name} = $unique_seq{$name}*100/$total_seq{$name}; 218 | print "total # seq = ", $total_seq{$name}, " unique # seq: ", $unique_seq{$name}, "\n"; 219 | } 220 | else { 221 | print OUT "$full_path does not have cdhit_out file!\n"; 222 | return; 223 | } 224 | 225 | # get number of Filtered and good sequences 226 | ############################################################################## 227 | # need to change here if seg masker enabled 228 | $tempF = $full_path."/".$name.".fa.cdhit_out.masked.badSeq"; 229 | open (IN, $tempF) or die "can not open file $tempF!\n"; 230 | while () { 231 | if ($_ =~ /good seq = (\d+)/) { 232 | # print "num of good seq: $1, percentage: $2 (percentage of unique sequences\n"; 233 | $good_seq{$name} = $1; 234 | $good_percent{$name} = $1*100/$total_seq{$name}; 235 | } 236 | if ($_ =~ /bad seq = (\d+)/) { 237 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n"; 238 | $bad_seq{$name} = $1; 239 | $bad_percent{$name} = $1*100/$total_seq{$name}; 240 | } 241 | if ($_ =~ /Repeat and Low complexicity seq = (\d+)/) { 242 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n"; 243 | $lowComplex_seq{$name} = $1; 244 | $lowComplex_percent{$name} = $1*100/$total_seq{$name}; 245 | } 246 | } 247 | 248 | # get number of sequences assigned by BLASTn and number of sequences saved for BLASTX 249 | my $total_saved = 0; 250 | my $total_BNassigned = 0; 251 | $tempF = $full_path."/".$name.".BNFiltered.fa"; 252 | my $BNFiltered; 253 | if (-e $tempF) { 254 | $BNFiltered = &count_num_of_seq($tempF); 255 | $blastn_assigned{$name} = $good_seq{$name} - $BNFiltered; 256 | $blastn_assigned_percent{$name} = $blastn_assigned{$name}*100/$total_seq{$name}; 257 | } 258 | else { 259 | $BNFiltered = 0; 260 | $blastn_assigned{$name} = $good_seq{$name} - $BNFiltered; 261 | $blastn_assigned_percent{$name} = $blastn_assigned{$name}*100/$total_seq{$name}; 262 | } 263 | 264 | my $total_BXassigned = 0; 265 | $tempF = $full_path."/".$name.".gi.unassigned.fa"; 266 | my $unassigned; 267 | if (-e $tempF) { 268 | $unassigned = &count_num_of_seq($tempF); 269 | } 270 | else { 271 | $unassigned = 0; 272 | } 273 | $blastx_assigned{$name} = $BNFiltered - $unassigned; 274 | $blastx_assigned_percent{$name} = $blastx_assigned{$name}*100/$total_seq{$name}; 275 | 276 | # print out report for this sample 277 | printf OUT "%30s\t%5d\t%5d\t%5.1f\t", $name, $total_seq{$name}, $unique_seq{$name}, $unique_seq_percent{$name}; 278 | printf OUT "%5d\t%5.1f\t%5d\t%5.1f\t%5d\t%5.1f\t", $bad_seq{$name}, $bad_percent{$name}, $lowComplex_seq{$name}, $lowComplex_percent{$name}, $good_seq{$name}, $good_percent{$name}; 279 | printf OUT "%5d\t%9.1f\t%5d\t%5.1f\n", $blastn_assigned{$name}, $blastn_assigned_percent{$name}, $blastx_assigned{$name}, $blastx_assigned_percent{$name}; 280 | } 281 | } 282 | } 283 | 284 | 285 | # caclculate and print statistics for this run 286 | my $total = 0; 287 | my $unique = 0; 288 | my $bad = 0; 289 | my $good = 0; 290 | my $BNassign = 0; 291 | my $BXassign = 0; 292 | foreach my $key (keys %total_seq) { 293 | $total += $total_seq{$key}; 294 | $unique += $unique_seq{$key}; 295 | $bad += $bad_seq{$key}; 296 | $good += $good_seq{$key}; 297 | $BNassign += $blastn_assigned{$key}; 298 | $BXassign += $blastx_assigned{$key}; 299 | } 300 | $total_seq{"total"} = $total; 301 | $unique_seq{"total"} = $unique; 302 | $unique_seq_percent{"total"} = $unique*100/$total; 303 | $bad_seq{"total"} = $bad; 304 | $bad_percent{"total"} = $bad*100/$total; 305 | $lowComplex_seq{"total"} = $bad; 306 | $lowComplex_percent{"total"} = $bad*100/$total; 307 | $good_seq{"total"} = $good; 308 | $good_percent{"total"} = $good*100/$total; 309 | $blastn_assigned{"total"} = $BNassign; 310 | $blastn_assigned_percent{"total"} = $BNassign*100/$total; 311 | $blastx_assigned{"total"} = $BXassign; 312 | $blastx_assigned_percent{"total"} = $BXassign*100/$total; 313 | 314 | printf OUT "%30s\t%5d\t%5d\t%5.1f\t", "total", $total_seq{"total"}, $unique_seq{"total"}, $unique_seq_percent{"total"}; 315 | printf OUT "%5d\t%5.1f\t%5d\t%5.1f\t%5d\t%5.1f\t", $bad_seq{"total"}, $bad_percent{"total"}, $lowComplex_seq{"total"}, $lowComplex_percent{"total"}, $good_seq{"total"}, $good_percent{"total"}; 316 | printf OUT "%5d\t%9.1f\t%5d\t%5.1f\n", $blastn_assigned{"total"}, $blastn_assigned_percent{"total"}, $blastx_assigned{"total"}, $blastx_assigned_percent{"total"}; 317 | 318 | } 319 | 320 | ############################################################################ 321 | sub count_num_of_seq () { 322 | my ($fastaFile) = @_; 323 | my $count = 0; 324 | 325 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 326 | while (my $line = ){ 327 | if ($line =~ ">") { 328 | $count++; 329 | } 330 | } 331 | close FastaFile; 332 | 333 | return $count; 334 | } 335 | 336 | #################################################################################### 337 | # Assignment Summary 338 | sub generate_InterestingReads { 339 | my ( $dir ) = @_; 340 | 341 | opendir(DH, $dir) or die "Can not open dir $dir!\n"; 342 | my @files = readdir DH; 343 | foreach my $name (sort {$a cmp $b} @files) { 344 | # name is either file name or sample name (directory) 345 | my $full_path = $dir."/".$name; 346 | if (!($name =~ /\./)) { 347 | if (-d $full_path) { # is a directory 348 | print OUT $name, "\n"; 349 | my $tempF = $full_path."/".$name.".gi.InterestingReads"; 350 | if ( -e $tempF ) { 351 | open (IN, $tempF) or die "can not open file $tempF!\n"; 352 | while () { 353 | print OUT $_; 354 | } 355 | close IN; 356 | } 357 | else { 358 | print OUT "$name does not have .InteresingReads file!\n"; 359 | } 360 | print OUT $c2; 361 | } 362 | } 363 | } 364 | } 365 | -------------------------------------------------------------------------------- /get_fasta_from_bam_filter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my $usage = ' 4 | perl $file_in $file_out 5 | '; 6 | die $usage unless scalar @ARGV == 2; 7 | my ( $file_in, $file_out) = @ARGV; 8 | open(IN,"<$file_in"); 9 | open(OUT,">$file_out"); 10 | while() 11 | { 12 | my $line=$_; 13 | chomp($line); 14 | my @ss=split("\t",$line); 15 | if(!($ss[2]=~/gi\|548558394/) && !($ss[2]=~/gi\|9626372/)) 16 | { 17 | chomp($ss[0]); 18 | chomp($ss[9]); 19 | print OUT ">",$ss[0],"\n"; 20 | print OUT $ss[9],"\n";} 21 | } 22 | close IN; 23 | close OUT; 24 | -------------------------------------------------------------------------------- /import_gi_taxid_nucl.sql: -------------------------------------------------------------------------------- 1 | \! echo 'Loading gi_taxid_nucl.dmp - this can take a very long time' 2 | DROP TABLE IF EXISTS `gi_taxid_nucl`; 3 | CREATE TABLE `gi_taxid_nucl` ( 4 | `gi_taxid_nucl_id` int(10) unsigned NOT NULL auto_increment, 5 | `gi` int(10) unsigned default NULL, 6 | `tax_id` int(10) unsigned default NULL, 7 | PRIMARY KEY (`gi_taxid_nucl_id`), 8 | KEY `tax_id` (`tax_id`), 9 | KEY `gi` (`gi`) 10 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 11 | 12 | 13 | LOAD DATA LOCAL INFILE '/directory/gi_taxid_nucl.dmp' 14 | INTO TABLE gi_taxid_nucl 15 | FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' 16 | (gi,tax_id); 17 | -------------------------------------------------------------------------------- /send_email.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | my ( $dir, $email ) = @ARGV; 5 | my $sendmail = "/usr/sbin/sendmail -t"; 6 | my $reply_to = "Reply-to: $email\n"; 7 | my $subject = "Subject: data processing finished\n"; 8 | my $content = "The $dir data processing has finished.\n"; 9 | my $send_to = "To: $email\n"; 10 | open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; 11 | print SENDMAIL $reply_to; 12 | print SENDMAIL $subject; 13 | print SENDMAIL $send_to; 14 | print SENDMAIL "Content-type: text/plain\n\n"; 15 | print SENDMAIL $content; 16 | close SENDMAIL; 17 | exit; 18 | -------------------------------------------------------------------------------- /split_fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use Getopt::Long; 4 | my %opts; 5 | GetOptions(\%opts, "i:s", "o:s", "n=i", "p=s", "h"); 6 | 7 | die "usage:\t$0 <-i fasta_file> <-o out_put_dir> <-n number_of_file> <-p prefix_of_seq>[-h]\n" if(defined($opts{h}) || !defined($opts{i}) || !defined($opts{o}) || !defined($opts{p})); 8 | 9 | # get total number of sequence in input file 10 | my $count_seq= &count_num_of_seq($opts{i}); 11 | 12 | # calculate how many sequences in each file 13 | my $size = $count_seq/$opts{n}; 14 | #print "$count_seq $size\n"; 15 | 16 | # start spliting 17 | my $count = 1; 18 | my $count_seq_each=0; 19 | open(OUT, ">$opts{o}/$opts{p}${count}".".fa")||die $!; 20 | 21 | open(SEQ, $opts{i}) || die "cannot open file : $opts{i}\n"; 22 | $/='>'; 23 | ; 24 | while() { 25 | chomp; 26 | if($count_seq_each > $size) { 27 | close OUT; 28 | $count++; $count_seq_each=0; 29 | open(OUT, ">$opts{o}/$opts{p}${count}".".fa")||die $!; 30 | } 31 | print OUT $/.$_; 32 | 33 | $count_seq_each++; 34 | } 35 | close OUT; 36 | close SEQ; 37 | 38 | ############################################################################ 39 | sub count_num_of_seq () { 40 | my ($fastaFile) = @_; 41 | my $count = 0; 42 | 43 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile"; 44 | while (my $line = ){ 45 | if ($line =~ ">") { 46 | $count++; 47 | } 48 | } 49 | close FastaFile; 50 | 51 | return $count; 52 | } 53 | 54 | -------------------------------------------------------------------------------- /trim_readid.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $usage = " 5 | This script will trim the name of read id 6 | 7 | perl $0 8 | 9 | "; 10 | 11 | die $usage unless scalar @ARGV == 2; 12 | my ( $filein, $fileout ) = @ARGV; 13 | 14 | open(IN,"<$filein"); 15 | open(OUT,">$fileout"); 16 | 17 | my $cc=0; 18 | 19 | while() 20 | { 21 | 22 | my $line=$_; 23 | if($line=~/^\>/) { $cc++; print OUT ">read".$cc,"\n"; } 24 | else { print OUT $line; } 25 | 26 | } 27 | 28 | close IN; 29 | close OUT; 30 | --------------------------------------------------------------------------------