├── BLASTn_NT_parser.pl
├── BLASTn_RefGenome_parser.pl
├── README.md
├── SequenceQualityControl.pl
├── VirusScan.pl
├── assignment_report_virus_gi.pl
├── assignment_summary_gi.pl
├── blast_summary.pl
├── check_Blast_parsed_file.pl
├── check_SequenceQualityControl.pl
├── check_split_BN.pl
├── check_split_RefG.pl
├── check_split_cdhit.pl
├── generate_final_report_gi.pl
├── get_fasta_from_bam_filter.pl
├── import_gi_taxid_nucl.sql
├── send_email.pl
├── split_fasta.pl
└── trim_readid.pl
/BLASTn_NT_parser.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl -w
3 |
4 | use strict;
5 | use Bio::SearchIO;
6 | use Bio::Taxon;
7 | use Bio::DB::Taxonomy;
8 | use Bio::Tree::Tree;
9 | use DBI();
10 |
11 | my $Usage = '
12 | This script accepts a blastn output file and parse the information
13 |
14 | perl script
15 | = directory that blast output file resides in, without last "/"
16 | = name of the blastn output file
17 |
18 | ';
19 |
20 | die $Usage unless scalar @ARGV == 2;
21 | my ($dir, $blastout) = @ARGV;
22 |
23 | ###################################################################################
24 | # This section needs to be modified to use local configuration
25 | my $database_dir = "/gscmnt/gc3027/info/medseq/taxdump_2014_01_08";
26 |
27 | # open a connection to mysql database
28 | my $dbh_mysql = DBI->connect("DBI:mysql:database=scao_taxondb;host=mysql1","scao", "asdf1234",{'RaiseError'=>1}) or die "Unable to connect $DBI::errstr\n";
29 |
30 | ###################################################################################
31 | # Everhting below should not need modification
32 | my $HOME =$ENV{HOME};
33 |
34 | my %assignment = ();
35 |
36 | # cutoff value for having a good hit
37 | my $E_cutoff = 1e-10;
38 | my $havedefinedHit=0; # Song added
39 | # create ouput file
40 | my $outFile = $blastout;
41 | $outFile =~ s/blastn\.out/blastn.parsed/;
42 | $outFile = $dir."/".$outFile;
43 | open (OUT, ">$outFile") or die "can not open file $outFile!\n";
44 |
45 | # create a tmp directory in the home directory if tmp does not exist
46 | if (! -d $HOME."/taxo") {
47 | `mkdir $HOME"/taxo"`;
48 | }
49 |
50 | # get a Taxon from a Bio::DB::Taxonomy object
51 | my $dbh = Bio::DB::Taxonomy->new(-source => 'flatfile',
52 | -directory=> "$HOME/taxo",
53 | -nodesfile=> "$database_dir/nodes.dmp",
54 | -namesfile=> "$database_dir/names.dmp",
55 | );
56 |
57 | my @keep_for_tblastx = (); # query should be kept for further analysis
58 | my @known = (); # queries that are significantly similar to known sequences
59 | my $total_records = 0;
60 |
61 | print "parsing blast output files...\n\n";
62 |
63 | my $input_file = $dir."/".$blastout;
64 | my $report = new Bio::SearchIO(-format => 'blast', -file => $input_file, -report_type => 'blastn');
65 |
66 | # Go through BLAST reports one by one
67 | while(my $result = $report->next_result) {# next query output
68 | $total_records++;
69 | my $haveHit = 0;
70 | my $keep_for_tblastx = 1;
71 | %assignment = ();
72 |
73 | # only take the best hits
74 | my $best_e = 100;
75 | my $hit_count = 0;
76 | $havedefinedHit=0; #song added;
77 | while(my $hit = $result->next_hit) {
78 | # from hit name get hit gi number
79 | my $hit_name = $hit->name; # gi|num|database|accessionNum|
80 | my @temp_arr = split(/\|/, $hit_name);
81 | my $gi = $temp_arr[1];
82 | #print $gi,"\n";
83 | if ($temp_arr[2] eq "pdb") { # skip data from pdb database
84 | next;
85 | }
86 | $haveHit = 1;
87 | $hit_count++;
88 | if ($hit_count == 1) {
89 | $best_e = $hit->significance;
90 | }
91 |
92 | # check whether the hit should be kept
93 | if ($best_e <= $E_cutoff) { # similar to known, need Phylotyped
94 | $keep_for_tblastx = 0;
95 |
96 | # print $result->query_name, " similar to known, output information!\n\n";
97 | # print "the $hit_count hit, $best_e \n";
98 | if ($hit->significance == $best_e || ($hit->significance <= $E_cutoff && $havedefinedHit==1)) { # only get best hits #song changed
99 | # from gi get taxonomy lineage
100 | my $sth = $dbh_mysql->prepare("SELECT * FROM gi_taxid_nucl where gi = $gi");
101 | $sth->execute();
102 | my $ref = $sth->fetchrow_hashref();
103 | # print "gi = $ref->{'gi'}, taxid = $ref->{'tax_id'}\n";
104 |
105 | $sth->finish();
106 | my $taxID = $ref->{'tax_id'};
107 | if ($taxID) { # some gi don't have record in gi_taxid_nucl
108 | # print "taxID is $taxID\n";
109 | my $taxon_obj = $dbh->get_taxon(-taxonid => $taxID);
110 |
111 | if (!(defined $taxon_obj)) {
112 | # die "unable to get taxon_obj object\n";
113 | my $description = "undefined taxon ".$hit->description."\t".$hit->name."\t".$hit->significance;
114 | $assignment{"other"} = $description;
115 | }
116 |
117 | else {
118 | my $tree_function = Bio::Tree::Tree->new();
119 | my @lineage = $tree_function->get_lineage_nodes($taxon_obj);
120 | # each lineage node is a Bio::Tree::NodeI object
121 |
122 | #if($gi eq "61741475") {
123 | #print "hit gi is $gi\n";
124 | #print "id is ", $taxon_obj->id, "\n";
125 | #print "rank is ", $taxon_obj->rank, "\n";
126 | #print "divison is ", $taxon_obj->division, "\n\n";
127 | #print "lineage is @lineage\n";
128 | #;
129 | #}
130 |
131 | if (scalar @lineage) {
132 | # print "PhyloTyped, don't save for further analysis\n";
133 | &PhyloType(\@lineage,$hit, $best_e, $dbh_mysql, $dbh, \%assignment);
134 | }
135 | #}
136 | }
137 | }
138 | else { # for situations that gi does not have corresponding taxid
139 | # print $result->query_name, " ", $hit->name, "\n";
140 | # print "gi = $ref->{'gi'}, taxid = $ref->{'tax_id'}\n";
141 | # print "hit gi is $gi\n";
142 | my $desc = $hit->description."\t".$hit->name."\t".$hit->significance;
143 | # print $result->query_name, "\t", $desc, "\n";
144 | $assignment{"other"} = $desc;
145 | }
146 | }
147 | else {
148 | last;
149 | }
150 | } # finish phylotype for given hit
151 | } # finish all hits
152 |
153 | # foreach my $key (keys %assignment) {
154 | # print "after parsing ", $key, "\t", $assignment{$key},"\n";
155 | # }
156 | # consolidate assignment
157 | # If a query is assigned both Homo and Primates, it will be reported as Homo only
158 | # If a query is assigned a real taxon name and "other" for reason like"other sequences;
159 | # artificial sequences", or no taxon id in taxon database it will be reported only as
160 | # the real taxon name
161 | my $num_assignment = keys %assignment;
162 | if ($num_assignment > 1) { # have multiple assignment
163 | # handle the situation that assigned both a specific category and "other"
164 | # only specific category will be save.
165 | my $has_specific = 0;
166 | my $has_other = 0;
167 | if ((defined $assignment{"Bacteria"}) || (defined $assignment{"Artificial"}) || (defined $assignment{"Fungi"}) || (defined $assignment{"Homo"}) || (defined $assignment{"Mus"}) || (defined $assignment{"Phage"}) || (defined $assignment{"Viruses"})) {
168 | $has_specific = 1;
169 | }
170 | if (defined $assignment{"other"}) {
171 | $has_other = 1;
172 | }
173 | #################################################################
174 | # If a sequence hits virus and any other species with the same e value,
175 | # the sequence is assigned to "Ambiguous" category. cai added 12/2010
176 | #remove human since we have done extensive filtering for human sequence 10/12/2014
177 |
178 | if (((defined $assignment{"Bacteria"}) || (defined $assignment{"Fungi"}) || (defined $assignment{"Mus"}) || (defined $assignment{"Phage"}) || (defined $assignment{"other"})) && (defined $assignment{"Viruses"})) {
179 | $assignment{"Ambiguous"} = $assignment{"Viruses"};
180 | delete $assignment{"Viruses"};
181 | }
182 | if (((defined $assignment{"Viruses"}) || (defined $assignment{"Fungi"}) || (defined $assignment{"Mus"}) || (defined $assignment{"Phage"}) || (defined $assignment{"other"})) && (defined $assignment{"Bacteria"})) {
183 | $assignment{"Ambiguous"} = $assignment{"Bacteria"};
184 | delete $assignment{"Bacteria"};
185 | }
186 | #################################
187 | if ($has_specific && $has_other) {
188 | delete $assignment{"other"};
189 | }
190 |
191 | }
192 |
193 | # foreach my $key (keys %assignment) {
194 | # print "after consolidateion ", $key, "\t", $assignment{$key},"\n";
195 | # }
196 |
197 | # print out assignment for this query
198 | foreach my $assign (keys %assignment) {
199 | print OUT $result->query_name, "\t", $result->query_length, "\t", $assign, "\t", $assignment{$assign}, "\n";
200 | # print $result->query_name, "\t", $result->query_length, "\t", $assign, "\t", $assignment{$assign}, "\n";
201 |
202 | }
203 |
204 | if ($keep_for_tblastx) {
205 | push @keep_for_tblastx, $result->query_name;
206 | # print $result->query_name, " keep_for_tblastx!\n\n";
207 | }
208 | else {
209 | push @known, $result->query_name;
210 | }
211 | #}
212 | }
213 | print OUT "# Summary: ", scalar @keep_for_tblastx, " out of $total_records ", scalar @keep_for_tblastx/$total_records, " is saved for next step analysis.\n";
214 |
215 | close OUT;
216 |
217 | # generate a fasta file that contains all the sequences that will be kept for further analysis
218 | # read in blast input sequences
219 | my $file = $blastout;
220 | $file =~ s/\.blastn\.out//;
221 | $file = $dir."/".$file.".fa";
222 | my %seq = &read_FASTA_data($file);
223 |
224 | $outFile = $blastout;
225 | $outFile =~ s/\.blastn\.out//;
226 | $outFile = $dir."/".$outFile.".BNfiltered.fa";
227 | open (OUT2, ">$outFile") or die "can not open file $outFile!\n";
228 | foreach my $seq_name (@keep_for_tblastx) {
229 | print OUT2 ">$seq_name\n";
230 | print OUT2 $seq{$seq_name}, "\n";
231 | }
232 | close OUT2;
233 |
234 | $dbh_mysql->disconnect();
235 |
236 | exit;
237 |
238 |
239 | ############################################################################
240 | sub read_FASTA_data () {
241 | my $fastaFile = shift @_;
242 |
243 | #keep old read seperator and set new read seperator to ">"
244 | my $oldseperator = $/;
245 | $/ = ">";
246 |
247 | my %fastaSeq;
248 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
249 |
250 | while (my $line = ){
251 | # Discard blank lines
252 | if ($line =~ /^\s*$/) {
253 | next;
254 | }
255 | # discard comment lines
256 | elsif ($line =~ /^\s*#/) {
257 | next;
258 | }
259 | # discard the first line which only has ">", keep the rest
260 | elsif ($line ne ">") {
261 | chomp $line;
262 | my @rows = ();
263 | @rows = split (/\s/, $line);
264 | my $contigName = shift @rows;
265 | my $contigSeq = join("", @rows);
266 | $contigSeq =~ s/\s//g; #remove white space
267 | $fastaSeq{$contigName} = $contigSeq;
268 | }
269 | }
270 |
271 | # to check the correctness
272 | # foreach my $key (keys %fastaSeq){
273 | # print "Here is the key for fasta seq: $key \t $fastaSeq{$key}\n";
274 | # }
275 |
276 | #reset the read seperator
277 | $/ = $oldseperator;
278 | close FastaFile;
279 |
280 | return %fastaSeq;
281 | }
282 |
283 |
284 | ###############################################################################
285 | # subroutine to determine the taxonomy lineage for a given blast hit
286 | sub PhyloType {
287 | my ($lineage_ref, $hit_ref, $best_e, $dbh_mysql, $dbh_taxonomy, $assignment_ref) = @_;
288 | my $description = "";
289 | my $node_id;
290 | my $obj;
291 | my $name;
292 | my $assigned = 0;
293 |
294 | my $Lineage = "";
295 | for (my $i = 0; $i <= $#$lineage_ref; $i++) {
296 | my $temp_node_id = $lineage_ref->[$i]->id;
297 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id);
298 | my $temp_name = $temp_obj->scientific_name;
299 | $Lineage .= $temp_name.";";
300 | }
301 | #print "linease is $Lineage\n";
302 |
303 | if($Lineage =~/Mimiviridae/i || $Lineage =~/Phycodnaviridae/i || $Lineage =~/marseillevirus/i || $Lineage =~/Iridoviridae/i) { $havedefinedHit=1; } #song added;
304 |
305 | # check to see if it is a human sequence
306 | if (scalar @{$lineage_ref} >= 4) {
307 | $node_id = $lineage_ref->[3]->id;
308 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id);
309 | $name = $obj->scientific_name;
310 | if ($name eq "Metazoa") {
311 | # make assignment
312 | for (my $i = 0; $i <= $#$lineage_ref; $i++) {
313 | my $temp_node_id = $lineage_ref->[$i]->id;
314 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id);
315 | my $temp_name = $temp_obj->scientific_name;
316 | #print "name = $temp_name\n";
317 | #;
318 | if ($temp_name eq "Homo") {
319 | if(!defined $assignment_ref->{"Homo"}) { # only keep the first best hit description, song added 1/7/2012
320 | # print "assigned to Homo\n\n";
321 | $description .= "Homo\t".$hit_ref->name."\t".$hit_ref->significance;
322 | $assignment_ref->{"Homo"} = $description;
323 | }
324 | $assigned = 1;
325 | last;
326 | }
327 | }
328 | if (!$assigned) {
329 | for (my $i = 0; $i <= $#$lineage_ref; $i++) {
330 | my $temp_node_id = $lineage_ref->[$i]->id;
331 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id);
332 | my $temp_name = $temp_obj->scientific_name;
333 | # print "name = $temp_name\n";
334 |
335 | if ($temp_name eq "Mus") {
336 | if(!defined $assignment_ref->{"Mus"}) { # only keep the first best hit description, song added 1/7/2012
337 | # print "assigned to Mus\n\n";
338 | $description .= "Mus\t".$hit_ref->name."\t".$hit_ref->significance;
339 | $assignment_ref->{"Mus"} = $description;
340 | }
341 | $assigned = 1;
342 | last;
343 | }
344 | }
345 | }
346 | if (!$assigned) {
347 | if(!defined $assignment_ref->{"other"}) { # only take the first best hit description
348 | $description .= $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
349 | # print "assigned to other\n\n";
350 | $assignment_ref->{"other"} = $description;
351 | }
352 | $assigned = 1;
353 | }
354 | }
355 | }
356 |
357 | # check to see if it is bacteria sequence
358 | if ((scalar @{$lineage_ref} >= 2)&&(!$assigned)) {
359 | $node_id = $lineage_ref->[1]->id;
360 | #print $node_id,"\n";
361 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id);
362 | $name = $obj->scientific_name;
363 | #print $name,"\n";
364 |
365 | if($name=~/artificial sequences/i) {
366 | if(!defined $assignment_ref->{"Artificial"})
367 | {
368 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
369 | $assignment_ref->{"Artificial"} = $description;}
370 | $assigned=1;
371 | }
372 |
373 | if ($name eq "Bacteria") {
374 | if(!defined $assignment_ref->{"Bacteria"}) { # take the first best hit description
375 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
376 | $assignment_ref->{"Bacteria"} = $description;
377 | }
378 | $assigned = 1;
379 | }
380 | }
381 |
382 | # check to see if it is a phage virus sequence
383 | if (!$assigned) {
384 | $node_id = $lineage_ref->[0]->id;
385 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id);
386 | $name = $obj->scientific_name;
387 | if ($name eq "Viruses") {
388 | for (my $i = 0; $i <= $#$lineage_ref; $i++) {
389 | my $temp_node_id = $lineage_ref->[$i]->id;
390 | my $temp_obj = $dbh_taxonomy->get_taxon(-taxonid=>$temp_node_id);
391 | my $temp_name = $temp_obj->scientific_name;
392 | $description .= $temp_name.";";
393 | if (($temp_name eq "Lipothrixviridae")||($temp_name eq "Caudovirales")||($temp_name eq "Corticoviridae")||($temp_name eq "Cystoviridae")||($temp_name eq "Inoviridae")||($temp_name eq "Leviviridae")||($temp_name eq "Microviridae")||($temp_name eq "Tectiviridae")||($temp_name =~ /phage/i)) {
394 | # print "assigned to phage\n\n";
395 | if(!defined $assignment_ref->{"Phage"}) { # take the first best hit description
396 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
397 | $assignment_ref->{"Phage"} = $description;
398 | }
399 | $assigned = 1;
400 | last;
401 | }
402 | }
403 | }
404 | }
405 |
406 | # check to see if it is a virus sequence
407 | $description = "";
408 | if (!$assigned) {
409 | $node_id = $lineage_ref->[0]->id;
410 | $obj = $dbh_taxonomy->get_taxon(-taxonid=>$node_id);
411 | $name = $obj->scientific_name;
412 | if ($name eq "Viruses") {
413 | if(!defined $assignment_ref->{"Viruses"}) { # take the first best hit description
414 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
415 | $assignment_ref->{"Viruses"} = $description;
416 | }
417 | $assigned = 1;
418 | }
419 | }
420 |
421 | # check to see if it is a fungi sequence
422 | if ((scalar @{$lineage_ref} >= 4)&&(!$assigned)) {
423 | $node_id = $lineage_ref->[3]->id;
424 | $obj = $dbh->get_taxon(-taxonid=>$node_id);
425 | $name = $obj->scientific_name;
426 | if ($name eq "Fungi") {
427 | if(!defined $assignment_ref->{"Fungi"}) { # take the first best hit description
428 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
429 | $assignment_ref->{"Fungi"} = $description;
430 | }
431 | $assigned = 1;
432 | }
433 | }
434 |
435 | # if still not assigned, assigned to "other" category
436 | if (!$assigned) {
437 | if(!defined $assignment_ref->{"other"}) {
438 | $description = $Lineage."\t".$hit_ref->name."\t".$hit_ref->significance;
439 | $assignment_ref->{"other"} = $description;
440 | }
441 | $assigned = 1;
442 | }
443 |
444 | return $assigned;
445 | }
446 |
447 |
448 |
--------------------------------------------------------------------------------
/BLASTn_RefGenome_parser.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl -w
3 |
4 | use strict;
5 | use Bio::SearchIO;
6 |
7 | my $Usage = '
8 | This script accepts a BLASTn output file that were blasted against Reference
9 | genome, find out whether the best hit has a e value lower than the cutoff. If
10 | yes, output query information. If no, the sequence will be kept for further analysis.
11 |
12 | perl script [
13 | = directory that blast output file resides in, without last "/"
14 | = name of the blastn output file
15 | ][ = Bacteria, Homo, Phage, Fungi, Mus, other
16 |
17 | ';
18 |
19 | die $Usage unless scalar @ARGV == 3;
20 | my ($dir, $blastout, $RefGenomeTaxonomy) = @ARGV;
21 |
22 | # cutoff value for having a good hit, 1e-10 is a value that gives reasonable confidence
23 | my $E_cutoff = 1e-10;
24 |
25 | # create ouput file
26 | my $outFile = $blastout;
27 | $outFile =~ s/RefGblast\.out/RefGblast.parsed/;
28 | $outFile = $dir."/".$outFile;
29 | open (OUT, ">$outFile") or die "can not open file $outFile!\n";
30 |
31 | my @keep = (); # query should be kept for further analysis
32 | my @known = (); # queries that are significantly similar to Reference sequences
33 | my $total_records = 0;
34 |
35 | #print "parsing blast output files...\n\n";
36 |
37 | my $input_file = $dir."/".$blastout;
38 | my $report = new Bio::SearchIO(-format => 'blast', -file => $input_file, -report_type => 'blastn');
39 |
40 | # Go through BLAST reports one by one
41 | while(my $result = $report->next_result) {# next query output
42 | # print "\\", $result->query_name, "\\ input\n\n";
43 | if ($result->query_name eq "") { # deals with situation where blast 1st report is empty
44 | next;
45 | }
46 |
47 | $total_records++;
48 | my $haveHit = 0;
49 | my $keep = 1;
50 | while(my $hit = $result->next_hit) {
51 | $haveHit = 1;
52 | # check whether the query should be kept for further analysis
53 | if ($hit->significance <= $E_cutoff) {
54 | $keep = 0;
55 | # print $result->query_name, " similar to known, output information!\n\n";
56 | print OUT $result->query_name, "\t", $result->query_length, "\t$RefGenomeTaxonomy\t$RefGenomeTaxonomy\t".$hit->name."\t".$hit->significance,"\n";
57 | }
58 | last; # only need to look at the first hit
59 | }
60 |
61 | if ($haveHit) {
62 | if ($keep) {
63 | push @keep, $result->query_name;
64 | # print $result->query_name, " keep!\n\n";
65 | }
66 | else {
67 | push @known, $result->query_name;
68 | }
69 | }
70 | else { # does not have a hit, keep for further analysis
71 | push @keep, $result->query_name;
72 | # print $result->query_name, " keep!\n\n";
73 | }
74 |
75 | }
76 | print OUT "# Summary: ", scalar @keep, " out of $total_records ", scalar @keep/$total_records, " is saved for BLASTN analysis.\n";
77 |
78 | close OUT;
79 |
80 | # generate a fasta file that contains all the non-Reference sequences
81 | # read in blastn input sequences
82 | my $file = $blastout;
83 | $file =~ s/\.RefGblast\.out//;
84 | $file = $dir."/".$file.".fa";
85 | my %seq = &read_FASTA_data($file);
86 |
87 | $outFile = $blastout;
88 | $outFile =~ s/\.RefGblast\.out//;
89 | $outFile = $dir."/".$outFile.".RefGfiltered.fa";
90 | open (OUT2, ">$outFile") or die "can not open file $outFile!\n";
91 | foreach my $seq_name (@keep) {
92 | if ($seq_name eq "") { # deals with situation where blast 1st report is empty
93 | next;
94 | }
95 | print OUT2 ">$seq_name\n";
96 | print OUT2 $seq{$seq_name}, "\n";
97 | }
98 | close OUT2;
99 |
100 |
101 | exit;
102 |
103 |
104 | ############################################################################
105 | # subroutines
106 | sub read_FASTA_data () {
107 | my $fastaFile = shift @_;
108 |
109 | #keep old read seperator and set new read seperator to ">"
110 | my $oldseperator = $/;
111 | $/ = ">";
112 |
113 | my %fastaSeq;
114 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
115 |
116 | while (my $line = ){
117 | # Discard blank lines
118 | if ($line =~ /^\s*$/) {
119 | next;
120 | }
121 | # discard comment lines
122 | elsif ($line =~ /^\s*#/) {
123 | next;
124 | }
125 | # discard the first line which only has ">", keep the rest
126 | elsif ($line ne ">") {
127 | chomp $line;
128 | my @rows = ();
129 | @rows = split (/\s/, $line);
130 | my $contigName = shift @rows;
131 | my $contigSeq = join("", @rows);
132 | $contigSeq =~ s/\s//g; #remove white space
133 | $fastaSeq{$contigName} = $contigSeq;
134 | }
135 | }
136 |
137 | # check for correctness
138 | # foreach my $key (keys %fastaSeq){
139 | # print "$key \t $fastaSeq{$key}\n";
140 | # }
141 |
142 | #reset the read seperator
143 | $/ = $oldseperator;
144 |
145 | close FastaFile;
146 | return %fastaSeq;
147 | }
148 |
149 |
150 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #### VirusScan version 1.1 ####
2 |
3 | Author: Song Cao
4 |
5 | Contact: scao@wustl.edu
6 |
7 | Released on Apr 25, 2016
8 |
9 | Please cite the following paper for VirusScan pipeline:
10 |
11 | Song Cao, Michael C. Wendl, Matthew A. Wyczalkowski, Kristine Wylie, Kai Ye, Reyka Jayasinghe, Mingchao Xie, Song Wu, Beifang Niu, Robert Grubb III, Kimberly J. Johnson, Hiram Gay, Ken Chen, Janet S. Rader, John F. Dipersio, Feng Chen, and Li Ding, Divergent viral presentation among human tumors and adjacent normal tissues, Scientific Reports, 2016, 6:28294.
12 |
13 | VirusScan pipeline is a fully automated and modular software package designed for the fast
14 | and accurate detection of known viruses from NGS data. It works on LSF job scheduler.
15 |
16 | It was developed from VirusHunter pipeline, which focuses on identification of novel viruses for 454 reads.
17 | Compared to VirusHunter pipeline, VirusScan can work on Illlumina WGS, WES and RNA-Seq data and fastly return
18 | the discovery result of known viruses.
19 |
20 | ###Dependencies:###
21 |
22 |
23 |
24 | 1. RepeatMasker: Download and install RepeatMasker from http://www.repeatmasker.org/RMDownload.html.
25 |
26 | 2. BLAST Module: Download and install BLAST from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/.
27 |
28 | 3. MySQL DBI: See http://search.cpan.org/dist/DBI/. DBI may be included with your Linux distribution by default. This is a Perl module that allows Perl to interact directly with MySQL database.
29 |
30 | 4. BioPerl: See http://bioperl.org/. BioPerl is used for parsing BLAST output files, and to construct taxonomy lineage tree from a taxonomy ID.
31 |
32 | 5. NCBI nt database: Download NT database from ftp://ftp.ncbi.nih.gov/blast/db/.
33 |
34 | 6. Viral nt database: Downlaod viral nt database from https://drive.google.com/open?id=0B-teklYT0wbDMEh6ZlhzMVo2QlE.
35 |
36 | 7. NCBI taxonomy database: Download NCBI taxonomy database from ftp://ftp.ncbi.nih.gov/pub/taxonomy/.
37 |
38 | 7.1. Create a directory to hold taxonomy file, e.g. taxdump_2016_06_20. Download taxdump.tar.gz file to the directory and Type "tar -xzf taxdump.tar.gz" to untar the file.
39 |
40 | 7.2. Create MySQL Database for the taxonomy information
41 |
42 | Ask your MySQL database administrator to create a MySQL database for taxonomy information, and grant privileges on this database to a suitable username.
43 | For example, ask your MySQL database administrator to use following commands to create a database named "vs_taxondb" and grant all privileges to the user "vs_taxonUser" with the password "vs_password".
44 | $ mysql -u root -p
45 | CREATE DATABASE test_taxondb;
46 | GRANT ALL ON vs_taxondb.* TO 'vs_taxonUser'@'localhost' IDENTIFIED BY 'vs_password';
47 | GRANT ALL ON vs_taxondb.* TO 'vs_taxonUser'@'%' IDENTIFIED BY 'vs_password';
48 | QUIT;
49 |
50 | 6.3. Load gi-taxid into database for nucleotide sequences:
51 | download gi_taxid_nucl.dmp.gz to the directory
52 | unzip the file
53 |
54 | Modify script "import_gi_taxid_nucl.sql " to replace the full path to the gi_taxid_nucl.dmp file with the actual full path in your local system at line " LOAD DATA LOCAL INFILE" in the script.
55 | The LOAD DATA INFILE statement reads rows from a text file into a table at a very high speed. The file name must be given as a literal string.
56 |
57 | Load the gi_taxid_nucl.dmp content to a MySQL database using script " import_gi_taxid_nucl.sql" with the following command:
58 | Cat import_gi_taxid_nucl.sql | mysql -h hostname --user=username databaseName --pass=password &
59 |
60 | Warning: This can take a very long time. It is better to run it as a background task.
61 |
62 | ###Usage:###
63 |
64 | git clone https://github.com/ding-lab/VirusScan.git
65 |
66 | perl VirusScan.pl < run_folder > < step_number >
67 |
68 | run_folder: A folder contains different bam files for different samples:
69 |
70 | For example:
71 |
72 | work/sample1/sample1.bam
73 |
74 | work/sample2/sample2.bam
75 |
76 | Warning: The prefix of the name of the bam file should be the same as the sample directory.
77 |
78 | step_number: Integer between 1 and 33 which represents the following step:
79 |
80 | [1] Extract unmapped no-human reads from aligned bam file and map extracted reads to the viral database
81 |
82 | [2] Split files for running RepeatMasker
83 |
84 | [3] Submit RepeatMasker job array
85 |
86 | [4] Sequence Quality Control
87 |
88 | [5] Split files for Blast Human Genome
89 |
90 | [6] Submit Blast Human Genome job array
91 |
92 | [7] Parse Human Genome Blast result
93 |
94 | [8] Pool and split files for BlastN
95 |
96 | [9] Submit BlastN job array
97 |
98 | [10] Parse BlastN result
99 |
100 | [11] Generate summary result for blastn output
101 |
102 | [12] Assignment report for each sample
103 |
104 | [13] Assignment summary for each sample
105 |
106 | [14] Generate report for the run
107 |
108 | [22] Run steps from 2 to 14
109 |
110 | [23] Run steps from 3 to 14
111 |
112 | [24] Run steps from 4 to 14
113 |
114 | [25] Run steps from 5 to 14
115 |
116 | [26] Run steps from 6 to 14
117 |
118 | [27] Run steps from 7 to 14
119 |
120 | [28] Run steps from 8 to 14
121 |
122 | [29] Run steps from 9 to 14
123 |
124 | [30] Run steps from 10 to 14
125 |
126 | [31] Run steps from 11 to 14
127 |
128 | [32] Run steps from 12 to 14
129 |
130 | [33] Run steps from 13 to 14
131 |
132 |
--------------------------------------------------------------------------------
/SequenceQualityControl.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 |
5 | my $usage = '
6 | This script will check each .masked file in the given directory.
7 | Some sequences have only/lots of Ns because masked by RepeatMasker.
8 | 1) Sequences that do not have greater than 40 nt of consecutive
9 | sequence without N will be put into file .fa.cdhit_out.masked.badSeq
10 | 2) Sequences with >= 40% of total length of being masked will be put
11 | into file .fa.cdhit_out.masked.RepeatLowComplexSeq
12 |
13 | perl script
14 | = full path of the folder holding files for this sample
15 | without last "/"
16 |
17 | ';
18 | die $usage unless scalar @ARGV == 1;
19 | my ( $dir ) = @ARGV;
20 | my $percent_masked_cutoff = 0.4;
21 |
22 | # get directory path
23 | my @fields = split(/\//, $dir);
24 | my $libName = $fields[$#fields];
25 |
26 | my $total_seq = 0;
27 | my $good_seq = 0;
28 | my $bad_seq = 0;
29 | my $RepeatLowComplex_seq = 0;
30 | my $OutFile1 = $dir."/".$libName.".fa.cdhit_out.masked.goodSeq";
31 | my $OutFile2 = $dir."/".$libName.".fa.cdhit_out.masked.badSeq";
32 | my $OutFile3 = $dir."/".$libName.".fa.cdhit_out.masked.RepeatLowComplexSeq";
33 |
34 | open (OUT1, ">$OutFile1") or die "can not open $OutFile1\n";
35 | open (OUT2, ">$OutFile2") or die "can not open $OutFile2\n";
36 | open (OUT3, ">$OutFile3") or die "can not open $OutFile3\n";
37 |
38 | opendir(DH, $dir) or die "Can not open dir $dir!\n";
39 | foreach my $name (readdir DH) {
40 | if ($name =~ /.cdhit_out_RepeatMasker$/) { # RepeatMasker directory
41 | my $full_path = $dir."/".$name;
42 | opendir(SubDH, $full_path) or die "can not open dir $full_path!\n";
43 | foreach my $file (readdir SubDH) {
44 | if ($file =~ /\.masked$/) { # masked sequence
45 | my $maskedFile = $full_path."/".$file;
46 | my %seq = ();
47 | print $maskedFile,"\n";
48 | &read_FASTA_data($maskedFile, \%seq);
49 |
50 | # check for contiguous bases >= 40 bp (non-Ns)
51 | foreach my $read_id (keys %seq) {
52 | #print $read_id,"\n"; ;
53 | $total_seq++;
54 | my $seq_temp = $seq{$read_id};
55 | my $goodQuality=$seq_temp=~/[ACTG]{40,}/;
56 | if($goodQuality) {
57 | my $length_masked = ($seq_temp =~ tr/N/N/);
58 | my $length_total = length $seq_temp;
59 | my $percent_masked = $length_masked/$length_total;
60 |
61 | # print ">$read_id\n";
62 | # print $seq{$read_id}, "\n";
63 | #print "total length $length_total, total number of Ns $length_masked, percentage $percent_masked\n";
64 | #;
65 | if ($percent_masked >= $percent_masked_cutoff) {
66 | print OUT3 ">$read_id\n";
67 | print OUT3 $seq{$read_id}, "\n";
68 | $RepeatLowComplex_seq++;
69 | }
70 | else {
71 | print OUT1 ">$read_id\n";
72 | #print $read_id,"\t","OUT1","\n";
73 | print OUT1 $seq{$read_id}, "\n";
74 | $good_seq++;
75 | }
76 | }
77 | else {
78 | print OUT2 ">$read_id\n";
79 | print OUT2 "$seq{$read_id}\n";
80 | $bad_seq++;
81 | }
82 | }
83 | }
84 | }
85 | }
86 | }
87 |
88 | print OUT2 "total unique seq = $total_seq\n";
89 | print OUT2 "good seq = $good_seq\n";
90 | print OUT2 "bad seq = $bad_seq\n";
91 | print OUT2 "Repeat and Low complexicity seq = $RepeatLowComplex_seq\n";
92 |
93 |
94 | print OUT3 "total unique seq = $total_seq\n";
95 | print OUT3 "Repeat and Low complexicity seq = $RepeatLowComplex_seq\n";
96 |
97 | close(OUT1);
98 | close(OUT2);
99 | close(OUT3);
100 |
101 | exit;
102 |
103 | ############################################################################
104 | sub read_FASTA_data () {
105 | my ($fastaFile, $hash_ref) = @_;
106 |
107 | #keep old read seperator and set new read seperator to ">"
108 | my $oldseperator = $/;
109 | $/ = ">";
110 |
111 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
112 | while (my $line = ){
113 | # Discard blank lines
114 | if ($line =~ /^\s*$/) {
115 | next;
116 | }
117 | # discard comment lines
118 | elsif ($line =~ /#/) {
119 | next;
120 | }
121 | # discard the first line which only has ">", keep the rest
122 | elsif ($line ne ">") {
123 | chomp $line;
124 | my @rows = ();
125 | @rows = split (/\n/m, $line);
126 | my $seqName = shift @rows;
127 | my @temp = split (/\s/, $seqName);
128 | $seqName = shift @temp;
129 | my $Seq = join("", @rows);
130 | $Seq =~ s/\s//g; #remove white space
131 | $hash_ref->{$seqName} = $Seq;
132 | # print "name = $seqName\n";
133 | # print "seq = \\$Seq\\\n";
134 | }
135 | }
136 |
137 | close FastaFile;
138 | #reset the read seperator
139 | $/ = $oldseperator;
140 | }
141 |
--------------------------------------------------------------------------------
/VirusScan.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 | use warnings;
5 | #use POSIX;
6 |
7 | #color code
8 | my $red = "\e[31m";
9 | my $gray = "\e[37m";
10 | my $yellow = "\e[33m";
11 | my $green = "\e[32m";
12 | my $purple = "\e[35m";
13 | my $cyan = "\e[36m";
14 | my $normal = "\e[0m";
15 |
16 | #usage information
17 | (my $usage = < $normal
21 |
22 | = full path of the folder holding files for this sequence run
23 |
24 | run this pipeline step by step. (running the whole pipeline if step number is 0)
25 |
26 | $green [1] Run bwa
27 | $red [2, or <=22] Split files for RepeatMasker
28 | [3 or <=23] Submit RepeatMasker job array
29 | $yellow [4 or <=24] Sequence Qulity Control
30 | $green [5 or <=25] Split files for Blast Reference Genome
31 | [6 or <=26] Submit Blast Reference Genome job array
32 | [7 or <=27] Parse Reference Genome Blast result
33 | $gray [8 or <=28] Pool and split files for BlastN
34 | [9 or <=29] Submit BlastN job array
35 | [10 or <=30] Parse BlastN result
36 | [11 or <=31] Get summary of BlastN
37 | $purple [12 or <=32] Assignment report for each sample
38 | [13 or <=33] Assignment summary for each sample
39 | [14 or <=34] Generate report for the run
40 | $normal
41 | OUT
42 |
43 | die $usage unless @ARGV == 2;
44 | my ( $run_dir, $step_number ) = @ARGV;
45 | if ($run_dir =~/(.+)\/$/) {
46 | $run_dir = $1;
47 | }
48 |
49 | die $usage unless ($step_number >=0)&&(($step_number <= 17) || ($step_number >= 22));
50 |
51 |
52 | #####################################################################################
53 | # values need to be modified to adapt to local environment
54 | my $email = "scao\@wustl\.edu";
55 |
56 | # software path
57 | #my $cd_hit = "/gscuser/mboolcha/software/cdhit/cd-hit-est";
58 | my $repeat_masker = "RepeatMasker";
59 | my $blastn = "/gscuser/scao/tools/ncbi-blast+/bin/blastn";
60 | #my $blastx = "/gscuser/scao/tools/software/ncbi-blast+/bin/blastx";
61 |
62 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
63 | # path and name of databases
64 | #my $db_BN = "/gscuser/scao/gc3027/nt/nt";
65 | #my $db_BX = "/gscuser/scao/gc3027/nr/nr";
66 | #my $bwa_ref = "/gscuser/scao/gc3027/fasta/virus/virusdb_082414.fa";
67 |
68 | my $db_BN = "/gscmnt/gc3027/dinglab/medseq/nt/nt";
69 | my $db_BX = "/gscmnt/gc3027/dinglab/medseq/nr/nr";
70 | my $bwa_ref = "/gscmnt/gc3027/dinglab/medseq/fasta/nt012414_RE_Split/nt012414_virus_abbr_cdhit98.fa";
71 |
72 | # reference genome taxonomy classification and database location.
73 | # It's better to change $refrence_genome_taxonomy and $reference_genome based on the data being analyzed.
74 | my $refrence_genome_taxonomy = "";
75 | my $reference_genome = "";
76 |
77 | #if ($ref_genome_choice == 1) {
78 | # $refrence_genome_taxonomy = "Homo"; # use Bacteria, Homo, Phage, Fungi, Mus, other
79 |
80 | # path to the reference genome
81 | # $reference_genome = "/gscmnt/gc3027/dinglab/medseq/human70.37/humandnacdna.fa";
82 | #}
83 |
84 | $refrence_genome_taxonomy = "Homo";
85 |
86 | $reference_genome = "/gscmnt/gc3027/dinglab/medseq/human70.37/humandnacdna.fa";
87 |
88 | #####################################################################################
89 | # everything else below should be automated
90 | my $HOME = $ENV{HOME};
91 | my $working_name= (split(/\//,$run_dir))[-2];
92 |
93 | # To run jobs faster, split large fasta files to small ones. Split to specific number of
94 | # files instead of specific sequences in each small file, because the number of job array
95 | # cannot be determined if spliting to specific number of sequences in each file. Job
96 | # number is required by qsub ${SGE_TASK_ID}. The minimum size of each file is 4kb.
97 | # The number of files should be determined accourding to CPUs available in the computer
98 | # cluster.
99 |
100 | # The number of small fasta files to split to from a large file for RepeatMasker
101 | my $file_number_of_RepeatMasker = 100; #default
102 | # the number of small fasta files to split to from a large file for Blast_Reference_Genome
103 | my $file_number_of_Blast_Ref_Genome = 100; #default
104 | # the number of small fasta files to split to from a large file for Blast_N
105 | my $file_number_of_Blast_N = 100; #default
106 | # the number of small fasta files to split to from a large file for Blast_X
107 | #my $file_number_of_Blast_X = 200; #default
108 |
109 | #store job files here
110 | my $HOME1="/gscmnt/gc2524/dinglab";
111 |
112 | #store job files here
113 | if (! -d $HOME1."/tmp") {
114 | `mkdir $HOME1"/tmp"`;
115 | }
116 | my $job_files_dir = $HOME1."/tmp";
117 |
118 | #store SGE output and error files here
119 | if (! -d $HOME1."/SGE_DIR") {
120 | `mkdir $HOME1"/SGE_DIR"`;
121 | }
122 | my $lsf_file_dir = $HOME1."/SGE_DIR";
123 |
124 | # obtain script path
125 | my $run_script_path = `dirname $0`;
126 | chomp $run_script_path;
127 | $run_script_path = "/usr/bin/perl ".$run_script_path."/";
128 |
129 | my $hold_RM_job = "norm";
130 | my $current_job_file = "";#cannot be empty
131 | my $hold_job_file = "";
132 | my $bsub_com = "";
133 | my $sample_full_path = "";
134 | my $sample_name = "";
135 |
136 | #directory suffix constants
137 | my $REPEAT_MASKER_DIR_SUFFIX = "fa.cdhit_out_RepeatMasker";
138 | my $BLAST_RefG_DIR_SUFFIX = "fa.cdhit_out.masked.goodSeq_RefGblast";
139 | my $BLAST_NT_DIR_SUFFIX = "RefGfiltered_BLASTN";
140 | my $BLASTX_NR_DIR_SUFFIX = "BNFiltered_BLASTX_NR";
141 |
142 | # get sample list in the run, name should not contain "."
143 | opendir(DH, $run_dir) or die "Cannot open dir $run_dir: $!\n";
144 | my @sample_dir_list = readdir DH;
145 | close DH;
146 |
147 | # check to make sure the input directory has correct structure
148 | &check_input_dir($run_dir);
149 |
150 | # start data processsing
151 | if ($step_number < 14 || $step_number>=22) {
152 | #begin to process each sample
153 | for (my $i=0;$i<@sample_dir_list;$i++) {#use the for loop instead. the foreach loop has some problem to pass the global variable $sample_name to the sub functions
154 | $sample_name = $sample_dir_list[$i];
155 | if (!($sample_name =~ /\./)) {
156 | $sample_full_path = $run_dir."/".$sample_name;
157 | if (-d $sample_full_path) { # is a full path directory containing a sample
158 | print $yellow, "\nSubmitting jobs for the sample ",$sample_name, "...",$normal, "\n";
159 | $current_job_file="";
160 | if ($step_number == 0 || $step_number>=22) {#run the whole pipeline
161 | ######################################################################
162 | #cd-hit
163 | if($step_number==0)
164 | { &bsub_bwa();}
165 |
166 | ######################################################################
167 | #RepeatMasker
168 | #split file for RepeatMasker
169 | #my $f_fa=$sample_full_path.".fa";
170 | #if(! -f $f_fa) { next; }
171 |
172 | if($step_number<=22)
173 | {
174 | &split_for_RepeatMasker(); }
175 |
176 | #submit RepeatMasker job array
177 | if($step_number<=23)
178 | {
179 | &submit_job_array_RM();
180 | $hold_RM_job=$current_job_file; # to limit number repeatmasker jobs run in the cluster at the same time. Can be removed if the cluster is able to handle the volumn of data input/output.
181 | }
182 | ######################################################################
183 | #Sequence Quality Control
184 | if($step_number<=24)
185 | { &seq_QC();}
186 |
187 | ######################################################################
188 | #BLASTn against Reference Genome
189 | if($step_number<=25)
190 | {
191 | &split_for_blast_RefG();}
192 |
193 | #submit Blast RefG job array
194 | if($step_number<=26)
195 | {
196 | &submit_job_array_blast_RefG();}
197 |
198 | if($step_number<=27)
199 | {
200 | #parser Blast RefG file
201 | &parse_blast_RefG();}
202 |
203 |
204 | ######################################################################
205 | #BLASTn against nt
206 | #pool and split files for BLASTn
207 | if($step_number<=28)
208 | {
209 | &pool_split_for_blast_N();}
210 |
211 | #submit BLASTn job array
212 | if($step_number<=29)
213 | {
214 | &submit_job_array_blast_N();}
215 |
216 | #parser BLASTn output file
217 | if($step_number<=30)
218 | {
219 | &parse_blast_N();}
220 |
221 | if($step_number<=31)
222 | {
223 | &blast_S();}
224 |
225 | if($step_number<=32){
226 | &report_for_each_sample();}
227 |
228 | #Assignment summary for each sample
229 | if($step_number<=33) {
230 | &summary_for_each_sample();}
231 |
232 | ######################################################################
233 | #run the pipeline step by step
234 | }elsif ($step_number == 1) {
235 | &bsub_bwa();
236 | }elsif ($step_number == 2) {
237 | &split_for_RepeatMasker(1);
238 | }elsif ($step_number == 3) {
239 | &submit_job_array_RM(1);
240 | $hold_RM_job=$current_job_file; # to limit number of repeatmasker jobs
241 | }elsif ($step_number == 4) {
242 | &seq_QC(1);
243 | }elsif ($step_number == 5) {
244 | &split_for_blast_RefG(1);
245 | }elsif ($step_number == 6) {
246 | &submit_job_array_blast_RefG(1);
247 | }elsif ($step_number == 7) {
248 | &parse_blast_RefG(1);
249 | }elsif ($step_number == 8) {
250 | &pool_split_for_blast_N(1);
251 | }elsif ($step_number == 9) {
252 | &submit_job_array_blast_N(1);
253 | }elsif ($step_number == 10) {
254 | &parse_blast_N(1);
255 | }elsif ($step_number == 11) {
256 | &blast_S(1);
257 | }elsif ($step_number == 12) {
258 | &report_for_each_sample(1);
259 | }elsif ($step_number == 13) {
260 | &summary_for_each_sample(1);
261 | }
262 | }
263 | }
264 | }
265 | }
266 |
267 | ##########################################################################################
268 | # generate report for the run
269 | if (($step_number == 0) || ($step_number == 14) || ($step_number>=22)) {
270 |
271 | print $yellow, "Submitting jobs for generating the report for the run ....",$normal, "\n";
272 | $hold_job_file=$current_job_file;
273 | $current_job_file = "Run_report_".$$.".sh";
274 | open(REPRUN, ">$job_files_dir/$current_job_file") or die $!;
275 | print REPRUN "#!/bin/bash\n";
276 | print REPRUN "#BSUB -n 1\n";
277 | print REPRUN "#BSUB -R \"rusage[mem=40000]\"","\n";
278 | print REPRUN "#BSUB -M 40000000\n";
279 | #print REPRUN "#BSUB -q ding-lab\n";
280 | print REPRUN "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
281 | print REPRUN "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
282 | print REPRUN "#BSUB -J $current_job_file\n";
283 | print REPRUN "#BSUB -w \"$hold_job_file\"","\n";
284 |
285 | print REPRUN "BAD_SEQ=fa.cdhit_out.masked.badSeq\n"; #output of RepeatMasker
286 | print REPRUN "BAD_SEQ=fa.cdhit_out.masked.badSeq\n"; #output of RepeatMasker
287 |
288 | print REPRUN "OUTPUT=".$run_dir."/Analysis_Report_gi_".$working_name."\n";
289 |
290 | print REPRUN 'if [ -f $OUTPUT ] ',"\n"; # file exist
291 | print REPRUN "then\n";
292 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n";
293 | print REPRUN ' CHECK=$?',"\n";
294 | print REPRUN ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish
295 | print REPRUN " do\n";
296 | print REPRUN " ".$run_script_path."generate_final_report_gi.pl ".$run_dir." ".$version,"\n";
297 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n";
298 | print REPRUN ' CHECK=$?',"\n";
299 | print REPRUN " done\n";
300 | print REPRUN "else\n"; # file does not exist
301 | print REPRUN " ".$run_script_path."generate_final_report_gi.pl ".$run_dir." ".$version,"\n";
302 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n";
303 | print REPRUN ' CHECK=$?',"\n";
304 | print REPRUN ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish
305 | print REPRUN " do\n";
306 | print REPRUN " ".$run_script_path."generate_final_report_gi.pl ".$run_dir." ".$version,"\n";
307 | print REPRUN ' grep "# Finished" ${OUTPUT}',"\n";
308 | print REPRUN ' CHECK=$?',"\n";
309 | print REPRUN " done\n";
310 | print REPRUN "fi\n";
311 | close REPRUN;
312 | close REPRUN;
313 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n";
314 | #$bsub_com = "qsub -V -P long -hold_jid $working_name -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
315 | system ($bsub_com);
316 |
317 | }
318 |
319 | #######################################################################
320 | # send email to notify the finish of the analysis
321 | if (($step_number == 0) || ($step_number == 15) || ($step_number>=22)) {
322 | print $yellow, "Submitting the job for sending an email when the run finishes ",$sample_name, "...",$normal, "\n";
323 | $hold_job_file = $current_job_file;
324 | $current_job_file = "Email_run_".$$.".sh";
325 | open(EMAIL, ">$job_files_dir/$current_job_file") or die $!;
326 | print EMAIL "#!/bin/bash\n";
327 | print EMAIL "#BSUB -n 1\n";
328 | print EMAIL "#BSUB -o $lsf_file_dir","\n";
329 | print EMAIL "#BSUB -e $lsf_file_dir","\n";
330 | print EMAIL "#BSUB -J $current_job_file\n";
331 | print EMAIL "#BSUB -w \"$hold_job_file\"","\n";
332 | print EMAIL $run_script_path."send_email.pl ".$run_dir." ".$email."\n";
333 | close EMAIL;
334 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n";
335 | #$bsub_com = "qsub -V -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
336 | system ($bsub_com);
337 | }
338 | #######################################################################
339 | if ($step_number == 0) {
340 | print $green, "All jobs are submitted! You will get email notification when this run is completed.\n",$normal;
341 | }
342 |
343 | exit;
344 |
345 |
346 | ########################################################################
347 | # subroutines
348 |
349 | sub check_input_dir {
350 | my ($input_dir) = @_;
351 | my $have_input_sample = 0;
352 |
353 | # get sample list in the run, name should not contain "."
354 | opendir(DH, $input_dir) or die "Cannot open dir $input_dir: $!\n";
355 | my @sample_list = readdir DH;
356 | close DH;
357 |
358 | for (my $i=0;$i<@sample_list;$i++) {#use the for loop instead. the foreach loop has some problem to pass the global variable $sample_name to the sub functions
359 | $sample_name = $sample_list[$i];
360 | if (!($sample_name =~ /\./)&&!($sample_name =~/Analysis_/)) {
361 | $have_input_sample = 1;
362 | $sample_full_path = $input_dir."/".$sample_name;
363 | if (-d $sample_full_path) { # is a full path directory containing a sample
364 | my $input_file = $input_dir."/".$sample_name."/".$sample_name.".bam";
365 | if (!(-e $input_file)) { # input file does not exist
366 | print $red, "Do not have appropriate input directory structure. Please check your command line argument!", $normal, "\n\n";
367 | die;
368 | }
369 | }
370 | else { # input sample directory does not exist
371 | print $red, "Do not have appropriate input directory structure. Please check your command line argument!", $normal, "\n\n";
372 | die;
373 | }
374 | }
375 | }
376 |
377 | if (!($have_input_sample)) { # does not have any input sample directory
378 | print $red, "Do not have appropriate input directory structure. Please check your command line argument!", $normal, "\n\n";
379 | die;
380 | }
381 |
382 | }
383 |
384 | ########################################################################
385 | ########################################################################
386 | sub bsub_bwa{
387 |
388 | #my $cdhitReport = $sample_full_path."/".$sample_name.".fa.cdhitReport";
389 |
390 | $current_job_file = "j1_bwa_".$sample_name.$$.".sh";
391 |
392 | my $IN_bam = $sample_full_path."/".$sample_name.".bam";
393 |
394 | if (! -e $IN_bam) {#make sure there is a input fasta file
395 | print $red, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n";
396 | print "Warning: Died because there is no input bam file for bwa:\n";
397 | print "File $IN_bam does not exist!\n";
398 | die "Please check command line argument!", $normal, "\n\n";
399 |
400 | }
401 | if (! -s $IN_bam) {#make sure input fasta file is not empty
402 | print $red, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n";
403 | die "Warning: Died because $IN_bam is empty!", $normal, "\n\n";
404 | }
405 |
406 | open(BWA, ">$job_files_dir/$current_job_file") or die $!;
407 | print BWA "#!/bin/bash\n";
408 | print BWA "#BSUB -n 1\n";
409 | print BWA "#BSUB -R \"rusage[mem=20000]\"","\n";
410 | print BWA "#BSUB -M 20000000\n";
411 | print BWA "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
412 | print BWA "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
413 | print BWA "#BSUB -J $current_job_file\n";
414 | print BWA "BWA_IN=".$sample_full_path."/".$sample_name.".bam\n";
415 | print BWA "BWA_fq=".$sample_full_path."/".$sample_name.".fq\n";
416 | print BWA "BWA_sai=".$sample_full_path."/".$sample_name.".sai\n";
417 | #print BWA "BWA_sam=".$sample_full_path."/".$sample_name.".sam\n";
418 | #print BWA "BWA_bam=".$sample_full_path."/".$sample_name.".realign.bam\n";
419 | #print BWA "BWA_mapped_bam=".$sample_full_path."/".$sample_name.".mapped.bam\n";
420 | print BWA "BWA_mapped=".$sample_full_path."/".$sample_name.".mapped.reads\n";
421 | print BWA "BWA_fa=".$sample_full_path."/".$sample_name.".fa\n";
422 | #print BWA
423 | print BWA 'if [ ! -s $BWA_mapped ]',"\n";
424 | print BWA " then\n";
425 | print BWA "rm \${BWA_sai}","\n";
426 | print BWA "rm \${BWA_fq}","\n";
427 | #print BWA "mkfifo \${BWA_sai}","\n";
428 | print BWA "mkfifo \${BWA_fq}","\n";
429 | #0x100: secondary alignment
430 | #0x800: supplementary alignment
431 | #H: Hard clipping
432 | #S: Soft clipping
433 | print BWA "samtools view -h \${BWA_IN} | perl -ne \'\$line=\$_; \@ss=split(\"\\t\",\$line); \$flag=\$ss[1]; \$cigar=\$ss[5]; if(\$ss[0]=~/^\@/ || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && ((\$flag & 0x4) || (\$cigar=~/S/))) || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && (\$ss[2]=~/^gi/))) { print \$line;}\' | samtools view -Sb - | bamtools convert -format fastq > \${BWA_fq} \&","\n";
434 | #print BWA "bwa aln $bwa_ref -b0 \${BWA_IN} > \${BWA_sai} \&","\n";
435 | print BWA "bwa aln $bwa_ref \${BWA_fq} > \${BWA_sai}","\n";
436 | print BWA 'rm ${BWA_fq}',"\n";
437 | print BWA "mkfifo \${BWA_fq}","\n";
438 | print BWA "samtools view -h \${BWA_IN} | perl -ne \'\$line=\$_; \@ss=split(\"\\t\",\$line); \$flag=\$ss[1]; \$cigar=\$ss[5]; if(\$ss[0]=~/^\@/ || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && ((\$flag & 0x4) || (\$cigar=~/S/))) || (!((\$flag & 0x100) || (\$flag & 0x800) || (\$cigar=~/H/)) && (\$ss[2]=~/^gi/))) { print \$line;}\' | samtools view -Sb - | bamtools convert -format fastq > \${BWA_fq} \&","\n";
439 | #print BWA "samtools view -h \${BWA_IN} | gawk \'{if (substr(\$1,1,1)==\"\@\" || (and(\$2,0x4) || and(\$2,0x8) )) print}\' | samtools view -Sb - | bamtools convert -format fastq > \${BWA_fq} \&","\n";
440 | print BWA "bwa samse $bwa_ref \${BWA_sai} \${BWA_fq} | grep -v \@SQ | perl -ne \'\$line=\$_; \@ss=split(\"\\t\",\$line); if(\$ss[2]=~/^gi/) { print \$line; }\' > \${BWA_mapped}","\n";
441 | print BWA " ".$run_script_path."get_fasta_from_bam_filter.pl \${BWA_mapped} \${BWA_fa}\n";
442 | print BWA " ".$run_script_path."trim_readid.pl \${BWA_fa} \${BWA_fa}.cdhit_out\n";
443 | print BWA 'rm ${BWA_sai}',"\n";
444 | print BWA 'rm ${BWA_fq}',"\n";
445 | print BWA "else\n";
446 | print BWA " ".$run_script_path."get_fasta_from_bam_filter.pl \${BWA_mapped} \${BWA_fa}\n";
447 | print BWA " ".$run_script_path."trim_readid.pl \${BWA_fa} \${BWA_fa}.cdhit_out\n";
448 | print BWA " fi\n";
449 | close BWA;
450 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n";
451 | system ( $bsub_com );
452 | }
453 |
454 | #####################################################################################
455 |
456 | sub split_for_RepeatMasker {
457 | #split file for RepeatMasker
458 | my ($step_by_step) = @_;
459 | if ($step_by_step) {
460 | $hold_job_file = "";
461 | }else{
462 | $hold_job_file = $current_job_file;
463 | }
464 | $current_job_file = "j2_".$sample_name."_RM_split_".$$.".sh";
465 | open(RMSPLIT, ">$job_files_dir/$current_job_file") or die $!;
466 | print RMSPLIT "#!/bin/bash\n";
467 | print RMSPLIT "#BSUB -n 1\n";
468 | #print RMSPLIT "#BSUB -q ding-lab\n";
469 | print RMSPLIT "#BSUB -R \"rusage[mem=10000]\"","\n";
470 | print RMSPLIT "#BSUB -M 10000000\n";
471 | print RMSPLIT "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
472 | print RMSPLIT "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
473 | print RMSPLIT "#BSUB -J $current_job_file\n";
474 | print RMSPLIT "RMSPLIT_IN=".$sample_full_path."/".$sample_name.".fa\n";
475 | print RMSPLIT "#BSUB -w \"$hold_job_file\"","\n";
476 | #####################
477 | print RMSPLIT "RM_DIR=".$sample_full_path."/".$sample_name.".$REPEAT_MASKER_DIR_SUFFIX\n";
478 | print RMSPLIT "SAMPLE_DIR=".$sample_full_path."\n\n";
479 | print RMSPLIT "if [ ! -d \${RM_DIR} ]\n";
480 | print RMSPLIT "then\n";
481 | print RMSPLIT " mkdir \${RM_DIR}\n";
482 | print RMSPLIT " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out -o \${RM_DIR} -n $file_number_of_RepeatMasker -p ".$sample_name.".fa.cdhit_out_file\n";
483 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n";
484 | print RMSPLIT ' CHECK=$?',"\n";
485 | print RMSPLIT ' while [ ${CHECK} -eq 10 ]',"\n"; # 10 is the error exit code of check_split_cdhit.pl. It will check whether split_cdhit is correctly completed, if not correctly completed
486 | print RMSPLIT " do\n"; # run split and check again
487 | print RMSPLIT " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out -o \${RM_DIR} -n $file_number_of_RepeatMasker -p ".$sample_name.".fa.cdhit_out_file\n";
488 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n";
489 | print RMSPLIT ' CHECK=$?',"\n";
490 | print RMSPLIT " done\n";
491 | print RMSPLIT "else\n"; # RepeatMasker directory already existed (file already splited)
492 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n";
493 | print RMSPLIT ' CHECK=$?',"\n";
494 | #check if spliting file is correctly completed, if not correctly completed. check again
495 | print RMSPLIT ' while [ ${CHECK} -eq 10 ]',"\n";
496 | print RMSPLIT " do\n";# check again
497 | print RMSPLIT " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out -o \${RM_DIR} -n $file_number_of_RepeatMasker -p ".$sample_name.".fa.cdhit_out_file\n";
498 | print RMSPLIT " ".$run_script_path."check_split_cdhit.pl \${SAMPLE_DIR}\n";
499 | print RMSPLIT ' CHECK=$?',"\n";
500 | print RMSPLIT " done\n";
501 | print RMSPLIT "fi\n";
502 | close RMSPLIT;
503 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
504 | #$bsub_com = "qsub -V -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
505 | system ($bsub_com);
506 | }
507 |
508 | #####################################################################################
509 |
510 | sub submit_job_array_RM {
511 | #submit RepeatMasker job array
512 | my ($step_by_step) = @_;
513 | if ($step_by_step) {
514 | $hold_job_file = "";
515 | }else{
516 | $hold_job_file = $current_job_file;
517 | }
518 | $current_job_file = "j3_".$sample_name."_RM_".$$.".sh";
519 | open (RM, ">$job_files_dir/$current_job_file") or die $!;
520 | print RM "#!/bin/bash\n";
521 | print RM "#BSUB -n 1\n";
522 | #print RM "#BSUB -q ding-lab\n";
523 | print RM "#BSUB -R \"span[hosts=1] rusage[mem=10000]\"","\n";
524 | #print RM "#BSUB -R \"rusage[mem=40000]\"","\n";
525 | print RM "#BSUB -M 10000000\n";
526 | print RM "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
527 | print RM "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
528 | print RM "#BSUB -J $current_job_file\[1-$file_number_of_RepeatMasker\]\n";
529 | print RM "#BSUB -w \"$hold_job_file\"","\n";
530 | print RM "RM_IN=".$sample_full_path."/".$sample_name.".fa\n";
531 | #####################
532 | print RM "RM_dir=".$sample_full_path."/".$sample_name.".$REPEAT_MASKER_DIR_SUFFIX\n";
533 | #print RM "#\$ -t 1-$file_number_of_RepeatMasker:1","\n";
534 | print RM "RMOUT=",'${RM_dir}',"/".$sample_name.".fa.cdhit_out_file".'${LSB_JOBINDEX}'.".fa.masked","\n";
535 | print RM "RMIN=",'${RM_dir}',"/".$sample_name.".fa.cdhit_out_file".'${LSB_JOBINDEX}',".fa\n";
536 | print RM "RMOTHER=",'${RM_dir}',"/".$sample_name.".fa.cdhit_out_file".'${LSB_JOBINDEX}'.".fa.out","\n\n";
537 | print RM 'if [ -f $RMIN ]',"\n"; # input file exist
538 | print RM "then\n";
539 | print RM ' if [ ! -s $RMOUT ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished
540 | print RM " then\n";
541 | #print RM ' while [ ! -s $RMOUT ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished
542 | # print RM " do\n"; # run RepeatMasker until it finishes
543 | print RM " $repeat_masker -pa 4 \$RMIN \n";
544 | # print RM " done\n";
545 | print RM " fi\n\n";
546 | print RM ' if [ ! -f $RMOTHER ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished
547 | print RM " then\n";
548 | print RM ' while [ ! -f $RMOTHER ]',"\n"; # don't have RepeatMasker output ".out" file, means RepeatMasker never ran or finished
549 | print RM " do\n"; # run RepeatMasker until it finishes
550 | print RM " $repeat_masker -pa 4 \$RMIN \n";
551 | print RM " done\n";
552 | print RM " fi\n\n";
553 | print RM ' if [ ! -f $RMOUT ]',"\n"; #sometimes repeatmasker does not find any repeat in input files, in these cases no .masked file will be generated.
554 | print RM " then\n";
555 | print RM ' cp ${RMIN} ${RMOUT}',"\n";
556 | print RM " fi\n";
557 | print RM "fi\n";
558 | close RM;
559 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n";
560 | #print $bsub_com, "\n";
561 | #$bsub_com = "qsub -V -l h_vmem=4G -hold_jid $hold_job_file,$hold_RM_job -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
562 | system ($bsub_com)
563 | }
564 |
565 | #####################################################################################
566 |
567 | sub seq_QC {
568 | my ($step_by_step) = @_;
569 | if ($step_by_step) {
570 | $hold_job_file = "";
571 | }else{
572 | $hold_job_file = $current_job_file;
573 | }
574 | $current_job_file = "j4_".$sample_name."_QC_".$$.".sh";
575 | open(QC, ">$job_files_dir/$current_job_file") or die $!;
576 | print QC "#!/bin/bash\n";
577 | print QC "#BSUB -n 1\n";
578 | print QC "#BSUB -R \"rusage[mem=10000]\"","\n";
579 | print QC "#BSUB -M 10000000\n";
580 | print QC "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
581 | print QC "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
582 | print QC "#BSUB -J $current_job_file\n";
583 | print QC "#BSUB -w \"$hold_job_file\"","\n";
584 | #####################
585 | print QC "SAMPLE_DIR=".$sample_full_path."\n";
586 | print QC "QC_OUT=".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq\n\n";
587 | print QC "f_fa=".$sample_full_path."/".$sample_name.".fa\n";
588 | print QC 'if [ ! -f $QC_OUT] && [ -s $f_fa]',"\n";
589 | print QC "then\n";
590 | print QC " ".$run_script_path."SequenceQualityControl.pl ".$sample_full_path."\n";
591 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n";
592 | print QC ' CHECK=$?',"\n";
593 | print QC ' while [ ${CHECK} -eq 10 ]',"\n";#10 is the exit code of check_SequenceQualityControl.pl if it is not correctly completed.
594 | print QC " do\n";#run split and check again
595 | print QC " ".$run_script_path."SequenceQualityControl.pl ".$sample_full_path."\n";
596 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n";
597 | print QC ' CHECK=$?',"\n";
598 | print QC " done\n";
599 | print QC "else\n";
600 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n";
601 | print QC ' CHECK=$?',"\n";
602 | #check if parsed file is completed, if not completed. check again
603 | print QC ' while [ ${CHECK} -eq 10 ]',"\n";
604 | print QC " do\n";#run parser again
605 | print QC " ".$run_script_path."SequenceQualityControl.pl ".$sample_full_path."\n";
606 | print QC " ".$run_script_path."check_SequenceQualityControl.pl \${SAMPLE_DIR}\n";
607 | print QC ' CHECK=$?',"\n";
608 | print QC ' CHECK=1',"\n";
609 | print QC " done\n";
610 | print QC "fi\n";
611 | close QC;
612 | $bsub_com = "bsub < $job_files_dir/$current_job_file\n";
613 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
614 | system ($bsub_com);
615 | }
616 |
617 | #####################################################################################
618 |
619 | sub split_for_blast_RefG{
620 | #split file for RefG blast
621 | my ($step_by_step) = @_;
622 | if ($step_by_step) {
623 | $hold_job_file = "";
624 | }else{
625 | $hold_job_file = $current_job_file;
626 | }
627 |
628 | $current_job_file = "j5_".$sample_name."_RefG_split_".$$.".sh";
629 | open(RefGS, ">$job_files_dir/$current_job_file") or die $!;
630 | print RefGS "#!/bin/bash\n";
631 | print RefGS "#BSUB -n 1\n";
632 | print RefGS "#BSUB -R \"rusage[mem=10000]\"","\n";
633 | print RefGS "#BSUB -M 10000000\n";
634 | print RefGS "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
635 | print RefGS "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
636 | print RefGS "#BSUB -J $current_job_file\n";
637 | print RefGS "#BSUB -w \"$hold_job_file\"","\n";
638 | ############################
639 | print RefGS "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n";
640 | print RefGS "SAMPLE_DIR=".$sample_full_path."\n\n";
641 | print RefGS 'if [ ! -d $RefG_DIR ]',"\n";
642 | print RefGS "then\n";
643 | print RefGS " mkdir \${RefG_DIR}\n";
644 | print RefGS " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq -o \${RefG_DIR} -n $file_number_of_Blast_Ref_Genome -p ".$sample_name.".fa.cdhit_out.masked.goodSeq_file\n";
645 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n";
646 | print RefGS ' CHECK=$?',"\n";
647 | print RefGS ' while [ ${CHECK} -eq 10 ]',"\n";#10 is the error exit code of it is not correctly completed.
648 | print RefGS " do\n";#run split and check again
649 | print RefGS " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq -o \${RefG_DIR} -n $file_number_of_Blast_Ref_Genome -p ".$sample_name.".fa.cdhit_out.masked.goodSeq_file\n";
650 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n";
651 | print RefGS ' CHECK=$?',"\n";
652 | print RefGS " done\n";
653 | print RefGS "else\n";
654 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n";
655 | print RefGS ' CHECK=$?',"\n";
656 | #check if parsed file is completed, if not completed. check again
657 | print RefGS ' while [ ${CHECK} -eq 10 ]',"\n";
658 | print RefGS " do\n";#run parser again
659 | print RefGS " ".$run_script_path."split_fasta.pl -i ".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq -o \${RefG_DIR} -n $file_number_of_Blast_Ref_Genome -p ".$sample_name.".fa.cdhit_out.masked.goodSeq_file\n";
660 | print RefGS " ".$run_script_path."check_split_RefG.pl \${SAMPLE_DIR}\n";
661 | print RefGS ' CHECK=$?',"\n";
662 | print RefGS " done\n";
663 | print RefGS "fi\n";
664 | close RefGS;
665 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
666 | #$bsub_com = "qsub -V -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
667 | system ($bsub_com);
668 | }
669 |
670 | #####################################################################################
671 |
672 | sub submit_job_array_blast_RefG{
673 | my ($step_by_step) = @_;
674 | if ($step_by_step) {
675 | $hold_job_file = "";
676 | }else{
677 | $hold_job_file = $current_job_file;
678 | }
679 |
680 | $current_job_file = "j6_".$sample_name."_BRefG_".$$.".sh";
681 | open (RefG, ">$job_files_dir/$current_job_file") or die $!;
682 | print RefG "#!/bin/bash\n";
683 | print RefG "#BSUB -n 1\n";
684 | print RefG "#BSUB -R \"span[hosts=1] rusage[mem=20000]\"","\n";
685 | print RefG "#BSUB -M 20000000\n";
686 | print RefG "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
687 | print RefG "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
688 | print RefG "#BSUB -J $current_job_file\[1-$file_number_of_Blast_Ref_Genome\]\n";
689 | print RefG "#BSUB -w \"$hold_job_file\"","\n";
690 |
691 | ####################
692 | print RefG "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n";
693 | #print RefG "#\$ -t 1-$file_number_of_Blast_Ref_Genome:1","\n"; #the number must be a digital value in the .sh job file, cannot be calculated when the job submitted
694 | print RefG "BlastRefGOUT=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}',".RefGblast.out\n";
695 | print RefG "QUERY=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}'.".fa\n\n";
696 | print RefG 'if [ -s $QUERY ]',"\n"; #modified by song: check if a file is empty.
697 | print RefG "then\n";
698 | #if blast output file does not exist, do blast and check the completeness of output
699 | print RefG ' if [ ! -f $BlastRefGOUT ]',"\n";
700 | print RefG " then\n";
701 | print RefG " $blastn -evalue 1e-9 -show_gis -num_threads 4 -num_descriptions 2 -num_alignments 2 -query \${QUERY} -out \${BlastRefGOUT} -db $reference_genome","\n";
702 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n";
703 | print RefG ' CHECK=$?',"\n";
704 | print RefG ' while [ ${CHECK} -eq 1 ]',"\n";
705 | print RefG " do\n";
706 | print RefG " $blastn -evalue 1e-9 -show_gis -num_threads 4 -num_descriptions 2 -num_alignments 2 -query \${QUERY} -out \${BlastRefGOUT} -db $reference_genome","\n";
707 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n";
708 | print RefG ' CHECK=$?',"\n";
709 | print RefG " done\n";
710 | #if blast output file exists, check the completeness of output
711 | print RefG " else\n";
712 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n";
713 | print RefG ' CHECK=$?',"\n";
714 | print RefG ' while [ ${CHECK} -eq 1 ]',"\n";
715 | print RefG " do\n";
716 | print RefG " $blastn -evalue 1e-9 -show_gis -num_threads 4 -num_descriptions 2 -num_alignments 2 -query \${QUERY} -out \${BlastRefGOUT} -db $reference_genome","\n";
717 | print RefG ' tail -10 ${BlastRefGOUT}|grep Matrix',"\n";
718 | print RefG ' CHECK=$?',"\n";
719 | print RefG " done\n";
720 | print RefG " fi\n";
721 | print RefG "fi";
722 | close RefG;
723 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
724 | #$bsub_com = "qsub -V -l h_vmem=10G -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
725 | system ($bsub_com);
726 | }
727 |
728 | #####################################################################################
729 |
730 | sub parse_blast_RefG{
731 | my ($step_by_step) = @_;
732 | if ($step_by_step) {
733 | $hold_job_file = "";
734 | }else{
735 | $hold_job_file = $current_job_file;
736 | }
737 |
738 | # $current_job_file = "j10_".$sample_name."_PBN_".$$.".sh";
739 | my $BND=$sample_full_path."/".$sample_name.".".$BLAST_RefG_DIR_SUFFIX;
740 | #if
741 | #my $nn1=`tail $BND/*.out | grep Matrix | wc -l`;
742 | #my $nn2=`ls $BND/*.out | wc -l`;
743 | #print $nn1,"\n";
744 | #print $nn2,"\n";
745 | #if($nn1 != $nn2) { print "resubmitted blastHG for $sample_name","\n"; &submit_job_array_blast_RefG(1); }
746 | #else {
747 | $current_job_file = "j7_".$sample_name."_PRefG_".$$.".sh";
748 | open (PRefG, ">$job_files_dir/$current_job_file") or die $!;
749 | print PRefG "#!/bin/bash\n";
750 | print PRefG "#BSUB -n 1\n";
751 | print PRefG "#BSUB -R \"rusage[mem=10000]\"","\n";
752 | print PRefG "#BSUB -M 10000000\n";
753 | print PRefG "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
754 | print PRefG "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
755 | print PRefG "#BSUB -J $current_job_file\[1-$file_number_of_Blast_Ref_Genome\]\n";
756 | print PRefG "#BSUB -w \"$hold_job_file\"","\n";
757 | #################################
758 | print PRefG "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n";
759 | #print PRefG "#\$ -t 1-$file_number_of_Blast_Ref_Genome:1","\n";#must be a decimal number
760 | print PRefG "BlastRefGOUT=${sample_name}.fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}',".RefGblast.out\n";#name only, not full path
761 | print PRefG "BlastRefGIN=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}'.".fa\n";#full path
762 | print PRefG "PARSED=",'${RefG_DIR}',"/".$sample_name.".fa.cdhit_out.masked.goodSeq_file".'${LSB_JOBINDEX}'.".RefGblast.parsed\n\n";
763 | print PRefG 'if [ -s $BlastRefGIN ]',"\n"; # change -f to -s
764 | print PRefG "then\n";
765 | #if the parsed file does not exist, run parser and check the completeness of the parsed file
766 | print PRefG ' if [ ! -f $PARSED ]',"\n";
767 | print PRefG " then\n";
768 | print PRefG " ".$run_script_path."BLASTn_RefGenome_parser.pl \${RefG_DIR} \${BlastRefGOUT} $refrence_genome_taxonomy\n";
769 | #check the completeess of parse
770 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n";
771 | print PRefG ' CHECK=$?',"\n";
772 | # rerun if not completed
773 | print PRefG ' while [ ${CHECK} -eq 1 ]',"\n";
774 | print PRefG " do\n";#run parse again
775 | print PRefG " ".$run_script_path."BLASTn_RefGenome_parser.pl \${RefG_DIR} \${BlastRefGOUT} $refrence_genome_taxonomy \n";
776 | #check the completeess of parse
777 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n";
778 | print PRefG ' CHECK=$?',"\n";
779 | print PRefG " done\n";
780 | #if the parsed file exists, check the completeness of the parsed file
781 | print PRefG " else\n";
782 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n";
783 | print PRefG ' CHECK=$?',"\n";
784 | print PRefG ' while [ ${CHECK} -eq 1 ]',"\n"; #not complete
785 | print PRefG " do\n";
786 | print PRefG " ".$run_script_path."BLASTn_RefGenome_parser.pl \${RefG_DIR} \${BlastRefGOUT} $refrence_genome_taxonomy \n";
787 | print PRefG ' tail -5 ${PARSED}|grep Summary',"\n";
788 | print PRefG ' CHECK=$?',"\n";
789 | print PRefG " done\n";
790 | print PRefG " fi\n";
791 | print PRefG "fi";
792 | close PRefG;
793 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
794 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
795 | system ($bsub_com);
796 | #}
797 | }
798 |
799 | #####################################################################################
800 |
801 | sub pool_split_for_blast_N{
802 | my ($step_by_step) = @_;
803 | if ($step_by_step) {
804 | $hold_job_file = "";
805 | }else{
806 | $hold_job_file = $current_job_file;
807 | }
808 |
809 | $current_job_file = "j8_".$sample_name."_BN_split_".$$.".sh";
810 | open(BNS, ">$job_files_dir/$current_job_file") or die $!;
811 | print BNS "#!/bin/bash\n";
812 | print BNS "#BSUB -n 1\n";
813 | print BNS "#BSUB -R \"rusage[mem=10000]\"","\n";
814 | print BNS "#BSUB -M 10000000\n";
815 | print BNS "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
816 | print BNS "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
817 | print BNS "#BSUB -J $current_job_file\n";
818 | print BNS "#BSUB -w \"$hold_job_file\"","\n";
819 | ############################
820 | print BNS "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n";
821 | print BNS "SAMPLE_DIR=".$sample_full_path."\n";
822 | print BNS "RefGFiltered_fa=".$sample_full_path."/".$sample_name.".RefGfiltered.fa\n";
823 | print BNS "RefG_DIR=".$sample_full_path."/".$sample_name.".$BLAST_RefG_DIR_SUFFIX\n\n";
824 | print BNS 'if [ ! -d $BN_DIR ] ',"\n";
825 | print BNS "then\n";
826 | print BNS " mkdir \${BN_DIR}\n";
827 | print BNS "fi\n";
828 | print BNS 'if [ -f $RefGFiltered_fa ] ',"\n";
829 | print BNS "then\n";
830 | print BNS " rm \${RefGFiltered_fa}\n";
831 | print BNS "fi\n";
832 | print BNS "cat \${RefG_DIR}/*.RefGfiltered.fa >> \${RefGFiltered_fa}\n";
833 | print BNS "".$run_script_path."check_split_BN.pl \${SAMPLE_DIR}\n";
834 | print BNS 'CHECK=$?',"\n";
835 | print BNS 'while [ ${CHECK} -eq 10 ]',"\n"; #10 is the exit code of check_split_BN.pl. Check whether it is correctly completed, if not rerun split and check again.
836 | print BNS "do\n";
837 | # split to -n number of files, this number should be consistent with
838 | # the number of blastn job array submitted bellow
839 | print BNS " ".$run_script_path."split_fasta.pl -i \${RefGFiltered_fa} -o \${BN_DIR} -n $file_number_of_Blast_N -p ".$sample_name.".RefGfiltered.fa_file\n";
840 | print BNS " ".$run_script_path."check_split_BN.pl \${SAMPLE_DIR}\n";
841 | print BNS ' CHECK=$?',"\n";
842 | print BNS "done\n";
843 | close BNS;
844 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
845 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
846 | system ($bsub_com);
847 | }
848 |
849 | #####################################################################################
850 |
851 | sub submit_job_array_blast_N{
852 | my ($step_by_step) = @_;
853 | if ($step_by_step) {
854 | $hold_job_file = "";
855 | }else{
856 | $hold_job_file = $current_job_file;
857 | }
858 |
859 | my $BND=$sample_full_path."/".$sample_name.".".$BLAST_NT_DIR_SUFFIX;
860 |
861 | #my $nn1=`tail $BND/*.out | grep Matrix | wc -l`;
862 | #my $nn2=`ls $BND/*.out | wc -l`;
863 |
864 | #print $nn1,"\n";
865 | #print $nn2,"\n";
866 |
867 | #if($nn1 != $nn2 || $nn2<200)
868 | #{
869 | $current_job_file = "j9_".$sample_name."_BN_".$$.".sh";
870 | open (BN, ">$job_files_dir/$current_job_file") or die $!;
871 | print BN "#!/bin/bash\n";
872 | print BN "#BSUB -n 1\n";
873 | print BN "#BSUB -R \"span[hosts=1] rusage[mem=40000]\"","\n";
874 | print BN "#BSUB -M 40000000\n";
875 | print BN "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
876 | print BN "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
877 | print BN "#BSUB -J $current_job_file\[1-$file_number_of_Blast_N\]\n";
878 | print BN "#BSUB -w \"$hold_job_file\"","\n";
879 | #################################
880 | print BN "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n";
881 | #print BN "#\$ -t 1-$file_number_of_Blast_N:1","\n"; #must be a decimal number, the value must be determined when this job file is generated. cannot be a variable
882 | print BN "BlastNOUT=",'${BN_DIR}',"/",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.out\n";#full path
883 | print BN "QUERY=",'${BN_DIR}',"/".$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".fa\n\n";
884 | print BN 'if [ -s $QUERY ]',"\n"; #modified by song. check if the file is empty
885 | print BN "then\n";
886 | #if the output file does not exist, run and check the completeness of the output file
887 | print BN ' if [ ! -f $BlastNOUT ]',"\n";
888 | print BN " then\n";
889 | print BN " $blastn -evalue 1e-9 -show_gis -num_threads 4 -query \${QUERY} -out \${BlastNOUT} -db $db_BN","\n";
890 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n";
891 | print BN ' CHECK1=$?',"\n";
892 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n"; # one possible blast error message ( see the end of this script).
893 | print BN ' CHECK2=$?',"\n";
894 | print BN ' while [ ${CHECK1} -eq 1 ] || [ ${CHECK2} -eq 0 ]',"\n";
895 | print BN " do\n";
896 | print BN " $blastn -evalue 1e-9 -show_gis -num_threads 4 -query \${QUERY} -out \${BlastNOUT} -db $db_BN","\n";
897 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n";
898 | print BN ' CHECK1=$?',"\n";
899 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n";#see the end of this script
900 | print BN ' CHECK2=$?',"\n";
901 | print BN " done\n";
902 | #if the output file exists, check the completeness of the output file
903 | print BN " else\n";
904 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n";
905 | print BN ' CHECK1=$?',"\n";
906 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n";# one possible blast error (see the end of this script).
907 | print BN ' CHECK2=$?',"\n";
908 | print BN ' while [ ${CHECK1} -eq 1 ] || [ ${CHECK2} -eq 0 ]',"\n";
909 | print BN " do\n";
910 | print BN " $blastn -evalue 1e-9 -show_gis -num_threads 4 -query \${QUERY} -out \${BlastNOUT} -db $db_BN","\n";
911 | print BN ' tail -5 ${BlastNOUT}|grep Matrix',"\n";
912 | print BN ' CHECK1=$?',"\n";
913 | print BN ' grep "no longer exists in database" ${BlastNOUT}',"\n";#see the end of this script
914 | print BN ' CHECK2=$?',"\n";
915 | print BN " done\n";
916 | print BN " fi\n";
917 | print BN "fi";
918 | close BN;
919 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
920 | #$bsub_com = "qsub -V -l h_vmem=10G -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
921 | system ($bsub_com);
922 | #}
923 | }
924 |
925 | #####################################################################################
926 |
927 | sub parse_blast_N{
928 | my ($step_by_step) = @_;
929 | if ($step_by_step) {
930 | $hold_job_file = "";
931 | }else{
932 | $hold_job_file = $current_job_file;
933 | }
934 |
935 | $current_job_file = "j10_".$sample_name."_PBN_".$$.".sh";
936 | #my $BND=$sample_full_path."/".$sample_name.".".$BLAST_NT_DIR_SUFFIX;
937 | #my $nn1=`tail $BND/*.out | grep Matrix | wc -l`;
938 | #my $nn2=`ls $BND/*.out | wc -l`;
939 | #print $nn1,"\n";
940 | #print $nn2,"\n";
941 | #if($nn1 != $nn2) { print "resubmited blastN for $sample_name","\n"; &submit_job_array_blast_N(1); }
942 | #else {
943 | #exit(2);
944 | open (PBN, ">$job_files_dir/$current_job_file") or die $!;
945 | print PBN "#!/bin/bash\n";
946 | print PBN "#BSUB -n 1\n";
947 | print PBN "#BSUB -R \"rusage[mem=10000]\"","\n";
948 | print PBN "#BSUB -M 10000000\n";
949 | print PBN "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
950 | print PBN "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
951 | print PBN "#BSUB -J $current_job_file\[1-$file_number_of_Blast_N\]\n";
952 | print PBN "#BSUB -w \"$hold_job_file\"","\n";
953 | #################################
954 | print PBN "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n";
955 | #print PBN "#\$ -t 1-$file_number_of_Blast_N:1","\n"; #must be a decimal number when the job file is created, cannot be a variable
956 | print PBN "BlastNOUT=",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.out\n";#name only, not full path
957 | print PBN "BlastNIN=",'${BN_DIR}',"/",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".fa\n";#full path
958 | print PBN "PARSED=",'${BN_DIR}',"/".$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.parsed\n\n";
959 | print PBN 'if [ -s $BlastNIN ]',"\n"; #song changed -f to -s;
960 | print PBN "then\n";
961 | #if the parsed file does not exist, run parser and check the completeness of the parsed file
962 | print PBN ' if [ ! -f $PARSED ]',"\n";
963 | print PBN " then\n";
964 | print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n";
965 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n";
966 | print PBN ' CHECK=$?',"\n";
967 | #check if parsed file is completed, if not completed. run and check again
968 | print PBN ' while [ ${CHECK} -eq 10 ]',"\n"; #10 is the error exit code of check_Blast_parsed_file.pl if it's not correctly completed.
969 | print PBN " do\n"; #run parser again
970 | print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n";
971 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n";
972 | print PBN ' CHECK=$?',"\n";
973 | print PBN " done\n";
974 | #if the parsed file exists, check the completeness of the parsed file
975 | print PBN " else\n";
976 | # print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n";
977 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n";
978 | print PBN ' CHECK=$?',"\n";
979 | #check if parsed file is completed. If not correctly completed run and check again
980 | print PBN ' while [ ${CHECK} -eq 10 ]',"\n";
981 | print PBN " do\n"; #run parser again
982 | print PBN " ".$run_script_path."BLASTn_NT_parser.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNOUT}\n";
983 | print PBN " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n";
984 | print PBN ' CHECK=$?',"\n";
985 | print PBN " done\n";
986 | print PBN " fi\n";
987 | print PBN "fi";
988 | close PBN;
989 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
990 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
991 | system ($bsub_com);
992 | }
993 |
994 | #####################################################################################
995 |
996 | sub blast_S{
997 |
998 | my ($step_by_step) = @_;
999 | if ($step_by_step) {
1000 | $hold_job_file = "";
1001 | }else{
1002 | $hold_job_file = $current_job_file;
1003 | }
1004 | $current_job_file = "j11_".$sample_name."_blastS_".$$.".sh";
1005 | open (PS, ">$job_files_dir/$current_job_file") or die $!;
1006 | print PS "#!/bin/bash\n";
1007 | print PS "#BSUB -n 1\n";
1008 | print PS "#BSUB -R \"rusage[mem=10000]\"","\n";
1009 | print PS "#BSUB -M 10000000\n";
1010 | print PS "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
1011 | print PS "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
1012 | print PS "#BSUB -J $current_job_file\[1-$file_number_of_Blast_N\]\n";
1013 | print PS "#BSUB -w \"$hold_job_file\"","\n";
1014 | #################################
1015 | print PS "BN_DIR=".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX\n";
1016 | #print PBN "#\$ -t 1-$file_number_of_Blast_N:1","\n"; #must be a decimal number when the job file is created, cannot be a variable
1017 | print PS "BlastNparsed=",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.parsed\n";#name only, not full path
1018 | print PS "BlastNIN=",'${BN_DIR}',"/",$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".fa\n";#full path
1019 | print PS "OUTPUT=",'${BN_DIR}',"/".$sample_name.".RefGfiltered.fa_file".'${LSB_JOBINDEX}',".blastn.summary\n\n";
1020 | print PS 'if [ -s $BlastNIN ]',"\n"; #song changed -f to -s;
1021 | print PS "then\n";
1022 | #if the parsed file does not exist, run parser and check the completeness of the parsed file
1023 | print PS ' if [ ! -f $OUTPUT ]',"\n";
1024 | print PS " then\n";
1025 | print PS " ".$run_script_path."blast_summary.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNparsed}\n";
1026 | print PS ' grep "Finished summary" ${OUTPUT}',"\n";
1027 | print PS ' CHECK=$?',"\n";
1028 | #check if parsed file is completed, if not completed. run and check again
1029 | print PS ' while [ ${CHECK} -eq 1 ]',"\n"; #10 is the error exit code of check_Blast_parsed_file.pl if it's not correctly completed.
1030 | print PS " do\n"; #run parser again
1031 | print PS " ".$run_script_path."blast_summary.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNparsed}\n";
1032 | print PS ' grep "Finished summary" ${OUTPUT}',"\n";
1033 | print PS ' CHECK=$?',"\n";
1034 | print PS " done\n";
1035 | #if the parsed file exists, check the completeness of the parsed file
1036 | print PS " else\n";
1037 | #print PS " ".$run_script_path."check_Blast_parsed_file.pl \${PARSED}\n";
1038 | print PS ' grep "Finished summary" ${OUTPUT}',"\n";
1039 | print PS ' CHECK=$?',"\n";
1040 | #check if parsed file is completed. If not correctly completed run and check again
1041 | print PS ' while [ ${CHECK} -eq 1 ]',"\n";
1042 | print PS " do\n"; #run parser again
1043 | print PS " ".$run_script_path."blast_summary.pl ".$sample_full_path."/".$sample_name.".$BLAST_NT_DIR_SUFFIX \${BlastNparsed}\n";
1044 | print PS ' grep "Finished summary" ${OUTPUT}',"\n";
1045 | print PS ' CHECK=$?',"\n";
1046 | print PS " done\n";
1047 | print PS " fi\n";
1048 | print PS "fi";
1049 | close PS;
1050 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
1051 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
1052 | system ($bsub_com);
1053 |
1054 | }
1055 |
1056 |
1057 | #####################################################################################
1058 |
1059 | sub report_for_each_sample{
1060 | my ($step_by_step) = @_;
1061 | if ($step_by_step) {
1062 | $hold_job_file = "";
1063 | }else{
1064 | $hold_job_file = $current_job_file;
1065 | }
1066 |
1067 | $current_job_file = "j12_".$sample_name."_Rep_".$$.".sh";
1068 | open(REP, ">$job_files_dir/$current_job_file") or die $!;
1069 | print REP "#!/bin/bash\n";
1070 | print REP "#BSUB -n 1\n";
1071 | print REP "#BSUB -R \"rusage[mem=40000]\"","\n";
1072 | print REP "#BSUB -M 40000000\n";
1073 | print REP "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
1074 | print REP "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
1075 | print REP "#BSUB -J $current_job_file\n";
1076 | print REP "#BSUB -w \"$hold_job_file\"","\n";
1077 | ############################
1078 | print REP "INPUT=".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.goodSeq\n";#RepeatMasker QC output
1079 | print REP "REPORT=".$sample_full_path."/".$sample_name.".gi.AssignmentReport\n";
1080 | print REP 'if [ -f $REPORT ] ',"\n"; # report file exist
1081 | print REP "then\n";
1082 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n";
1083 | print REP ' CHECK=$?',"\n";
1084 | print REP ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, report not finish
1085 | print REP " do\n";
1086 | print REP " ".$run_script_path."assignment_report_virus_gi.pl ".$sample_full_path." \${INPUT} $refrence_genome_taxonomy \n";
1087 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n";
1088 | print REP ' CHECK=$?',"\n";
1089 | print REP " done\n";
1090 | print REP "else\n"; # report file does not exist
1091 | print REP " ".$run_script_path."assignment_report_virus_gi.pl ".$sample_full_path." \${INPUT} $refrence_genome_taxonomy \n";
1092 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n";
1093 | print REP ' CHECK=$?',"\n";
1094 | print REP ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, report not finish
1095 | print REP " do\n";
1096 | print REP " ".$run_script_path."assignment_report_virus_gi.pl ".$sample_full_path." \${INPUT} $refrence_genome_taxonomy \n";
1097 | print REP ' grep "# Finished Assignment Report" ${REPORT}',"\n";
1098 | print REP ' CHECK=$?',"\n";
1099 | print REP " done\n";
1100 | print REP "fi\n";
1101 | close REP;
1102 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
1103 | #$bsub_com = "qsub -V -P long -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
1104 | system ($bsub_com);
1105 | }
1106 |
1107 | #####################################################################################
1108 |
1109 | sub summary_for_each_sample{
1110 |
1111 | my ($step_by_step) = @_;
1112 | if ($step_by_step) {
1113 | $hold_job_file = "";
1114 | }else{
1115 | $hold_job_file = $current_job_file;
1116 | }
1117 |
1118 | $current_job_file = "j13_".$sample_name."_Sum_".$$.".sh";
1119 |
1120 | open(SUM, ">$job_files_dir/$current_job_file") or die $!;
1121 | print SUM "#!/bin/bash\n";
1122 | print SUM "#BSUB -n 1\n";
1123 | print SUM "#BSUB -R \"rusage[mem=40000]\"","\n";
1124 | print SUM "#BSUB -M 40000000\n";
1125 | print SUM "#BSUB -o $lsf_file_dir","/","$current_job_file.out\n";
1126 | print SUM "#BSUB -e $lsf_file_dir","/","$current_job_file.err\n";
1127 | print SUM "#BSUB -J $current_job_file\n";
1128 | print SUM "#BSUB -w \"$hold_job_file\"","\n";
1129 | ############################
1130 | print SUM "OUTPUT=".$sample_full_path."/".$sample_name.".gi.AssignmentSummary\n";
1131 | print SUM "BAD_SEQ=".$sample_full_path."/".$sample_name.".fa.cdhit_out.masked.badSeq\n\n"; #output of RepeatMasker
1132 | print SUM 'if [ -f $OUTPUT ] ',"\n"; # summary file exist
1133 | print SUM "then\n";
1134 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n";
1135 | print SUM ' CHECK=$?',"\n";
1136 | print SUM ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish
1137 | print SUM " do\n";
1138 | print SUM " ".$run_script_path."assignment_summary_gi.pl ".$sample_full_path." \${BAD_SEQ}\n";
1139 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n";
1140 | print SUM ' CHECK=$?',"\n";
1141 | print SUM " done\n";
1142 | print SUM "else\n"; # file does not exist
1143 | print SUM " ".$run_script_path."assignment_summary_gi.pl ".$sample_full_path." \${BAD_SEQ}\n";
1144 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n";
1145 | print SUM ' CHECK=$?',"\n";
1146 | print SUM ' while [ ${CHECK} -eq 1 ] ',"\n"; # grep unsuccessful, file not finish
1147 | print SUM " do\n";
1148 | print SUM " ".$run_script_path."assignment_summary_gi.pl ".$sample_full_path." \${BAD_SEQ}\n";
1149 | print SUM ' grep "# Finished Assignment Summary" ${OUTPUT}',"\n";
1150 | print SUM ' CHECK=$?',"\n";
1151 | print SUM " done\n";
1152 | print SUM "fi\n";
1153 | close SUM;
1154 | $bsub_com = "bsub < $job_files_dir/$current_job_file";
1155 | #$bsub_com = "qsub -V -P long -N $working_name -hold_jid $hold_job_file -e $lsf_file_dir -o $lsf_file_dir $job_files_dir/$current_job_file\n";
1156 | system ($bsub_com);
1157 | }
1158 |
1159 | =add
1160 | possible blast error
1161 | Sequence with id 224967180 no longer exists in database...alignment skipped
1162 | Sequence with id 224967180 no longer exists in database...alignment skipped
1163 | =cut
1164 |
--------------------------------------------------------------------------------
/assignment_report_virus_gi.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 | use Switch;
5 | use Bio::SearchIO;
6 |
7 | my $usage = '
8 | This script will read corresponding files in the given director and
9 | generate a report. It will report in each library, for each category,
10 | how many total sequence were assigned to this category, how many were
11 | assigned by BLASTN, how many were assigned by TBLASTX, the range of
12 | percent identity. It will also generate four fasta format files which
13 | contain viral reads from blastn, tblastx, all viral reads and reads
14 | that can not be assigned to any category.
15 |
16 | perl script ][
17 | = full path to the directory holding files for the given
18 | library
19 | e.g. .../S21_Rota_other
20 | ][ = 1. Human
21 | 2. Mouse
22 | 3. Worm (C. elegans, C. briggsae)
23 | 4. Mouse lemur (Microcebus_murinus)
24 | 5. sand fly (Lutzomyia longipalpis)
25 |
26 |
27 | ';
28 | die $usage unless scalar @ARGV == 3;
29 | my ( $dir, $input_good_seq_fasta_file, $ref_genome_choice ) = @ARGV;
30 |
31 | # get all the viral read sequences
32 | my %viral_reads_blastn = ();
33 | my %viral_reads_blastx = ();
34 |
35 | my %best_e_blastn = (); # viral_read_ID => best_e value for this read in blastn
36 | my %best_e_blastx = (); # viral_read_ID => best_e value for this read in blastx
37 |
38 | my @blast_files_blastn = (); # all blastn.out files
39 | my @blast_files_blastx = (); # all blastx.out files
40 |
41 | my @unassigned_reads = ();
42 | ####################################
43 | my @ambiguous_reads = (); #cai added 12/2010
44 | ####################################
45 |
46 | # read in original sequences
47 | my @temp = split("\/", $dir);
48 | my $lib_name = pop @temp;
49 | # print "lib is $lib_name\n";
50 | #my $fasta_file = $dir."/".$lib_name.".fa.cdhit_out.masked.goodSeq";
51 | my $fasta_file = $input_good_seq_fasta_file; #cai changed, added segmasker
52 |
53 | my %seq = &read_FASTA_data($fasta_file);
54 |
55 | my $out1 = $dir."/".$lib_name.".gi.AssignmentReport";
56 | open (OUT1, ">$out1") or die "can not open file $out1!\n";
57 | my $OUT2 = $dir."/".$lib_name.".gi.ViralReads_all.fa";
58 | open (OUT2, ">$OUT2") or die "can not open file $OUT2!\n";
59 | my $OUT3 = $dir."/".$lib_name.".gi.unassigned.fa";
60 | open (OUT3, ">$OUT3") or die "can not open file $OUT3!\n";
61 | ##################################cai added 12/2010
62 | my $out4 = $dir."/".$lib_name.".gi.AmbiguousReads_all.fa";
63 | open (OUT4, ">$out4") or die "can not open file $out4!\n";
64 | ##################################
65 |
66 | # category => num of sequence assigned to this category by blastn
67 | my %blastn = (
68 | "Bacteria" => 0,
69 | "Fungi" => 0,
70 | "Homo" => 0,
71 | "Mus" => 0,
72 | "Phage" => 0,
73 | "Viruses" => 0,
74 | "other" => 0,
75 | "unassigned" => 0,
76 | ##################################cai added 12/2010
77 | "Ambiguous" => 0,
78 | ##################################cai added
79 | );
80 |
81 | # category => num of sequence assigned to this category by blastn of Reference genome
82 | my %blastn_RefG = ();
83 | foreach my $key (keys %blastn) {
84 | $blastn_RefG{$key} = 0;
85 | }
86 |
87 | # category => num of sequence assigned to this category by tblastx of viral genome
88 | my %blastx = ();
89 | foreach my $key (keys %blastn) {
90 | $blastx{$key} = 0;
91 | }
92 |
93 | # viral_lineage => number of reads assigned to this lineage in the library
94 | my %num_reads = ();
95 | my %blast_readinfo =(); # readID => information about this read
96 | my %lineage_blastn = (); # lineage => [read ID]
97 | my %lineage_gi = ();
98 | my %lineage_blastx = (); # lineage => [read ID]
99 |
100 | opendir(DH, $dir) or die "Can not open dir $dir!\n";
101 | foreach my $name (readdir DH) {
102 | # name is either file name or directory for splited files
103 | my $full_path = $dir."/".$name;
104 |
105 | # full_path= dir/goodSeq_RefGblast
106 | if ($name =~ /goodSeq_RefGblast$/) { # Reference genome blast result
107 | # enter subdirectory where blastn results resides
108 | opendir (RefGDIR, $full_path) or die "can not open dir $full_path!\n";
109 | foreach my $blast_file (readdir RefGDIR) {
110 | if ($blast_file =~ /RefGblast\.parsed$/) {
111 | my $parsed = $full_path."/".$blast_file;
112 | open (IN, $parsed) or die "can not open file $parsed!\n";
113 | while () {
114 | if ($_ =~ /#/) { # skip comment line
115 | next;
116 | }
117 | chomp;
118 | my ($read_ID, $length, $category, $lineage, $hit_name, $e_value) = split("\t", $_);
119 | # print "readID = $read_ID, length = $length, category = $category, lineage = $lineage, hit name = $hit_name, e = $e_value\n";
120 | $blastn_RefG{$ref_genome_choice}++;
121 | }
122 | close IN;
123 | }
124 | }
125 | closedir RefGDIR;
126 | } # finish .RefGblast.parsed
127 |
128 | # full_path= dir/RefGfiltered_BLASTN
129 |
130 | if ($name =~ /RefGfiltered_BLASTN$/) {
131 | # enter subdirectory where blastx results resides
132 | opendir (BNDIR, $full_path) or die "can not open dir $full_path!\n";
133 | foreach my $blast_file (readdir BNDIR) {
134 | if ($blast_file =~ /blastn\.parsed$/) {
135 | # print "blastn parsed file $blast_file\n";
136 | my $blast_out = $blast_file;
137 | $blast_out =~ s/\.blastn\.parsed/\.blastn\.out/;
138 | $blast_out = $full_path."/".$blast_out;
139 | my $blast_s = $blast_file;
140 | $blast_s =~ s/\.blastn\.parsed/\.blastn\.summary/;
141 | $blast_s = $full_path."/".$blast_s;
142 | push @blast_files_blastn, $blast_s;
143 | my $parsed = $full_path."/".$blast_file;
144 | #print $parsed,"\n";
145 | ##################################cai changed 12/2010
146 | &collect_information($parsed, \%blastn, \%viral_reads_blastn, \%best_e_blastn, \%lineage_blastn, \%lineage_gi, \%num_reads, \@unassigned_reads, \@ambiguous_reads);
147 | ##################################
148 | }
149 | }
150 | closedir BNDIR;
151 | } # finish .blastn.parsed
152 |
153 |
154 | # full_path= dir/RefGfiltered_BLASTN
155 | #if ($name =~ /BLASTX_NR$/i) {
156 | # enter subdirectory where blastx results resides
157 | # opendir (BXDIR, $full_path) or die "can not open dir $full_path!\n";
158 | # foreach my $blast_file (readdir BXDIR) {
159 | # if ($blast_file =~ /blastx\.parsed$/) {
160 | # print "blastn parsed file $blast_file\n";
161 | # my $blast_out = $blast_file;
162 | # $blast_out =~ s/\.blastx\.parsed/\.blastx\.out/;
163 | # $blast_out = $full_path."/".$blast_out;
164 | # push @blast_files_blastx, $blast_out;
165 | # my $parsed = $full_path."/".$blast_file;
166 | ##################################cai changed 12/2010
167 | # &collect_information($parsed, \%blastx, \%viral_reads_blastx, \%best_e_blastx, \%lineage_blastx, \%num_reads, \@unassigned_reads, \@ambiguous_reads);
168 | ##################################
169 | # }
170 | # }
171 | # closedir BXDIR;
172 | #} # finish .blastx.parsed
173 |
174 | }
175 |
176 | close DH;
177 |
178 | # get detailed information about each viral read
179 | &get_viral_read_info(\@blast_files_blastn, \%blast_readinfo);
180 |
181 | #&get_viral_read_info( \@blast_files_blastx, "blastx", \%viral_reads_blastx, \%best_e_blastx, \%blast_readinfo);
182 | # print out report for this library
183 |
184 | print OUT1 $dir, "\n";
185 | printf OUT1 "%12s\t%7s\t%7s\t%7s\t%7s\t%7s\t%7s\t%7s\n", "category", "total", "BN_RefG", "BN", "BX_NR";
186 |
187 | foreach my $key (sort {$a cmp $b } keys %blastx) {
188 | printf OUT1 "%12s\t%7d\t%7d\t%7d\t%7d\n", $key, $blastn_RefG{$key}+$blastn{$key}+$blastx{$key},$blastn_RefG{$key}, $blastn{$key}, $blastx{$key};
189 | }
190 |
191 | print OUT1 "\n###########################################################\n\n";
192 |
193 | foreach my $gi (sort {$num_reads{$a} <=> $num_reads{$b}} keys %num_reads) {
194 |
195 | print OUT1 $gi, "\t", $lineage_gi{$gi}, "\ttotal number of reads: ", $num_reads{$gi}, "\n\n";
196 |
197 | print OUT1 "QueryName\tQuerylength\t HitName \tHitLen\t HitDesc \tAlnLen\t%ID\tHitStart\tHitEnd\te\n";
198 |
199 | if (defined $lineage_blastn{$gi}) {
200 | if (scalar @{$lineage_blastn{$gi}}) {
201 | print OUT1 "reads from blastn:\n";
202 | foreach my $read (sort {$a cmp $b} @{$lineage_blastn{$gi}}) {
203 | print OUT1 $blast_readinfo{$read};
204 | }
205 | }
206 | }
207 |
208 | #if (defined $lineage_blastx{$lineage}) {
209 | # if (scalar @{$lineage_blastx{$lineage}}) {
210 | # print OUT1 "reads from blastx:\n";
211 | # foreach my $read (sort {$a cmp $b} @{$lineage_blastx{$lineage}}) {
212 | # print OUT1 $blast_readinfo{$read};
213 | # }
214 | # }
215 | #}
216 | print OUT1 "\n##################################################\n\n";
217 | }
218 |
219 | # get all the viral reads and put into output file:
220 |
221 | foreach my $gi (keys %num_reads) {
222 |
223 | foreach my $read (@{$lineage_blastn{$gi}}) {
224 | print OUT2 ">$read\n";
225 | print OUT2 $seq{$read}, "\n";
226 | }
227 | }
228 | #foreach my $read (@{$lineage_blastx{$lineage}}) {
229 | # print OUT2 ">$read\n";
230 | # print OUT2 $seq{$read}, "\n";
231 | #}
232 | #}
233 |
234 | # get all unassigned reads
235 | #foreach my $read (@unassigned_reads) {
236 | # print OUT3 ">$read\n";
237 | # print OUT3 $seq{$read}, "\n";
238 | #}
239 |
240 | ######################cai added
241 | #foreach my $read (@ambiguous_reads) {
242 | # print OUT4 ">$read\n";
243 | # print OUT4 $seq{$read}, "\n";
244 | #}
245 | #####################
246 |
247 | print OUT1 "# Finished Assignment Report\n";
248 |
249 | exit;
250 |
251 | #####################################################################################
252 | # collecte information from given directory
253 | sub collect_information {
254 | ##################################cai changed 12/2010
255 | my ($infile, $category_hash_ref, $viral_reads_hash_ref, $best_e_hash_ref, $lineage_hash_ref, $lineage_hash_gi, $num_reads_hash_ref, $unassigned_reads_arr_ref, $ambiguous_reads_arr_ref) = @_;
256 | ##################################
257 | open (IN, $infile) or die "can not open file $infile!\n";
258 | while () {
259 | if ($_ =~ /#/) { # skip comment line
260 | next;
261 | }
262 | chomp;
263 | my ($read_ID, $length, $category, $lineage, $hit_name, $e_value) = split("\t", $_);
264 | # print "readID = $read_ID, length = $length, category = $category, lineage = $lineage, hit name = $hit_name, e = $e_value\n";
265 | my $gid=0;
266 | if($hit_name=~/gi\|(\d+)\|/) { $gid=$1; $lineage_hash_gi->{$gid}=$lineage; }
267 |
268 | switch ($category ) {
269 | case "Bacteria" { $category_hash_ref->{"Bacteria"}++ }
270 | case "Fungi" { $category_hash_ref->{"Fungi"}++ }
271 | case "Homo" { $category_hash_ref->{"Homo"}++ }
272 | case "Mus" { $category_hash_ref->{"Mus"}++ }
273 | case "Phage" {$category_hash_ref->{"Phage"}++ }
274 | case "Viruses" { $category_hash_ref->{"Viruses"}++ }
275 | case "other" {$category_hash_ref->{"other"}++ }
276 | case "unassigned" {$category_hash_ref->{"unassigned"}++}
277 | case "Ambiguous" {$category_hash_ref->{"Ambiguous"}++ } #cai added
278 | }
279 |
280 | if (($category eq "Viruses") && $gid!=0) {
281 |
282 | $viral_reads_hash_ref->{$read_ID} = 1;
283 |
284 | $best_e_hash_ref->{$read_ID} = $e_value;
285 |
286 | if (!(defined $lineage_hash_ref->{$gid})) {
287 | $lineage_hash_ref->{$gid} = [$read_ID];
288 | }
289 | else {
290 | push @{$lineage_hash_ref->{$gid}}, $read_ID;
291 | }
292 |
293 | if (defined $num_reads_hash_ref->{$gid}) {
294 | $num_reads_hash_ref->{$gid}++;
295 | }
296 | else {
297 | $num_reads_hash_ref->{$gid} = 1;
298 | }
299 |
300 | ##################################cai added 12/2010
301 | }elsif ($category eq "Ambiguous"){
302 | push @{$ambiguous_reads_arr_ref}, $read_ID;
303 | ##################################
304 | }elsif ($category eq "unassigned") {
305 | push @{$unassigned_reads_arr_ref}, $read_ID;
306 | }
307 | }
308 | close IN;
309 | }
310 |
311 | ############################################################################
312 | sub read_FASTA_data () {
313 | my $fastaFile = shift @_;
314 |
315 | #keep old read seperator and set new read seperator to ">"
316 | my $oldseperator = $/;
317 | $/ = ">";
318 |
319 | my %fastaSeq;
320 | open (FAfile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
321 | while (my $line = ){
322 | # Discard blank lines
323 | if ($line =~ /^\s*$/) {
324 | next;
325 | }
326 | # discard comment lines
327 | elsif ($line =~ /^\s*#/) {
328 | next;
329 | }
330 | # discard the first line which only has ">", keep the rest
331 | elsif ($line ne ">") {
332 | chomp $line;
333 | my @rows = ();
334 | @rows = split (/\n/, $line);
335 | my $temp = shift @rows;
336 | my @temp = split(/\s+/, $temp);
337 | my $name = shift @temp;
338 | my $Seq = join("", @rows);
339 | $Seq =~ s/\s//g; #remove white space
340 | $fastaSeq{$name} = $Seq;
341 | }
342 | }
343 |
344 | # check
345 | # foreach my $key (keys %fastaSeq){
346 | # print "Here is the key for fasta seq: $key \t $fastaSeq{$key}\n";
347 | # }
348 |
349 | #reset the read seperator
350 | $/ = $oldseperator;
351 | close FAfile;
352 |
353 | return %fastaSeq;
354 |
355 | }
356 |
357 | #############################################################################
358 | # get detailed information about each viral read
359 | sub get_viral_read_info {
360 | my ($report_file_ref,$blast_readinfo_hash_ref) = @_;
361 | my $report; # blast report object
362 | foreach my $file (@{$report_file_ref}) {
363 | foreach my $line (`cat $file`)
364 | {
365 | if($line=~/Finished summary/) { next; }
366 | else {
367 | my @ss=split("\t",$line);
368 | $blast_readinfo_hash_ref->{$ss[0]} = $line;}
369 | }
370 | }
371 | }
372 |
--------------------------------------------------------------------------------
/assignment_summary_gi.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 |
5 | my $usage = '
6 | This script will read the assignment report files in the given
7 | directory and generate a summary report for a given library. It will report
8 | in each library, for each category, how many total sequence were
9 | assigned to this category, how many were assigned by BLASTN, how many
10 | were assigned by BLASTX.
11 |
12 | It will also filter the virus lineage, leave out virus that are phage.
13 | It will rank the virus lineage by range of percent ID from low to high.
14 |
15 | It will also generate a .InterestingReads report about the details of each lineage.
16 |
17 | perl script
18 | = full path to the folder holding files for a given sample
19 |
20 | ';
21 |
22 | die $usage unless scalar @ARGV == 2;
23 | my ( $dir, $bad_seq ) = @ARGV;
24 |
25 | # cutoff for sequences to be interesting, we choose to report everything.
26 | my $percentID_cutoff = 100;
27 |
28 | my @temp = split("\/", $dir);
29 | my $lib_name = pop @temp;
30 |
31 | my $out = $dir."/".$lib_name.".gi.AssignmentSummary";
32 | open (OUT, ">$out") or die "can not open file $out!\n";
33 | my $out2 = $dir."/".$lib_name.".gi.InterestingReads";
34 | open (OUT2, ">$out2") or die "can not open file $out2!\n";
35 |
36 | my $seq_file = $dir."/".$lib_name.".fa";
37 | my %sequences = &read_FASTA_data($seq_file); # read_ID => sequence
38 |
39 | my %ID_low = (); # lineage => lowest percent identity to hits
40 | my %ID_high = (); # lineage => highest percent identity to hits
41 | my %num_reads = ();
42 | my $C = "##############################################\n\n";
43 |
44 | print OUT "$dir\n\n";
45 | # get sequence statistics
46 | my @nums = &get_SequenceInfo_OneSample($dir);
47 | #print "@nums\n";
48 |
49 | print OUT "#total\tuniq\ttotal\%\tFiltered\ttotal\%\tLowComplex\ttotal\%\tgood\ttotal\%\tBNRefG\ttotal\%\tBNNT\ttotal\%\tBXNR\ttotal\%\n";
50 | printf OUT ("%d\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\t%d\t%5.1f\n", $nums[0], $nums[1], $nums[1]*100/$nums[0], $nums[2], $nums[2]*100/$nums[0], $nums[3], $nums[3]*100/$nums[0], $nums[4], $nums[4]*100/$nums[0], $nums[5], $nums[5]*100/$nums[0], $nums[6], $nums[6]*100/$nums[0], $nums[7], $nums[7]*100/$nums[0]);
51 | print OUT "\n\n";
52 |
53 |
54 | my $oldSeperator = $/;
55 | $/ = "###########\n";
56 | my $AssignmentReport_file = $dir."/".$lib_name.".gi.AssignmentReport";
57 | open (IN, $AssignmentReport_file) or die "can not open file $AssignmentReport_file!\n";
58 | my $line = ;
59 | $line =~ s/#//g;
60 | my @temps = split("\n", $line);
61 | shift @temps;
62 | foreach my $temp (@temps) {
63 | print OUT $temp, "\n";
64 | }
65 | print OUT "\n\n";
66 |
67 | while () {
68 | if ($_ =~ /^\s*$/) { # skip blank line
69 | next;
70 | }
71 | elsif ($_ =~ /Finished Assignment Report/) { next; }
72 |
73 | my @lines = split("\n", $_);
74 | my $lineage = shift @lines;
75 | $lineage = shift @lines;
76 | #$lineage = shift @lines;
77 | #print $lineage,"\n";
78 | #;
79 | my $high = 0;
80 | my $low = 100;
81 | my %readID_Identity = (); # readID => percent ID
82 | my %readID_desc = (); # readID => description of the read
83 | foreach my $l (@lines) {
84 | if ($l =~ /^\s*$/) { next; }
85 | elsif ($l =~ /QueryName/) { next; }
86 | elsif ($l =~ /reads from/) { next; }
87 | elsif ($l =~ /#+/) { next; }
88 | my ($read_ID, $Qlength, $hitName, $hitLen, $hitDesc, $alnLen, $ID, $hitS, $hitE, $e) = split("\t", $l);
89 | if($ID > $high) { $high = $ID;}
90 | if($ID < $low) { $low = $ID;}
91 |
92 | if (defined ($readID_Identity{$read_ID})) {
93 | if ($ID > $readID_Identity{$read_ID}) {
94 | $readID_Identity{$read_ID} = $ID;
95 | $readID_desc{$read_ID} = $l;
96 | }
97 | }
98 | else {
99 | $readID_Identity{$read_ID} = $ID;
100 | $readID_desc{$read_ID} = $l;
101 | }
102 | }
103 | if ($high == 0) {
104 | $high = 100;
105 | }
106 |
107 | $ID_low{$lineage} = $low;
108 | $ID_high{$lineage} = $high;
109 |
110 | if($lineage=~/total number of reads: (\d+)/) { $num_reads{$lineage}=$1; }
111 |
112 | if ($low <= $percentID_cutoff) {
113 | print OUT2 $lineage, "\t[$low, $high]\n\n";
114 | foreach my $key (sort {$readID_Identity{$a} <=> $readID_Identity{$b}} keys %readID_Identity) {
115 | print OUT2 $readID_desc{$key}, "\n";
116 | }
117 | print OUT2 "\n";
118 | foreach my $key (sort {$readID_Identity{$a} <=> $readID_Identity{$b}} keys %readID_Identity) {
119 | print OUT2 ">$key\n";
120 | print OUT2 "$sequences{$key}\n\n";
121 | }
122 | }
123 | }
124 | close IN;
125 |
126 |
127 | foreach my $key (sort {$num_reads{$b} <=> $num_reads{$a}} keys %num_reads) {
128 | printf OUT ("%s\t[%4.1f, %4.1f]%\n", $key, $ID_low{$key}, $ID_high{$key});
129 | }
130 | print OUT "# Finished Assignment Summary\n";
131 |
132 | $/ = $oldSeperator;
133 |
134 | close OUT;
135 | close OUT2;
136 |
137 | exit;
138 |
139 | #####################################################################
140 | sub read_FASTA_data () {
141 | my $fastaFile = shift @_;
142 |
143 | #keep old read seperator and set new read seperator to ">"
144 | my $oldseperator = $/;
145 | $/ = ">";
146 |
147 | my %fastaSeq;
148 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
149 |
150 | while (my $line = ){
151 | # Discard blank lines
152 | if ($line =~ /^\s*$/) {
153 | next;
154 | }
155 | # discard comment lines
156 | elsif ($line =~ /^\s*#/) {
157 | next;
158 | }
159 | # discard the first line which only has ">", keep the rest
160 | elsif ($line ne ">") {
161 | chomp $line;
162 | my @rows = ();
163 | @rows = split (/\n/m, $line);
164 | my $temp = shift @rows;
165 | my @temp_arr = split(/\s/, $temp);
166 | my $contigName = shift @temp_arr;
167 | my $contigSeq = join("", @rows);
168 | $contigSeq =~ s/\s//g; #remove white space
169 | $fastaSeq{$contigName} = $contigSeq;
170 | # print " name = \\$contigName\\, seq = \\$contigSeq\\\n\n";
171 | }
172 | }
173 |
174 | # check
175 | # foreach my $key (keys %fastaSeq){
176 | # print "Here is the key for fasta seq: $key \t $fastaSeq{$key}\n";
177 | # }
178 |
179 | #reset the read seperator
180 | $/ = $oldseperator;
181 |
182 | return %fastaSeq;
183 | }
184 |
185 |
186 | ##########################################################################
187 | sub get_SequenceInfo_OneSample {
188 | my ( $dir ) = @_;
189 |
190 | my $total_seq = 0;
191 | my $unique_seq = 0;
192 | my $good_seq = 0;
193 | my $filtered_seq = 0;
194 | my $RepeatLowComplex_seq = 0;
195 | my $blast_RefG_assigned = 0;
196 | my $blastn_assigned = 0;
197 | my $blastx_NR_assigned = 0;
198 | # my $tblastx_NTVS_assigned = 0;
199 |
200 | # get directory path
201 | my @fields = split(/\//, $dir);
202 | my $libName = $fields[$#fields];
203 |
204 | # get total number of sequences in the sample
205 | my $tempF = $dir."/".$libName.".fa";
206 | $total_seq = &count_num_of_seq($tempF);
207 |
208 | # get number of unique sequence in the sample
209 | $tempF = $dir."/".$libName.".fa.cdhit_out";
210 | if (-e $tempF) {
211 | $unique_seq = &count_num_of_seq($tempF);
212 | # print "total # seq = ", $total_seq, " unique # seq: ", $unique_seq, "\n";
213 | }
214 |
215 | # get number of Filtered and good sequences
216 | #$tempF = $dir."/".$libName.".fa.cdhit_out.masked.badSeq";
217 | $tempF = $bad_seq; #cai changed, added segmasker
218 | if (-e $tempF) {
219 | open (IN, $tempF) or die "can not open file $tempF!\n";
220 | }
221 | while () {
222 | if ($_ =~ /good seq = (\d+)/) {
223 | # print "num of good seq: $1, percentage: $2 (percentage of unique sequences\n";
224 | $good_seq = $1;
225 | }
226 | if ($_ =~ /bad seq = (\d+)/) {
227 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n";
228 | $filtered_seq = $1;
229 | }
230 | if ($_ =~ /Repeat and Low complexicity seq = (\d+)/) {
231 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n";
232 | $RepeatLowComplex_seq = $1;
233 | }
234 |
235 | }
236 | close IN;
237 |
238 | # get number of sequences assigned by BLAST ReferenceGenome
239 | my $RefGfiltered = 0;
240 | my $tempF = $dir."/".$libName.".RefGfiltered.fa";
241 | if (-e $tempF) {
242 | $RefGfiltered = &count_num_of_seq($tempF);
243 | }
244 | else {
245 | $RefGfiltered = 0;
246 | }
247 | $blast_RefG_assigned = $good_seq - $RefGfiltered;
248 |
249 | # get number of sequences assigned by BLASTN
250 | my $BNFiltered = 0;
251 | my $tempF = $dir."/".$libName.".BNFiltered.fa";
252 | if (-e $tempF) {
253 | $BNFiltered = &count_num_of_seq($tempF);
254 | }
255 | else {
256 | $BNFiltered = 0;
257 | }
258 | $blastn_assigned = $RefGfiltered - $BNFiltered;
259 |
260 | # get number of sequences assigned by BLASTX NR
261 | my $unassigned_num = 0;
262 | my $tempF = $dir."/".$libName.".unassigned.fa";
263 | if (-e $tempF) {
264 | $unassigned_num = &count_num_of_seq($tempF);
265 | }
266 | else {
267 | $unassigned_num = 0;
268 | }
269 | $blastx_NR_assigned = $BNFiltered - $unassigned_num;
270 |
271 |
272 | return ($total_seq, $unique_seq, $filtered_seq, $RepeatLowComplex_seq, $good_seq, $blast_RefG_assigned, $blastn_assigned, $blastx_NR_assigned);
273 | }
274 |
275 | ############################################################################
276 | sub count_num_of_seq () {
277 | my ($fastaFile) = @_;
278 | my $count = 0;
279 |
280 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
281 | while (my $line = ){
282 | if ($line =~ ">") {
283 | $count++;
284 | }
285 | }
286 | close FastaFile;
287 |
288 | return $count;
289 | }
290 |
--------------------------------------------------------------------------------
/blast_summary.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 | use Switch;
5 | use Bio::SearchIO;
6 |
7 | my $usage = '
8 | perl $full_path $blast_file
9 | ';
10 |
11 | die $usage unless scalar @ARGV == 2;
12 | my ( $full_path, $blast_file) = @ARGV;
13 |
14 | # get all the viral read sequences
15 | my %viral_reads_blastn = ();
16 | my %viral_reads_blastx = ();
17 | my %best_e_blastn = (); # viral_read_ID => best_e value for this read in blastn
18 | my %best_e_blastx = (); # viral_read_ID => best_e value for this read in blastx
19 | my @blast_files_blastn = (); # all blastn.out files
20 | my @blast_files_blastx = (); # all blastx.out files
21 | my @unassigned_reads = ();
22 | ####################################
23 | my @ambiguous_reads = (); #cai added 12/2010
24 | ####################################
25 |
26 | # read in original sequences
27 | #my @temp = split("\/", $dir);
28 | #my $lib_name = pop @temp;
29 | # print "lib is $lib_name\n";
30 | #my $fasta_file = $dir."/".$lib_name.".fa.cdhit_out.masked.goodSeq";
31 | #my $fasta_file = $input_good_seq_fasta_file; #cai changed, added segmasker
32 |
33 | #my %seq = &read_FASTA_data($fasta_file);
34 |
35 | #my $out1 = $dir."/".$lib_name.".gi.AssignmentReport";
36 | #open (OUT1, ">$out1") or die "can not open file $out1!\n";
37 | #my $OUT2 = $dir."/".$lib_name.".gi.ViralReads_all.fa";
38 | #open (OUT2, ">$OUT2") or die "can not open file $OUT2!\n";
39 | #my $OUT3 = $dir."/".$lib_name.".gi.unassigned.fa";
40 | #open (OUT3, ">$OUT3") or die "can not open file $OUT3!\n";
41 |
42 | ##################################cai added 12/2010
43 | #my $out4 = $dir."/".$lib_name.".gi.AmbiguousReads_all.fa";
44 | #open (OUT4, ">$out4") or die "can not open file $out4!\n";
45 | ##################################
46 |
47 |
48 | # category => num of sequence assigned to this category by blastn
49 | my %blastn = (
50 | "Bacteria" => 0,
51 | "Fungi" => 0,
52 | "Homo" => 0,
53 | "Mus" => 0,
54 | "Phage" => 0,
55 | "Viruses" => 0,
56 | "other" => 0,
57 | "unassigned" => 0,
58 | ##################################cai added 12/2010
59 | "Ambiguous" => 0,
60 | ##################################cai added
61 | );
62 |
63 | # category => num of sequence assigned to this category by blastn of Reference genome
64 | my %blastn_RefG = ();
65 | foreach my $key (keys %blastn) {
66 | $blastn_RefG{$key} = 0;
67 | }
68 |
69 | # category => num of sequence assigned to this category by tblastx of viral genome
70 | my %blastx = ();
71 | foreach my $key (keys %blastn) {
72 | $blastx{$key} = 0;
73 | }
74 |
75 | # viral_lineage => number of reads assigned to this lineage in the library
76 | my %num_reads = ();
77 | my %blast_readinfo =(); # readID => information about this read
78 | my %lineage_blastn = (); # lineage => [read ID]
79 | my %lineage_gi = ();
80 | my %lineage_blastx = (); # lineage => [read ID]
81 | #if ($blast_file =~ /blastn\.parsed$/) {
82 | my $blast_out = $blast_file;
83 | $blast_out =~ s/\.blastn\.parsed/\.blastn\.out/;
84 | $blast_out = $full_path."/".$blast_out;
85 | my $blast_s = $blast_file;
86 | print $blast_s,"\n";
87 | $blast_s =~ s/\.blastn\.parsed/\.blastn\.summary/;
88 | $blast_s = $full_path."/".$blast_s;
89 | #print $blast_file,"\n";
90 | #print $blast_out,"\n";
91 | #print $blast_s,"\n";
92 |
93 | #open(OUT,">$blast_s");
94 |
95 | #foreach my $id (keys %blast_readinfo)
96 | #{
97 | # print OUT $id,"\t",$blast_readinfo{$id},"\n";
98 | #}
99 |
100 | push @blast_files_blastn, $blast_out;
101 | my $parsed = $full_path."/".$blast_file;
102 | &collect_information($parsed, \%blastn, \%viral_reads_blastn, \%best_e_blastn, \%lineage_blastn, \%lineage_gi, \%num_reads, \@unassigned_reads, \@ambiguous_reads);
103 |
104 | &get_viral_read_info( \@blast_files_blastn, "blastn", \%viral_reads_blastn, \%best_e_blastn, \%blast_readinfo);
105 |
106 | open(OUT,">$blast_s");
107 |
108 | foreach my $id (keys %blast_readinfo)
109 | {
110 | print OUT $blast_readinfo{$id};
111 | }
112 |
113 | print OUT "Finished summary\n";
114 | close OUT;
115 | #####################################################################################
116 | # collecte information from given directory
117 | sub collect_information {
118 | ##################################cai changed 12/2010
119 | my ($infile, $category_hash_ref, $viral_reads_hash_ref, $best_e_hash_ref, $lineage_hash_ref, $lineage_hash_gi, $num_reads_hash_ref, $unassigned_reads_arr_ref, $ambiguous_reads_arr_ref) = @_;
120 | ##################################
121 | open (IN, $infile) or die "can not open file $infile!\n";
122 | while () {
123 | if ($_ =~ /#/) { # skip comment line
124 | next;
125 | }
126 | chomp;
127 | my ($read_ID, $length, $category, $lineage, $hit_name, $e_value) = split("\t", $_);
128 | # print "readID = $read_ID, length = $length, category = $category, lineage = $lineage, hit name = $hit_name, e = $e_value\n";
129 | my $gid=0;
130 | if($hit_name=~/gi\|(\d+)\|/) { $gid=$1; $lineage_hash_gi->{$gid}=$lineage; }
131 |
132 | switch ($category ) {
133 | case "Bacteria" { $category_hash_ref->{"Bacteria"}++ }
134 | case "Fungi" { $category_hash_ref->{"Fungi"}++ }
135 | case "Homo" { $category_hash_ref->{"Homo"}++ }
136 | case "Mus" { $category_hash_ref->{"Mus"}++ }
137 | case "Phage" {$category_hash_ref->{"Phage"}++ }
138 | case "Viruses" { $category_hash_ref->{"Viruses"}++ }
139 | case "other" {$category_hash_ref->{"other"}++ }
140 | case "unassigned" {$category_hash_ref->{"unassigned"}++}
141 | case "Ambiguous" {$category_hash_ref->{"Ambiguous"}++ } #cai added
142 | }
143 |
144 | if (($category eq "Viruses" || $category eq "Bacteria") && $gid!=0) {
145 |
146 | $viral_reads_hash_ref->{$read_ID} = 1;
147 |
148 | $best_e_hash_ref->{$read_ID} = $e_value;
149 |
150 | if (!(defined $lineage_hash_ref->{$gid})) {
151 | $lineage_hash_ref->{$gid} = [$read_ID];
152 | }
153 | else {
154 | push @{$lineage_hash_ref->{$gid}}, $read_ID;
155 | }
156 |
157 | if (defined $num_reads_hash_ref->{$gid}) {
158 | $num_reads_hash_ref->{$gid}++;
159 | }
160 | else {
161 | $num_reads_hash_ref->{$gid} = 1;
162 | }
163 |
164 | ##################################cai added 12/2010
165 | }elsif ($category eq "Ambiguous"){
166 | push @{$ambiguous_reads_arr_ref}, $read_ID;
167 | ##################################
168 | }elsif ($category eq "unassigned") {
169 | push @{$unassigned_reads_arr_ref}, $read_ID;
170 | }
171 | }
172 | close IN;
173 | }
174 |
175 | #############################################################################
176 | # get detailed information about each viral read
177 | sub get_viral_read_info {
178 | my ($report_file_ref, $report_type, $viral_reads_hash_ref, $best_e_hash_ref, $blast_readinfo_hash_ref) = @_;
179 | my $report; # blast report object
180 | foreach my $file (@{$report_file_ref}) {
181 | $report = new Bio::SearchIO(-format => 'blast', -file => $file, -report_type => $report_type);
182 | # Go through BLAST reports one by one
183 | while(my $result = $report->next_result) {# next query output
184 | my $read_ID = $result->query_name;
185 | if (defined $viral_reads_hash_ref->{$read_ID}) {
186 | my $desc = "";
187 | my $hit_count = 0;
188 | while (my $hit = $result->next_hit()) {
189 | if ($hit->significance() == $best_e_hash_ref->{$read_ID}) {
190 | $hit_count++;
191 | # for those with hundreads hits, only take the first 100
192 | if ($hit_count == 2) {
193 | last;
194 | }
195 | $desc .= $result->query_name()."\t";
196 | $desc .= $result->query_length()."\t";
197 | $desc .= $hit->name()."\t";
198 | $desc .= $hit->length()."\t";
199 | $desc .= $hit->description(60)."\t";
200 | while (my $hsp = $hit->next_hsp()) {
201 | $desc .= $hsp->length('hit')."\t";
202 | my $percent_id = sprintf("%4.1f", $hsp->percent_identity());
203 | $desc .= $percent_id."\%\t[";
204 | $desc .= $hsp->start('hit')."\t";
205 | $desc .= $hsp->end('hit')."]\t";
206 | $desc .= $hsp->evalue()."\n";
207 | last;
208 | }
209 | }
210 | }
211 | $blast_readinfo_hash_ref->{$read_ID} = $desc;
212 | }
213 | }
214 | }
215 | }
216 |
--------------------------------------------------------------------------------
/check_Blast_parsed_file.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 | use warnings;
5 |
6 | my $usage = "
7 | This script will check all .blastn.parsed or .blastx.parsed files
8 | to make sure parsing blast output file is finished for each file.
9 |
10 | perl $0
11 | ";
12 |
13 | exit( 10 ) unless scalar @ARGV == 1;
14 | my ( $PARSED ) = @ARGV;
15 | my $HOME = $ENV{HOME};
16 |
17 | my $finished = &check_blastnParsed_output($PARSED);
18 |
19 | exit ($finished);
20 |
21 | sub check_blastnParsed_output {
22 | my ( $in_file ) = @_;
23 | my $have_summary_line = 0;
24 | my $line_count = 0;
25 | my $total_seq = 0;
26 | my $saved_seq = 0;
27 | my $num_undefined_taxon = 0;
28 |
29 | open (TEMP, "<$in_file") or return 10;
30 | while (my $line = ) {
31 | $line_count++;
32 | if ($line =~ /# Summary: (\d+) out of (\d+)/) {
33 | $saved_seq = $1;
34 | $total_seq = $2;
35 | $have_summary_line = 1;
36 | }
37 | if ($line =~ /undefined taxon/) {
38 | $num_undefined_taxon++;
39 | }
40 | }
41 | close TEMP;
42 |
43 | if (!$have_summary_line) {
44 | return 10;
45 | }
46 |
47 | # taxonomy record has to be equal or greater than the number of sequences get
48 | # successful phylotyped because some sequence could be assigned multiple taxonomy
49 | # categories. Should have at least $num_phylotyped + 1 lines
50 | my $num_phylotyped = $total_seq - $saved_seq;
51 | if ( $num_phylotyped == 0 ) { # every sequence is unassigned
52 | #print "every sequence is unassigned\n";
53 | return 1;
54 | }
55 | # deal with situation where all records showed as undefined taxon and relative
56 | # to humber of phylotyped sequences
57 | elsif ( $num_phylotyped <= $num_undefined_taxon) {
58 | # print "every sequence is undefined taxon\n";
59 | return 10; #changed from 0 to 10, the system default $? is 0, avoid the same value, the same reason below
60 | }
61 |
62 | if ( ($line_count - 1) == $num_undefined_taxon) { # deal with situation where all records showed as undefined taxon
63 | # print "every sequence is un defined taxon\n";
64 | return 10;
65 | }
66 |
67 | # deal with old situation where some reads were not recorded because of no
68 | # entry of gi-taxon record in the database
69 | if ($num_phylotyped > ($line_count -1 ) ) {
70 | #print "record number less than num phylotyped\n";
71 | return 10;
72 | }
73 |
74 | return 1;
75 | }
76 |
77 |
--------------------------------------------------------------------------------
/check_SequenceQualityControl.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 | my $usage='
4 | perl script
5 | = full path of the folder holding files for a sample
6 |
7 | ';
8 | die $usage unless scalar @ARGV == 1;
9 | my ( $dir ) = @ARGV;
10 |
11 | my $finished = &check_QC_read_number($dir);
12 | #print $finished;
13 | exit ($finished);
14 |
15 | ##########################################################################
16 | sub check_QC_read_number {
17 | my ( $dir ) = @_;
18 | my $tot_cdhit_seq=0;
19 | my $tot_seq = 0;
20 |
21 | opendir(DH, $dir) or return 10;
22 | foreach my $name (readdir DH) {
23 | if ($name =~/\.cdhit_out$/) {
24 | my $cdFile = $dir."/".$name;
25 | open (IN, $cdFile) or return 10;
26 | while (my $line = ){
27 | if ($line =~ ">") {
28 | $tot_cdhit_seq++;
29 | }
30 | }
31 | close IN;
32 | }
33 | if ($name =~ /\.badSeq$/) {
34 | my $full_path = $dir."/".$name;
35 | open(IN, $full_path) or return 10;
36 | while(){
37 | if($_=~/total unique seq = (\d+)/) {$tot_seq=$1;}
38 | }
39 | close IN;
40 | }
41 | }
42 | close DH;
43 |
44 | print "total unique seq in CD-HIT output file: $tot_cdhit_seq\n";
45 | print "total unique sequence in QC output file: $tot_seq\n";
46 |
47 | if(abs($tot_cdhit_seq-$tot_seq)/$tot_cdhit_seq<=0.00001) { print "1","\n"; return 1; }
48 | else {
49 |
50 | return 10;
51 | }
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/check_split_BN.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 | my $usage='
4 | perl script
5 | = full path of the folder holding files for this sample
6 | without last "/"
7 | ';
8 | die $usage unless scalar @ARGV == 1;
9 | my ( $dir ) = @ARGV;
10 |
11 | my $finished = &check_split_output($dir);
12 | #print $finished;
13 | exit ($finished);
14 |
15 | ##############################################################
16 | sub check_split_output {
17 | my ( $dir ) = @_;
18 | my $tot_BN_seq=0;
19 | my $tot_seq = 0;
20 |
21 | opendir(DH, $dir) or return 10;
22 | foreach my $name (readdir DH) {
23 | if ($name =~/\.RefGfiltered\.fa$/) {
24 | my $RefGFile = $dir."/".$name;
25 | open (IN, $RefGFile) or return 10;
26 | while (my $line = ){
27 | if ($line =~ ">") {
28 | $tot_BN_seq++;
29 | }
30 | }
31 | close IN;
32 | }
33 | if ($name =~ /\.RefGfiltered_BLASTN$/) { # BLASTN directory
34 | my $full_path = $dir."/".$name;
35 | opendir(SubDH, $full_path) or return 10;
36 | foreach my $file (readdir SubDH) {
37 | if ($file =~ /\.fa$/ && !($file=~/\.BNfiltered\.fa/)) {
38 | my $faFile = $full_path."/".$file;
39 | my $count = 0;
40 | open (IN, $faFile) or return 10;
41 | while (my $line = ){
42 | if ($line =~ ">") {
43 | $count++;
44 | }
45 | }
46 | close IN;
47 | $tot_seq += $count;
48 | }
49 | }
50 | close SubDH;
51 | }
52 | }
53 | close DH;
54 |
55 | # print "$tot_BN_seq\n";
56 | # print "$tot_seq\n";
57 |
58 | if($tot_BN_seq==$tot_seq) { return 1; }
59 | else {
60 | opendir(DH, $dir) or return 10;
61 | foreach my $name (readdir DH) {
62 | # print "$name\n";
63 | if ($name =~ /\.RefGfiltered_BLASTN$/) {
64 | my $full_path = $dir."/".$name;
65 | opendir(SubDH, $full_path) or return 10;
66 | foreach my $file (readdir SubDH) {
67 | my $faFile = $full_path."/".$file;
68 | # print "$faFile\n";
69 | unlink $faFile;
70 | }
71 | close SubDH;
72 | }
73 | }
74 | close DH;
75 | return 10;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/check_split_RefG.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 | my $usage='
4 | perl script
5 | = full path of the folder holding files for this sample
6 | without last "/"
7 | ';
8 | die $usage unless scalar @ARGV == 1;
9 | my ( $dir ) = @ARGV;
10 |
11 | my $finished = &check_RefG_split($dir);
12 | #print $finished;
13 | exit ($finished);
14 |
15 | ###########################################################################
16 | sub check_RefG_split {
17 | my ( $dir ) = @_;
18 | my $tot_RefG_seq=0;
19 | my $tot_seq = 0;
20 |
21 | opendir(DH, $dir) or return 10;
22 | foreach my $name (readdir DH) {
23 | if ($name =~/\.goodSeq$/) {
24 | my $RefGFile = $dir."/".$name;
25 | open (IN, $RefGFile) or return 10;
26 | while (my $line = ){
27 | if ($line =~ ">") {
28 | $tot_RefG_seq++;
29 | }
30 | }
31 | close IN;
32 | }
33 |
34 | if ($name =~ /\.goodSeq_RefGblast$/) { # Blast RefG directory
35 | my $full_path = $dir."/".$name;
36 | opendir(SubDH, $full_path) or return 10;
37 | foreach my $file (readdir SubDH) {
38 | if ($file =~ /\.fa$/ && !($file=~/\.RefGfiltered\.fa/)) {
39 | my $faFile = $full_path."/".$file;
40 | my $count = 0;
41 | open (IN, $faFile) or return 10;
42 | while (my $line = ){
43 | if ($line =~ ">") {
44 | $count++;
45 | }
46 | }
47 | close IN;
48 | $tot_seq += $count;
49 | }
50 | }
51 | close SubDH;
52 | }
53 | }
54 | close DH;
55 |
56 | # print "total seq after spliting: $tot_RefG_seq\n";
57 | # print "total input seq in .goodSeq: $tot_seq\n";
58 |
59 | if($tot_RefG_seq==$tot_seq) { return 1; }
60 | else {
61 | opendir(DH, $dir) or return 10;
62 | foreach my $name (readdir DH) {
63 | if ($name =~ /\.goodSeq_RefGblast$/) {
64 | my $full_path = $dir."/".$name;
65 | opendir(SubDH, $full_path) or return 10;
66 | foreach my $file (readdir SubDH) {
67 | my $File = $full_path."/".$file;
68 | # unlink $File;
69 | }
70 | close SubDH;
71 | # `rmdir $full_path`;
72 | }
73 | }
74 | close DH;
75 | return 10;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/check_split_cdhit.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 | my $usage='
4 | perl script
5 | = full path of the folder holding files for a sample
6 |
7 | ';
8 | die $usage unless scalar @ARGV == 1;
9 | my ( $dir ) = @ARGV;
10 |
11 | my $finished = &check_split_output($dir);
12 | #print $finished;
13 | exit ($finished);
14 |
15 | ###########################################################################
16 | sub check_split_output {
17 | my ( $dir ) = @_;
18 | my $tot_cdhit_seq=0;
19 | my $tot_seq = 0;
20 |
21 | opendir(DH, $dir) or return 10;
22 | foreach my $name (readdir DH) {
23 | if ($name =~/\.cdhit_out$/) {
24 | my $cdFile = $dir."/".$name;
25 | open (IN, $cdFile) or return 10;
26 | while (my $line = ){
27 | if ($line =~ ">") {
28 | $tot_cdhit_seq++;
29 | }
30 | }
31 | close IN;
32 | }
33 | if ($name =~ /\.cdhit_out_RepeatMasker$/) { # RepeatMasker directory
34 | my $full_path = $dir."/".$name;
35 | opendir(SubDH, $full_path) or return 10;
36 | foreach my $file (readdir SubDH) {
37 | if ($file =~ /\.fa$/) {
38 | my $faFile = $full_path."/".$file;
39 | my $count = 0;
40 | open (IN, $faFile) or return 10;
41 | while (my $line = ){
42 | if ($line =~ ">") {
43 | $count++;
44 | }
45 | }
46 | close IN;
47 | $tot_seq += $count;
48 | }
49 | }
50 | close SubDH;
51 | }
52 | }
53 | close DH;
54 |
55 | # print "$tot_cdhit_seq\n";
56 | # print "$tot_seq\n";
57 |
58 | if($tot_cdhit_seq==$tot_seq) { return 1; }
59 | else {
60 | opendir(DH, $dir) or return 10;
61 | foreach my $name (readdir DH) {
62 | if ($name =~ /\.cdhit_out_RepeatMasker$/) {
63 | my $full_path = $dir."/".$name;
64 | my $com1 = "rm -rf $full_path";
65 | my $com2 ="mkdir $full_path";
66 | # print "com is $com\n";
67 | system ( $com1 );
68 | system ( $com2 );
69 | }
70 | }
71 | close DH;
72 | return 10;
73 | }
74 | }
75 |
76 |
--------------------------------------------------------------------------------
/generate_final_report_gi.pl:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/perl
3 | use strict;
4 |
5 | my $usage = "
6 | This script will read corresponding files in the given director and
7 | generate a report which contains SampleDescription, SequenceReport,
8 | AssignmentSummary, InterestingReads.
9 |
10 | perl $0
11 | = full path of the folder holding files for this sequence run
12 |
13 | ";
14 | die $usage unless scalar @ARGV == 2;
15 | my ( $dir, $version ) = @ARGV;
16 |
17 | my @temp = split("/", $dir);
18 | my $run_name = pop @temp;
19 | my $outFile = $dir."/Analysis_Report_".$run_name;
20 | open (OUT, ">$outFile") or die "can not open file $outFile!\n";
21 |
22 | my ($wkday,$month,$day,$time,$year) = split(/\s+/, localtime);
23 | print OUT "PathHit V${version}; Processing date: $day-$month-$year\n";
24 |
25 | my $c = "**************************************************************************\n";
26 | my $c2 = "#########################################################################\n\n";
27 | print OUT $c;
28 |
29 |
30 | print OUT "Summary:\n\n";
31 | &generate_SampleDescription( $dir );
32 | print OUT "End of Summary\n\n";
33 | #print OUT $c ;
34 |
35 | #print OUT "\n\nSequence Report\n\n";
36 | #&generate_SequenceReport( $dir );
37 | #print OUT "End of Sequence Report\n\n";
38 | #print OUT $c ;
39 |
40 | #print OUT "\n\nTaxonomy Assignment:\n\n";
41 | #&generate_AssignmentSummary( $dir );
42 | #print OUT "End of Assignment\n\n";
43 | #print OUT $c ;
44 |
45 | #print OUT "\n\nInteresting Reads\n\n";
46 | #&generate_InterestingReads( $dir );
47 | #print OUT "End of Interesting Reads\n\n";
48 | #print "\n";
49 |
50 | #print OUT "# Finished\n";
51 |
52 | exit;
53 |
54 | ############################################################################
55 | sub generate_SampleDescription {
56 | my ($dir) = @_;
57 |
58 | # sample name => num of total sequence in the sample
59 | my %total_seq = ();
60 |
61 | print OUT $dir,"\n";
62 | printf OUT "%10s\t", " ";
63 | printf OUT "%5s\t%15s\t%10s\t%40s\n", "NoHumanRead", "PercentIDrange", "gi", "IdentifiedNoHuman";
64 |
65 | opendir(DH, $dir) or die "Can not open dir $dir!\n";
66 | my @files = readdir DH;
67 | foreach my $name (sort {$a cmp $b} @files) {
68 | if (!($name =~ /\./)) {
69 | # name is either file name or sample name (directory)
70 | my $full_path = $dir."/".$name;
71 | if (-d $full_path) { # is a directory, sample directory
72 | # get total number of sequences in the sample
73 | my $tempF = $full_path."/".$name.".fa";
74 | $total_seq{$name} = &count_num_of_seq($tempF);
75 |
76 | # print out report for this sample
77 | printf OUT "%30s\t%8d\n", $name, $total_seq{$name};
78 | my $Summary_file = $full_path."/".$name.".gi.AssignmentSummary";
79 | if (-e $Summary_file) {
80 | open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
81 | foreach (1..17) {
82 | ;
83 | }
84 | while () {
85 | if ($_ =~ /^\s*$/) { # empty line
86 | next;
87 | }
88 | elsif ($_ =~ /# Finished Assignment Summary/) {
89 | next;
90 | }
91 | else {
92 | chomp $_;
93 | my $number_reads = 0;
94 | my $range = "";
95 | my @temp = split(/\t/, $_);
96 | my $range = pop @temp;
97 | my $info = pop @temp;
98 | my $virus_info = pop @temp;
99 | my $virus = "";
100 | my $gi=$temp[0];
101 | if ($info =~ /total number of reads: (\d+)/) {
102 | $number_reads = $1;
103 | }
104 |
105 | if ($virus_info =~ /hit does not have taxonomy entry/) {
106 | my @temp2 = split (",", $virus_info);
107 | $virus = shift @temp2;
108 | }
109 | else {
110 | my @temp2 = split(";", $virus_info);
111 | $virus = pop @temp2;
112 | }
113 |
114 | if($number_reads>=1)
115 | {
116 | printf OUT "%10s\t", " ";
117 | printf OUT "%5d\t%20s\t%10s\t%40s\n", $number_reads, $range, $gi, $virus; }
118 | }
119 | }
120 | }
121 | else {
122 | print OUT "$Summary_file does not exist!\n";
123 | }
124 | }
125 | }
126 | }
127 | }
128 |
129 | #####################################################################
130 | # Assignment Summary
131 | sub generate_AssignmentSummary {
132 | my ( $dir ) = @_;
133 |
134 | opendir(DH, $dir) or die "Can not open dir $dir!\n";
135 | my @files = readdir DH;
136 | foreach my $name (sort {$a cmp $b} @files) {
137 | # name is either file name or sample name (directory)
138 | my $full_path = $dir."/".$name;
139 | if (!($name =~ /\./)) {
140 | if (-d $full_path) { # is a directory
141 | my $Summary_file = $full_path."/".$name.".gi.AssignmentSummary";
142 | if (-e $Summary_file) {
143 | open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
144 | while () {
145 | if ($_ =~ /# Finished Assignment Summary/) {
146 | next;
147 | }
148 |
149 | print OUT $_;
150 | }
151 | }
152 | print OUT $c2 ;
153 | }
154 | }
155 | }
156 | }
157 |
158 | ##########################################################################
159 | sub generate_SequenceReport {
160 | my ( $dir ) = @_;
161 |
162 | # sample name => num of total sequence in the sample
163 | my %total_seq = ();
164 |
165 | # sample name => num of unique sequence in the sample
166 | my %unique_seq = ();
167 | my %unique_seq_percent = ();
168 |
169 | # sample name => num of Filtered sequence in the libary
170 | my %bad_seq = ();
171 |
172 | # sample name => percentage of Filtered seq in the lib
173 | my %bad_percent = ();
174 |
175 | # sample name => num of Filtered sequence in the libary
176 | my %lowComplex_seq = ();
177 |
178 | # sample name => percentage of Filtered seq in the lib
179 | my %lowComplex_percent = ();
180 |
181 | # libary name => num of good sequenc in the sample
182 | my %good_seq = ();
183 |
184 | # sample name => percentage of Filtered seq in the lib
185 | my %good_percent = ();
186 |
187 | # sample name => num of sequence assigned by BLASTN
188 | my %blastn_assigned = ();
189 |
190 | # sample name => percentage of sequences assigned by blastn
191 | my %blastn_assigned_percent = ();
192 |
193 | # sample name => num of sequence assigned by BLASTN
194 | my %blastx_assigned = ();
195 |
196 | # sample name => percentage of sequences assigned by blastn
197 | my %blastx_assigned_percent = ();
198 |
199 | print OUT $dir,"\n";
200 | printf OUT "%30s\t", "sampleName";
201 | print OUT "total\tuniq\t\%\t Filtered\t\%\tLowComplex\t\%\tgood\t\%\tBNassign\t\%\tBXassign\t\%\n";
202 | opendir(DH, $dir) or die "Can not open dir $dir!\n";
203 | my @files = readdir DH;
204 | foreach my $name (sort {$a cmp $b} @files) {
205 | # name is either file name or sample name (directory)
206 | my $full_path = $dir."/".$name;
207 | if (!($name =~ /\./)) {
208 | if (-d $full_path) { # is a directory
209 | # get total number of sequences in the sample
210 | my $tempF = $full_path."/".$name.".fa";
211 | $total_seq{$name} = &count_num_of_seq($tempF);
212 |
213 | # get number of unique sequence in the sample
214 | $tempF = $full_path."/".$name.".fa.cdhit_out";
215 | if (-e $tempF) {
216 | $unique_seq{$name} = &count_num_of_seq($tempF);
217 | $unique_seq_percent{$name} = $unique_seq{$name}*100/$total_seq{$name};
218 | print "total # seq = ", $total_seq{$name}, " unique # seq: ", $unique_seq{$name}, "\n";
219 | }
220 | else {
221 | print OUT "$full_path does not have cdhit_out file!\n";
222 | return;
223 | }
224 |
225 | # get number of Filtered and good sequences
226 | ##############################################################################
227 | # need to change here if seg masker enabled
228 | $tempF = $full_path."/".$name.".fa.cdhit_out.masked.badSeq";
229 | open (IN, $tempF) or die "can not open file $tempF!\n";
230 | while () {
231 | if ($_ =~ /good seq = (\d+)/) {
232 | # print "num of good seq: $1, percentage: $2 (percentage of unique sequences\n";
233 | $good_seq{$name} = $1;
234 | $good_percent{$name} = $1*100/$total_seq{$name};
235 | }
236 | if ($_ =~ /bad seq = (\d+)/) {
237 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n";
238 | $bad_seq{$name} = $1;
239 | $bad_percent{$name} = $1*100/$total_seq{$name};
240 | }
241 | if ($_ =~ /Repeat and Low complexicity seq = (\d+)/) {
242 | # print "num of Filtered seq: $1, percentage: $2 percentage of unique sequences\n";
243 | $lowComplex_seq{$name} = $1;
244 | $lowComplex_percent{$name} = $1*100/$total_seq{$name};
245 | }
246 | }
247 |
248 | # get number of sequences assigned by BLASTn and number of sequences saved for BLASTX
249 | my $total_saved = 0;
250 | my $total_BNassigned = 0;
251 | $tempF = $full_path."/".$name.".BNFiltered.fa";
252 | my $BNFiltered;
253 | if (-e $tempF) {
254 | $BNFiltered = &count_num_of_seq($tempF);
255 | $blastn_assigned{$name} = $good_seq{$name} - $BNFiltered;
256 | $blastn_assigned_percent{$name} = $blastn_assigned{$name}*100/$total_seq{$name};
257 | }
258 | else {
259 | $BNFiltered = 0;
260 | $blastn_assigned{$name} = $good_seq{$name} - $BNFiltered;
261 | $blastn_assigned_percent{$name} = $blastn_assigned{$name}*100/$total_seq{$name};
262 | }
263 |
264 | my $total_BXassigned = 0;
265 | $tempF = $full_path."/".$name.".gi.unassigned.fa";
266 | my $unassigned;
267 | if (-e $tempF) {
268 | $unassigned = &count_num_of_seq($tempF);
269 | }
270 | else {
271 | $unassigned = 0;
272 | }
273 | $blastx_assigned{$name} = $BNFiltered - $unassigned;
274 | $blastx_assigned_percent{$name} = $blastx_assigned{$name}*100/$total_seq{$name};
275 |
276 | # print out report for this sample
277 | printf OUT "%30s\t%5d\t%5d\t%5.1f\t", $name, $total_seq{$name}, $unique_seq{$name}, $unique_seq_percent{$name};
278 | printf OUT "%5d\t%5.1f\t%5d\t%5.1f\t%5d\t%5.1f\t", $bad_seq{$name}, $bad_percent{$name}, $lowComplex_seq{$name}, $lowComplex_percent{$name}, $good_seq{$name}, $good_percent{$name};
279 | printf OUT "%5d\t%9.1f\t%5d\t%5.1f\n", $blastn_assigned{$name}, $blastn_assigned_percent{$name}, $blastx_assigned{$name}, $blastx_assigned_percent{$name};
280 | }
281 | }
282 | }
283 |
284 |
285 | # caclculate and print statistics for this run
286 | my $total = 0;
287 | my $unique = 0;
288 | my $bad = 0;
289 | my $good = 0;
290 | my $BNassign = 0;
291 | my $BXassign = 0;
292 | foreach my $key (keys %total_seq) {
293 | $total += $total_seq{$key};
294 | $unique += $unique_seq{$key};
295 | $bad += $bad_seq{$key};
296 | $good += $good_seq{$key};
297 | $BNassign += $blastn_assigned{$key};
298 | $BXassign += $blastx_assigned{$key};
299 | }
300 | $total_seq{"total"} = $total;
301 | $unique_seq{"total"} = $unique;
302 | $unique_seq_percent{"total"} = $unique*100/$total;
303 | $bad_seq{"total"} = $bad;
304 | $bad_percent{"total"} = $bad*100/$total;
305 | $lowComplex_seq{"total"} = $bad;
306 | $lowComplex_percent{"total"} = $bad*100/$total;
307 | $good_seq{"total"} = $good;
308 | $good_percent{"total"} = $good*100/$total;
309 | $blastn_assigned{"total"} = $BNassign;
310 | $blastn_assigned_percent{"total"} = $BNassign*100/$total;
311 | $blastx_assigned{"total"} = $BXassign;
312 | $blastx_assigned_percent{"total"} = $BXassign*100/$total;
313 |
314 | printf OUT "%30s\t%5d\t%5d\t%5.1f\t", "total", $total_seq{"total"}, $unique_seq{"total"}, $unique_seq_percent{"total"};
315 | printf OUT "%5d\t%5.1f\t%5d\t%5.1f\t%5d\t%5.1f\t", $bad_seq{"total"}, $bad_percent{"total"}, $lowComplex_seq{"total"}, $lowComplex_percent{"total"}, $good_seq{"total"}, $good_percent{"total"};
316 | printf OUT "%5d\t%9.1f\t%5d\t%5.1f\n", $blastn_assigned{"total"}, $blastn_assigned_percent{"total"}, $blastx_assigned{"total"}, $blastx_assigned_percent{"total"};
317 |
318 | }
319 |
320 | ############################################################################
321 | sub count_num_of_seq () {
322 | my ($fastaFile) = @_;
323 | my $count = 0;
324 |
325 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
326 | while (my $line = ){
327 | if ($line =~ ">") {
328 | $count++;
329 | }
330 | }
331 | close FastaFile;
332 |
333 | return $count;
334 | }
335 |
336 | ####################################################################################
337 | # Assignment Summary
338 | sub generate_InterestingReads {
339 | my ( $dir ) = @_;
340 |
341 | opendir(DH, $dir) or die "Can not open dir $dir!\n";
342 | my @files = readdir DH;
343 | foreach my $name (sort {$a cmp $b} @files) {
344 | # name is either file name or sample name (directory)
345 | my $full_path = $dir."/".$name;
346 | if (!($name =~ /\./)) {
347 | if (-d $full_path) { # is a directory
348 | print OUT $name, "\n";
349 | my $tempF = $full_path."/".$name.".gi.InterestingReads";
350 | if ( -e $tempF ) {
351 | open (IN, $tempF) or die "can not open file $tempF!\n";
352 | while () {
353 | print OUT $_;
354 | }
355 | close IN;
356 | }
357 | else {
358 | print OUT "$name does not have .InteresingReads file!\n";
359 | }
360 | print OUT $c2;
361 | }
362 | }
363 | }
364 | }
365 |
--------------------------------------------------------------------------------
/get_fasta_from_bam_filter.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | my $usage = '
4 | perl $file_in $file_out
5 | ';
6 | die $usage unless scalar @ARGV == 2;
7 | my ( $file_in, $file_out) = @ARGV;
8 | open(IN,"<$file_in");
9 | open(OUT,">$file_out");
10 | while()
11 | {
12 | my $line=$_;
13 | chomp($line);
14 | my @ss=split("\t",$line);
15 | if(!($ss[2]=~/gi\|548558394/) && !($ss[2]=~/gi\|9626372/))
16 | {
17 | chomp($ss[0]);
18 | chomp($ss[9]);
19 | print OUT ">",$ss[0],"\n";
20 | print OUT $ss[9],"\n";}
21 | }
22 | close IN;
23 | close OUT;
24 |
--------------------------------------------------------------------------------
/import_gi_taxid_nucl.sql:
--------------------------------------------------------------------------------
1 | \! echo 'Loading gi_taxid_nucl.dmp - this can take a very long time'
2 | DROP TABLE IF EXISTS `gi_taxid_nucl`;
3 | CREATE TABLE `gi_taxid_nucl` (
4 | `gi_taxid_nucl_id` int(10) unsigned NOT NULL auto_increment,
5 | `gi` int(10) unsigned default NULL,
6 | `tax_id` int(10) unsigned default NULL,
7 | PRIMARY KEY (`gi_taxid_nucl_id`),
8 | KEY `tax_id` (`tax_id`),
9 | KEY `gi` (`gi`)
10 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
11 |
12 |
13 | LOAD DATA LOCAL INFILE '/directory/gi_taxid_nucl.dmp'
14 | INTO TABLE gi_taxid_nucl
15 | FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
16 | (gi,tax_id);
17 |
--------------------------------------------------------------------------------
/send_email.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | use warnings;
4 | my ( $dir, $email ) = @ARGV;
5 | my $sendmail = "/usr/sbin/sendmail -t";
6 | my $reply_to = "Reply-to: $email\n";
7 | my $subject = "Subject: data processing finished\n";
8 | my $content = "The $dir data processing has finished.\n";
9 | my $send_to = "To: $email\n";
10 | open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
11 | print SENDMAIL $reply_to;
12 | print SENDMAIL $subject;
13 | print SENDMAIL $send_to;
14 | print SENDMAIL "Content-type: text/plain\n\n";
15 | print SENDMAIL $content;
16 | close SENDMAIL;
17 | exit;
18 |
--------------------------------------------------------------------------------
/split_fasta.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 | use Getopt::Long;
4 | my %opts;
5 | GetOptions(\%opts, "i:s", "o:s", "n=i", "p=s", "h");
6 |
7 | die "usage:\t$0 <-i fasta_file> <-o out_put_dir> <-n number_of_file> <-p prefix_of_seq>[-h]\n" if(defined($opts{h}) || !defined($opts{i}) || !defined($opts{o}) || !defined($opts{p}));
8 |
9 | # get total number of sequence in input file
10 | my $count_seq= &count_num_of_seq($opts{i});
11 |
12 | # calculate how many sequences in each file
13 | my $size = $count_seq/$opts{n};
14 | #print "$count_seq $size\n";
15 |
16 | # start spliting
17 | my $count = 1;
18 | my $count_seq_each=0;
19 | open(OUT, ">$opts{o}/$opts{p}${count}".".fa")||die $!;
20 |
21 | open(SEQ, $opts{i}) || die "cannot open file : $opts{i}\n";
22 | $/='>';
23 | ;
24 | while() {
25 | chomp;
26 | if($count_seq_each > $size) {
27 | close OUT;
28 | $count++; $count_seq_each=0;
29 | open(OUT, ">$opts{o}/$opts{p}${count}".".fa")||die $!;
30 | }
31 | print OUT $/.$_;
32 |
33 | $count_seq_each++;
34 | }
35 | close OUT;
36 | close SEQ;
37 |
38 | ############################################################################
39 | sub count_num_of_seq () {
40 | my ($fastaFile) = @_;
41 | my $count = 0;
42 |
43 | open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
44 | while (my $line = ){
45 | if ($line =~ ">") {
46 | $count++;
47 | }
48 | }
49 | close FastaFile;
50 |
51 | return $count;
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/trim_readid.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 |
4 | my $usage = "
5 | This script will trim the name of read id
6 |
7 | perl $0
8 |
9 | ";
10 |
11 | die $usage unless scalar @ARGV == 2;
12 | my ( $filein, $fileout ) = @ARGV;
13 |
14 | open(IN,"<$filein");
15 | open(OUT,">$fileout");
16 |
17 | my $cc=0;
18 |
19 | while()
20 | {
21 |
22 | my $line=$_;
23 | if($line=~/^\>/) { $cc++; print OUT ">read".$cc,"\n"; }
24 | else { print OUT $line; }
25 |
26 | }
27 |
28 | close IN;
29 | close OUT;
30 |
--------------------------------------------------------------------------------
]