├── ExCID_v2.0.zip
├── ExCID_v2.1
    ├── Config.txt
    ├── ExCID.BatchScript_v2.1-threading_Final.pl
    ├── ExCID.grep_gene_list_pct_Final-miRNA.pl
    ├── ExCID.grep_gene_list_pct_Final.pl
    ├── ExCID_v2.1-Batch.pl
    ├── Get_HGNC.pl
    ├── bed_file-annotator_V2_CCDS-miRBASE.pl
    ├── bed_file-annotator_V2_RefSeq-VEGA.pl
    ├── bin
    │   ├── CapStatsV2.5.jar
    │   ├── CapStatsV2.6.jar
    │   ├── CovFasta_Generator.jar
    │   ├── Java_code
    │   │   ├── CaptureStatsBAM5_extended.java
    │   │   ├── ParseOpts.java
    │   │   ├── WGS_Stats_v1.java
    │   │   ├── picard-1.88.jar
    │   │   └── sam-1.88.jar
    │   ├── WGSStats_v1.1.jar
    │   └── WGSStats_v1.jar
    ├── change_log.txt
    ├── check_HGNC_individual_CCDSDB.pl
    ├── check_HGNC_individual_RefSeqDB.pl
    ├── check_HGNC_individual_VEGADB.pl
    ├── check_HGNC_individual_mirnaDB.pl
    ├── creat_bed_UCSC_coding.pl
    ├── database.tgz
    ├── external_programs
    │   └── BEDTools.v2.17.0.tar.gz
    ├── miRBASE_r20.gff2
    ├── reformat.pl
    ├── setup.sh
    └── update_databases.sh
└── README.md


/ExCID_v2.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.0.zip


--------------------------------------------------------------------------------
/ExCID_v2.1/Config.txt:
--------------------------------------------------------------------------------
1 | DataBaseDir=
2 | AnnotationDir=
3 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/ExCID.BatchScript_v2.1-threading_Final.pl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/ExCID.BatchScript_v2.1-threading_Final.pl


--------------------------------------------------------------------------------
/ExCID_v2.1/ExCID.grep_gene_list_pct_Final-miRNA.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | use strict;
  3 | use warnings;
  4 | use diagnostics;
  5 | use Getopt::Long;
  6 | use File::Path;
  7 | use Time::localtime;
  8 | use Fcntl qw(:flock);
  9 | use File::Basename;
 10 | 
 11 | 
 12 | unless (scalar @ARGV > 0){USAGE(); exit; }
 13 | 
 14 | my %opt;
 15 | my $script_dir;
 16 | my $script_dir_tmp = $0;
 17 | $script_dir_tmp =~m/^.+\//;
 18 | $script_dir=$&;
 19 | 
 20 | my @tmp = `grep "DataBaseDir=" $script_dir/Config.txt`;
 21 | $tmp[0]=~s/DataBaseDir=//;
 22 | chomp($tmp[0]);
 23 | my $database_dir = $tmp[0];
 24 | 
 25 | my $database1 = "$database_dir/refGene.txt.bed-exon_HGNC.bed";
 26 | my $database2 = "$database_dir/ccdsGene.txt.bed-exon_HGNC.bed";
 27 | my $database3 = "$database_dir/vegaGene.txt.bed-exon_HGNC.bed";
 28 | my $database4 = "$database_dir/miRBASE_r20_HGNC.bed";
 29 | my $combined='';
 30 | 
 31 | 
 32 | ### OPTIONS ###
 33 | GetOptions('i:s' => \$opt{i}, 'l:s' => \$opt{l}, 'list:s' => \$opt{g}, 'db:s' => \$opt{db}) || &USAGE;
 34 | my $bed = $opt{i} || "null";
 35 | my $low_cov_file = $opt{l} || "null";
 36 | my $gene_list = $opt{g} || "null";
 37 | my $targeted_database = $opt{db} || "null" ;
 38 | my $pct_file;
 39 | my @genes_lest;
 40 | 
 41 | 
 42 | if ($gene_list eq "null" && $targeted_database eq "null") {
 43 |     print STDERR "Please provide either list of genes (HGNC Symbols) or the targeted Gene database. Check the documentation for database format\n";
 44 |     exit;
 45 | }
 46 | 
 47 | if ($bed eq "null") {
 48 |     print STDERR "Please provide Targeted Bedfile.\n";
 49 |     exit;
 50 | }
 51 | 
 52 | if($low_cov_file eq "null"){
 53 |     print STDERR "Please provide inadequately covered bases in bed file format.\n";
 54 |     exit;
 55 | }
 56 | 
 57 | 
 58 | 
 59 | 
 60 | if($targeted_database && -e $targeted_database ){
 61 |     print STDERR  "Targeted Data base provided. Obtaining Gene percent coverages.\n";
 62 |     $pct_file = genes_check_db($low_cov_file, $bed, $targeted_database);
 63 |     exit 0;
 64 | }else{
 65 |     print STDERR "Some issue with the run. Please check the commands.\n";
 66 |     exit -1;
 67 | }
 68 | 
 69 | 
 70 | 
 71 | ########## SUBROUTINES ###########
 72 | sub USAGE {
 73 |     print "\nUSAGE: $0 -i <Targeted BED file> -l <low cov file> -list <Genelist> -db <Gene database>\n";
 74 |     print "  -i:  Annotated bed file\n";
 75 |     print "  -l:  Low cov bed file\n";
 76 |     print "  -list:  List of genes. one per line\n";
 77 |     print "  -db:  Database of interested Genes. Please see documentation for database format. \n";
 78 |     exit;
 79 | }
 80 | 
 81 | sub genes_check_db {
 82 |     
 83 |     my ($low_cov_file, $anno_bed, $db_list)  = @_;
 84 |     my $db_name = basename($db_list);
 85 |     my $out_file = $low_cov_file;
 86 |     $out_file=~ s{.*/}{}; # remove path
 87 |     $out_file.="-miRBASE-anno.bed";
 88 |     my $nottargeted_rfs = $anno_bed;
 89 |     $nottargeted_rfs=~ s{.*/}{}; # remove path
 90 |     $nottargeted_rfs.="-notTargeted_in_DB-$out_file.bed";
 91 |     my $tmp = $out_file;
 92 |     $tmp=~ s{.*/}{}; # remove path
 93 |     $tmp=~s/\.bed$//;
 94 |     $combined = $nottargeted_rfs;
 95 |     $combined=~s/\-$out_file//;
 96 |     $combined=~s/\.bed$//;
 97 |     $combined.="-$tmp.bed";
 98 |     
 99 |     system("$script_dir/bin/bedtools subtract -a $db_list -b $anno_bed > $nottargeted_rfs");
100 |     system("$script_dir/bin/bedtools intersect -a $db_list -b $low_cov_file > $out_file");
101 |     system("cat $nottargeted_rfs $out_file | $script_dir/bin/bedtools sort -i  > $combined");
102 |     system("$script_dir/bin/bedtools merge -i $combined > $combined-tmp");
103 |     system("$script_dir/bin/bedtools intersect -a $db_list -b $combined-tmp > $combined");
104 |     my $db_list_transcript_size = get_Transcirpt_size_db($db_list);
105 |     my $out_FILE = get_Transcirpt_size($low_cov_file,$combined, $db_list_transcript_size);
106 |     my $rm_file = "$combined-tmp";
107 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
108 |     $rm_file = $out_file;
109 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
110 |     $rm_file = $nottargeted_rfs;
111 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
112 |     $rm_file = "$combined";
113 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
114 |     
115 |     $db_name=~ s{\.[^.]+$}{}; # removes extension
116 |     system("mv $out_FILE $db_name-pct.txt");
117 |     return "$db_name-pct.txt";
118 |     
119 |     return $out_FILE;
120 | }
121 |     
122 | sub get_Transcirpt_size{
123 |     my ($low_cov_file,$combined,$db)=@_;
124 |     
125 |     my $tmp = $combined;
126 |     $tmp=~ s{.*/}{}; # remove path
127 |     $tmp=~s/\.bed$//;
128 |     
129 |     
130 |     my $outfile_final = "$tmp"."_transcriptSIZE.txt";
131 |     open(my $in,"$combined") || die "Can't open $combined: $!\n";
132 |     open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n";
133 |     
134 |     while(<$in>)
135 |          {
136 |              chomp; my $line = $_; 
137 |              my ($chr, $start, $stop, $gene) = split(/\s/, $line);
138 |              my $exonsize = $stop - $start + 1;
139 |              my @tranarray = split(/\;/, $gene);
140 |              my $arraysize = scalar(@tranarray) - 1;
141 |              for (my $j=0; $j<=$arraysize; $j++) {
142 |                 print $out "$tranarray[$j]\t$exonsize\t$chr\t$start\t$stop\n";
143 |              }
144 |          }
145 |     
146 |     close($in);
147 |     close($out);
148 |     my $outFile = get_pct($outfile_final,$db);
149 |     return $outFile;
150 | }
151 | 
152 | sub get_Transcirpt_size_db{
153 |     my ($combined)=@_;
154 |     
155 |     my $tmp = $combined;
156 |     $tmp=~ s{.*/}{}; # remove path
157 |     $tmp=~s/\.bed$//;
158 |     
159 |     
160 |     my $outfile_final = "$tmp"."_transcriptSIZE.txt";
161 |     open(my $in,"$combined") || die "Can't open $combined: $!\n";
162 |     open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n";
163 |     
164 |     while(<$in>)
165 |          {
166 |              chomp; my $line = $_; 
167 |              my ($chr, $start, $stop, $gene) = split(/\s/, $line);
168 |              my $exonsize = $stop - $start + 1;
169 |              my @tranarray = split(/\;/, $gene);
170 |              my $arraysize = scalar(@tranarray) - 1;
171 |              for (my $j=0; $j<=$arraysize; $j++) {
172 |                 print $out "$tranarray[$j]\t$exonsize\t$chr\t$start\t$stop\n";
173 |              }
174 |          }
175 |     
176 |     close($in);
177 |     close($out);
178 |     return $outfile_final; 
179 | }
180 | 
181 | sub get_pct{
182 |     
183 |     my ($infile,$control) =@_;
184 |     
185 |     
186 |     unless (-e $control) {print STDERR "$control does not exist\n"; exit;}
187 |     unless (-e $infile) {print STDERR "$infile does not exist\n"; exit;}
188 |     
189 |     my $outfile = $infile;
190 |     $outfile=~s/\_transcriptSIZE\.txt$/\_pct\.txt/;
191 |     my %control;
192 |     my %file_data;
193 |     open(my $fh,"<$control") or die $!;
194 |     
195 |     while (my $line = <$fh>) {
196 |         chomp($line);
197 |         my @data1 = split("\t",$line);
198 |         #print STDERR $data1[0]."\n";
199 |         my @data = split(/\|/,$data1[0]);
200 |         if (scalar @data1 ==1) {
201 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = 0;
202 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0];
203 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1];
204 |         }
205 |         if (scalar @data1 ==5) {
206 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = $data1[1];
207 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0];
208 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1];
209 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"chr"} = $data1[2];
210 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"start"} = $data1[3];
211 |             $control{"$data1[0]_$data1[2]_$data1[3]"}{"stop"} = $data1[4];
212 |         }
213 |     }
214 |     close($fh);
215 |     
216 |     open(my $fh1,"<$infile") or die $!;
217 |     while (my $line = <$fh1>) {
218 |         chomp($line);
219 |         my @data1 = split("\t",$line);
220 |         my @data = split(/\|/,$data1[0]);
221 |         if (scalar @data1 ==1) {
222 |             $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = 0;
223 |             $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0];
224 |             $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1];
225 |         }
226 |         if (scalar @data1 ==5) {
227 |             $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = $data1[1];
228 |             $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0];
229 |             $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1];
230 |         }
231 |     }
232 |     close($fh1);
233 |     
234 |     my @low_region_keys = keys %file_data;
235 |     open(my $fho,">$outfile") or die $!;
236 |     
237 |     foreach my $key (keys %control){
238 |         my @key_split = split(/_/,$key);
239 |         my $no_match = 1;
240 |         
241 |         my @matches = grep { /$key_split[0]/ } @low_region_keys;
242 |         foreach my $match (@matches){
243 |             my @low_key_split = split(/_/,$match);
244 |             
245 |             if (($key_split[0] eq $low_key_split[0]) && ($key_split[1] eq $low_key_split[1]) && $key_split[2] <= $low_key_split[2]) {
246 |                 my $tmp = $file_data{$match}{"val"}/$control{$key}{"val"};
247 |                 my $pct = sprintf("%.3f",(1-$tmp)*100);
248 |                 
249 |                 my $grep_key = $control{$key}{"Gene"}."|".$control{$key}{"ID"};
250 |                 my @regions = `grep -w \"$grep_key\" $combined `;
251 |                 my $lowcov_coords='';
252 |                 foreach my $lowcov_region (@regions){
253 |                     my @tmp = split ("\t",$lowcov_region);
254 |                     if ("$tmp[0]" eq $control{$key}{"chr"} && $tmp[1]>= $control{$key}{"start"} && $tmp[2] <= $control{$key}{"stop"}) {
255 |                         $lowcov_coords.="$tmp[1]-$tmp[2];";
256 |                     }
257 |                 }
258 |                 $lowcov_coords=~s/;$//;
259 |                 
260 |                 print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t".$control{$key}{"ID"}."\t1\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t$pct%\t$lowcov_coords\n";
261 |                 $no_match = 0;
262 |             }
263 |             
264 |         }
265 |         if ($no_match == 1) {
266 |             print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t".$control{$key}{"ID"}."\t1\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t100.000%\t.\n";
267 |         }
268 |     }
269 |     close($fho);
270 |     my $rm_file = $infile;
271 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
272 |     
273 |     return $outfile;
274 | }
275 | 
276 | ##############################################################################################################################
277 | 
278 | sub timestamp {
279 |   my $t = localtime;
280 |   return sprintf( "%04d-%02d-%02d_%02d-%02d-%02d", $t->year + 1900, $t->mon + 1, $t->mday, $t->hour, $t->min, $t->sec );
281 | }


--------------------------------------------------------------------------------
/ExCID_v2.1/ExCID.grep_gene_list_pct_Final.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | use strict;
  3 | use warnings;
  4 | use diagnostics;
  5 | use Getopt::Long;
  6 | use File::Path;
  7 | use Time::localtime;
  8 | use Fcntl qw(:flock);
  9 | use File::Basename;
 10 | 
 11 | 
 12 | unless (scalar @ARGV > 0){USAGE(); exit; }
 13 | 
 14 | my %opt;
 15 | my $script_dir;
 16 | my $script_dir_tmp = $0;
 17 | $script_dir_tmp =~m/^.+\//;
 18 | $script_dir=$&;
 19 | 
 20 | my @tmp = `grep "DataBaseDir=" $script_dir/Config.txt`;
 21 | $tmp[0]=~s/DataBaseDir=//;
 22 | chomp($tmp[0]);
 23 | my $database_dir = $tmp[0];
 24 | 
 25 | my $database1 = "$database_dir/refGene.txt.bed-exon_HGNC.bed";
 26 | my $database2 = "$database_dir/ccdsGene.txt.bed-exon_HGNC.bed";
 27 | my $database3 = "$database_dir/vegaGene.txt.bed-exon_HGNC.bed";
 28 | my $database4 = "$database_dir/miRBASE_r20_HGNC.bed";
 29 | my $combined='';
 30 | my $HGMD_db = "$database_dir/HGMD_2014_v4.bed";
 31 | 
 32 | 
 33 | ### OPTIONS ###
 34 | my $check_HGMD;
 35 | GetOptions('i:s' => \$opt{i}, 'l:s' => \$opt{l}, 'list:s' => \$opt{g}, 'db:s' => \$opt{db}, 'checkHGMD' => \$check_HGMD) || &USAGE;
 36 | my $bed = $opt{i} || "null";
 37 | my $low_cov_file = $opt{l} || "null";
 38 | my $gene_list;
 39 | $gene_list = $opt{g} if($opt{g});
 40 | my $targeted_database;
 41 | $targeted_database = $opt{db} if($opt{db}) ;
 42 | my $pct_file;
 43 | my @genes_lest;
 44 | 
 45 | 
 46 | if (!$gene_list && !$targeted_database) {
 47 |     print STDERR "Please provide either list of genes (HGNC Symbols) or the targeted Gene database. Check the documentation for database format\n";
 48 |     exit;
 49 | }
 50 | 
 51 | if ($bed eq "null") {
 52 |     print STDERR "Please provide Targeted Bedfile.\n";
 53 |     exit;
 54 | }
 55 | 
 56 | if($low_cov_file eq "null"){
 57 |     print STDERR "Please provide inadequately covered bases in bed file format.\n";
 58 |     exit;
 59 | }
 60 | 
 61 | 
 62 | 
 63 | if($targeted_database && -e $targeted_database && !$gene_list){
 64 |     print STDERR  "Targeted Data base provided. Obtaining Gene percent coverages.\n";
 65 |     $pct_file=genes_check_db($low_cov_file, $bed, $targeted_database);
 66 |     if($check_HGMD) {
 67 |         HGMDcheck($pct_file,@genes_lest) ;
 68 |     }else{
 69 |         averageGene($pct_file,@genes_lest) ;
 70 |     }
 71 |     exit 0 ;
 72 | }elsif($targeted_database && -e $targeted_database && $gene_list && -e $gene_list){
 73 |     print STDERR  "Targeted Data base provided. Obtaining Gene percent coverages.\n";
 74 |     $pct_file=genes_check_db($low_cov_file, $bed, $targeted_database);
 75 |     open(my $glfh, "< $gene_list") or die $!;
 76 |     @genes_lest= <$glfh>;
 77 |     close($glfh);
 78 |     if($check_HGMD) {
 79 |         HGMDcheck($pct_file,@genes_lest) ;
 80 |     }else{
 81 |         averageGene($pct_file,@genes_lest) ;
 82 |     }
 83 |     exit 0 ;
 84 | }elsif($gene_list && -e $gene_list && !$targeted_database){
 85 |     open(my $glfh, "< $gene_list") or die $!;
 86 |     @genes_lest= <$glfh>;
 87 |     close($glfh);
 88 | }else{
 89 |     print STDERR "Some issue with the run. Please check the commands.\n";
 90 | }
 91 | 
 92 | my $output_dir;
 93 | my $output_dir_tmp = $low_cov_file;
 94 | $output_dir_tmp =~m/^.+\//;
 95 | $output_dir=$&;
 96 | 
 97 | $targeted_database = $output_dir."/".basename($gene_list)."-database.bed";
 98 | my $targeted_miRBASE = $output_dir."/".basename($gene_list)."-miRBASE.bed";
 99 | open(my $gldb, "> $targeted_database") or die $!;
100 | open(my $gldbm, "> $targeted_miRBASE") or die $!;
101 | my $mirfound = 0;
102 | foreach my $gene (@genes_lest){
103 |     chomp($gene);
104 |     next unless(length($gene) > 0);
105 |     my $found = 0;
106 |     my @regions = `grep -w -P \"\t$gene\$\" $database1 `;
107 |     if (scalar(@regions) > 0) {
108 |         foreach my $reg (@regions){
109 |             chomp($reg);
110 |             my @tmp_split = split("\t",$reg);
111 |             print $gldb "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[5]|$tmp_split[4]\n" ;    
112 |         }
113 |         $found = 1;
114 |     }
115 |     
116 |     @regions = `grep -w -P \"\t$gene\$\" $database2 `;
117 |     if (scalar(@regions) > 0) {
118 |         foreach my $reg (@regions){
119 |             chomp($reg);
120 |             my @tmp_split = split("\t",$reg);
121 |             print $gldb "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[4]|$tmp_split[3]\n" ;    
122 |         }
123 |         $found = 1;
124 |     }
125 |     
126 |     @regions = `grep -w -P \"\t$gene\$\" $database3 `;
127 |     if (scalar(@regions) > 0) {
128 |         foreach my $reg (@regions){
129 |             chomp($reg);
130 |             my @tmp_split = split("\t",$reg);
131 |             print $gldb "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[5]|$tmp_split[4]\n" ;    
132 |         }
133 |         $found = 1;
134 |     }
135 |     
136 |     @regions = `grep -w -P \"\t$gene\$\" $database4 `;
137 |     if (scalar(@regions) > 0) {
138 |         foreach my $reg (@regions){
139 |             chomp($reg);
140 |             my @tmp_split = split("\t",$reg);
141 |             print $gldbm "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[4]|$tmp_split[3]\n" ;    
142 |         }
143 |         $found = 1;
144 |         $mirfound = 1;
145 |     }
146 |     
147 |     if ($found != 1) {
148 |         print STDERR "$gene is not a HGNC symbol. Please check the Gene or update the databases.\n";
149 |     }
150 |     
151 | }
152 | 
153 | close($gldb);
154 | close($gldbm);
155 | 
156 | $pct_file=genes_check_db($low_cov_file, $bed, $targeted_database);
157 | 
158 | open(my $glfh, "< $gene_list") or die $!;
159 | @genes_lest= <$glfh>;
160 | close($glfh);
161 | 
162 | if($check_HGMD) {
163 |     HGMDcheck($pct_file,@genes_lest) ;
164 | }else{
165 |     averageGene($pct_file,@genes_lest) ;
166 | }
167 |     
168 | if ($mirfound == 1) {
169 |     system("$script_dir/ExCID.grep_gene_list_pct_Final-miRNA.pl -i $bed -l $low_cov_file -db $targeted_miRBASE ");
170 | }
171 | 
172 | 
173 | 
174 | ########## SUBROUTINES ###########
175 | sub USAGE {
176 |     print "\nUSAGE: $0 -i <Targeted BED file> -l <low cov file> -list <Genelist> -db <Gene database>\n";
177 |     print "  -i:  Annotated bed file\n";
178 |     print "  -l:  Low cov bed file\n";
179 |     print "  -list:  List of genes. one per line\n";
180 |     print "  -checkHGMD: Output only HGMD Transcripts if present or Average among all the transcripts.\n";
181 |     print "  -db:  Database of interested Genes. Please see documentation for database format. \n";
182 |     exit;
183 | }
184 | 
185 | sub genes_check_db {
186 |     
187 |     my ($low_cov_file, $Bed_file, $db_list)  = @_;
188 |     
189 |     my $output_file_name = basename($low_cov_file);
190 |     $output_file_name=~ s{\.[^.]+$}{}; # removes extension
191 |     my $output_dir;
192 |     my $output_dir_tmp = $low_cov_file;
193 |     $output_dir_tmp =~m/^.+\//;
194 |     $output_dir=$&;
195 |     my $db_name = basename($db_list);
196 |     my $out_file = $output_file_name;
197 |     $out_file=~ s{.*/}{}; # remove path
198 |     $out_file.="-$db_name.bed";
199 |     my $nottargeted = $Bed_file;
200 |     $nottargeted=~ s{.*/}{}; # remove path
201 |     $nottargeted.="-notTrgtdin-$db_name-$output_file_name.bed";
202 |     my $tmp = $out_file;
203 |     $tmp=~ s{.*/}{}; # remove path
204 |     $tmp=~s/\.bed$//;
205 |     $combined = $nottargeted;
206 |     $combined=~s/\-$output_file_name//;
207 |     $combined=~s/\.bed$//;
208 |     $combined.="-$tmp.bed";
209 |     
210 |     system("$script_dir/bin/bedtools subtract -a $db_list -b $Bed_file > $nottargeted");
211 |     system("$script_dir/bin/bedtools intersect -a $db_list -b $low_cov_file > $out_file");
212 |     system("cat $nottargeted $out_file | $script_dir/bin/bedtools sort -i  > $combined");
213 |     system("$script_dir/bin/bedtools merge -i $combined > $combined-tmp");
214 |     system("$script_dir/bin/bedtools intersect -a $db_list -b $combined-tmp > $combined");
215 |     my $db_list_transcript_size = get_Transcirpt_size_db($low_cov_file,$db_list);
216 |     my $out_FILE = get_Transcirpt_size($low_cov_file,$combined, $db_list_transcript_size);
217 |     
218 |     my $rm_file = "$combined-tmp";
219 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
220 |     $rm_file = $out_file;
221 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
222 |     $rm_file = $nottargeted;
223 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
224 |     $rm_file = "$combined";
225 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
226 |     $rm_file = $db_list_transcript_size;
227 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
228 |     $rm_file=~s/\.txt$/\_perExon\.txt/;
229 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
230 |     
231 |     
232 |     $db_name=~ s{\.[^.]+$}{}; # removes extension
233 |     system("mv $out_FILE $output_dir/$output_file_name-$db_name-pct.txt");
234 |     $out_FILE=~s/\_pct\.txt$/\_perExon\_pct\.txt/;
235 |     system("mv $out_FILE $output_dir/$output_file_name-$db_name-perExon_pct.txt");
236 |     return "$output_dir/$output_file_name-$db_name-pct.txt";
237 | }
238 |     
239 | sub get_Transcirpt_size{
240 |     my ($low_cov_file,$combined,$db)=@_;
241 |     
242 |     my $tmp = $combined;
243 |     $tmp=~ s{.*/}{}; # remove path
244 |     $tmp=~s/\.bed$//;
245 |     
246 |     
247 |     my $outfile_final = "$tmp"."_transcriptSIZE.txt";
248 |     my $outfile_final_exon = "$tmp"."_transcriptSIZE_perExon.txt";
249 |     my %sizehash;
250 |     open(my $in,"$combined") || die "Can't open $combined: $!\n";
251 |     open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n";
252 |     open(my $outex, ">$outfile_final_exon") || die "Can't open $outfile_final_exon: $!\n";
253 |     
254 |     while(<$in>)
255 |          {
256 |              chomp; my $line = $_;
257 |              next unless (length($line) != 0);
258 |              my ($chr, $start, $stop, $gene) = split(/\s/, $line);
259 |              my $exonsize = $stop - $start + 1;
260 |              my @tranarray = split(/\;/, $gene);
261 |              my $arraysize = scalar(@tranarray) - 1;
262 |              for (my $j=0; $j<=$arraysize; $j++) {
263 |                 print $outex "$tranarray[$j]\t$exonsize\n";
264 |                 my @tmp = split(/\_/, $tranarray[$j]);
265 |                 my $unit = join("_",@tmp[0..(scalar(@tmp)-3)]);
266 |                 my $cds = $tmp[(scalar(@tmp)-2)];
267 |                 my $exon = $tmp[(scalar(@tmp)-1)];
268 |                 push @{$sizehash{$unit}}, $exonsize;
269 |              }
270 |          }
271 |     
272 |     foreach my $key ( keys %sizehash )
273 |     {
274 |         my $total_size = eval join '+', @{$sizehash{$key}};
275 |         print $out "$key\t$total_size\n";
276 |     }
277 |     
278 |     close($in);
279 |     close($out);
280 |     close($outex);
281 |     my $outFile = get_pct($outfile_final,$db);
282 |     my $outFile_perExon = get_pct_perExon($outfile_final,$db);
283 |     return $outFile;
284 | }
285 | 
286 | sub get_Transcirpt_size_db{
287 |     my ($low_cov_file, $db_list)=@_;
288 |     
289 |     my $tmp = $low_cov_file;
290 |     $tmp=~ s{\.[^.]+$}{}; # removes extension
291 |     
292 |     my $tmp1 = basename($db_list);
293 |     $tmp.="-$tmp1";
294 |     $tmp=~s/\.bed$//;
295 |     
296 |     my $outfile_final = "$tmp"."_transcriptSIZE.txt";
297 |     my $outfile_final_exon = "$tmp"."_transcriptSIZE_perExon.txt";
298 |     my %sizehash;
299 |     open(my $in,"< $db_list") || die "Can't open $db_list: $!\n";
300 |     open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n";
301 |     open(my $outex, ">$outfile_final_exon") || die "Can't open $outfile_final_exon: $!\n";
302 |     
303 |     while(<$in>)
304 |          {
305 |              chomp; my $line = $_;
306 |              unless(length($line) != 0){next;}
307 |              my ($chr, $start, $stop, $gene) = split(/\s/, $line);
308 |              my $exonsize = $stop - $start + 1;
309 |              my @tranarray = split(/\;/, $gene);
310 |              my $arraysize = scalar(@tranarray) - 1;
311 |              for (my $j=0; $j<=$arraysize; $j++) {
312 |                 print $outex "$tranarray[$j]\t$exonsize\t$chr\t$start\t$stop\n";
313 |                 my @tmp = split(/\_/, $tranarray[$j]);
314 |                 my $unit = join("_",@tmp[0..(scalar(@tmp)-3)]);
315 |                 my $cds = $tmp[(scalar(@tmp)-2)];
316 |                 my $exon = $tmp[(scalar(@tmp)-1)];
317 |                 push @{$sizehash{$unit}{'exonsize'}}, $exonsize;
318 |                 $sizehash{$unit}{'chr'} = $chr;
319 |              }
320 |          }
321 |     
322 |     foreach my $key ( keys %sizehash )
323 |     {
324 |         my $total_size = eval join '+', @{$sizehash{$key}{'exonsize'}};
325 |         print $out "$key\t$total_size\t".$sizehash{$key}{'chr'}."\t".scalar(@{$sizehash{$key}{'exonsize'}})."\n";
326 |         my @tmp= split(/\|/,$key);
327 |         push @genes_lest,$tmp[0];
328 |     }
329 |     
330 |     close($in);
331 |     close($out);
332 |     close($outex);
333 |     return $outfile_final; 
334 | }
335 | 
336 | sub get_pct{
337 |     
338 |     my ($infile,$control) =@_;
339 |     
340 |     unless (-e $control) {print STDERR "$control does not exist\n"; exit;}
341 |     unless (-e $infile) {print STDERR "$infile does not exist\n"; exit;}
342 |     
343 |     my $outfile = $infile;
344 |     $outfile=~s/\_transcriptSIZE\.txt$/\_pct\.txt/;
345 |     my %control;
346 |     my %file_data;
347 |     open(my $fh,"<$control") or die $!;
348 |     
349 |     while (my $line = <$fh>) {
350 |         chomp($line);
351 |         my @data1 = split("\t",$line);
352 |         #print STDERR $data1[0]."\n";
353 |         my @data = split(/\|/,$data1[0]);
354 |         #print STDERR $data[0]."\n";
355 |         if (scalar @data1 ==1) {
356 |             $control{$data1[0]}{"val"} = 0;
357 |             $control{$data1[0]}{"Gene"} = $data[0];
358 |         }
359 |         if (scalar @data1 ==4) {
360 |             $control{$data1[0]}{"val"} = $data1[1];
361 |             $control{$data1[0]}{"Gene"} = $data[0];
362 |             $control{$data1[0]}{"chr"} = $data1[2];
363 |             $control{$data1[0]}{"exons"} = $data1[3];
364 |         }
365 |     }
366 |     close($fh);
367 |     
368 |     open(my $fh1,"<$infile") or die $!;
369 |     while (my $line = <$fh1>) {
370 |         chomp($line);
371 |         my @data1 = split("\t",$line);
372 |         my @data = split(/\|/,$data1[0]);
373 |         #print STDERR scalar @data."\n";
374 |         if (scalar @data1 ==1) {
375 |             $file_data{$data1[0]}{"val"} = 0;
376 |             $file_data{$data1[0]}{"Gene"} = $data[0];
377 |         }
378 |         if (scalar @data1 ==2) {
379 |             $file_data{$data1[0]}{"val"} = $data1[1];
380 |             $file_data{$data1[0]}{"Gene"} = $data[0];
381 |         }
382 |     }
383 |     close($fh1);
384 |     
385 |     open(my $fho,">$outfile") or die $!;
386 |     
387 |     foreach my $key (keys %control){
388 |         my @key_split = split(/\|/,$key);
389 |         print STDERR "$key\n" unless($key_split[1]);
390 |         if (exists $file_data{$key}) {
391 |             my $tmp = $file_data{$key}{"val"}/$control{$key}{"val"};
392 |             my $pct = sprintf("%.3f",(1-$tmp)*100);
393 |             print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t$key_split[1]\t".$control{$key}{"val"}."\t".$control{$key}{"exons"}."\t$pct%\n";
394 |         }else {
395 |             print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t$key_split[1]\t".$control{$key}{"val"}."\t".$control{$key}{"exons"}."\t100.000%\n";
396 |         }  
397 |     }
398 |     close($fho);
399 |     my $rm_file = $infile;
400 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
401 |     
402 |     return $outfile;
403 | }
404 | 
405 | 
406 | 
407 | sub get_pct_perExon{
408 |     
409 |     my ($infile,$control) =@_;
410 |     
411 |     $control=~s/\.txt$/\_perExon\.txt/;
412 |     $infile=~s/\.txt$/\_perExon\.txt/;
413 |     
414 |     unless (-e $control) {print STDERR "$control does not exist\n"; exit;}
415 |     unless (-e $infile) {print STDERR "$infile does not exist\n"; exit;}
416 |     
417 |     my $outfile_pex = $infile;
418 |     $outfile_pex=~s/\_transcriptSIZE\_perExon\.txt/\_perExon\_pct\.txt/;
419 |     
420 |     my %control;
421 |     my %file_data;
422 |     
423 |     open(my $fh,"<$control") or die $!;
424 |     
425 |     while (my $line = <$fh>) {
426 |         chomp($line);
427 |         my @data1 = split("\t",$line);
428 |         #print STDERR $data1[0]."\n";
429 |         if (scalar @data1 ==1) {
430 |             $control{$data1[0]}{"val"} = 0;
431 |         }
432 |         if (scalar @data1 ==5) {
433 |             $control{$data1[0]}{"val"} = $data1[1];
434 |             $control{$data1[0]}{"chr"} = $data1[2];
435 |             $control{$data1[0]}{"start"} = $data1[3];
436 |             $control{$data1[0]}{"stop"} = $data1[4];
437 |         }
438 |     }
439 |     close($fh);
440 |     
441 |     open(my $fh1,"<$infile") or die $!;
442 |     while (my $line = <$fh1>) {
443 |         chomp($line);
444 |         my @data1 = split("\t",$line);
445 |         if (scalar @data1 ==1) {
446 |             $file_data{$data1[0]}{"val"} = 0;
447 |         }
448 |         if (scalar @data1 ==2) {
449 |             $file_data{$data1[0]}{"val"} = $data1[1];
450 |         }
451 |     }
452 |     close($fh1);
453 |     
454 |     open(my $fho,">$outfile_pex") or die $!;
455 |     
456 |     foreach my $key (keys %control){
457 |         my @key_split = split(/\|/,$key);
458 |         my @NM_details = split("_",$key_split[1]);
459 |         if (exists $file_data{$key}) {
460 |             my $tmp = $file_data{$key}{"val"}/$control{$key}{"val"};
461 |             my $pct = sprintf("%.3f",(1-$tmp)*100);
462 |             
463 |             my @regions = `grep -w \"$key\" $combined`;
464 |             my $lowcov_coords='';
465 |             foreach my $lowcov_region (@regions){
466 |                 my @tmp = split ("\t",$lowcov_region);
467 |                 $lowcov_coords.="$tmp[1]-$tmp[2];";
468 |             }
469 |             $lowcov_coords=~s/;$//;
470 |             if (scalar(@NM_details)==4) {
471 |                 print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]_$NM_details[1]\t$NM_details[2]_$NM_details[3]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t$pct%\t$lowcov_coords\n";
472 |             }else{
473 |                 print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]\t$NM_details[1]_$NM_details[2]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t$pct%\t$lowcov_coords\n";
474 |             }
475 |         }else {
476 |             if (scalar(@NM_details)==4) {
477 |                 print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]_$NM_details[1]\t$NM_details[2]_$NM_details[3]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t100.000%\t.\n";
478 |             }else{
479 |                 print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]\t$NM_details[1]_$NM_details[2]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t100.000%\t.\n";
480 |             }
481 |             
482 |         }   
483 |     }
484 |     close($fho);
485 |     my $rm_file = $infile;
486 |     unlink $rm_file or warn "Could not unlink $rm_file: $!";
487 |     
488 |     return $outfile_pex;
489 | }
490 | 
491 | 
492 | ####### Added Feb 5th  #########
493 | sub HGMDcheck {
494 |     my ($pct_file,@genes_lest) = @_;
495 |     my %genes =();
496 |     my $outfile=$pct_file;
497 |     $outfile=~s/-pct\.txt/-pctCov\.txt/;
498 |     foreach my $gene (@genes_lest) {
499 |         chomp($gene);
500 |         $gene=~s/\(/\\\(/;
501 |         $gene=~s/\)/\\\)/;
502 |         my @grep_gene = `grep -w -P \"\t$gene\t\" $pct_file`;
503 |         if ((scalar @grep_gene) == 0) {
504 |             print STDERR "$gene is not present in the provided database\n";
505 |         }
506 |         
507 |         foreach my $gene_trans (@grep_gene){
508 |             chomp($gene_trans);
509 |             my @tmp = split("\t",$gene_trans);
510 |             my $is_HGMD = `grep -w -P \"\t$tmp[2]\$\" $HGMD_db`;
511 |             if ($is_HGMD) {
512 |                 $genes{$tmp[1]}{$tmp[2]}{'is_HGMD'} = "true";
513 |             }else{
514 |                 $genes{$tmp[1]}{$tmp[2]}{'is_HGMD'} = "false";
515 |             }
516 |             $genes{$tmp[1]}{$tmp[2]}{'line'} = $gene_trans;
517 |             $genes{$tmp[1]}{$tmp[2]}{'cov'} = $tmp[5];
518 |             $genes{$tmp[1]}{$tmp[2]}{'cov'}=~ s/%$//;
519 |         }
520 |     }
521 |     
522 |     open(my $fho," > $outfile") or die $!;
523 |     
524 |     #my $total_genes = keys %genes;
525 |     #print STDERR "$total_genes\n";
526 |     foreach my $gene (sort keys %genes){
527 |         my $written = 0;
528 |         foreach my $transcript (sort keys %{$genes{$gene}}){
529 |             if ($genes{$gene}{$transcript}{'is_HGMD'} eq "true") {
530 |                 print $fho "$gene\t$transcript\t$genes{$gene}{$transcript}{'cov'}\tHGMD\n";
531 |                 $written=1;
532 |             }
533 |             
534 |         }
535 |         #print STDERR "$gene\t$no_trans\t$written\n";
536 |         if ($written == 0) {
537 |             my $print = "$gene\t";
538 |             my $avg_cov = 0;
539 |             my $no_trans = 0;
540 |             foreach my $transcript (keys %{$genes{$gene}}){
541 |                 $no_trans++;
542 |                 $avg_cov+=$genes{$gene}{$transcript}{'cov'};
543 |                 $print .= "$transcript($genes{$gene}{$transcript}{'cov'});";
544 |             }
545 |             $print=~ s/;$//;
546 |             $avg_cov = sprintf("%0.2f",($avg_cov/$no_trans));
547 |             print $fho "$print\t$avg_cov\n";
548 |         }
549 |     }
550 |     
551 |     close($fho);
552 | }
553 | ####### Added Feb 5th  #########
554 | 
555 | ####### Added Feb 10th  #########
556 | sub averageGene {
557 |     my ($pct_file,@genes_lest) = @_;
558 |     my %genes =();
559 |     my $outfile=$pct_file;
560 |     $outfile=~s/-pct\.txt/-pctCov\.txt/;
561 |     foreach my $gene (@genes_lest) {
562 |         chomp($gene);
563 |         $gene=~s/\(/\\\(/;
564 |         $gene=~s/\)/\\\)/;
565 |         my @grep_gene = `grep -w -P \"\t$gene\t\" $pct_file`;
566 |         if ((scalar @grep_gene) == 0) {
567 |             print STDERR "$gene is not present in the provided database\n";
568 |         }
569 |         
570 |         foreach my $gene_trans (@grep_gene){
571 |             chomp($gene_trans);
572 |             my @tmp = split("\t",$gene_trans);
573 |             $genes{$tmp[1]}{$tmp[2]}{'line'} = $gene_trans;
574 |             $genes{$tmp[1]}{$tmp[2]}{'cov'} = $tmp[5];
575 |             $genes{$tmp[1]}{$tmp[2]}{'cov'}=~ s/%$//;
576 |         }
577 |     }
578 |     
579 |     open(my $fho," > $outfile") or die $!;
580 |     
581 |     #my $total_genes = keys %genes;
582 |     #print STDERR "$total_genes\n";
583 |     foreach my $gene (sort keys %genes){
584 |         my $print = "$gene\t";
585 |         my $avg_cov = 0;
586 |         my $no_trans = 0;
587 |         foreach my $transcript (keys %{$genes{$gene}}){
588 |             $no_trans++;
589 |             $avg_cov+=$genes{$gene}{$transcript}{'cov'};
590 |             $print .= "$transcript($genes{$gene}{$transcript}{'cov'});";
591 |         }
592 |         $print=~ s/;$//;
593 |         $avg_cov = sprintf("%0.2f",($avg_cov/$no_trans));
594 |         print $fho "$print\t$avg_cov\n";
595 |     }
596 |     
597 |     close($fho);
598 |    
599 | }
600 | ####### Added Feb 10th  #########
601 | 
602 | ##############################################################################################################################
603 | 
604 | sub timestamp {
605 |   my $t = localtime;
606 |   return sprintf( "%04d-%02d-%02d_%02d-%02d-%02d", $t->year + 1900, $t->mon + 1, $t->mday, $t->hour, $t->min, $t->sec );
607 | }


--------------------------------------------------------------------------------
/ExCID_v2.1/ExCID_v2.1-Batch.pl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/ExCID_v2.1-Batch.pl


--------------------------------------------------------------------------------
/ExCID_v2.1/Get_HGNC.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | use LWP::Simple;
 4 | my $url = 'http://www.genenames.org/cgi-bin/download?'.
 5 |           'col=gd_app_sym&'.
 6 |           'col=gd_app_name&'.
 7 |           'col=gd_status&'.
 8 |           'col=gd_prev_sym&'.
 9 |           'col=gd_aliases&'.
10 |           'col=gd_name_aliases&'.
11 |           'col=gd_pub_chrom_map&'.
12 |           'col=gd_pub_acc_ids&'.
13 |           'col=gd_pub_ensembl_id&'.
14 |           'col=gd_pub_refseq_ids&'.
15 |           'col=gd_ccds_ids&'.
16 |           'col=gd_vega_ids&'.
17 |           'col=md_mim_id&'.
18 |           'col=md_ucsc_id&'.
19 |           'status=Approved&'.
20 |           'status_opt=2&'.
21 |           'where=&'.
22 |           'order_by=gd_app_sym_sort&'.
23 |           'format=text&'.
24 |           'limit=&'.
25 |           'submit=submit';
26 | getprint($url);
27 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/bed_file-annotator_V2_CCDS-miRBASE.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | ## Anootater for BED file.
  3 | use strict;
  4 | use diagnostics;
  5 | use Getopt::Std;
  6 | use File::Basename;
  7 | 
  8 | ### GLOBAL VARS ###
  9 | my $bed = $ARGV[0] || &USAGE;
 10 | my $database = $ARGV[1] || &USAGE;
 11 | my $output_dir = $ARGV[2] || dirname($bed);
 12 | my $db_name = basename($database);
 13 | my $script_dir;
 14 | my $script_dir_tmp = $0;
 15 | $script_dir_tmp =~m/^.+\//;
 16 | $script_dir=$&;
 17 | 
 18 | system("ln -s $bed $output_dir/");
 19 | 
 20 | $bed = "$output_dir/".basename($bed);
 21 | my %bedfile;
 22 | 
 23 | open(my $fhb,"<$bed") or die $!;
 24 | my $line = <$fhb>;
 25 | chomp($line);
 26 | my @tmp = split("\t",$line);
 27 | my $cols = scalar @tmp;
 28 | close($fhb);
 29 | 
 30 | my $cmd="";
 31 | 
 32 | for(my $i =0; $i < $cols; $i++){
 33 |     my $tmp = $i+1;
 34 |     $cmd .= "\$$tmp\"\t\"";
 35 | }
 36 | $cmd .= "\$".($cols+3+1)."\"\t\""."\$".($cols+3+1+1)." ";
 37 | 
 38 | 
 39 | system("$script_dir/bin/bedtools intersect -a $bed -b $database -wao | awk -F\$\'\t\' '{print $cmd}' > $bed.$db_name.Annotated");
 40 | 
 41 | my %data_annotation;
 42 | open(my $fh, "< $bed.$db_name.Annotated") or die $!;
 43 | while (my $line = <$fh>) {
 44 |     chomp($line);
 45 |     $line=~s/\t-1\t/\t.\t/;
 46 |     my @columns = split("\t",$line);
 47 |     my $key = "$columns[0]_$columns[1]_$columns[2]";
 48 |     unless (exists $data_annotation{$key}){
 49 |         $data_annotation{$key}{'HGNC_gene'} = "";
 50 |         $data_annotation{$key}{'transcript'} = "";
 51 |         $data_annotation{$key}{'other_Genes'} = "";
 52 |         $data_annotation{$key}{'rest'} = join("\t", @columns[3..(scalar(@columns)-3)]);
 53 |     }
 54 |     
 55 |     if ($columns[scalar(@columns)-1] ne ".") {
 56 |         if (length($data_annotation{$key}{'HGNC_gene'}) == 0) {
 57 |             $data_annotation{$key}{'HGNC_gene'} = $columns[scalar(@columns)-1].";";
 58 |         }else{
 59 |             my $check = $columns[scalar(@columns)-1].";";
 60 |             if (index($data_annotation{$key}{'HGNC_gene'},$check) == -1 && index($data_annotation{$key}{'other_Genes'},$check) == -1) {
 61 |                 $data_annotation{$key}{'other_Genes'} = $columns[scalar(@columns)-1].";";
 62 |             }
 63 |         }
 64 |     }
 65 |     
 66 |     if ($columns[scalar(@columns)-2] ne ".") {
 67 |         if (length($data_annotation{$key}{'transcript'}) == 0) {
 68 |             $data_annotation{$key}{'transcript'} = $columns[scalar(@columns)-2].";";
 69 |         }else{
 70 |             my $check = $columns[scalar(@columns)-2].";";
 71 |             unless (index($data_annotation{$key}{'transcript'}, $check) != -1) {
 72 |                 $data_annotation{$key}{'transcript'} .= $columns[scalar(@columns)-2].";";
 73 |             }
 74 |         }
 75 |     }
 76 |     
 77 |     
 78 |     if (length($data_annotation{$key}{'HGNC_gene'}) == 0) {
 79 |         $data_annotation{$key}{'HGNC_gene'} =".";
 80 |     }
 81 |     if (length($data_annotation{$key}{'transcript'}) == 0) {
 82 |         $data_annotation{$key}{'transcript'} =".";
 83 |     }
 84 |     if (length($data_annotation{$key}{'other_Genes'}) == 0) {
 85 |         $data_annotation{$key}{'other_Genes'} =".";
 86 |     }      
 87 | }
 88 | close($fh);
 89 | 
 90 | open(my $fho, ">$bed.$db_name.Annotated.edit") or die $!;
 91 | foreach my $key (keys %data_annotation){
 92 |     my @columns = split("_",$key);
 93 |     
 94 |     $data_annotation{$key}{'HGNC_gene'} =~ s/;$//;
 95 |     $data_annotation{$key}{'other_Genes'} =~ s/;$//;
 96 |     $data_annotation{$key}{'transcript'}=~ s/;$//;
 97 |     $data_annotation{$key}{'other_Genes'} =~ s/^\.// if ($data_annotation{$key}{'other_Genes'} ne ".");
 98 |     
 99 |     if (length($data_annotation{$key}{'rest'}) < 1) {
100 |         my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t.\n";
101 |         $out_line=~s/\t-1\t/\t.\t/;
102 |         print $fho $out_line;
103 |     }else{
104 |         my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t$data_annotation{$key}{'rest'}\n";
105 |         $out_line=~s/\t-1\t/\t.\t/;
106 |         print $fho $out_line;
107 |     }
108 | }
109 | close($fho);
110 | 
111 | 
112 | 
113 | system("$script_dir/bin/bedtools sort -i $bed.$db_name.Annotated.edit > $bed-$db_name");
114 | 
115 | my $rm_file = "$bed.$db_name.Annotated.edit";
116 | unlink $rm_file or warn "Could not unlink $rm_file: $!";
117 | $rm_file = "$bed.$db_name.Annotated";
118 | unlink $rm_file or warn "Could not unlink $rm_file: $!";
119 | 
120 | 
121 | ### SUBROUTINES ###
122 | sub USAGE {
123 |     print "USAGE: $0 <Bed file> <Database file> <output dir>\n\n";
124 |     exit;
125 | }


--------------------------------------------------------------------------------
/ExCID_v2.1/bed_file-annotator_V2_RefSeq-VEGA.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | ## Anootater for BED file.
  3 | use strict;
  4 | use diagnostics;
  5 | use Getopt::Std;
  6 | use File::Basename;
  7 | 
  8 | ### GLOBAL VARS ###
  9 | my $bed = $ARGV[0] || &USAGE;
 10 | my $database = $ARGV[1] || &USAGE;
 11 | my $output_dir = $ARGV[2] || dirname($bed);
 12 | my $db_name = basename($database);
 13 | my $script_dir;
 14 | my $script_dir_tmp = $0;
 15 | $script_dir_tmp =~m/^.+\//;
 16 | $script_dir=$&;
 17 | 
 18 | system("ln -s $bed $output_dir/");
 19 | 
 20 | $bed = "$output_dir/".basename($bed);
 21 | my %bedfile;
 22 | 
 23 | open(my $fhb,"<$bed") or die $!;
 24 | my $line = <$fhb>;
 25 | chomp($line);
 26 | my @tmp = split("\t",$line);
 27 | my $cols = scalar @tmp;
 28 | close($fhb);
 29 | 
 30 | my $cmd="";
 31 | 
 32 | for(my $i =0; $i < $cols; $i++){
 33 |     my $tmp = $i+1;
 34 |     $cmd .= "\$$tmp\"\t\"";
 35 | }
 36 | $cmd .= "\$".($cols+3+1)."\"\t\""."\$".($cols+3+1+1)."\"\t\""."\$".($cols+3+1+1+1)." ";
 37 | 
 38 | 
 39 | system("$script_dir/bin/bedtools intersect -a $bed -b $database -wao | awk -F\$\'\t\' '{print $cmd}' > $bed.$db_name.Annotated");
 40 | 
 41 | my %data_annotation;
 42 | open(my $fh, "< $bed.$db_name.Annotated") or die $!;
 43 | while (my $line = <$fh>) {
 44 |     chomp($line);
 45 |     $line=~s/\t-1\t/\t.\t/;
 46 |     my @columns = split("\t",$line);
 47 |     my $key = "$columns[0]_$columns[1]_$columns[2]";
 48 |     unless (exists $data_annotation{$key}){
 49 |         $data_annotation{$key}{'HGNC_gene'} = "";
 50 |         $data_annotation{$key}{'transcript'} = "";
 51 |         $data_annotation{$key}{'other_Genes'} = "";
 52 |         $data_annotation{$key}{'rest'} = join("\t", @columns[3..(scalar(@columns)-4)]);
 53 |     }
 54 |     
 55 |     if ($columns[scalar(@columns)-1] ne ".") {
 56 |         if (length($data_annotation{$key}{'HGNC_gene'}) == 0) {
 57 |             $data_annotation{$key}{'HGNC_gene'} = $columns[scalar(@columns)-1].";";
 58 |         }else{
 59 |             my $check = $columns[scalar(@columns)-1].";";
 60 |             if (index($data_annotation{$key}{'HGNC_gene'},$check) == -1 && index($data_annotation{$key}{'other_Genes'},$check) == -1) {
 61 |                 $data_annotation{$key}{'other_Genes'} = $columns[scalar(@columns)-1].";";
 62 |             }
 63 |         }
 64 |     }
 65 |     
 66 |     if ($columns[scalar(@columns)-2] ne ".") {
 67 |         if (length($data_annotation{$key}{'transcript'}) == 0) {
 68 |             $data_annotation{$key}{'transcript'} = $columns[scalar(@columns)-2].";";
 69 |         }else{
 70 |             my $check = $columns[scalar(@columns)-2].";";
 71 |             unless (index($data_annotation{$key}{'transcript'}, $check) != -1) {
 72 |                 $data_annotation{$key}{'transcript'} .= $columns[scalar(@columns)-2].";";
 73 |             }
 74 |         }
 75 |     }
 76 |     
 77 |     
 78 |     if ($columns[scalar(@columns)-3] ne ".") {        
 79 |         my $check = $columns[scalar(@columns)-3].";";
 80 |         if (index($data_annotation{$key}{'HGNC_gene'},$check) == -1 && index($data_annotation{$key}{'other_Genes'},$check) == -1) {
 81 |             $data_annotation{$key}{'other_Genes'} = $columns[scalar(@columns)-3].";";
 82 |         }
 83 |     }
 84 |     
 85 |     
 86 |     if (length($data_annotation{$key}{'HGNC_gene'}) == 0) {
 87 |         $data_annotation{$key}{'HGNC_gene'} =".";
 88 |     }
 89 |     if (length($data_annotation{$key}{'transcript'}) == 0) {
 90 |         $data_annotation{$key}{'transcript'} =".";
 91 |     }
 92 |     if (length($data_annotation{$key}{'other_Genes'}) == 0) {
 93 |         $data_annotation{$key}{'other_Genes'} =".";
 94 |     }      
 95 | }
 96 | close($fh);
 97 | 
 98 | open(my $fho, ">$bed.$db_name.Annotated.edit") or die $!;
 99 | foreach my $key (keys %data_annotation){
100 |     my @columns = split("_",$key);
101 |     
102 |     $data_annotation{$key}{'HGNC_gene'} =~ s/;$//;
103 |     $data_annotation{$key}{'other_Genes'} =~ s/;$//;
104 |     $data_annotation{$key}{'transcript'}=~ s/;$//;
105 |     $data_annotation{$key}{'other_Genes'} =~ s/^\.// if ($data_annotation{$key}{'other_Genes'} ne ".");
106 |     
107 |     if (length($data_annotation{$key}{'rest'}) < 1) {
108 |         my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t.\n";
109 |         $out_line=~s/\t-1\t/\t.\t/;
110 |         print $fho $out_line;
111 |     }else{
112 |         my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t$data_annotation{$key}{'rest'}\n";
113 |         $out_line=~s/\t-1\t/\t.\t/;
114 |         print $fho $out_line;
115 |     }
116 | }
117 | close($fho);
118 | 
119 | 
120 | 
121 | system("$script_dir/bin/bedtools sort -i $bed.$db_name.Annotated.edit > $bed-$db_name");
122 | 
123 | my $rm_file = "$bed.$db_name.Annotated.edit";
124 | unlink $rm_file or warn "Could not unlink $rm_file: $!";
125 | $rm_file = "$bed.$db_name.Annotated";
126 | unlink $rm_file or warn "Could not unlink $rm_file: $!";
127 | 
128 | 
129 | ### SUBROUTINES ###
130 | sub USAGE {
131 |     print "USAGE: $0 <Bed file> <Database file> <output dir>\n\n";
132 |     exit;
133 | }


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/CapStatsV2.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/CapStatsV2.5.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/CapStatsV2.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/CapStatsV2.6.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/CovFasta_Generator.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/CovFasta_Generator.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/Java_code/CaptureStatsBAM5_extended.java:
--------------------------------------------------------------------------------
  1 | import java.io.*;
  2 | import java.util.*;
  3 | 
  4 | import net.sf.samtools.*;
  5 | import net.sf.samtools.SAMFileReader.ValidationStringency;
  6 | 
  7 | /**
  8 |  * Generates capture statistics from a BAM input file.
  9 |  * Generates 3 output files:
 10 |  * @author bainbrid
 11 |  *
 12 |  */
 13 | public class CaptureStatsBAM5_extended
 14 | {
 15 | 	static final int prime_size = 1000;
 16 | 	static int[] fivePrime = new int[prime_size]; //stores data about coverage upstream of the target 
 17 | 	static int[] threePrime = new int[prime_size];//stores data about coverage downstream of the target 
 18 | 	static int[] targetCov = new int[101]; //stores data about the coverage across a target
 19 | 	static final int BUFFER = 100; //buffer region around a target
 20 | 	static int tempcnt = 0; 
 21 | 	static int totalReadsProduced = 0; //total reads contains in the bam
 22 | 	static int totalReadsAligned = 0; //total reads aligned to a target region
 23 | 	static long totalTargetCoverage = 0;  //total number of read bases aligned to the target
 24 | 	static int totalReadsPaired = 0; //total number of reads with mate pairs (if any)
 25 | 	static int totalPairedreadsWithmappedMates = 0; //total number of aligned reads which have mapped mates
 26 | 	static int offtargetReadHitCount = 0; //total number of reads which do not align to a target region
 27 | 	static int ontargetReadHitCount = 0; //total number of reads which align to a target region
 28 | 	static int inbufferReadHitCount = 0; //total number of reads which align to the buffer region
 29 | 	static int readsAlignedINCLUDINGDUPES = 0; //total reads aligned including duplicates
 30 | 	static int duplicateReads = 0; //total number of duplicate reads
 31 | 	static long totalAlignedBases = 0; //total number of aligned bases
 32 | 	static int totalTargetedBases = 0; //total number of bases targeted
 33 | 	static int totalBufferBases = 0; //total number of bases in the buffer region
 34 | 	static int basesWithOneHitorMore = 0;   //total targeted bases with at least  1 coverage
 35 | 	static int basesWith10HitsorMore = 0;  //total targeted bases with at least  10 coverage
 36 | 	static int basesWith20HitsorMore = 0; //total targeted bases with at least  20 coverage
 37 | 	static int basesWith40HitsorMore = 0; //total targeted bases with at least 40  coverage
 38 |         static int basesWith50HitsorMore = 0; //total targeted bases with at least 50  coverage
 39 |         static int basesWith100HitsorMore = 0; //total targeted bases with at least 100  coverage
 40 |         static int basesWith500HitsorMore = 0; //total targeted bases with at least 500  coverage
 41 |         static int basesWith1000HitsorMore = 0; //total targeted bases with at least 1000  coverage
 42 | 	static int totalTargets = 0; //total taregted regions
 43 | 	static int hitTargetCount = 0;  //total targets with at least 1 read aligned to them
 44 | 	static int hitTarget_bufferonly_Count = 0; //total targets with no hits, except in buffer
 45 | 	static int[] dupHisto = new int[9]; //depcrecated
 46 | 	static int[] covHisto = new int[1001]; //coverage histogram
 47 | 	static int nonTragetGoodHits = 0; //regions that have high coverage but are not in the target
 48 | 	static String VERSION = "CapStatsV2.6 2015-03-25";
 49 | 	static Hashtable<String, SAMFileWriter> fht ; 
 50 | 	static final String dummy = "dummy";
 51 | 	static boolean removeDupes = false; //do we not consider duplicate reads
 52 | 	static boolean writeWGC = false; //write whole genome coverage statistics
 53 | 	static double _percentage = 1.0; //number of read to take (to randomly dump some)
 54 | 	static Random RAND = new Random(88651); //random number generator, good for removing a proportion of the reads
 55 | 	static String[] targetChrs;
 56 | 	static int[] targetStarts;
 57 | 	static int[] targetStops;
 58 | 	static double minmapscore = -1.0;
 59 |         static double minbasescore = -1.0;
 60 | 	static int size =0 ; //Holds the size of the Chromosome being analyzed.
 61 |         static int[] coverage_forMedian = new int[1000]; //Used for calculating the median coverage.
 62 | 	static int median_coverage = 0;
 63 |         static long READLENGTH = 0; //Gets READ Length
 64 | 	
 65 | 	/**
 66 | 	 * ONLY WORKS ON A SINGLE FILES THAT HAS BEEN SORTED!!!!!!!!!!
 67 | 	 * looks like it works in 4000M of ram.
 68 | 	 * 
 69 | 	 * usage: -o <output directory & file base name> -t <targetfile> - -i <BAM FILE> [-d] [-w] [-m] [-b] [-p]
 70 | 	 * 
 71 | 	 * i: BAM File
 72 | 	 * t: target file
 73 | 	 * o: output directory and base file name
 74 | 	 * d: remove duplicates, and do not use them for statistics
 75 | 	 * w: write whole genome coverage
 76 | 	 * m: minimum mapscore (mapscore must be >= to this)
 77 |          * b: minimum base score (basescore must be >= to this)
 78 | 	 * p: only take this proportion of reads into consideration (for scale back experiments)
 79 | 	 * 
 80 | 	 * 
 81 | 	 * @throws Exception
 82 | 	 * 
 83 | 	 */
 84 | 	public static void main(String[] args) throws Exception
 85 | 	{
 86 |             if(args.length == 0) {usage();System.exit(0);};
 87 |             String[] validargs = {"r","t","o","d","i","w","p","m","b"};
 88 |             String[] arguments = new String[validargs.length];
 89 |             String warns = ParseOpts.parse(validargs, args, arguments);
 90 |             if(arguments[0] == null)
 91 |             {
 92 |                 //deprecated.  no longer needed.
 93 |             }
 94 |             if(arguments[1] == null)
 95 |             {
 96 |                 System.err.println("No target file specified!!! Exiting\n");
 97 |                 usage();
 98 |                 System.exit(2);
 99 |             }
100 |             if(arguments[2] == null)
101 |             {
102 |                 System.err.println("No output file specified!!! Exiting\n");
103 |                 usage();
104 |                 System.exit(3);
105 |             }
106 |             int index = arguments[2].lastIndexOf(File.separator);
107 |             String fileName = arguments[2].substring(index + 1);
108 |             if(fileName == null)
109 |             {
110 |                 System.err.println("Please provide proper output file!!! Exiting\n");
111 |                 usage();
112 |                 System.exit(3);
113 |             }
114 |             if(arguments[3]!= null )
115 |             {
116 |                 removeDupes = true;
117 |             }
118 |             if(arguments[4] == null)
119 |             {
120 |                 System.err.println("No alignment file specified!!! Exiting\n");
121 |                 usage();
122 |                 System.exit(4);
123 |             }
124 |             if(arguments[5] != null)
125 |             {
126 |                 writeWGC = true;
127 |             }
128 |             if(arguments[6] != null)
129 |             {
130 |                 _percentage = Double.parseDouble(arguments[6]);
131 |             }
132 |             if(arguments[7] != null)
133 |             {
134 |                 minmapscore = Double.parseDouble(arguments[7]);
135 |             }
136 |             if(arguments[8] != null)
137 |             {
138 |                 minbasescore = Double.parseDouble(arguments[8]);
139 |             }
140 |             String targetFile = arguments[1];
141 |             String alignmentFile = arguments[4];
142 |             checkFile(alignmentFile);
143 |             checkFile(targetFile);
144 |             String outfile  = arguments[2];
145 |             String covFile = outfile+".cov.fasta";
146 |             System.out.println("Writing to: "+outfile);
147 |             FileWriter wgcFasta = null;
148 |             if(writeWGC)
149 |             {
150 |                 wgcFasta =  new FileWriter(outfile+".wholeGenomeCov.fasta");
151 |             }
152 |             FileWriter covFasta = new FileWriter(covFile);
153 |             FileWriter wig = new FileWriter(outfile+".missReads.wig");
154 |             FileWriter missTar = new FileWriter(outfile+".missTargets.txt");
155 |             fht = new Hashtable<String, SAMFileWriter>(50); 
156 |             wig.write("track type=wiggle_0 name="+alignmentFile+"\n");
157 |             loadTargets(targetFile);
158 |             readBAM(alignmentFile,  wig, covFasta, missTar, wgcFasta);
159 |             covFasta.close();
160 |             if(writeWGC) wgcFasta.close();
161 |             wig.close();
162 |             writeReport(outfile);
163 |             missTar.close();
164 | 	}
165 | 	
166 | 	/**
167 | 	 * load the target regions into memory and leave them there
168 | 	 * @param targetFile
169 | 	 */
170 | 	public static void loadTargets(String targetFile) throws Exception
171 | 	{
172 |             BufferedReader br = new BufferedReader(new FileReader(targetFile));
173 |             String line;
174 |             int cnt = 0;
175 |             int start = 0;
176 |             int stop = 0;
177 |             while((line = br.readLine())!=null)
178 |             {
179 |                 String[] tokens = line.split("[ \t]+");
180 |                 if(tokens.length < 3) {continue;}
181 |                 //if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue;
182 |                 try
183 |                 {	
184 |                     start = Integer.parseInt(tokens[1]);
185 |                     stop = Integer.parseInt(tokens[2]);
186 |                 }
187 |                 catch(NumberFormatException e){continue;}
188 |                 cnt++;
189 |             }
190 |             targetChrs = new String[cnt];
191 |             targetStarts = new int[cnt];
192 |             targetStops = new int[cnt];
193 |             cnt = 0;
194 |             br.close();
195 |             br = new BufferedReader(new FileReader(targetFile));
196 |             while((line = br.readLine())!=null)
197 |             {
198 |                 String[] tokens = line.split("[ \t]+");
199 |                 if(tokens.length < 3) {continue;}
200 |                 //if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue;
201 |                 try
202 |                 {	
203 |                     start = Integer.parseInt(tokens[1]);
204 |                     stop = Integer.parseInt(tokens[2]);
205 |                 }
206 |                 catch(NumberFormatException e){continue;}
207 |                 targetChrs[cnt] = tokens[0];
208 |                 targetStarts[cnt]=start;
209 |                 targetStops[cnt] = stop;
210 |                 cnt++;
211 |             }
212 |             totalTargets = cnt;     /* Should be used to count Targets Oct3_2014*///totalTargets = TR.length;
213 | 	}
214 | 	
215 | 	
216 | 	
217 | 	/**
218 | 	 * Major workhorse of the application.  Reads in the BAM file and processes each record.
219 | 	 * @param bamfile  The bam file to read in
220 | 	 * @param wig The output wig file to write offtarget high coverage regions
221 | 	 * @param covFasta the coverage fasta output file
222 | 	 * @param missTraget the miss target file
223 | 	 * @param wgCoverage whole genome coverage file
224 | 	 * @throws Exception
225 | 	 * 
226 | 	 */
227 | 	public static void readBAM(String bamfile,  FileWriter wig, FileWriter covFasta, FileWriter missTraget, FileWriter wgCoverage) throws Exception
228 | 	{
229 |             SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
230 |             SAMFileReader sfr  = new SAMFileReader(new File(bamfile));
231 |             SAMFileHeader header = sfr.getFileHeader();
232 |             Iterator<SAMRecord> iter = sfr.iterator();
233 |             String lastchr = "SOMEVERYFAKETHINGGOESHERE";
234 |             short[] COVERAGE = new short[0];
235 |             char[] TR = new char[0];
236 |             int cnt = 0;
237 |             SAMRecord rec = null;
238 |             while(iter.hasNext())
239 |             {
240 |                 try
241 |                 {
242 |                     rec = iter.next();
243 |                     if(_percentage < 1.0)
244 |                     {
245 |                         double rd = RAND.nextDouble();
246 |                         if(rd < _percentage) continue;
247 |                     }
248 | 
249 |                     totalReadsProduced++;
250 |                     if(rec.getMappingQuality() < minmapscore)
251 |                     {
252 |                         continue;
253 |                     }
254 |                     if(rec.getReadFailsVendorQualityCheckFlag()){
255 |                         continue;
256 |                     }
257 |                                 
258 |                     if(rec.getNotPrimaryAlignmentFlag()){
259 |                         continue;
260 |                     }
261 |                     
262 |                     if(rec.getReadUnmappedFlag())
263 |                     {
264 |                         //System.out.println("Unmapped! region.  breaking!!!");
265 |                         continue;
266 |                     }
267 |                     totalReadsAligned++;
268 |                     
269 |                     if(rec.getReadPairedFlag())
270 |                     {
271 |                         totalReadsPaired++;
272 |                         if(!rec.getMateUnmappedFlag()){
273 |                             totalPairedreadsWithmappedMates++;
274 |                         }
275 |                     }
276 |                     
277 |                     if(rec.getDuplicateReadFlag())
278 |                     {
279 |                         duplicateReads++;
280 |                         if(removeDupes){continue;}
281 |                     }
282 |                     
283 |                     /////////////////Added on Feb 11, 2015/////////////////////////////
284 |                     READLENGTH = rec.getReadLength();
285 |  
286 |                     ///////////////////////////////////////////////////////////////////
287 |                                 
288 |                     String currchr = rec.getReferenceName();
289 |                     if(!currchr.equals(lastchr))
290 |                     {
291 |                         if(!lastchr.equals("SOMEVERYFAKETHINGGOESHERE"))
292 |                         {
293 |                             getTargetsAndWriteCoverage(lastchr,  COVERAGE, covFasta,  missTraget,  wgCoverage);
294 |                             findWhereReadsHit(lastchr,  COVERAGE,wig);
295 |                         }
296 |                         lastchr = currchr;
297 |                         System.out.println(currchr);
298 |                         size = header.getSequence(currchr).getSequenceLength()+1;
299 |                         COVERAGE = new short[size];
300 |                         TR = getTargetPos(currchr, size);
301 |                         if(TR == null || COVERAGE == null)
302 |                         {
303 |                             System.err.println("COVERAGE or TR is null! "+currchr);
304 |                         }
305 | 
306 |                     }
307 |                     processRecord(rec, TR, COVERAGE);
308 |                 }
309 |                 catch(Exception e)
310 |                 {
311 |                     System.err.println("Error on record: "+cnt+"\n"+e.getMessage()+" "+Arrays.toString(e.getStackTrace()));
312 |                     System.err.println(rec.toString()+" "+rec.getReferenceName()+" "+rec.getAlignmentStart()+" "+rec.getAlignmentEnd());
313 |             //	throw e;
314 |                 }
315 |                 cnt++;
316 |             }
317 |             System.out.println("Done read bam");
318 |             getTargetsAndWriteCoverage(lastchr,  COVERAGE, covFasta,  missTraget,  wgCoverage);
319 |             findWhereReadsHit(lastchr,  COVERAGE,wig);
320 |             COVERAGE = null;
321 | 	}
322 | 	
323 | 	/**
324 | 	 * Processes read record from a BAM.  Adding the alignment to coverage array.
325 | 	 * @param rec
326 | 	 * @param TR
327 | 	 * @param COVERAGE
328 | 	 */
329 | 	public static void processRecord(SAMRecord rec, char[] TR, short[] COVERAGE) throws Exception
330 | 	{
331 |             boolean inbuffer = false;
332 |             boolean ontarget = false;
333 |             int start = rec.getAlignmentStart();
334 |             int stop = rec.getAlignmentEnd();
335 |             int referenceposition = 0;
336 |             byte[] baseQual = rec.getBaseQualities();
337 |             /*if(rec.getReadNegativeStrandFlag())
338 |             {
339 |                     start-=25;
340 |             }
341 |             else
342 |             {
343 |                     stop+=25;
344 |             }*/
345 | 
346 |             for(int i = start; i <= stop; i++)
347 |             {
348 |                 totalAlignedBases++;
349 |                 try
350 |                 {
351 |                     if(TR[i] == 1)
352 |                     {
353 |                         ontarget = true;
354 |                     }
355 |                     else if(TR[i] == 2)
356 |                     {
357 |                         inbuffer = true;
358 |                     }
359 |                     
360 |                     referenceposition = rec.getReferencePositionAtReadPosition(i-start+1);
361 |                     if(referenceposition==0){continue;}
362 |                     
363 |                     if(minbasescore > 0){
364 |                         if((double)baseQual[i-start] >= minbasescore) {
365 |                             COVERAGE[referenceposition]++;
366 |                         }
367 |                     }else {
368 |                         COVERAGE[referenceposition]++;
369 |                     }
370 |                 }
371 |                 catch(Exception e)
372 |                 {
373 |                     System.err.println("array size:"+COVERAGE.length);
374 |                     System.err.println("start stop:"+start+"  "+stop);
375 |                     System.err.println(e.getMessage()+" -- "+e.getLocalizedMessage()+" -- "+e.getCause()+" -- "+Arrays.toString(e.getStackTrace()));
376 |                     //throw e;
377 |                     break;
378 |                 }
379 |             }
380 |             if(ontarget)
381 |             {
382 |                 ontargetReadHitCount++;
383 |             }
384 |             else if(inbuffer)
385 |             {
386 |                 inbufferReadHitCount++;
387 |             }
388 |             else
389 |             {
390 |                 offtargetReadHitCount++;
391 |             }
392 | 	}
393 | 	
394 | 	/**
395 | 	 * makes sure a file exists
396 | 	 * @param s
397 | 	 * @throws Exception
398 | 	 */
399 | 	public static void checkFile(String s) throws Exception
400 | 	{
401 |             File f = new File(s);
402 |             if(!f.exists())
403 |             {
404 |                 throw new FileNotFoundException("No such file as \""+s+"\".  File not found.");
405 |             }
406 | 	}
407 | 	
408 | 	public static void usage()
409 | 	{
410 |             String s= "Version: "+VERSION+"\nUsage: -o <output directory & file base name> -t <targetfile> -i <BAM FILE ...> [-d] [-w] [-m <value>] [-b <value>]\n\t* t: target file\n\t* o: output directory and base file name";
411 |             s+="\n\t* d: remove duplicates, and do not use them for statistics\n\t* i: alignment file (multiple files are not allowed)\n\t* w: write whole genome coverage\n\t* m: minimum mapscore (mapscore must be >= to this)\n\t* b: minimum base quality\n";
412 |             System.out.println(s);                                                                                                 	 
413 | 	}
414 | 	
415 | 	/**
416 | 	 * removes the "chr" portion of any chromosome name
417 | 	 * @param c
418 | 	 * @return
419 | 	 */
420 | 	public static String removechr(String c)
421 | 	{
422 |             if(c.length() > 3 && c.substring(0, 3).equalsIgnoreCase("chr")){
423 |                 c = c.substring(3);
424 |             }
425 |             return c;
426 | 	}
427 | 	
428 | 	/**
429 | 	 * converts fractions into percentages with 2 decimal positions
430 | 	 * @param num
431 | 	 * @param dom
432 | 	 * @return
433 | 	 */
434 | 	public static double pc(int num, int dom)
435 | 	{
436 |             double pc = (double)num/(double)dom;
437 |             pc*=10000.0;pc+=0.5; int ipc = (int)pc; pc = (double)ipc/100;
438 |             return pc;
439 | 	}
440 | 	
441 | 	/**
442 | 	 * Writes all the statistical information to an output file.
443 | 	 * @param fname output file name
444 | 	 * @throws Exception
445 | 	 */
446 | 	public static void writeReport(String fname) throws Exception
447 | 	{
448 |             int nonduplicatereads = totalReadsAligned - duplicateReads;
449 |             if(totalTargetedBases == 0)
450 |             {
451 |                 System.err.println("Total targeted bases is zero.  This means that no read has aligned to a chromosome that contains a target. No target matches a chromosome in the BAM, or something else very weird.  Aborting.");
452 |                 System.exit(1);
453 |             }
454 |             if(totalReadsAligned == 0)
455 |             {
456 |                 System.err.println("No reads aligned. Aborting.");
457 |                 System.exit(2);
458 |             }
459 |             if(nonduplicatereads == 0)
460 |             {
461 |                 System.err.println("All reads are duplicates. Aborting.");
462 |                 System.exit(3);
463 |             }
464 |             if(totalTargets == 0)
465 |             {
466 |                 //I don't think we should ever see this error, as its dealt with above.
467 |                 System.err.println("No target regions given.  Aborting.");
468 |                 System.exit(4);
469 |             }
470 | 
471 | 
472 |             int sum =0;
473 |             for(int i = 0; i<coverage_forMedian.length; i++){
474 |                 if(sum >= (totalTargetedBases/2)){
475 |                     median_coverage = i;
476 |                     break;
477 |                 }else{
478 |                     sum+=coverage_forMedian[i];
479 |                 }
480 |             }
481 | 
482 | 
483 |             FileWriter report = new FileWriter(fname+".CoverageReport.csv");
484 |             report.write("Version: "+VERSION+"\n");
485 |             report.write("BUFFER size:,"+BUFFER+"\n");
486 |             report.write("Read Stats\n");
487 |             report.write("Total Reads Produced:,"+totalReadsProduced+"\n");
488 |             report.write("Total Yield Produced:,"+(READLENGTH * totalReadsProduced)+"\n");
489 |             report.write("Total Unique Yield Produced:,"+(READLENGTH * (totalReadsAligned-duplicateReads))+"\n");
490 |             report.write("Duplicate Reads:,"+duplicateReads+",("+pc(duplicateReads,totalReadsAligned)+"%)\n");
491 |             report.write("Total Reads Aligned:,"+totalReadsAligned+",("+pc(totalReadsAligned,totalReadsProduced)+"%)");
492 |             report.write(",reads paired:,"+totalReadsPaired);
493 |             report.write(",reads paired with mapped mates:,"+totalPairedreadsWithmappedMates+"\n");
494 |             report.write("Aligned Reads On-Buffer:,"+inbufferReadHitCount+",("+pc(inbufferReadHitCount,totalReadsAligned)+"%)\n");
495 |             report.write("Aligned Reads On-Target:,"+ontargetReadHitCount+",("+pc(ontargetReadHitCount,totalReadsAligned)+"%)\n");
496 |             report.write("Average Coverage:,-,("+((int)(totalTargetCoverage/totalTargetedBases))+")\n");
497 |             report.write("Median Coverage:,-,("+median_coverage+")\n");
498 |             int hittot = inbufferReadHitCount+ontargetReadHitCount;
499 |             report.write("Reads that hit target or buffer:,"+hittot+",("+pc(hittot,totalReadsAligned)+"%)\n");
500 |             report.write("Total Aligned Reads (expected):,"+totalReadsAligned+"\n");
501 |             report.write("Total Aligned Reads (calculated):,"+(offtargetReadHitCount+inbufferReadHitCount+ontargetReadHitCount)+"\n");
502 |             report.write("Target Stats\n");
503 |             report.write("Targets Hit:,"+hitTargetCount+",("+pc(hitTargetCount,totalTargets)+"%)\n");
504 |             report.write("Target Buffers Hit:,"+hitTarget_bufferonly_Count+",("+pc(hitTarget_bufferonly_Count,totalTargets)+"%)\n");
505 |             report.write("Total Targets:,"+totalTargets+"\n");
506 |             report.write("Non target regions with high coverage:,"+nonTragetGoodHits+"\n");
507 |             report.write("Base Stats\n");
508 |             report.write("Bases Targeted:,"+totalTargetedBases+"\n");
509 |             report.write("Buffer Bases:,"+totalBufferBases+"\n");
510 |             report.write("Bases with 1+ coverage:,"+basesWithOneHitorMore+",("+pc(basesWithOneHitorMore,totalTargetedBases)+"%)\n");
511 |             report.write("Bases with 10+ coverage:,"+basesWith10HitsorMore+",("+pc(basesWith10HitsorMore,totalTargetedBases)+"%)\n");
512 |             report.write("Bases with 20+ coverage:,"+basesWith20HitsorMore+",("+pc(basesWith20HitsorMore,totalTargetedBases)+"%)\n");
513 |             report.write("Bases with 40+ coverage:,"+basesWith40HitsorMore+",("+pc(basesWith40HitsorMore,totalTargetedBases)+"%)\n");
514 |             report.write("Bases with 50+ coverage:,"+basesWith50HitsorMore+",("+pc(basesWith50HitsorMore,totalTargetedBases)+"%)\n");
515 |             report.write("Bases with 100+ coverage:,"+basesWith100HitsorMore+",("+pc(basesWith100HitsorMore,totalTargetedBases)+"%)\n");
516 |             report.write("Bases with 500+ coverage:,"+basesWith500HitsorMore+",("+pc(basesWith500HitsorMore,totalTargetedBases)+"%)\n");
517 |             report.write("Bases with 1000+ coverage:,"+basesWith1000HitsorMore+",("+pc(basesWith1000HitsorMore,totalTargetedBases)+"%)\n");
518 |             report.write("Duplicate read distribution\n");
519 |             report.write("1,2,3,4,5,6to10,11to20,20plus\n");
520 |             int[] mult = {1,1,2,3,4,5,8,15,25};
521 |             for(int i = 1 ; i < dupHisto.length; i++)
522 |             {
523 |                 report.write(pc(dupHisto[i]*mult[i],totalReadsAligned)+"%,");
524 |             }
525 |             report.write("\n");
526 |             report.write("Coverage Histogram (may look weird if target regions overlap...)\n");
527 |             for(int i = 0; i < covHisto.length; i++){
528 |                 report.write(i+",");
529 |             }
530 |             report.write("\n");
531 |             for(int i = 0; i < covHisto.length; i++){
532 |                 report.write(covHisto[i]+",");
533 |             }
534 |             report.write("\n");
535 | 
536 |             report.write("Target and region coverage plot\n");
537 |             report.write("Position,5'count,3'count\n");
538 |             for(int i = 20; i <= prime_size; i+=20)
539 |             {
540 |                 report.write(i+","+fivePrime[fivePrime.length-(i-1)-1]+","+threePrime[i-1]+"\n");
541 |             }
542 |             report.write("%tar-Pos,count\n");
543 |             for(int i = 0; i < 101; i+=2)
544 |             {
545 |                 report.write(i+","+targetCov[i]+"\n");
546 |             }
547 |             report.close();
548 | 	}
549 | 	
550 | 	/**
551 | 	 * This method is destructive to the data structure, no further work can be done after this method has ran.  
552 | 	 * Works out whether reads are on or off target and how far off target they are
553 | 	 *  
554 | 	 * @param chromo -- current chromosome
555 | 	 * @param COVERAGE -- the coverage of the genome
556 | 	 * @param wig -- writes a wig file
557 | 	 * @throws Exception
558 | 	 */
559 | 	public static void findWhereReadsHit(String chromo,  short COVERAGE[], FileWriter wig) throws Exception
560 | 	{
561 |             for(int j = 0; j < targetChrs.length; j++)
562 |             {
563 |                 if(!removechr(targetChrs[j]).equals(removechr(chromo))) continue;
564 |                 int start = targetStarts[j];
565 |                 int end = targetStops[j];
566 |                 for(int i = start - 500; i < end +500; i++)
567 |                 {
568 |                     if(i < 0 || i >= size) {continue;}
569 |                     COVERAGE[i] = 0;
570 |                 }
571 |             }
572 |             
573 |             for(int i = 0; i < COVERAGE.length; i++)
574 |             {
575 |                 if(COVERAGE[i] > 20)
576 |                 {
577 |                     int j = i;
578 |                     nonTragetGoodHits++;
579 |                     while(i < COVERAGE.length && COVERAGE[i] > 0)
580 |                     {
581 |                         i++;
582 |                     }
583 | 
584 |                     while(j > 0 && COVERAGE[j] > 0)
585 |                     {
586 |                         j--;
587 |                     }
588 |                     wig.write("fixedStep  chrom="+chromo+"  start="+j+"  step=1\n");
589 |                     for(int h = j; h < i; h++)
590 |                     {
591 |                         wig.write(COVERAGE[h]+"\n");
592 |                     }
593 |                 }
594 |             }
595 |             wig.flush();
596 | 	}
597 | 	
598 | 	static boolean supertets = false;
599 | 	
600 | 	/**
601 | 	 * Gets the target regions from the target file, and writes over the coverage fasta files, as well as determines many of the coverage statistics.
602 | 	 * @param chromo  Current chromosome
603 | 	 * @param COVERAGE The array which contains the coverage of every base in the genome
604 | 	 * @param covFasta The filewriter for the target-specific coverage
605 | 	 * @param missTraget  Write a "wig" format file (good for ucsc) which shows you where all off-target regions with high coverage are
606 | 	 * @param wgCoverage A filewriter for the whole genome coverage... if null, this file won't be written
607 | 	 * @throws Exception
608 | 	 */
609 | 	public static void getTargetsAndWriteCoverage(String chromo,  short COVERAGE[], FileWriter covFasta, FileWriter missTraget, FileWriter wgCoverage) throws Exception
610 |         {
611 |             if(wgCoverage != null)
612 |             {
613 |                 wgCoverage.write(">"+chromo);
614 |                 for(int i = 0; i < COVERAGE.length; i++)
615 |                 {
616 |                     if(i%100==0) wgCoverage.write("\n");
617 |                     wgCoverage.write(COVERAGE[i]+" ");
618 |                 }
619 |                 wgCoverage.write("\n");
620 |             }
621 |             for(int j = 0; j < targetChrs.length; j++)
622 |             {
623 |                 if(!removechr(targetChrs[j]).equals(removechr(chromo))) {continue;}
624 |                 //totalTargets++;
625 |                 int start = targetStarts[j];
626 |                 int end = targetStops[j];
627 |                 int length = end - start+1;
628 |                 boolean collectTargetCov = length > 99 ; 
629 | 
630 |                 if(supertets)
631 |                 {
632 |                     System.out.println(targetChrs[j]+" "+start+" "+end);
633 |                 }
634 |                 if(collectTargetCov)
635 |                 {
636 |                     for(int i = 0; i < prime_size; i++)
637 |                     {
638 |                         if((start - i) < 0 || (end+i) >= size){
639 |                                 ///System.err.println("The BED Target "+targetChrs[j]+" "+start+" "+end+" is going out of Bound!!!\n");
640 |                                 continue;
641 |                         }
642 |                         fivePrime[i]+=COVERAGE[start-i];
643 |                         threePrime[i]+=COVERAGE[end+i];
644 | 
645 |                         ///fivePrime[i]+=COVERAGE[end-i+300];
646 |                         //threePrime[i]+=COVERAGE[start+i-300];
647 | 
648 |                     }
649 |                 }
650 | 
651 |                 if(supertets)
652 |                 {
653 | 
654 |                     for(int i = 0; i < 500; i++)
655 |                     {
656 |                         if((start-i) < 0) {continue;}
657 |                         System.out.print( (start-i)+" ");
658 |                     }
659 |                     System.out.print("\n");
660 |                     for(int i = 0; i < 500; i++)
661 |                     {
662 |                         if((end+i) >= size) {continue;}
663 |                         System.out.print( (end+i)+" ");
664 |                     }
665 |                     System.out.print("\n");
666 | 
667 | 
668 |                     supertets= false;
669 |                 }
670 | 
671 |                 boolean targetHit = false;
672 |                 short[] pc = new short[101];
673 |                 short[] pc2 = new short[101];
674 | 
675 |                 covFasta.write(">"+chromo+" "+start+" "+end+"\n");
676 |                 boolean spaceit = false;
677 |                 if(end - start > 10000) spaceit = true;
678 |                 for(int i = 0; i < length; i++)
679 |                 {
680 |                     if((i+start) >= size) {continue;}
681 |                     if(spaceit && i%100 == 0) covFasta.write("\n");
682 |                     short cov = COVERAGE[i+start];
683 |                     if(cov < 0)
684 |                     {
685 |                         cov = Short.MAX_VALUE;
686 |                         System.err.println("Coverage less than 0!!!!!!!\n");
687 |                     }
688 |                     short temp_cov = cov;
689 |                     if(temp_cov >= covHisto.length){
690 |                         temp_cov = (short) (covHisto.length-1);
691 |                     }
692 |                     covHisto[temp_cov]++;
693 |                     totalTargetCoverage+=cov;
694 |                     if(cov > 0)
695 |                     {
696 |                         targetHit=true;
697 |                         basesWithOneHitorMore++;
698 |                     }
699 |                     if(cov > 9){
700 |                             basesWith10HitsorMore++;}
701 |                     if(cov > 19){
702 |                             basesWith20HitsorMore++;}
703 |                     if(cov > 39){
704 |                             basesWith40HitsorMore++;}
705 |                     if(cov > 49){
706 |                             basesWith50HitsorMore++;}
707 |                     if(cov > 99){
708 |                             basesWith100HitsorMore++;}
709 |                     if(cov > 499){
710 |                             basesWith500HitsorMore++;}
711 |                     if(cov > 999){
712 |                             basesWith1000HitsorMore++;}
713 |                     
714 |                     covFasta.write(cov+" ");
715 | 
716 |                     if(cov < coverage_forMedian.length){
717 |                         coverage_forMedian[cov]++;
718 |                     }else{
719 |                         int[] tmp = new int[coverage_forMedian.length];
720 |                         System.arraycopy(coverage_forMedian, 0, tmp, 0, coverage_forMedian.length);
721 |                         coverage_forMedian = new int[cov+1];
722 |                         System.arraycopy(tmp, 0, coverage_forMedian, 0, tmp.length);
723 |                         coverage_forMedian[cov]++;
724 |                     }
725 | 
726 |                     if(collectTargetCov)
727 |                     {
728 |                         int pcpos = (int)((double)i/(double)length*100+0.5);
729 |                         pc[pcpos] += cov;
730 |                         pc2[pcpos]++;
731 |                     }
732 |                 }
733 |                 covFasta.write("\n");
734 |                 
735 |                 
736 |                 for(int index = 0; index < pc.length; index++)
737 |                 {
738 |                     if(pc2[index] != 0)
739 |                     {
740 |                         int d = (int) (((double)pc[index]/(double)pc2[index])+0.5);
741 |                         pc[index] = (short) d;	
742 |                     }
743 |                 }
744 |                 
745 |                 for(int i = 0; i < 101; i++)
746 |                 {
747 |                     targetCov[i]+=pc[i];
748 |                 }
749 |                 if(targetHit)
750 |                 {
751 |                     hitTargetCount++;
752 |                 }
753 |                 else
754 |                 {
755 |                     missTraget.write(targetChrs[j]+"\t"+targetStarts[j]+"\t"+targetStops[j]+"\n");
756 |                     boolean hit = false;
757 |                     for(int i = start - BUFFER; i < start && !hit; i++)
758 |                     {
759 |                         if(i < 0) {continue;}
760 |                         if(COVERAGE[i] > 0){
761 |                             hit=true;
762 |                         }
763 |                     }
764 |                     for(int i = end; i < end+BUFFER && !hit; i++)
765 |                     {
766 |                         if(i >= size) {continue;}
767 |                         if(COVERAGE[i] > 0){
768 |                             hit=true;
769 |                         }
770 |                     }
771 |                     if(hit){
772 |                         hitTarget_bufferonly_Count++;
773 |                     }
774 |                 }
775 |                 /*for(int i = 0; i < length; i++){
776 |                     if((i+start) >= size) {continue;}
777 |                     COVERAGE[i+start]=0;
778 |                 }*/   //Commented out on June 13th 2014, to include non flattened bedfile.
779 |             }
780 | 	}
781 | 		
782 | 	/**
783 | 	 * 
784 | 	 * @param chromo the current chromosome to load
785 | 	 * @param size the size of the chromosome
786 | 	 * @return
787 | 	 * @throws Exception
788 | 	 */
789 | 	public static char[]  getTargetPos(String chromo, int size) throws Exception
790 | 	{
791 |             char[] TR = new char[size];
792 |             chromo = removechr(chromo);
793 |             for(int j = 0; j < targetChrs.length; j++)
794 |             {
795 |                 try{
796 |                         if(!removechr(targetChrs[j]).equals(chromo))continue;
797 |                         int start = targetStarts[j];
798 |                         int end = targetStops[j];
799 |                         for(int i = start; i <= end; i++)
800 |                         {
801 |                             if(i >= size) {
802 |                                 continue;
803 |                             }else{
804 |                                 TR[i] = 1;
805 |                             }
806 |                         }
807 |                         for(int i = start -BUFFER; i <start; i++)
808 |                         {
809 |                             if(i < 0) {
810 |                                 continue;
811 |                             }else{
812 |                                 if(TR[i] == 0){
813 |                                     TR[i] = 2;
814 |                                 }
815 |                             }
816 |                         }
817 |                         if(end < (size-1)) {
818 |                             for(int i = end+1; i < end+BUFFER; i++)
819 |                             {
820 |                                 if(TR[i] == 0){
821 |                                     TR[i] = 2;
822 |                                 }
823 |                             }
824 |                         }else{
825 |                             continue;
826 |                         }
827 |                 }
828 |                 catch(Exception e)
829 |                 {
830 |                     System.err.println("HUGE ERROR TRYING TO PLACE THE TARGET SEQUENCE.  ARE THE TARGETS AND GENOME THE SAME VERSION????");
831 |                     break;
832 |                 }
833 |             }		
834 |             for(int i =1; i < TR.length; i++)
835 |             {
836 |                 if(TR[i] == 1){
837 |                     totalTargetedBases++;
838 |                 }else if(TR[i] == 2){
839 |                     totalBufferBases++;
840 |                 }
841 |             }
842 |             return TR;
843 |         }
844 | }
845 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/Java_code/ParseOpts.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A very simple options parser.  Only allows one name per option, returns everything as strings.
  3 |  * Takes a string[] of option names, and a string[] to return the values for the options.  Options that require
  4 |  * no parameters are returned as the string "true"
  5 |  * 
  6 |  * 
  7 |  * @author matthewb
  8 |  *
  9 |  */
 10 | 
 11 | public class ParseOpts
 12 | {
 13 | 
 14 | 	/**
 15 | 	 * @param args
 16 | 	 */
 17 | 	public static void main(String[] args)
 18 | 	{
 19 | 		String[] on = {"A", "S", "D", "F"};
 20 | 		String[] vals = new String[on.length];
 21 | 		String warns = ParseOpts.parse(on, args, vals);
 22 | 		if(warns.length() > 0)
 23 | 		{
 24 | 			System.out.println("Warning: ");
 25 | 			System.out.println(warns);
 26 | 		}
 27 | 		for(int i = 0; i < on.length; i++)
 28 | 		{
 29 | 			System.out.println(on[i]+" "+vals[i]);
 30 | 		}
 31 | 
 32 | 	}
 33 | 	
 34 | 	/**
 35 | 	 * Parses args based on optnames, into values.
 36 | 	 * @param optnames The names of allowable options
 37 | 	 * @param args The input arguments
 38 | 	 * @param values The return values for each optname (co-indexed)
 39 | 	 * @return
 40 | 	 */
 41 | 	public static String parse(String[] optnames, String[] args, String[] values)
 42 | 	{
 43 | 		StringBuffer warnings = new StringBuffer(100);
 44 | 		for(int i = 0; i < args.length; i++)
 45 | 		{
 46 | 			String curr = args[i];
 47 | 			if(curr.charAt(0) == '-')
 48 | 			{
 49 | 				curr = curr.substring(1);
 50 | 				String value = null;
 51 | 				String optname = null;
 52 | 				if(curr.indexOf('=')!= -1)
 53 | 				{
 54 | 					optname = curr.substring(0, curr.indexOf('='));
 55 | 					value = curr.substring(curr.indexOf('=')+1);
 56 | 				}
 57 | 				else
 58 | 				{
 59 | 					if(i<args.length -1)
 60 | 					{
 61 | 						String next = args[i+1];
 62 | 						if(next.charAt(0) == '-')
 63 | 						{
 64 | 							optname = curr;
 65 | 							value = "true";
 66 | 						}
 67 | 						else
 68 | 						{
 69 | 							optname = curr;
 70 | 							value = next;
 71 | 							i++;
 72 | 						}	
 73 | 					}
 74 | 					else
 75 | 					{
 76 | 						optname = curr;
 77 | 						value = "true";						
 78 | 					}
 79 | 				}
 80 | 				boolean found = false;
 81 | 				
 82 | 				for(int j = 0; j < optnames.length; j++)
 83 | 				{
 84 | 					if(optname.equals(optnames[j]))
 85 | 					{
 86 | 						found = true;
 87 | 						values[j] = value;
 88 | 						break;
 89 | 					}	
 90 | 				}
 91 | 				if(!found)
 92 | 				{
 93 | 					warnings.append("No such option: "+optname+"\n");
 94 | 				}
 95 | 			}
 96 | 			else
 97 | 			{
 98 | 				warnings.append("Badly formed option: "+curr+"\n");
 99 | 			}
100 | 		}
101 | 		return warnings.toString();
102 | 	}
103 | 	
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/Java_code/WGS_Stats_v1.java:
--------------------------------------------------------------------------------
   1 | import java.io.*;
   2 | import java.util.*;
   3 | 
   4 | import net.sf.samtools.*;
   5 | import net.sf.samtools.SAMFileReader.ValidationStringency;
   6 | 
   7 | /**
   8 |  * Generates capture statistics from a BAM input file.
   9 |  * @author rsanghvi
  10 |  *
  11 |  */
  12 | public class WGS_Stats_v1
  13 | {
  14 | 	static final int prime_size = 1000;
  15 | 	static int[] fivePrime = new int[prime_size]; //stores data about coverage upstream of the target 
  16 | 	static int[] threePrime = new int[prime_size];//stores data about coverage downstream of the target 
  17 | 	static int[] targetCov = new int[101]; //stores data about the coverage across a target
  18 | 	static final int BUFFER = 100; //buffer region around a target
  19 | 	static int tempcnt = 0; 
  20 | 	static long totalReadsProduced = 0; //total reads contains in the bam
  21 | 	static long totalReadsAligned = 0; //total reads aligned to a target region
  22 | 	static long totalTargetCoverage = 0;  //total number of read bases aligned to the target
  23 |         static long totalGenomeCoverage = 0;  //total number of read bases aligned to the Genome
  24 | 	static long totalReadsPaired = 0; //total number of reads with mate pairs (if any)
  25 | 	static int totalPairedreadsWithmappedMates = 0; //total number of aligned reads which have mapped mates
  26 | 	static int offtargetReadHitCount = 0; //total number of reads which do not align to a target region
  27 | 	static int ontargetReadHitCount = 0; //total number of reads which align to a target region
  28 | 	static int inbufferReadHitCount = 0; //total number of reads which align to the buffer region
  29 | 	static int readsAlignedINCLUDINGDUPES = 0; //total reads aligned including duplicates
  30 | 	static long duplicateReads = 0; //total number of duplicate reads
  31 | 	static long totalAlignedBases = 0; //total number of aligned bases
  32 | 	static int totalTargetedBases = 0; //total number of bases targeted
  33 |         static long totalGenomeBases = 0; //total number of bases in Genome
  34 | 	static int totalBufferBases = 0; //total number of bases in the buffer region
  35 | 	static int basesWithOneHitorMore = 0;   //total targeted bases with at least  1 coverage
  36 |         static int basesWith5HitorMore = 0;   //total targeted bases with at least  5 coverage
  37 | 	static int basesWith10HitsorMore = 0;  //total targeted bases with at least  10 coverage
  38 |         static int basesWith15HitorMore = 0;   //total targeted bases with at least  15 coverage
  39 | 	static int basesWith20HitsorMore = 0; //total targeted bases with at least  20 coverage
  40 |         static int basesWith30HitorMore = 0;   //total targeted bases with at least  25 coverage
  41 | 	static int basesWith40HitsorMore = 0; //total targeted bases with at least 40  coverage
  42 |         static int basesWith50HitsorMore = 0; //total targeted bases with at least 50  coverage
  43 |         static int basesWith60HitsorMore = 0; //total targeted bases with at least 60  coverage
  44 |         static int basesWith100HitsorMore = 0; //total targeted bases with at least 100  coverage
  45 |         static long basesWithOneHitorMore_WG = 0;   //total bases with at least  1 coverage
  46 |         static long basesWith5HitorMore_WG = 0;   //total bases with at least  5 coverage
  47 | 	static long basesWith10HitsorMore_WG = 0;  //total bases with at least  10 coverage
  48 |         static long basesWith15HitorMore_WG = 0;   //total bases with at least  15 coverage
  49 | 	static long basesWith20HitsorMore_WG = 0; //total bases with at least  20 coverage
  50 |         static long basesWith30HitorMore_WG = 0;   //total bases with at least  25 coverage
  51 | 	static long basesWith40HitsorMore_WG = 0; //total bases with at least 40  coverage
  52 |         static long basesWith50HitsorMore_WG = 0; //total bases with at least 50  coverage
  53 |         static long basesWith60HitsorMore_WG = 0; //total bases with at least 60  coverage
  54 |         static long basesWith100HitsorMore_WG = 0; //total bases with at least 100  coverage
  55 | 	static int totalTargets = 0; //total taregted regions
  56 | 	static int hitTargetCount = 0;  //total targets with at least 1 read aligned to them
  57 | 	static int hitTarget_bufferonly_Count = 0; //total targets with no hits, except in buffer
  58 | 	static int[] covHisto = new int[1001]; //coverage histogram
  59 |         static long[] covHisto_WG = new long[1001]; //coverage histogram for whole Genome
  60 | 	static int nonTragetGoodHits = 0; //regions that have high coverage but are not in the target
  61 | 	static String VERSION = "WGSStatsV1.1 2015-03-25";
  62 | 	static Hashtable<String, SAMFileWriter> fht ; 
  63 | 	static final String dummy = "dummy";
  64 | 	static boolean removeDupes = false; //do we not consider duplicate reads
  65 | 	static boolean writeWGC = false; //write whole genome coverage statistics
  66 | 	static double percentage = 1.0; //number of read to take (to randomly dump some)
  67 | 	static Random RAND = new Random(88651); //random number generator, good for removing a proportion of the reads
  68 | 	static String[] targetChrs;
  69 | 	static int[] targetStarts;
  70 | 	static int[] targetStops;
  71 | 	static double minmapscore = -1.0;
  72 |         static double minbasescore = -1.0;
  73 | 	static int size =0 ; //Holds the size of the Chromosome being analyzed.
  74 |         static int[] coverage_forMedian = new int[1000]; //Used for calculating the median coverage.
  75 |         static int[] coverage_forMedian_WG = new int[1000]; //Used for calculating the median coverage for Whole Genome.
  76 | 	static int median_coverage = 0;
  77 |         static int median_coverage_WG = 0;
  78 |         static boolean is_target = false; //Is target file provided
  79 |         static long READLENGTH = 0; //Gets READ Length
  80 | 	
  81 |         /**
  82 |         * ONLY WORKS ON A SINGLE FILES THAT HAS BEEN SORTED!!!!!!!!!!
  83 |         * looks like it works in 4000M of ram.
  84 |         * 
  85 |         * usage: -o <output directory & file base name> -t <targetfile> - -i <BAM FILE> -r <Reference Fasta File>[-d] [-w] [-m] [-b]
  86 |         * 
  87 |         * i: BAM File
  88 |         * t: target file
  89 |         * o: output directory and base file name
  90 |         * d: remove duplicates, and do not use them for statistics
  91 |         * m: minimum mapscore (mapscore must be >= to this)
  92 |         * b: minimum base score (basescore must be >= to this)
  93 |         * @param args
  94 |         * 
  95 |         * @throws Exception
  96 |         * 
  97 |         */
  98 | 	public static void main(String[] args) throws Exception
  99 | 	{
 100 | 		if(args.length == 0) {usage();System.exit(0);};
 101 | 		String[] validargs = {"t","o","d","i","m","b"};
 102 | 		String[] arguments = new String[validargs.length];
 103 | 		String warns = ParseOpts.parse(validargs, args, arguments);
 104 | 		if(arguments[0] == null)
 105 | 		{
 106 | 			System.err.println("No target file specified!!! Only calculating Whole genome stats\n");
 107 | 		}else{is_target = true;}
 108 | 		if(arguments[1] == null)
 109 | 		{
 110 | 			System.err.println("No output file specified!!! Exiting\n");
 111 | 			usage();
 112 | 			System.exit(3);
 113 | 		}
 114 |                 int index = arguments[1].lastIndexOf(File.separator);
 115 |                 String fileName = arguments[1].substring(index + 1);
 116 |                 if(fileName == null)
 117 | 		{
 118 | 			System.err.println("Please provide proper output file!!! Exiting\n");
 119 | 			usage();
 120 | 			System.exit(3);
 121 | 		}
 122 | 		if(arguments[2]!= null )
 123 | 		{
 124 | 			removeDupes = true;
 125 | 		}
 126 | 		if(arguments[3] == null)
 127 | 		{
 128 | 			System.err.println("No alignment file specified!!! Exiting\n");
 129 | 			usage();
 130 | 			System.exit(4);
 131 | 		}
 132 | 		if(arguments[4] != null)
 133 | 		{
 134 | 			minmapscore = Double.parseDouble(arguments[4]);
 135 | 		}
 136 |                 if(arguments[5] != null)
 137 |                 {
 138 |                         minbasescore = Double.parseDouble(arguments[5]);
 139 |                 }
 140 | 		
 141 |                 String targetFile = null;
 142 |                 
 143 |                 if(is_target) {
 144 |                     targetFile = arguments[0];
 145 |                     checkFile(targetFile);
 146 |                 }
 147 |                 
 148 | 		String alignmentFile = arguments[3];
 149 | 		checkFile(alignmentFile);
 150 | 		String outfile  = arguments[1];
 151 | 		String covFile = outfile+".cov.fasta";
 152 | 		System.out.println("Writing to: "+outfile);
 153 |                 
 154 | 		FileWriter wgcFasta = new FileWriter(outfile+".wholeGenomeCov.fasta");;
 155 | 		
 156 | 		FileWriter missTar = new FileWriter(outfile+".missTargets.txt");
 157 | 		fht = new Hashtable<String, SAMFileWriter>(50); 
 158 | 		if(is_target) {
 159 |                     loadTargets(targetFile);
 160 |                     FileWriter covFasta = new FileWriter(covFile);
 161 |                     readBAM(alignmentFile, covFasta, missTar, wgcFasta);
 162 |                     covFasta.close();
 163 |                 }else{
 164 |                     readBAM(alignmentFile, missTar, wgcFasta);
 165 |                 }
 166 | 		
 167 | 		wgcFasta.close();
 168 | 		writeReport(outfile);
 169 | 		missTar.close();
 170 | 	}
 171 | 	
 172 | 	/**
 173 | 	 * load the target regions into memory and leave them there
 174 | 	 * @param targetFile
 175 | 	 */
 176 | 	public static void loadTargets(String targetFile) throws Exception
 177 | 	{
 178 | 		BufferedReader br = new BufferedReader(new FileReader(targetFile));
 179 | 		String line;
 180 | 		int cnt = 0;
 181 | 		int start = 0;
 182 | 		int stop = 0;
 183 | 		while((line = br.readLine())!=null)
 184 | 		{
 185 | 			String[] tokens = line.split("[ \t]+");
 186 | 			if(tokens.length < 3) continue;
 187 | 			//if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue;
 188 | 			try
 189 | 			{	
 190 | 				start = Integer.parseInt(tokens[1]);
 191 | 				stop = Integer.parseInt(tokens[2]);
 192 | 			}
 193 | 			catch(NumberFormatException e){continue;}
 194 | 			cnt++;
 195 | 		}
 196 | 		targetChrs = new String[cnt];
 197 | 		targetStarts = new int[cnt];
 198 | 		targetStops = new int[cnt];
 199 | 		cnt = 0;
 200 | 		br.close();
 201 | 		br = new BufferedReader(new FileReader(targetFile));
 202 | 		while((line = br.readLine())!=null)
 203 | 		{
 204 | 			String[] tokens = line.split("[ \t]+");
 205 | 			if(tokens.length < 3) continue;
 206 | 			//if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue;
 207 | 			try
 208 | 			{	
 209 | 				start = Integer.parseInt(tokens[1]);
 210 | 				stop = Integer.parseInt(tokens[2]);
 211 | 			}
 212 | 			catch(NumberFormatException e){continue;}
 213 | 			targetChrs[cnt] = tokens[0];
 214 | 			targetStarts[cnt]=start;
 215 | 			targetStops[cnt] = stop;
 216 | 			cnt++;
 217 | 		}
 218 |                 totalTargets = cnt;     /* Should be used to count Targets Oct3_2014*///totalTargets = TR.length;
 219 | 	}
 220 | 	
 221 | 	
 222 | 	
 223 | 	/**
 224 | 	 * Major workhorse of the application.  Reads in the BAM file and processes each record.
 225 | 	 * @param bamfile  The bam file to read in
 226 | 	 * @param covFasta the coverage fasta output file
 227 | 	 * @param missTraget the miss target file
 228 | 	 * @param wgCoverage whole genome coverage file
 229 | 	 * @throws Exception
 230 | 	 * 
 231 | 	 */
 232 | 	public static void readBAM(String bamfile, FileWriter covFasta, FileWriter missTraget, FileWriter wgCoverage) throws Exception
 233 | 	{
 234 | 		SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
 235 | 		SAMFileReader sfr  = new SAMFileReader(new File(bamfile));
 236 | 		SAMFileHeader header = sfr.getFileHeader();
 237 | 		Iterator<SAMRecord> iter = sfr.iterator();
 238 | 		String lastchr = "SOMEVERYFAKETHINGGOESHERE";
 239 | 		int[] COVERAGE = new int[0];
 240 | 		char[] TR = new char[0];
 241 | 		int cnt = 0;
 242 | 		SAMRecord rec = null;
 243 | 		while(iter.hasNext())
 244 | 		{
 245 | 			try
 246 | 			{
 247 | 				rec = iter.next();
 248 | 				if(percentage < 1.0)
 249 | 				{
 250 |                                     double rd = RAND.nextDouble();
 251 |                                     if(rd < percentage) continue;
 252 | 				}
 253 | 						
 254 | 				totalReadsProduced++;
 255 |                                 if(rec.getMappingQuality() < minmapscore)
 256 | 				{
 257 |                                     continue;
 258 | 				}
 259 |                                 
 260 |                                 if(rec.getReadFailsVendorQualityCheckFlag()){
 261 |                                     continue;
 262 |                                 }
 263 |                                 
 264 |                                 if(rec.getNotPrimaryAlignmentFlag()){
 265 |                                     continue;
 266 |                                 }
 267 |                                 
 268 | 				if(rec.getReadUnmappedFlag())
 269 | 				{
 270 |                                     //System.out.println("Unmapped! region.  breaking!!!");
 271 |                                     continue;
 272 | 				}
 273 | 				totalReadsAligned++;
 274 | 				
 275 |                                 if(rec.getReadPairedFlag())
 276 | 				{
 277 |                                     totalReadsPaired++;
 278 |                                     if(!rec.getMateUnmappedFlag()){
 279 |                                         totalPairedreadsWithmappedMates++;
 280 |                                     }  
 281 | 				}
 282 | 				
 283 |                                 if(rec.getDuplicateReadFlag())
 284 | 				{
 285 |                                     duplicateReads++;
 286 |                                     if(removeDupes){continue;}
 287 | 				}
 288 |                                 
 289 |                                 /////////////////Added on Feb 11, 2015/////////////////////////////
 290 |                                 READLENGTH = rec.getReadLength();
 291 |  
 292 |                                 ///////////////////////////////////////////////////////////////////
 293 |                                 
 294 | 				String currchr = rec.getReferenceName();
 295 | 				if(!currchr.equals(lastchr))
 296 | 				{
 297 | 					if(!lastchr.equals("SOMEVERYFAKETHINGGOESHERE"))
 298 | 					{
 299 | 						getTargetsAndWriteCoverage(lastchr, COVERAGE, covFasta, missTraget, wgCoverage);
 300 | 					}
 301 | 					lastchr = currchr;
 302 | 					System.out.println(currchr);
 303 | 					size = header.getSequence(currchr).getSequenceLength()+1;
 304 |                                         totalGenomeBases+=size;
 305 | 					COVERAGE = new int[size];
 306 | 					TR = getTargetPos(currchr, size);
 307 | 					if(TR == null || COVERAGE == null)
 308 | 					{
 309 | 						System.err.println("COVERAGE or TR is null! "+currchr);
 310 | 					}
 311 | 					
 312 | 				}
 313 | 				processRecord(rec, TR, COVERAGE);
 314 | 			}
 315 | 			catch(Exception e)
 316 | 			{
 317 | 				System.err.println("Error on record: "+cnt+"\n"+e.getMessage()+" "+Arrays.toString(e.getStackTrace()));
 318 | 				System.err.println(rec.toString()+" "+rec.getReferenceName()+" "+rec.getAlignmentStart()+" "+rec.getAlignmentEnd());
 319 | 			//	throw e;
 320 | 			}
 321 | 			cnt++;
 322 | 		}
 323 | 		System.out.println("Done read bam");
 324 | 		getTargetsAndWriteCoverage(lastchr, COVERAGE, covFasta, missTraget, wgCoverage);
 325 | 		COVERAGE = null;
 326 | 	}
 327 | 	
 328 |         /**
 329 | 	 * Major workhorse of the application.  Reads in the BAM file and processes each record.
 330 | 	 * @param bamfile  The bam file to read in
 331 | 	 * @param missTarget the miss target file
 332 | 	 * @param wgCoverage whole genome coverage file
 333 | 	 * @throws Exception
 334 | 	 * 
 335 | 	 */
 336 | 	public static void readBAM(String bamfile, FileWriter missTarget, FileWriter wgCoverage) throws Exception
 337 | 	{
 338 | 		SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
 339 | 		SAMFileReader sfr  = new SAMFileReader(new File(bamfile));
 340 | 		SAMFileHeader header = sfr.getFileHeader();
 341 | 		Iterator<SAMRecord> iter = sfr.iterator();
 342 | 		String lastchr = "SOMEVERYFAKETHINGGOESHERE";
 343 | 		int[] COVERAGE = new int[0];
 344 | 		char[] TR = new char[0];
 345 | 		int cnt = 0;
 346 | 		SAMRecord rec = null;
 347 | 		while(iter.hasNext())
 348 | 		{
 349 |                     try
 350 |                     {
 351 |                             rec = iter.next();
 352 |                             if(percentage < 1.0)
 353 |                             {
 354 |                                     double rd = RAND.nextDouble();
 355 |                                     if(rd < percentage) continue;
 356 |                             }
 357 | 
 358 |                             totalReadsProduced++;
 359 |                             if(rec.getMappingQuality() < minmapscore)
 360 |                             {
 361 |                                 continue;
 362 |                             }
 363 |                             if(rec.getReadFailsVendorQualityCheckFlag()){
 364 |                                 continue;
 365 |                             }
 366 | 
 367 |                             if(rec.getNotPrimaryAlignmentFlag()){
 368 |                                 continue;
 369 |                             }
 370 | 
 371 |                             if(rec.getReadUnmappedFlag())
 372 |                             {
 373 |                                 //System.out.println("Unmapped! region.  breaking!!!");
 374 |                                 continue;
 375 |                             }
 376 |                             totalReadsAligned++;
 377 |                             if(rec.getReadPairedFlag())
 378 |                             {
 379 |                                 totalReadsPaired++;
 380 |                                 if(!rec.getMateUnmappedFlag()){
 381 |                                     totalPairedreadsWithmappedMates++;
 382 |                                 }
 383 |                             }
 384 |                             if(rec.getDuplicateReadFlag())
 385 |                             {
 386 |                                 duplicateReads++;
 387 |                                 if(removeDupes){
 388 |                                     continue;
 389 |                                 }
 390 |                             }
 391 |                             
 392 |                             /////////////////Added on Feb 11, 2015/////////////////////////////
 393 |                             READLENGTH = rec.getReadLength();
 394 |                             ///////////////////////////////////////////////////////////////////
 395 |                                 
 396 |                                 
 397 |                             String currchr = rec.getReferenceName();
 398 |                             if(!currchr.equals(lastchr))
 399 |                             {
 400 |                                 if(!lastchr.equals("SOMEVERYFAKETHINGGOESHERE"))
 401 |                                 {
 402 |                                     getTargetsAndWriteCoverage(lastchr, COVERAGE, null, missTarget, wgCoverage);
 403 |                                 }
 404 |                                 lastchr = currchr;
 405 |                                 System.out.println(currchr);
 406 |                                 size = header.getSequence(currchr).getSequenceLength()+1;
 407 |                                 totalGenomeBases+=size;
 408 |                                 COVERAGE = new int[size];
 409 |                                 if(COVERAGE == null)
 410 |                                 {
 411 |                                     System.err.println("COVERAGE or TR is null! "+currchr);
 412 |                                 }
 413 | 
 414 |                             }
 415 |                             processRecord(rec, COVERAGE);
 416 |                     }
 417 |                     catch(Exception e)
 418 |                     {
 419 |                         System.err.println("Error on record: "+cnt+"\n"+e.getMessage()+" "+Arrays.toString(e.getStackTrace()));
 420 |                         System.err.println(rec.toString()+" "+rec.getReferenceName()+" "+rec.getAlignmentStart()+" "+rec.getAlignmentEnd());
 421 |                     //  throw e;
 422 |                     }
 423 |                     cnt++;
 424 | 		}
 425 | 		System.out.println("Done read bam");
 426 | 		getTargetsAndWriteCoverage(lastchr, COVERAGE, null, missTarget, wgCoverage);
 427 | 		COVERAGE = null;
 428 | 	}
 429 | 	
 430 | 	/**
 431 | 	 * Processes read record from a BAM.  Adding the alignment to coverage array.
 432 | 	 * @param rec
 433 | 	 * @param TR
 434 | 	 * @param COVERAGE
 435 | 	 */
 436 | 	public static void processRecord(SAMRecord rec, char[] TR, int[] COVERAGE) throws Exception
 437 | 	{
 438 |             boolean inbuffer = false;
 439 |             boolean ontarget = false;
 440 |             int start = rec.getAlignmentStart();
 441 |             int stop = rec.getAlignmentEnd();
 442 |             int referenceposition = 0;
 443 |             byte[] baseQual = rec.getBaseQualities();
 444 | 
 445 |             for(int i = start; i <= stop; i++)
 446 |             {
 447 |                     totalAlignedBases++;
 448 |                     try
 449 |                     {
 450 |                             if(TR[i] == 1)
 451 |                             {
 452 |                                     ontarget = true;
 453 |                             }
 454 |                             else if(TR[i] == 2)
 455 |                             {
 456 |                                     inbuffer = true;
 457 |                             }
 458 |                             
 459 |                             referenceposition = rec.getReferencePositionAtReadPosition(i-start+1);
 460 |                             if(referenceposition==0){continue;}
 461 |                                 
 462 |                             if(minbasescore > 0){                        
 463 |                                 if((double)baseQual[i-start] >= minbasescore) {
 464 |                                     COVERAGE[referenceposition]++;
 465 |                                 }
 466 |                             }else {
 467 |                                 COVERAGE[referenceposition]++;
 468 |                             }
 469 |                     }
 470 |                     catch(Exception e)
 471 |                     {
 472 |                             System.err.println("array size:"+COVERAGE.length);
 473 |                             System.err.println("start stop:"+start+"  "+stop);
 474 |                             System.err.println(e.getMessage()+" -- "+e.getLocalizedMessage()+" -- "+e.getCause()+" -- "+Arrays.toString(e.getStackTrace()));
 475 |                             //throw e;
 476 |                             break;
 477 |                     }
 478 |             }
 479 |             if(ontarget)
 480 |             {
 481 |                     ontargetReadHitCount++;
 482 |             }
 483 |             else if(inbuffer)
 484 |             {
 485 |                     inbufferReadHitCount++;
 486 |             }
 487 |             else
 488 |             {
 489 |                     offtargetReadHitCount++;
 490 |             }
 491 | 	}
 492 |         
 493 |         /**
 494 | 	 * Processes read record from a BAM.  Adding the alignment to coverage array.
 495 | 	 * @param rec
 496 | 	 * @param COVERAGE
 497 | 	 */
 498 | 	public static void processRecord(SAMRecord rec, int[] COVERAGE) throws Exception
 499 | 	{
 500 |             int start = rec.getAlignmentStart();
 501 |             int stop = rec.getAlignmentEnd();
 502 |             int referenceposition = 0;
 503 |             byte[] baseQual = rec.getBaseQualities();
 504 | 
 505 |             for(int i = start; i <= stop; i++)
 506 |             {
 507 |                 totalAlignedBases++;
 508 |                 try
 509 |                 {       
 510 |                     if(minbasescore > 0){
 511 |                         
 512 |                         referenceposition = rec.getReferencePositionAtReadPosition(i-start+1);
 513 |                         if(referenceposition==0){continue;}
 514 |                                 
 515 |                         if((double)baseQual[i-start] >= minbasescore) {
 516 |                             COVERAGE[referenceposition]++;
 517 |                         }
 518 |                     }else {
 519 |                         COVERAGE[referenceposition]++;
 520 |                     }
 521 |                 }
 522 |                 catch(Exception e)
 523 |                 {
 524 |                     System.err.println("array size:"+COVERAGE.length);
 525 |                     System.err.println("start stop:"+start+"  "+stop);
 526 |                     System.err.println(e.getMessage()+" -- "+e.getLocalizedMessage()+" -- "+e.getCause()+" -- "+Arrays.toString(e.getStackTrace()));
 527 |                     //throw e;
 528 |                     break;
 529 |                 }
 530 |             }
 531 | 	}
 532 | 	
 533 | 	/**
 534 | 	 * makes sure a file exists
 535 | 	 * @param s
 536 | 	 * @throws Exception
 537 | 	 */
 538 | 	public static void checkFile(String s) throws Exception
 539 | 	{
 540 |             File f = new File(s);
 541 |             if(!f.exists())
 542 |             {
 543 |                 throw new FileNotFoundException("No such file as \""+s+"\".  File not found.");
 544 |             }
 545 | 	}
 546 | 	
 547 | 	public static void usage()
 548 | 	{
 549 |             String s= "Version: "+VERSION+"\nUsage: -o <output directory & file base name> -t <targetfile> -i <BAM FILE ...> [-d] [-m <value>] [-b <value>]\n\t* t: target file\n\t* o: output directory and base file name";
 550 |             s+="\n\t* d: remove duplicates, and do not use them for statistics\n\t* i: alignment file (multiple files are not allowed)\n";
 551 |             s+="\t* m: minimum mapping quality\n\t* b: minimum base quality\n";
 552 |             System.out.println(s);                                                                                                 	 
 553 | 	}
 554 | 	
 555 | 	/**
 556 | 	 * removes the "chr" portion of any chromosome name
 557 | 	 * @param c
 558 | 	 * @return
 559 | 	 */
 560 | 	public static String removechr(String c)
 561 | 	{
 562 |             if(c.length() > 3 && c.substring(0, 3).equalsIgnoreCase("chr")){
 563 |                 c = c.substring(3);
 564 |             }
 565 |             return c;
 566 | 	}
 567 | 	
 568 | 	/**
 569 | 	 * converts fractions into percentages with 2 decimal positions
 570 | 	 * @param num
 571 | 	 * @param dom
 572 | 	 * @return
 573 | 	 */
 574 | 	public static double pc(int num, int dom)
 575 | 	{
 576 |             double pc = (double)num/(double)dom;
 577 |             pc*=10000.0;pc+=0.5; int ipc = (int)pc; pc = (double)ipc/100;
 578 |             return pc;
 579 | 	}
 580 | 	
 581 |         /**
 582 | 	 * converts fractions into percentages with 2 decimal positions
 583 | 	 * @param num
 584 | 	 * @param dom
 585 | 	 * @return
 586 | 	 */
 587 | 	public static double pc(long num, long dom)
 588 | 	{
 589 |             double pc = (double)num/(double)dom;
 590 |             pc*=10000.0;pc+=0.5; int ipc = (int)pc; pc = (double)ipc/100;
 591 |             return pc;
 592 | 	}
 593 |         
 594 | 	/**
 595 | 	 * Writes all the statistical information to an output file.
 596 | 	 * @param fname output file name
 597 | 	 * @throws Exception
 598 | 	 */
 599 | 	public static void writeReport(String fname) throws Exception
 600 | 	{
 601 |             long nonduplicatereads = totalReadsAligned - duplicateReads;
 602 |             if(is_target && totalTargetedBases == 0)
 603 |             {
 604 |                     System.err.println("Total targeted bases is zero.  This means that no read has aligned to a chromosome that contains a target. No target matches a chromosome in the BAM, or something else very weird.  Aborting.");
 605 |                     System.exit(1);
 606 |             }
 607 |             if(totalReadsAligned == 0)
 608 |             {
 609 |                     System.err.println("No reads aligned. Aborting.");
 610 |                     System.exit(2);
 611 |             }
 612 |             if(nonduplicatereads == 0)
 613 |             {
 614 |                     System.err.println("All reads are duplicates. Aborting.");
 615 |                     System.exit(3);
 616 |             }
 617 |             if(is_target && totalTargets == 0)
 618 |             {
 619 |                     //I don't think we should ever see this error, as its dealt with above.
 620 |                     System.err.println("No target regions given.  Aborting.");
 621 |                     System.exit(4);
 622 |             }
 623 | 
 624 | 
 625 |             int sum =0;
 626 |             for(int i = 0; i<coverage_forMedian_WG.length; i++){
 627 |                 if(sum >= (totalGenomeBases/2)){
 628 |                     median_coverage_WG = i;
 629 |                     break;
 630 |                 }else{
 631 |                     sum+=coverage_forMedian_WG[i];
 632 |                 }
 633 |             }
 634 |             
 635 |             FileWriter report_WG = new FileWriter(fname+".WGCoverageReport.csv");
 636 |             report_WG.write("Version: "+VERSION+"\n");
 637 |             report_WG.write("Read Stats\n");
 638 |             report_WG.write("Total Reads Produced:,"+totalReadsProduced+"\n");
 639 |             report_WG.write("Total Yield Produced:"+READLENGTH+","+(READLENGTH * totalReadsProduced)+"\n");
 640 |             report_WG.write("Total Unique Yield Produced:,"+(READLENGTH * (totalReadsAligned-duplicateReads))+"\n");
 641 |             report_WG.write("Duplicate Reads:,"+duplicateReads+",("+pc(duplicateReads,totalReadsAligned)+"%)\n");
 642 |             report_WG.write("Total Reads Aligned:,"+totalReadsAligned+",("+pc(totalReadsAligned,totalReadsProduced)+"%)");
 643 |             report_WG.write(",reads paired:,"+totalReadsPaired);
 644 |             report_WG.write(",reads paired with mapped mates:,"+totalPairedreadsWithmappedMates+"\n");
 645 |             report_WG.write("Average Coverage:,-,("+((int)(totalGenomeCoverage/totalGenomeBases))+")\n");
 646 |             report_WG.write("Median Coverage:,-,("+median_coverage_WG+")\n");           
 647 |             report_WG.write("Base Stats\n");
 648 |             report_WG.write("Bases Targeted:,"+totalGenomeBases+"\n");
 649 |             report_WG.write("Bases with 0 coverage:,"+covHisto_WG[0]+",("+pc(covHisto_WG[0],totalGenomeBases)+"%)\n");
 650 |             report_WG.write("Bases with 1+ coverage:,"+basesWithOneHitorMore_WG+",("+pc(basesWithOneHitorMore_WG, totalGenomeBases)+"%)\n");
 651 |             report_WG.write("Bases with 5+ coverage:,"+basesWith5HitorMore_WG+",("+pc(basesWith5HitorMore_WG,totalGenomeBases)+"%)\n");
 652 |             report_WG.write("Bases with 10+ coverage:,"+basesWith10HitsorMore_WG+",("+pc(basesWith10HitsorMore_WG,totalGenomeBases)+"%)\n");
 653 |             report_WG.write("Bases with 15+ coverage:,"+basesWith15HitorMore_WG+",("+pc(basesWith15HitorMore_WG,totalGenomeBases)+"%)\n");
 654 |             report_WG.write("Bases with 20+ coverage:,"+basesWith20HitsorMore_WG+",("+pc(basesWith20HitsorMore_WG,totalGenomeBases)+"%)\n");
 655 |             report_WG.write("Bases with 30+ coverage:,"+basesWith30HitorMore_WG+",("+pc(basesWith30HitorMore_WG,totalGenomeBases)+"%)\n");
 656 |             report_WG.write("Bases with 40+ coverage:,"+basesWith40HitsorMore_WG+",("+pc(basesWith40HitsorMore_WG,totalGenomeBases)+"%)\n");
 657 |             report_WG.write("Bases with 50+ coverage:,"+basesWith50HitsorMore_WG+",("+pc(basesWith50HitsorMore_WG,totalGenomeBases)+"%)\n");
 658 |             report_WG.write("Bases with 60+ coverage:,"+basesWith60HitsorMore_WG+",("+pc(basesWith60HitsorMore_WG,totalGenomeBases)+"%)\n");
 659 |             report_WG.write("Bases with 100+ coverage:,"+basesWith100HitsorMore_WG+",("+pc(basesWith100HitsorMore_WG,totalGenomeBases)+"%)\n");
 660 |             report_WG.write("\n");
 661 |             report_WG.write("Coverage Histogram for Whole Genome (may look weird if target regions overlap...)\n");
 662 |             for(int i = 0; i < covHisto_WG.length; i++)
 663 |                     report_WG.write(i+",");
 664 |             report_WG.write("\n");
 665 |             for(int i = 0; i < covHisto_WG.length; i++)
 666 |                     report_WG.write(covHisto_WG[i]+",");
 667 |             report_WG.write("\n");
 668 |             
 669 |             report_WG.close();
 670 | 
 671 | 
 672 |             if(is_target){
 673 |                 sum =0;
 674 |                 for(int i = 0; i<coverage_forMedian.length; i++){
 675 |                     if(sum >= (totalTargetedBases/2)){
 676 |                         median_coverage = i;
 677 |                         break;
 678 |                     }else{
 679 |                         sum+=coverage_forMedian[i];
 680 |                     }
 681 |                 }
 682 |                 FileWriter report = new FileWriter(fname+".CoverageReport.csv");
 683 |                 report.write("Version: "+VERSION+"\n");
 684 |                 report.write("BUFFER size:,"+BUFFER+"\n");
 685 |                 report.write("Read Stats\n");
 686 |                 report.write("Total Reads Produced:,"+totalReadsProduced+"\n");
 687 |                 report.write("Total Yield Produced:,"+(READLENGTH * totalReadsProduced)+"\n");
 688 |                 report.write("Total Unique Yield Produced:,"+(READLENGTH * (totalReadsAligned-duplicateReads))+"\n");
 689 |                 report.write("Duplicate Reads:,"+duplicateReads+",("+pc(duplicateReads,totalReadsAligned)+"%)\n");
 690 |                 report.write("Total Reads Aligned:,"+totalReadsAligned+",("+pc(totalReadsAligned,totalReadsProduced)+"%)");
 691 |                 report.write(",reads paired:,"+totalReadsPaired);
 692 |                 report.write(",reads paired with mapped mates:,"+totalPairedreadsWithmappedMates+"\n");
 693 |                 //report.write("Aligned Reads On-Buffer:,"+inbufferReadHitCount+",("+pc(inbufferReadHitCount,totalReadsAligned)+"%)\n");
 694 |                 //report.write("Aligned Reads On-Target:,"+ontargetReadHitCount+",("+pc(ontargetReadHitCount,totalReadsAligned)+"%)\n");
 695 |                 report.write("Average Coverage:,-,("+((int)(totalTargetCoverage/totalTargetedBases))+")\n");
 696 |                 report.write("Median Coverage:,-,("+median_coverage+")\n");
 697 |                 int hittot = inbufferReadHitCount+ontargetReadHitCount;
 698 |                 //report.write("Reads that hit target or buffer:,"+hittot+",("+pc(hittot,totalReadsAligned)+"%)\n");
 699 |                 report.write("Total Aligned Reads (expected):,"+totalReadsAligned+"\n");
 700 |                 report.write("Total Aligned Reads (calculated):,"+(offtargetReadHitCount+inbufferReadHitCount+ontargetReadHitCount)+"\n");
 701 |                 report.write("Target Stats\n");
 702 |                 report.write("Targets Hit:,"+hitTargetCount+",("+pc(hitTargetCount,totalTargets)+"%)\n");
 703 |                 //report.write("Target Buffers Hit:,"+hitTarget_bufferonly_Count+",("+pc(hitTarget_bufferonly_Count,totalTargets)+"%)\n");
 704 |                 report.write("Total Targets:,"+totalTargets+"\n");
 705 |                 report.write("Non target regions with high coverage:,"+nonTragetGoodHits+"\n");
 706 |                 report.write("Base Stats\n");
 707 |                 report.write("Bases Targeted:,"+totalTargetedBases+"\n");
 708 |                 report.write("Buffer Bases:,"+totalBufferBases+"\n");
 709 |                 report.write("Bases with 0 coverage:,"+covHisto[0]+",("+pc(covHisto[0],totalTargetedBases)+"%)\n");
 710 |                 report.write("Bases with 1+ coverage:,"+basesWithOneHitorMore+",("+pc(basesWithOneHitorMore,totalTargetedBases)+"%)\n");
 711 |                 report.write("Bases with 5+ coverage:,"+basesWith5HitorMore+",("+pc(basesWith5HitorMore,totalTargetedBases)+"%)\n");
 712 |                 report.write("Bases with 10+ coverage:,"+basesWith10HitsorMore+",("+pc(basesWith10HitsorMore,totalTargetedBases)+"%)\n");
 713 |                 report.write("Bases with 15+ coverage:,"+basesWith15HitorMore+",("+pc(basesWith15HitorMore,totalTargetedBases)+"%)\n");
 714 |                 report.write("Bases with 20+ coverage:,"+basesWith20HitsorMore+",("+pc(basesWith20HitsorMore,totalTargetedBases)+"%)\n");
 715 |                 report.write("Bases with 30+ coverage:,"+basesWith30HitorMore+",("+pc(basesWith30HitorMore,totalTargetedBases)+"%)\n");
 716 |                 report.write("Bases with 40+ coverage:,"+basesWith40HitsorMore+",("+pc(basesWith40HitsorMore,totalTargetedBases)+"%)\n");
 717 |                 report.write("Bases with 50+ coverage:,"+basesWith50HitsorMore+",("+pc(basesWith50HitsorMore,totalTargetedBases)+"%)\n");
 718 |                 report.write("Bases with 60+ coverage:,"+basesWith60HitsorMore+",("+pc(basesWith60HitsorMore,totalTargetedBases)+"%)\n");
 719 |                 report.write("Bases with 100+ coverage:,"+basesWith100HitsorMore+",("+pc(basesWith100HitsorMore,totalTargetedBases)+"%)\n");
 720 |                 report.write("\n");
 721 |                 report.write("Coverage Histogram (may look weird if target regions overlap...)\n");
 722 |                 for(int i = 0; i < covHisto.length; i++)
 723 |                         report.write(i+",");
 724 |                 report.write("\n");
 725 |                 for(int i = 0; i < covHisto.length; i++)
 726 |                         report.write(covHisto[i]+",");
 727 |                 report.write("\n");
 728 |                 report.write("Target and region coverage plot\n");
 729 |                 report.write("Position,5'count,3'count\n");
 730 |                 for(int i = 20; i <= prime_size; i+=20)
 731 |                 {
 732 |                         report.write(i+","+fivePrime[fivePrime.length-(i-1)-1]+","+threePrime[i-1]+"\n");
 733 |                 }
 734 |                 report.write("%tar-Pos,count\n");
 735 |                 for(int i = 0; i < 101; i+=2)
 736 |                 {
 737 |                         report.write(i+","+targetCov[i]+"\n");
 738 |                 }
 739 |                 report.close();
 740 |             }	
 741 | 	}
 742 | 	
 743 | 	static boolean supertets = false;
 744 | 	
 745 | 	/**
 746 | 	 * Gets the target regions from the target file, and writes over the coverage fasta files, as well as determines many of the coverage statistics.
 747 | 	 * @param chromo  Current chromosome
 748 | 	 * @param COVERAGE The array which contains the coverage of every base in the genome
 749 | 	 * @param covFasta The FileWriter for the target-specific coverage
 750 | 	 * @param missTarget  Write a "wig" format file (good for UCSC) which shows you where all off-target regions with high coverage are
 751 | 	 * @param wgCoverage A FileWriter for the whole genome coverage... if null, this file won't be written
 752 | 	 * @throws Exception
 753 | 	 */
 754 | 	public static void getTargetsAndWriteCoverage(String chromo,  int COVERAGE[], FileWriter covFasta, FileWriter missTarget, FileWriter wgCoverage) throws Exception
 755 | 	{
 756 |             wgCoverage.write(">"+chromo+" 1 "+(size-1));
 757 |             for(int i = 0; i < COVERAGE.length; i++)
 758 |             {
 759 |                 if(i%100==0) wgCoverage.write("\n");
 760 |                 int cov = COVERAGE[i];
 761 |                 if(cov < 0)
 762 |                 {
 763 |                     System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i)+"\n");
 764 |                     cov = Short.MAX_VALUE;
 765 |                     System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i)+"\n");
 766 |                 }
 767 |                 int temp_cov = cov;
 768 |                 if(temp_cov >= covHisto_WG.length)
 769 |                         temp_cov = (covHisto_WG.length-1);
 770 |                 covHisto_WG[temp_cov]++;
 771 |                 totalGenomeCoverage+=cov;
 772 |                 if(cov > 0){
 773 |                     basesWithOneHitorMore_WG++;}
 774 |                 if(cov > 4){
 775 |                     basesWith5HitorMore_WG++;}
 776 |                 if(cov > 9){
 777 |                     basesWith10HitsorMore_WG++;}
 778 |                 if(cov > 14){
 779 |                     basesWith15HitorMore_WG++;}
 780 |                 if(cov > 19){
 781 |                     basesWith20HitsorMore_WG++;}
 782 |                 if(cov > 29){
 783 |                     basesWith30HitorMore_WG++;}
 784 |                 if(cov > 39){
 785 |                     basesWith40HitsorMore_WG++;}
 786 |                 if(cov > 49){
 787 |                     basesWith50HitsorMore_WG++;}
 788 |                 if(cov > 59){
 789 |                     basesWith60HitsorMore_WG++;}
 790 |                 if(cov > 99){
 791 |                     basesWith100HitsorMore_WG++;}
 792 | 
 793 |                 wgCoverage.write(cov+" ");
 794 | 
 795 | 
 796 |                 if(cov < coverage_forMedian_WG.length){
 797 |                     coverage_forMedian_WG[cov]++;
 798 |                 }else{
 799 |                     int[] tmp = new int[coverage_forMedian_WG.length];
 800 |                     System.arraycopy(coverage_forMedian_WG, 0, tmp, 0, coverage_forMedian_WG.length);
 801 |                     coverage_forMedian_WG = new int[cov+1];
 802 |                     System.arraycopy(tmp, 0, coverage_forMedian_WG, 0, tmp.length);
 803 |                     coverage_forMedian_WG[cov]++;
 804 |                 }     
 805 |             }
 806 |             wgCoverage.write("\n");
 807 | 
 808 |             if(covFasta != null){
 809 |                 for(int j = 0; j < targetChrs.length; j++)
 810 |                 {
 811 |                     if(!removechr(targetChrs[j]).equals(removechr(chromo))) {continue;}
 812 |                     //totalTargets++;
 813 |                     int start = targetStarts[j];
 814 |                     int end = targetStops[j];
 815 |                     int length = end - start+1;
 816 |                     boolean collectTargetCov = length > 99 ;
 817 |                     
 818 |                     //System.err.println(targetChrs[j]+" "+start+" "+end);
 819 | 
 820 |                     if(supertets)
 821 |                     {
 822 |                         System.out.println(targetChrs[j]+" "+start+" "+end);
 823 |                     }
 824 |                     if(collectTargetCov)
 825 |                     {
 826 |                         for(int i = 0; i < prime_size; i++)
 827 |                         {
 828 |                             if((start - i) < 0 || (end+i) >= size){
 829 |                                 continue;
 830 |                             }
 831 |                             fivePrime[i]+=COVERAGE[start-i];
 832 |                             threePrime[i]+=COVERAGE[end+i];
 833 |                         }
 834 |                     }
 835 | 
 836 |                     if(supertets)
 837 |                     {
 838 | 
 839 |                         for(int i = 0; i < 500; i++)
 840 |                         {
 841 |                             if((start-i) < 0) {continue;}
 842 |                             System.out.print( (start-i)+" ");
 843 |                         }
 844 |                         System.out.print("\n");
 845 |                         for(int i = 0; i < 500; i++)
 846 |                         {
 847 |                             if((end+i) >= size) {continue;}
 848 |                             System.out.print( (end+i)+" ");
 849 |                         }
 850 |                         System.out.print("\n");
 851 |                         
 852 |                         supertets= false;
 853 |                     }
 854 | 
 855 |                     boolean targetHit = false;
 856 |                     short[] pc = new short[101];
 857 |                     short[] pc2 = new short[101];
 858 | 
 859 |                     covFasta.write(">"+chromo+" "+start+" "+end+"\n");
 860 |                     boolean spaceit = false;
 861 |                     if(end - start > 10000) spaceit = true;
 862 |                     for(int i = 0; i < length; i++)
 863 |                     {
 864 |                         if((i+start) >= size) {continue;}
 865 |                         if(spaceit && i%100 == 0) {covFasta.write("\n");}
 866 |                         int cov = COVERAGE[i+start];
 867 |                         if(cov < 0)
 868 |                         {
 869 |                             System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i+start)+"\n");
 870 |                             cov = Short.MAX_VALUE;
 871 |                             System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i+start)+"\n");
 872 |                         }
 873 |                         int temp_cov = cov;
 874 |                         if(temp_cov >= covHisto.length){
 875 |                                 temp_cov = (covHisto.length-1);
 876 |                         }
 877 |                         
 878 |                         covHisto[temp_cov]++;
 879 |                         totalTargetCoverage+=cov;
 880 |                         
 881 |                         if(cov > 0)
 882 |                         {
 883 |                             targetHit=true;
 884 |                             basesWithOneHitorMore++;
 885 |                         }
 886 |                         if(cov > 4){
 887 |                             basesWith5HitorMore++;}
 888 |                         if(cov > 9){
 889 |                             basesWith10HitsorMore++;}
 890 |                         if(cov > 14){
 891 |                             basesWith15HitorMore++;}
 892 |                         if(cov > 19){
 893 |                             basesWith20HitsorMore++;}
 894 |                         if(cov > 29){
 895 |                             basesWith30HitorMore++;}
 896 |                         if(cov > 39){
 897 |                             basesWith40HitsorMore++;}
 898 |                         if(cov > 49){
 899 |                             basesWith50HitsorMore++;}
 900 |                         if(cov > 59){
 901 |                             basesWith60HitsorMore++;}
 902 |                         if(cov > 99){
 903 |                             basesWith100HitsorMore++;}
 904 |                         
 905 |                         covFasta.write(cov+" ");
 906 | 
 907 | 
 908 |                         if(cov < coverage_forMedian.length){
 909 |                             coverage_forMedian[cov]++;
 910 |                         }else{
 911 |                             int[] tmp = new int[coverage_forMedian.length];
 912 |                             System.arraycopy(coverage_forMedian, 0, tmp, 0, coverage_forMedian.length);
 913 |                             coverage_forMedian = new int[cov+1];
 914 |                             System.arraycopy(tmp, 0, coverage_forMedian, 0, tmp.length);
 915 |                             coverage_forMedian[cov]++;
 916 |                         }
 917 | 
 918 | 
 919 |                         if(collectTargetCov)
 920 |                         {
 921 |                             int pcpos = (int)((double)i/(double)length*100+0.5);
 922 |                             pc[pcpos] += cov;
 923 |                             pc2[pcpos]++;
 924 |                         }
 925 |                     }
 926 |                     covFasta.write("\n");
 927 |                     
 928 |                     for(int index = 0; index < pc.length; index++)
 929 |                     {
 930 |                         if(pc2[index] != 0)
 931 |                         {
 932 |                             int d = (int) (((double)pc[index]/(double)pc2[index])+0.5);
 933 |                             pc[index] = (short) d;	
 934 |                         }
 935 |                     }
 936 |                     
 937 |                     for(int i = 0; i < 101; i++)
 938 |                     {
 939 |                         targetCov[i]+=pc[i];
 940 |                     }
 941 |                     
 942 |                     if(targetHit)
 943 |                     {
 944 |                         hitTargetCount++;
 945 |                     }else{
 946 |                         missTarget.write(targetChrs[j]+"\t"+targetStarts[j]+"\t"+targetStops[j]+"\n");
 947 |                         boolean hit = false;
 948 |                         for(int i = start - BUFFER; i < start && !hit; i++)
 949 |                         {
 950 |                             if(i < 0) {continue;}
 951 |                             if(COVERAGE[i] > 0){
 952 |                                     hit=true;
 953 |                             }
 954 |                         }
 955 |                         for(int i = end; i < end+BUFFER && !hit; i++)
 956 |                         {
 957 |                             if(i >= size) {continue;}
 958 |                             if(COVERAGE[i] > 0){
 959 |                                     hit=true;
 960 |                             }
 961 |                         }
 962 |                         if(hit){
 963 |                             hitTarget_bufferonly_Count++;
 964 |                         }
 965 |                     }
 966 |                 }
 967 |             }
 968 | 	}
 969 | 		
 970 | 	/**
 971 | 	 * 
 972 | 	 * @param chromo the current chromosome to load
 973 | 	 * @param size the size of the chromosome
 974 | 	 * @return
 975 | 	 * @throws Exception
 976 | 	 */
 977 | 	public static char[]  getTargetPos(String chromo, int size) throws Exception
 978 | 	{
 979 |             char[] TR = new char[size];
 980 |             chromo = removechr(chromo);
 981 |             for(int j = 0; j < targetChrs.length; j++)
 982 |             {
 983 |                     try{
 984 |                         if(!removechr(targetChrs[j]).equals(chromo))continue;
 985 |                         int start = targetStarts[j];
 986 |                         int end = targetStops[j];
 987 |                         for(int i = start; i <= end; i++)
 988 |                         {
 989 |                             if(i >= size) {
 990 |                                 continue;
 991 |                             }else{
 992 |                                 TR[i] = 1;
 993 |                             }
 994 |                         }
 995 |                         for(int i = start -BUFFER; i <start; i++)
 996 |                         {
 997 |                             if(i < 0) {
 998 |                                 continue;
 999 |                             }else{
1000 |                                 if(TR[i] == 0)
1001 |                                         TR[i] = 2;
1002 |                             }
1003 |                         }
1004 |                         if(end < (size-1)) {
1005 |                             for(int i = end+1; i < end+BUFFER; i++)
1006 |                             {
1007 |                                     if(TR[i] == 0)
1008 |                                             TR[i] = 2;
1009 |                             }
1010 |                         }else{
1011 |                             continue;
1012 |                         }
1013 |                     }
1014 |                     catch(Exception e)
1015 |                     {
1016 |                         System.err.println("HUGE ERROR TRYING TO PLACE THE TARGET SEQUENCE.  ARE THE TARGETS AND GENOME THE SAME VERSION????");
1017 |                         break;
1018 |                     }
1019 |             }		
1020 |             for(int i =1; i < TR.length; i++)
1021 |             {
1022 |                 if(TR[i] == 1)
1023 |                         totalTargetedBases++;
1024 |                 else if(TR[i] == 2)
1025 |                         totalBufferBases++;
1026 |             }
1027 |             return TR;
1028 | 	}
1029 | }
1030 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/Java_code/picard-1.88.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/Java_code/picard-1.88.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/Java_code/sam-1.88.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/Java_code/sam-1.88.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/WGSStats_v1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/WGSStats_v1.1.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/bin/WGSStats_v1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/WGSStats_v1.jar


--------------------------------------------------------------------------------
/ExCID_v2.1/change_log.txt:
--------------------------------------------------------------------------------
 1 | Bugfix[WGS_Stats_v1.java]: The Size of each Chromosome registered in the header of the "*wholeGenomeCov.fasta" was reported as (Size+1) and is now fixed. (Eg: >1 1 249250622, Should Be: >1 1 249250621)
 2 | Bugfix[WGS_Stats_v1.java]: covHistio_WG variable was defined  as 'int' and could go over the data type limit while handling whole genome samples. This is now changed to 'long'.
 3 | Bugfix[batch_*_Lowcov.bed]: There was an extra tab between the 'length' and 'gene' fields. The file has been fixed to match the Headers.
 4 | Bugfix[batch_*_Lowcov.bed]: Interval length now matches length of 0-based genome coordinate positions in 'batch_*_Lowcov.bed' file.
 5 | Bugfix[ExCID.BatchScript_v2.1-threading_Final.pl]: Targets BED file with more than 3 columns were not handled correctly in the Annotation step. This has been fixed and will now allow BED file with more than 3 columns as Input.
 6 | Bugfix[ExCID.BatchScript_v2.1-threading_Final.pl]: In Batch mode, while obtaining the Gene% coverage, '-checkHGMD' was applied as a mandatory option. This has been corrected and is changed to optional.
 7 | Bugfix: An assumption made in "ExCID.grep_gene_list_pct_Final-miRNA.pl" and "ExCID.grep_gene_list_pct_Final.pl" scripts about the location of the database files. This has been fixed to obtain the location from the Config.txt .
 8 | Bugfix[CapStatsV2.6.java]: A VERSION variable is used to write the version in the *.csv file.
 9 | 
10 | Devel[batch_*_Lowcov.bed]: The format of the file will be changed to 1-based making it consistent with all other Outputs.
11 | Devel[WGS_Stats_v1.java]: There is an extra 0 at the start of coverage information of each chromosome in the Ò*wholeGenomeCov.fastaÓ output file, that needs to be removed.
12 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/check_HGNC_individual_CCDSDB.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | 
 5 | my $annotated_index = $ARGV[0];
 6 | my $HGNC = $ARGV[1];
 7 | 
 8 | 
 9 | open(my $fh,"<$annotated_index") or die $!;
10 | 
11 | 
12 | while (my $line = <$fh>) {
13 |     
14 |     chomp($line);
15 |     
16 |     my ($chr,$start,$Stop,$ID) = split("\t",$line);
17 |     my @transcript_ID_split = split("_exon_",$ID);
18 |     my $transcript_ID = $transcript_ID_split[0];
19 |     
20 |     my @grep = `grep -w "$transcript_ID" $HGNC `;
21 |     
22 |     if (scalar(@grep) == 1) {
23 |         my @tmp = split("\t",$grep[0]);
24 |         my $gene_name = $tmp[0];
25 |         print "$chr\t$start\t$Stop\t$ID\t$gene_name\n";
26 |     }else{
27 |         print "$chr\t$start\t$Stop\t$ID\t.\n";
28 |         #print STDERR "$chr\t$start\t$Stop\t$ID\t.\n";
29 |     }
30 |     
31 | }
32 | 
33 | close($fh);
34 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/check_HGNC_individual_RefSeqDB.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | 
 5 | my $annotated_index = $ARGV[0];
 6 | my $HGNC = $ARGV[1];
 7 | 
 8 | 
 9 | open(my $fh,"<$annotated_index") or die $!;
10 | 
11 | 
12 | while (my $line = <$fh>) {
13 |     
14 |     chomp($line);
15 |     
16 |     my ($chr,$start,$Stop,$gene,$ID) = split("\t",$line);
17 |     my @transcript_ID_tmp_split = split("_exon_",$ID);
18 |     my $transcript_ID = $transcript_ID_tmp_split[0];
19 | 
20 |     my @grep = `grep -w "$transcript_ID" $HGNC `;
21 |     
22 |     if (scalar(@grep) == 1) {
23 |         my @tmp = split("\t",$grep[0]);
24 |         my $gene_name = $tmp[0];
25 |         print "$chr\t$start\t$Stop\t$gene\t$ID\t$gene_name\n";
26 |     }else{
27 |         print "$chr\t$start\t$Stop\t$gene\t$ID\t.\n";
28 |        # print STDERR "$chr\t$start\t$Stop\t$gene\t$ID\t.\n";
29 |     }
30 |     
31 | }
32 | 
33 | close($fh);
34 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/check_HGNC_individual_VEGADB.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | 
 5 | my $annotated_index = $ARGV[0];
 6 | my $HGNC = $ARGV[1];
 7 | my $VEGA_HGNC_names = $ARGV[2];
 8 | 
 9 | my %words_to_genes = (); # not a perfect index, but will do the same as "grep -w"
10 | open(my $hgnc_fh, "<$HGNC") or die $!;
11 | while (my $line = <$hgnc_fh>) {
12 |     chomp $line;
13 |     my @row = split(/[\s,]/, $line);
14 |     my $gene = shift @row;
15 |     map { $words_to_genes{$_} = $gene } @row;
16 | }
17 | close $hgnc_fh or die $!;
18 | 
19 | my %vega_index = ();
20 | open(my $vega_fh, "<$VEGA_HGNC_names") or die $!;
21 | while (my $line = <$vega_fh>) {
22 | 	chomp $line;
23 | 	my ($a, $b) = split "\t", $line;
24 | 	$vega_index{$b} = $a;
25 | }
26 | close $vega_fh or die $!;
27 | 
28 | open(my $fh,"<$annotated_index") or die $!;
29 | 
30 | while (my $line = <$fh>) {
31 |     
32 |     chomp($line);
33 |     
34 |     my ($chr,$start,$Stop,$gene,$ID) = split("\t",$line);
35 |     my @transcript_ID_tmp_split = split("_exon_",$ID);
36 |     my $transcript_ID = $transcript_ID_tmp_split[0];
37 |     
38 |     $transcript_ID = $vega_index{$transcript_ID} || $transcript_ID;
39 |     my $gene_name = $words_to_genes{$transcript_ID} || '.';
40 | 
41 |     print "$chr\t$start\t$Stop\t$gene\t$ID\t$gene_name\n";
42 |     
43 | }
44 | 
45 | close($fh);


--------------------------------------------------------------------------------
/ExCID_v2.1/check_HGNC_individual_mirnaDB.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | 
 5 | my $annotated_index = $ARGV[0];
 6 | my $HGNC = $ARGV[1];
 7 | 
 8 | 
 9 | open(my $fh,"<$annotated_index") or die $!;
10 | 
11 | 
12 | while (my $line = <$fh>) {
13 |     
14 |     chomp($line);
15 |     
16 |     my ($chr,$start,$Stop,$transcript_ID) = split("\t",$line);
17 |     
18 |     my @grep = `grep -w "$transcript_ID" $HGNC `;
19 |     
20 |     if (scalar(@grep) == 1) {
21 |         my @tmp = split("\t",$grep[0]);
22 |         my $gene_name = $tmp[0];
23 |         print "$chr\t$start\t$Stop\t$transcript_ID\t$gene_name\n";
24 |     }else{
25 |         #my @tmp = split("-",$transcript_ID);
26 |         #my $gene_name = "MIR";
27 |         #for(my $i = 2; $i < scalar(@tmp); $i++){
28 |         #    $gene_name.=uc($tmp[$i]);
29 |         #}
30 |         print "$chr\t$start\t$Stop\t$transcript_ID\t.\n";
31 |         #print STDERR "$chr\t$start\t$Stop\t$gene_name\t$transcript_ID\n";
32 |     }
33 |     
34 | }
35 | 
36 | close($fh);


--------------------------------------------------------------------------------
/ExCID_v2.1/creat_bed_UCSC_coding.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | ### GLOBAL VARS ###
 4 | ###################
 5 | my $infile = $ARGV[0];
 6 | ### MAIN CODE ###
 7 | #################
 8 | open(FIN,"$infile") || die "Can't open $infile: $!\n";
 9 | 
10 | while(<FIN>){
11 |     
12 |     chomp; my $line = $_; 
13 |     my ($chr, $cds_start, $cds_stop, $ID, $gene, $exon_num, $exon_start, $exon_stop) = split(/\s/, $line);
14 |     unless($gene){$gene = " " ;}
15 |     $chr=~s/^chr//;
16 |     my ($id, $version) = split(/\./, $ID); #for ccds data
17 |     my @chrarray = split(/\_/, $chr);
18 |     my $chrNum = $chrarray[0];
19 |     my $chrcount = scalar(@chrarray);
20 |     if ($chrcount == 1){  #if need the halytype like chr6_hap, need to screen out
21 | 	my @exon_startarray = split(/\,/, $exon_start);
22 | 	my @exon_stoparray = split(/\,/, $exon_stop);
23 | 	my $outfile1 = "${infile}-exon.bed";
24 | 	my $outfile2 = "${infile}-Coding_region.bed";
25 | 	open(OUT1, ">>$outfile1") || die "Can't open $outfile1: $!\n";
26 | 	#open(OUT2, ">>$outfile2") || die "Can't open $outfile2: $!\n";
27 | 	#print OUT2 "$chrNum\t$cds_start\t$cds_stop\t${id}\n" if($cds_start != $cds_stop);
28 | 	#print OUT2 "${chrNum}\t$cds_start\t$cds_stop\t${gene}\n";
29 | 	for (my $j=0; $j<$exon_num; $j++) { #for gene
30 | 	#for (my $j=0; $j<$exon_num; $j++){ #for ;
31 | 	    next if ($exon_startarray[$j] > $cds_stop);
32 | 	    next if ($exon_stoparray[$j]< $cds_start);
33 | 	    
34 | 	    
35 | 	    if($cds_start != $cds_stop) {
36 | 		if ($exon_startarray[$j] >= $cds_start && $exon_stoparray[$j] <= $cds_stop) {
37 | 		    print OUT1 "$chrNum\t$exon_startarray[$j]\t$exon_stoparray[$j]\t$gene\t${id}_exon_${j}\n";
38 | 		}elsif($exon_startarray[$j] < $cds_start && $exon_stoparray[$j] <= $cds_stop) {
39 | 		    print OUT1 "$chrNum\t$cds_start\t$exon_stoparray[$j]\t$gene\t${id}_exon_${j}\n";
40 | 		}elsif($exon_startarray[$j] >= $cds_start && $exon_stoparray[$j] > $cds_stop) {
41 | 		    print OUT1 "$chrNum\t$exon_startarray[$j]\t$cds_stop\t$gene\t${id}_exon_${j}\n";
42 | 		}elsif($exon_startarray[$j] < $cds_start && $exon_stoparray[$j] > $cds_stop) {
43 |                     print OUT1 "$chrNum\t$cds_start\t$cds_stop\t$gene\t${id}_exon_${j}\n";
44 |                 }
45 | 	    }else{
46 | 		print OUT1 "$chrNum\t$exon_startarray[$j]\t$exon_stoparray[$j]\t$gene\t${id}_exon_${j}\n";
47 | 	    }
48 | 	    
49 | 	}
50 |     }
51 |     
52 | }
53 | 
54 | 
55 | close(FIN);
56 | close(OUT1);
57 | #close(OUT2);
58 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/database.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/database.tgz


--------------------------------------------------------------------------------
/ExCID_v2.1/external_programs/BEDTools.v2.17.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/external_programs/BEDTools.v2.17.0.tar.gz


--------------------------------------------------------------------------------
/ExCID_v2.1/reformat.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # This script takes in the annotated BED file (eg the VCrome+PKv2 fully annotated) and checks
  3 | # the gene names and other gene names columns with the HGNC database to add the Approved names in 
  4 | # the gene names column and put other names in the other gene names column.
  5 | #
  6 | 
  7 | use strict;
  8 | 
  9 | 
 10 | my $annotated_index = $ARGV[0];
 11 | my $HGNC = $ARGV[1];
 12 | 
 13 | 
 14 | open(my $fh,"<$annotated_index") or die $!;
 15 | 
 16 | 
 17 | while (my $line = <$fh>) {
 18 |     
 19 |     chomp($line);
 20 |     
 21 |     $line=~s/\t-1\t/\t.\t/;
 22 |     my $not_anno = 0;
 23 |     my @line_tmp = split("\t",$line);
 24 |     my $target = $line_tmp[0]."\t".$line_tmp[1]."\t".$line_tmp[2];
 25 |     my $gene_name="";
 26 |     my $prev_name="";
 27 |     my $synonyms_names = "";
 28 |     my $refseq_IDs = $line_tmp[11];
 29 |     my $CCDS_IDs = $line_tmp[12];
 30 |     my $VEGA_IDs = $line_tmp[13];
 31 |     my $miRNA_IDs = $line_tmp[14];
 32 |     my $rest = join("\t", @line_tmp[15..(scalar(@line_tmp)-1)]);
 33 |     
 34 |     
 35 |     ## Gene_name
 36 |     
 37 |     if ($line_tmp[3] ne "." ) {
 38 |         $gene_name =$line_tmp[3].";";
 39 |         my $check = $line_tmp[4].";";
 40 |         if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) {
 41 |             $synonyms_names = $line_tmp[4].";";   ## First time Syn Name is given a value.
 42 |         }
 43 |         $check = $line_tmp[5].";";
 44 |         if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) {
 45 |             $synonyms_names .= $line_tmp[5].";";
 46 |         }
 47 |         $check = $line_tmp[6].";";
 48 |         if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) {
 49 |             $synonyms_names .= $line_tmp[6].";";
 50 |         }
 51 |     }elsif($line_tmp[4] ne "." ){
 52 |         $gene_name =$line_tmp[4].";";
 53 |         my $check = $line_tmp[5].";";
 54 |         if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) {
 55 |             $synonyms_names = $line_tmp[5].";"; ## First time Syn Name is given a value.
 56 |         }
 57 |         $check = $line_tmp[6].";";
 58 |         if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) {
 59 |             $synonyms_names .= $line_tmp[6].";";
 60 |         }
 61 |     }elsif($line_tmp[5] ne "." ){
 62 |         $gene_name =$line_tmp[5].";";
 63 |         my $check = $line_tmp[6].";";
 64 |         if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) {
 65 |             $synonyms_names = $line_tmp[6].";"; ## First time Syn Name is given a value.
 66 |         }
 67 |     }elsif($line_tmp[6] ne "." ){
 68 |         $gene_name =$line_tmp[6].";";
 69 |     }
 70 |     
 71 |     ##
 72 |     
 73 |     ## Other_Names
 74 |     
 75 |     if ($line_tmp[7] ne "."){
 76 |         my @line_tmp_split = split(";",$line_tmp[7]);
 77 |         foreach my $tmp (@line_tmp_split){
 78 |             my $check = $tmp.";";
 79 |             if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) {
 80 |                 $prev_name = $line_tmp[7].";";
 81 |             }
 82 |             
 83 |         } 
 84 |     }
 85 |     
 86 |     if ($line_tmp[8] ne "."){
 87 |         my @line_tmp_split = split(";",$line_tmp[8]);
 88 |         foreach my $tmp (@line_tmp_split){
 89 |             my $check = $tmp.";";
 90 |             if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) {
 91 |                 $prev_name = $line_tmp[8].";";
 92 |             }
 93 |             
 94 |         } 
 95 |     }
 96 |     
 97 |     if ($line_tmp[9] ne "."){
 98 |         my @line_tmp_split = split(";",$line_tmp[9]);
 99 |         foreach my $tmp (@line_tmp_split){
100 |             my $check = $tmp.";";
101 |             if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) {
102 |                 $prev_name = $line_tmp[9].";";
103 |             }
104 |             
105 |         } 
106 |     }
107 |     
108 |     if ($line_tmp[10] ne "."){
109 |         my @line_tmp_split = split(";",$line_tmp[10]);
110 |         foreach my $tmp (@line_tmp_split){
111 |             my $check = $tmp.";";
112 |             if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) {
113 |                 $prev_name = $line_tmp[10].";";
114 |             }
115 |             
116 |         } 
117 |     }
118 |     
119 |     $gene_name=~ s/;$//;
120 |     $prev_name=~ s/;$//;
121 |     $synonyms_names=~ s/;$//;
122 |     $gene_name=~ s/^\.;//;
123 |     $prev_name=~ s/^\.;//;
124 |     $synonyms_names=~ s/^\.;//;
125 |     $gene_name=~ s/;\.$//;
126 |     $prev_name=~ s/;\.$//;
127 |     $synonyms_names=~ s/;\.$//;
128 |     
129 |     chomp($gene_name);
130 |     chomp($prev_name);
131 |     chomp($synonyms_names);
132 |     
133 |     if (length($gene_name) == 0) {
134 |         $gene_name = ".";
135 |     }
136 |     if (length($prev_name) == 0) {
137 |         $prev_name = ".";
138 |     }
139 |     if (length($synonyms_names) == 0) {
140 |         $synonyms_names = ".";
141 |     }
142 |     
143 |     print "$target\t$gene_name\t$prev_name\t$synonyms_names\t$refseq_IDs\t$CCDS_IDs\t$VEGA_IDs\t$miRNA_IDs\t$rest\n";
144 |  
145 | }
146 | 
147 | close($fh);
148 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASEDIR=$(cd `dirname ${0}`; pwd)
 4 | source Config.txt
 5 | 
 6 | mkdir $DataBaseDir; 
 7 | mkdir $AnnotationDir;
 8 | 
 9 | tar xzvf external_programs/BEDTools.v2.17.0.tar.gz 1>>setup.log 2>>setup.log ; 
10 | # build the bedtools package
11 | cd bedtools-2.17.0 ;
12 | make 1>>setup.log 2>>setup.log ;
13 | cp bin/bedtools ../bin/ ;
14 | 
15 | 
16 | cd $BASEDIR;
17 | printf "Extracting external databases... ";
18 | tar -xzf $BASEDIR/database.tgz;
19 | printf "DONE.\n";
20 | 
21 | mv $BASEDIR/database/* $DataBaseDir/ ;
22 | 
23 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz $DataBaseDir ;
24 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ccdsGene.txt.gz $DataBaseDir ;
25 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGene.txt.gz $DataBaseDir ;
26 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGtp.txt.gz $DataBaseDir ;
27 | cp miRBASE_r20.gff2 $DataBaseDir/miRBASE_r20.gff2 ;
28 | 
29 | if [ "$(uname)" == "Darwin" ]; then
30 |     ls $DataBaseDir/*.gz | while read FILE ; do gzip -d "$FILE" ; done ;
31 | 	ls $DataBaseDir/*.txt | while read FILE ; do awk -F "\t" '{print $3"\t"$7"\t"$8"\t"$2"\t"$13"\t"$9"\t"$10"\t"$11}' "$FILE" > "$FILE.bed" ; done  ;        
32 | elif [ "$(uname)" == "Linux" ]; then
33 |     ls --color=never $DataBaseDir/*.gz | while read FILE ; do gzip -d "$FILE" ; done ;
34 | 	ls --color=never $DataBaseDir/*.txt | while read FILE ; do awk -F "\t" '{print $3"\t"$7"\t"$8"\t"$2"\t"$13"\t"$9"\t"$10"\t"$11}' "$FILE" > "$FILE.bed" ; done  ;
35 | fi
36 | 
37 | rm $DataBaseDir/vegaGtp.txt.bed ;
38 | awk -F "\t" '{print $1"\t"$2}' $DataBaseDir/vegaGtp.txt > $DataBaseDir/VEGA-hgnc_names ; 
39 | rm $DataBaseDir/vegaGtp.txt ;
40 | 
41 | awk -F "\t| " '{print $1"\t"$4"\t"$5"\t"$10}' $DataBaseDir/miRBASE_r20.gff2  | sed s/ID=\"//   | sed s/\"\;//  | grep "^#" -v > $DataBaseDir/miRBASE_r20.bed ;
42 | 
43 | perl creat_bed_UCSC_coding.pl $DataBaseDir/refGene.txt.bed ;
44 | perl creat_bed_UCSC_coding.pl $DataBaseDir/ccdsGene.txt.bed ;
45 | perl creat_bed_UCSC_coding.pl $DataBaseDir/vegaGene.txt.bed ;
46 | 
47 | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$5}' $DataBaseDir/ccdsGene.txt.bed-exon.bed > tmp ; mv tmp $DataBaseDir/ccdsGene.txt.bed-exon.bed ;
48 | 
49 | sed s/^chr// $DataBaseDir/miRBASE_r20.bed > tmp ; mv tmp $DataBaseDir/miRBASE_r20.bed ;
50 | 
51 | perl Get_HGNC.pl  > $DataBaseDir/HGNC_database.txt ;
52 | 
53 | perl check_HGNC_individual_mirnaDB.pl $DataBaseDir/miRBASE_r20.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/miRBASE_r20_HGNC.bed  &
54 | perl check_HGNC_individual_CCDSDB.pl $DataBaseDir/ccdsGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/ccdsGene.txt.bed-exon_HGNC.bed  &
55 | perl check_HGNC_individual_VEGADB.pl $DataBaseDir/vegaGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt $DataBaseDir/VEGA-hgnc_names > $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed  &
56 | perl check_HGNC_individual_RefSeqDB.pl $DataBaseDir/refGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/refGene.txt.bed-exon_HGNC.bed  &
57 | 
58 | wait;
59 | 
60 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed | $BASEDIR/bin/bedtools intersect -a - -b $DataBaseDir/refGene.txt.bed-Coding_region.bed -u > refGene.txt.bed-exon_HGNC.bed_tmp ;
61 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed -v | cat - refGene.txt.bed-exon_HGNC.bed_tmp > tmp ;
62 | #mv tmp $DataBaseDir/refGene.txt.bed-exon_HGNC.bed ;
63 | 
64 | #$BASEDIR/bin/bedtools intersect -a $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed -b $DataBaseDir/vegaGene.txt.bed-Coding_region.bed -u > tmp;
65 | #mv tmp $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed ;
66 | 
67 | 
68 | #rm refGene.txt.bed-exon_HGNC.bed_tmp ;
69 | rm $DataBaseDir/miRBASE_r20.gff2 ;
70 | rm $DataBaseDir/miRBASE_r20.bed ;
71 | rm $DataBaseDir/ccdsGene.txt ;
72 | rm $DataBaseDir/vegaGene.txt ;
73 | rm $DataBaseDir/refGene.txt ;
74 | rm $DataBaseDir/ccdsGene.txt.bed ;
75 | rm $DataBaseDir/vegaGene.txt.bed ;
76 | rm $DataBaseDir/refGene.txt.bed ;
77 | rm $DataBaseDir/ccdsGene.txt.bed-exon.bed ;
78 | rm $DataBaseDir/vegaGene.txt.bed-exon.bed ;
79 | rm $DataBaseDir/refGene.txt.bed-exon.bed ;
80 | 
81 | 


--------------------------------------------------------------------------------
/ExCID_v2.1/update_databases.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASEDIR=$(cd `dirname ${0}`; pwd)
 4 | source Config.txt
 5 | 
 6 | mkdir -p $DataBaseDir; 
 7 | mkdir -p $AnnotationDir;
 8 | rm -rf $DataBaseDir/* ;
 9 | rm -rf $AnnotationDir/* ;
10 | 
11 | 
12 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz $DataBaseDir ;
13 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ccdsGene.txt.gz $DataBaseDir ;
14 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGene.txt.gz $DataBaseDir ;
15 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGtp.txt.gz $DataBaseDir ;
16 | cp miRBASE_r20.gff2 $DataBaseDir/miRBASE_r20.gff2 ;
17 | 
18 | ls --color=never $DataBaseDir/*.gz | while read FILE ; do gzip -d "$FILE" ; done ;
19 | ls --color=never $DataBaseDir/*.txt | while read FILE ; do awk -F "\t" '{print $3"\t"$7"\t"$8"\t"$2"\t"$13"\t"$9"\t"$10"\t"$11}' "$FILE" > "$FILE.bed" ; done  ;
20 | rm $DataBaseDir/vegaGtp.txt.bed ;
21 | awk -F "\t" '{print $1"\t"$2}' $DataBaseDir/vegaGtp.txt > $DataBaseDir/VEGA-hgnc_names ; 
22 | rm $DataBaseDir/vegaGtp.txt ;
23 | 
24 | awk -F "\t| " '{print $1"\t"$4"\t"$5"\t"$10}' $DataBaseDir/miRBASE_r20.gff2  | sed s/ID=\"//   | sed s/\"\;//  | grep "^#" -v > $DataBaseDir/miRBASE_r20.bed ;
25 | 
26 | perl creat_bed_UCSC_coding.pl $DataBaseDir/refGene.txt.bed ;
27 | perl creat_bed_UCSC_coding.pl $DataBaseDir/ccdsGene.txt.bed ;
28 | perl creat_bed_UCSC_coding.pl $DataBaseDir/vegaGene.txt.bed ;
29 | 
30 | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$5}' $DataBaseDir/ccdsGene.txt.bed-exon.bed > tmp ; mv tmp $DataBaseDir/ccdsGene.txt.bed-exon.bed ;
31 | 
32 | sed s/^chr// $DataBaseDir/miRBASE_r20.bed > tmp ; mv tmp $DataBaseDir/miRBASE_r20.bed ;
33 | 
34 | perl Get_HGNC.pl  > $DataBaseDir/HGNC_database.txt ;
35 | 
36 | perl check_HGNC_individual_mirnaDB.pl $DataBaseDir/miRBASE_r20.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/miRBASE_r20_HGNC.bed  &
37 | perl check_HGNC_individual_CCDSDB.pl $DataBaseDir/ccdsGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/ccdsGene.txt.bed-exon_HGNC.bed  &
38 | perl check_HGNC_individual_VEGADB.pl $DataBaseDir/vegaGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt $DataBaseDir/VEGA-hgnc_names > $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed  &
39 | perl check_HGNC_individual_RefSeqDB.pl $DataBaseDir/refGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/refGene.txt.bed-exon_HGNC.bed &
40 | 
41 | wait;
42 | 
43 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed | $BASEDIR/bin/bedtools intersect -a - -b $DataBaseDir/refGene.txt.bed-Coding_region.bed -u > refGene.txt.bed-exon_HGNC.bed_tmp ;
44 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed -v | cat - refGene.txt.bed-exon_HGNC.bed_tmp > tmp ;
45 | #mv tmp $DataBaseDir/refGene.txt.bed-exon_HGNC.bed ;
46 | 
47 | #$BASEDIR/bin/bedtools intersect -a $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed -b $DataBaseDir/vegaGene.txt.bed-Coding_region.bed -u > tmp;
48 | #mv tmp $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed ;
49 | 
50 | 
51 | #rm refGene.txt.bed-exon_HGNC.bed_tmp ;
52 | rm $DataBaseDir/miRBASE_r20.gff2 ;
53 | rm $DataBaseDir/miRBASE_r20.bed ;
54 | rm $DataBaseDir/ccdsGene.txt ;
55 | rm $DataBaseDir/vegaGene.txt ;
56 | rm $DataBaseDir/refGene.txt ;
57 | rm $DataBaseDir/ccdsGene.txt.bed ;
58 | rm $DataBaseDir/vegaGene.txt.bed ;
59 | rm $DataBaseDir/refGene.txt.bed ;
60 | rm $DataBaseDir/ccdsGene.txt.bed-exon.bed ;
61 | rm $DataBaseDir/vegaGene.txt.bed-exon.bed ;
62 | rm $DataBaseDir/refGene.txt.bed-exon.bed ;
63 | 
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## About the ExCID Report ##
 2 | 
 3 | The Exome Coverage and Identification (ExCID) Report is a software tool developed at BCM-HGSC to assess sequence depth in user-defined targeted regions.  The tool was initially developed for use in targeted capture applications, but its functionality has evolved to encompass any sequencing application from amplicon and targeted capture sequencing to WGS.  ExCID analyzes sequence depth of any sequencing event, reports the average coverage across each target, and identifies bases below a user-defined threshold (20X coverage by default).  Furthermore, the tool annotates the target with the latest gene, transcript, and exon information from RefSeq and the Human Gene Mutation Database (HGMD).  The report has the option to output data tracks of sample targets and coverage that can be visualized in UCSC and IGV genome browsers.
 4 | 
 5 | ## Outputs ##
 6 | * Outputs length, average coverage, and gene annotations for targets in BCM-HGSC VCRome (or your custom design)
 7 | * Outputs all regions of low coverage, including length and avg. coverage
 8 | * Output coverage track across regions of interest viewable in standard browser
 9 | * Outputs the percentage of gene covered in the design for all the Genetest genes (Clinically important genes) and other provided Gene Lists or Gene databases.
10 | 
11 | ## Installation ##
12 | 
13 | Requirements:
14 | 
15 |         1. Latest version of JAVA and PERL.
16 |         2. If on a Mac, you might need to install XCode: https://developer.apple.com/xcode/downloads/
17 | 
18 | 1) Fill the information in Config.txt.
19 | 
20 |         DataBaseDir=/path/to/directory/to_put_the_databases/
21 |         AnnotationDir=/path/to/directory/to_put_the_annotations_of_bed_files/
22 |         
23 | 2) Run setup.sh script from command line.
24 | 
25 |         $./setup.sh
26 |         
27 | The setup script installs the bedtools version 2.17.0 (Released under GNU public license version 2 (GPL v2)) and maintained by the Quinlan Laboratory at the University of Virginia.
28 | It will download the latest RefSEQ, VEGA, CCDS and miRBASE databses for bed file annotation. The databases generated are for Coding regions only.
29 |         
30 | ## Usage ##
31 | 
32 | 1) For using with VCrome regions run the program as:
33 | 
34 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam <BAM file> -m <min threshold>
35 |         
36 | Multiple Bam files can be provided eg.
37 |     
38 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam <BAM file1> -bam <BAM file2> -bam <BAM file3> -m <min threshold>
39 |     OR
40 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bamList <BAM list> -m <min threshold>
41 |           where the BAM list is a test file with 1 bam file per line.
42 |         
43 | If the minimum threshold is not provided by the user then 20x coverage is assummed by default.
44 |     
45 |     
46 | 2) For using with a user defined BED file:
47 | 
48 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam <BAM file> -m <min threshold> -i <Bed file>
49 |         
50 | Multiple Bam files can be provided eg.
51 |     
52 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam <BAM file1> -bam <BAM file2> -bam <BAM file3> -m <min threshold> -i <Bed file>
53 |         
54 | If the minimum threshold is not provided by the user then 20x coverage is assummed by default.
55 | 
56 | 
57 | 3) For generating a wig file for all the target regions and a bed file for low covered regions for visualization in standard genome browser, use the '-wig' option:
58 | 
59 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam <BAM file> -m <min threshold> -i <Bed file> -wig
60 |         
61 | 
62 | 4) Using '-d' option will not consider duplicates reads for generating the coverages statistics:
63 | 
64 |         $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam <BAM file> -m <min threshold> -i <Bed file> -d
65 |         
66 | 
67 | The BED file will be annotated with RefSEQ, CCDS, VEGA and miRBASE gene annotations. The RefSEQ, CCDS, VEGA and miRBASE database can be updated as:
68 |         $ ./update_databases.sh    
69 | 
70 | 
71 | The Gentest genes were complied and annotated in November 2013.
72 | 
73 | ## File Formats ##
74 | 
75 | 1) If the user wants to generate a Gene database to obtain the Gene coverage percentage, the database should be of the following format:
76 |     
77 |     CHR START   STOP    GENE|TRASCRIPT_exon_number
78 |     
79 |     Example:
80 |     10	100177320	100177483	HPS1|NM_000195_cds_0
81 |     10	100177931	100178014	HPS1|NM_000195_cds_1
82 |     10	100179801	100179915	HPS1|NM_000195_cds_2
83 | 
84 | 


--------------------------------------------------------------------------------