├── ExCID_v2.0.zip ├── ExCID_v2.1 ├── Config.txt ├── ExCID.BatchScript_v2.1-threading_Final.pl ├── ExCID.grep_gene_list_pct_Final-miRNA.pl ├── ExCID.grep_gene_list_pct_Final.pl ├── ExCID_v2.1-Batch.pl ├── Get_HGNC.pl ├── bed_file-annotator_V2_CCDS-miRBASE.pl ├── bed_file-annotator_V2_RefSeq-VEGA.pl ├── bin │ ├── CapStatsV2.5.jar │ ├── CapStatsV2.6.jar │ ├── CovFasta_Generator.jar │ ├── Java_code │ │ ├── CaptureStatsBAM5_extended.java │ │ ├── ParseOpts.java │ │ ├── WGS_Stats_v1.java │ │ ├── picard-1.88.jar │ │ └── sam-1.88.jar │ ├── WGSStats_v1.1.jar │ └── WGSStats_v1.jar ├── change_log.txt ├── check_HGNC_individual_CCDSDB.pl ├── check_HGNC_individual_RefSeqDB.pl ├── check_HGNC_individual_VEGADB.pl ├── check_HGNC_individual_mirnaDB.pl ├── creat_bed_UCSC_coding.pl ├── database.tgz ├── external_programs │ └── BEDTools.v2.17.0.tar.gz ├── miRBASE_r20.gff2 ├── reformat.pl ├── setup.sh └── update_databases.sh └── README.md /ExCID_v2.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.0.zip -------------------------------------------------------------------------------- /ExCID_v2.1/Config.txt: -------------------------------------------------------------------------------- 1 | DataBaseDir= 2 | AnnotationDir= 3 | -------------------------------------------------------------------------------- /ExCID_v2.1/ExCID.BatchScript_v2.1-threading_Final.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/ExCID.BatchScript_v2.1-threading_Final.pl -------------------------------------------------------------------------------- /ExCID_v2.1/ExCID.grep_gene_list_pct_Final-miRNA.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use warnings; 4 | use diagnostics; 5 | use Getopt::Long; 6 | use File::Path; 7 | use Time::localtime; 8 | use Fcntl qw(:flock); 9 | use File::Basename; 10 | 11 | 12 | unless (scalar @ARGV > 0){USAGE(); exit; } 13 | 14 | my %opt; 15 | my $script_dir; 16 | my $script_dir_tmp = $0; 17 | $script_dir_tmp =~m/^.+\//; 18 | $script_dir=$&; 19 | 20 | my @tmp = `grep "DataBaseDir=" $script_dir/Config.txt`; 21 | $tmp[0]=~s/DataBaseDir=//; 22 | chomp($tmp[0]); 23 | my $database_dir = $tmp[0]; 24 | 25 | my $database1 = "$database_dir/refGene.txt.bed-exon_HGNC.bed"; 26 | my $database2 = "$database_dir/ccdsGene.txt.bed-exon_HGNC.bed"; 27 | my $database3 = "$database_dir/vegaGene.txt.bed-exon_HGNC.bed"; 28 | my $database4 = "$database_dir/miRBASE_r20_HGNC.bed"; 29 | my $combined=''; 30 | 31 | 32 | ### OPTIONS ### 33 | GetOptions('i:s' => \$opt{i}, 'l:s' => \$opt{l}, 'list:s' => \$opt{g}, 'db:s' => \$opt{db}) || &USAGE; 34 | my $bed = $opt{i} || "null"; 35 | my $low_cov_file = $opt{l} || "null"; 36 | my $gene_list = $opt{g} || "null"; 37 | my $targeted_database = $opt{db} || "null" ; 38 | my $pct_file; 39 | my @genes_lest; 40 | 41 | 42 | if ($gene_list eq "null" && $targeted_database eq "null") { 43 | print STDERR "Please provide either list of genes (HGNC Symbols) or the targeted Gene database. Check the documentation for database format\n"; 44 | exit; 45 | } 46 | 47 | if ($bed eq "null") { 48 | print STDERR "Please provide Targeted Bedfile.\n"; 49 | exit; 50 | } 51 | 52 | if($low_cov_file eq "null"){ 53 | print STDERR "Please provide inadequately covered bases in bed file format.\n"; 54 | exit; 55 | } 56 | 57 | 58 | 59 | 60 | if($targeted_database && -e $targeted_database ){ 61 | print STDERR "Targeted Data base provided. Obtaining Gene percent coverages.\n"; 62 | $pct_file = genes_check_db($low_cov_file, $bed, $targeted_database); 63 | exit 0; 64 | }else{ 65 | print STDERR "Some issue with the run. Please check the commands.\n"; 66 | exit -1; 67 | } 68 | 69 | 70 | 71 | ########## SUBROUTINES ########### 72 | sub USAGE { 73 | print "\nUSAGE: $0 -i -l -list -db \n"; 74 | print " -i: Annotated bed file\n"; 75 | print " -l: Low cov bed file\n"; 76 | print " -list: List of genes. one per line\n"; 77 | print " -db: Database of interested Genes. Please see documentation for database format. \n"; 78 | exit; 79 | } 80 | 81 | sub genes_check_db { 82 | 83 | my ($low_cov_file, $anno_bed, $db_list) = @_; 84 | my $db_name = basename($db_list); 85 | my $out_file = $low_cov_file; 86 | $out_file=~ s{.*/}{}; # remove path 87 | $out_file.="-miRBASE-anno.bed"; 88 | my $nottargeted_rfs = $anno_bed; 89 | $nottargeted_rfs=~ s{.*/}{}; # remove path 90 | $nottargeted_rfs.="-notTargeted_in_DB-$out_file.bed"; 91 | my $tmp = $out_file; 92 | $tmp=~ s{.*/}{}; # remove path 93 | $tmp=~s/\.bed$//; 94 | $combined = $nottargeted_rfs; 95 | $combined=~s/\-$out_file//; 96 | $combined=~s/\.bed$//; 97 | $combined.="-$tmp.bed"; 98 | 99 | system("$script_dir/bin/bedtools subtract -a $db_list -b $anno_bed > $nottargeted_rfs"); 100 | system("$script_dir/bin/bedtools intersect -a $db_list -b $low_cov_file > $out_file"); 101 | system("cat $nottargeted_rfs $out_file | $script_dir/bin/bedtools sort -i > $combined"); 102 | system("$script_dir/bin/bedtools merge -i $combined > $combined-tmp"); 103 | system("$script_dir/bin/bedtools intersect -a $db_list -b $combined-tmp > $combined"); 104 | my $db_list_transcript_size = get_Transcirpt_size_db($db_list); 105 | my $out_FILE = get_Transcirpt_size($low_cov_file,$combined, $db_list_transcript_size); 106 | my $rm_file = "$combined-tmp"; 107 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 108 | $rm_file = $out_file; 109 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 110 | $rm_file = $nottargeted_rfs; 111 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 112 | $rm_file = "$combined"; 113 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 114 | 115 | $db_name=~ s{\.[^.]+$}{}; # removes extension 116 | system("mv $out_FILE $db_name-pct.txt"); 117 | return "$db_name-pct.txt"; 118 | 119 | return $out_FILE; 120 | } 121 | 122 | sub get_Transcirpt_size{ 123 | my ($low_cov_file,$combined,$db)=@_; 124 | 125 | my $tmp = $combined; 126 | $tmp=~ s{.*/}{}; # remove path 127 | $tmp=~s/\.bed$//; 128 | 129 | 130 | my $outfile_final = "$tmp"."_transcriptSIZE.txt"; 131 | open(my $in,"$combined") || die "Can't open $combined: $!\n"; 132 | open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n"; 133 | 134 | while(<$in>) 135 | { 136 | chomp; my $line = $_; 137 | my ($chr, $start, $stop, $gene) = split(/\s/, $line); 138 | my $exonsize = $stop - $start + 1; 139 | my @tranarray = split(/\;/, $gene); 140 | my $arraysize = scalar(@tranarray) - 1; 141 | for (my $j=0; $j<=$arraysize; $j++) { 142 | print $out "$tranarray[$j]\t$exonsize\t$chr\t$start\t$stop\n"; 143 | } 144 | } 145 | 146 | close($in); 147 | close($out); 148 | my $outFile = get_pct($outfile_final,$db); 149 | return $outFile; 150 | } 151 | 152 | sub get_Transcirpt_size_db{ 153 | my ($combined)=@_; 154 | 155 | my $tmp = $combined; 156 | $tmp=~ s{.*/}{}; # remove path 157 | $tmp=~s/\.bed$//; 158 | 159 | 160 | my $outfile_final = "$tmp"."_transcriptSIZE.txt"; 161 | open(my $in,"$combined") || die "Can't open $combined: $!\n"; 162 | open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n"; 163 | 164 | while(<$in>) 165 | { 166 | chomp; my $line = $_; 167 | my ($chr, $start, $stop, $gene) = split(/\s/, $line); 168 | my $exonsize = $stop - $start + 1; 169 | my @tranarray = split(/\;/, $gene); 170 | my $arraysize = scalar(@tranarray) - 1; 171 | for (my $j=0; $j<=$arraysize; $j++) { 172 | print $out "$tranarray[$j]\t$exonsize\t$chr\t$start\t$stop\n"; 173 | } 174 | } 175 | 176 | close($in); 177 | close($out); 178 | return $outfile_final; 179 | } 180 | 181 | sub get_pct{ 182 | 183 | my ($infile,$control) =@_; 184 | 185 | 186 | unless (-e $control) {print STDERR "$control does not exist\n"; exit;} 187 | unless (-e $infile) {print STDERR "$infile does not exist\n"; exit;} 188 | 189 | my $outfile = $infile; 190 | $outfile=~s/\_transcriptSIZE\.txt$/\_pct\.txt/; 191 | my %control; 192 | my %file_data; 193 | open(my $fh,"<$control") or die $!; 194 | 195 | while (my $line = <$fh>) { 196 | chomp($line); 197 | my @data1 = split("\t",$line); 198 | #print STDERR $data1[0]."\n"; 199 | my @data = split(/\|/,$data1[0]); 200 | if (scalar @data1 ==1) { 201 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = 0; 202 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0]; 203 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1]; 204 | } 205 | if (scalar @data1 ==5) { 206 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = $data1[1]; 207 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0]; 208 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1]; 209 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"chr"} = $data1[2]; 210 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"start"} = $data1[3]; 211 | $control{"$data1[0]_$data1[2]_$data1[3]"}{"stop"} = $data1[4]; 212 | } 213 | } 214 | close($fh); 215 | 216 | open(my $fh1,"<$infile") or die $!; 217 | while (my $line = <$fh1>) { 218 | chomp($line); 219 | my @data1 = split("\t",$line); 220 | my @data = split(/\|/,$data1[0]); 221 | if (scalar @data1 ==1) { 222 | $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = 0; 223 | $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0]; 224 | $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1]; 225 | } 226 | if (scalar @data1 ==5) { 227 | $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"val"} = $data1[1]; 228 | $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"Gene"} = $data[0]; 229 | $file_data{"$data1[0]_$data1[2]_$data1[3]"}{"ID"} = $data[1]; 230 | } 231 | } 232 | close($fh1); 233 | 234 | my @low_region_keys = keys %file_data; 235 | open(my $fho,">$outfile") or die $!; 236 | 237 | foreach my $key (keys %control){ 238 | my @key_split = split(/_/,$key); 239 | my $no_match = 1; 240 | 241 | my @matches = grep { /$key_split[0]/ } @low_region_keys; 242 | foreach my $match (@matches){ 243 | my @low_key_split = split(/_/,$match); 244 | 245 | if (($key_split[0] eq $low_key_split[0]) && ($key_split[1] eq $low_key_split[1]) && $key_split[2] <= $low_key_split[2]) { 246 | my $tmp = $file_data{$match}{"val"}/$control{$key}{"val"}; 247 | my $pct = sprintf("%.3f",(1-$tmp)*100); 248 | 249 | my $grep_key = $control{$key}{"Gene"}."|".$control{$key}{"ID"}; 250 | my @regions = `grep -w \"$grep_key\" $combined `; 251 | my $lowcov_coords=''; 252 | foreach my $lowcov_region (@regions){ 253 | my @tmp = split ("\t",$lowcov_region); 254 | if ("$tmp[0]" eq $control{$key}{"chr"} && $tmp[1]>= $control{$key}{"start"} && $tmp[2] <= $control{$key}{"stop"}) { 255 | $lowcov_coords.="$tmp[1]-$tmp[2];"; 256 | } 257 | } 258 | $lowcov_coords=~s/;$//; 259 | 260 | print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t".$control{$key}{"ID"}."\t1\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t$pct%\t$lowcov_coords\n"; 261 | $no_match = 0; 262 | } 263 | 264 | } 265 | if ($no_match == 1) { 266 | print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t".$control{$key}{"ID"}."\t1\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t100.000%\t.\n"; 267 | } 268 | } 269 | close($fho); 270 | my $rm_file = $infile; 271 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 272 | 273 | return $outfile; 274 | } 275 | 276 | ############################################################################################################################## 277 | 278 | sub timestamp { 279 | my $t = localtime; 280 | return sprintf( "%04d-%02d-%02d_%02d-%02d-%02d", $t->year + 1900, $t->mon + 1, $t->mday, $t->hour, $t->min, $t->sec ); 281 | } -------------------------------------------------------------------------------- /ExCID_v2.1/ExCID.grep_gene_list_pct_Final.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use warnings; 4 | use diagnostics; 5 | use Getopt::Long; 6 | use File::Path; 7 | use Time::localtime; 8 | use Fcntl qw(:flock); 9 | use File::Basename; 10 | 11 | 12 | unless (scalar @ARGV > 0){USAGE(); exit; } 13 | 14 | my %opt; 15 | my $script_dir; 16 | my $script_dir_tmp = $0; 17 | $script_dir_tmp =~m/^.+\//; 18 | $script_dir=$&; 19 | 20 | my @tmp = `grep "DataBaseDir=" $script_dir/Config.txt`; 21 | $tmp[0]=~s/DataBaseDir=//; 22 | chomp($tmp[0]); 23 | my $database_dir = $tmp[0]; 24 | 25 | my $database1 = "$database_dir/refGene.txt.bed-exon_HGNC.bed"; 26 | my $database2 = "$database_dir/ccdsGene.txt.bed-exon_HGNC.bed"; 27 | my $database3 = "$database_dir/vegaGene.txt.bed-exon_HGNC.bed"; 28 | my $database4 = "$database_dir/miRBASE_r20_HGNC.bed"; 29 | my $combined=''; 30 | my $HGMD_db = "$database_dir/HGMD_2014_v4.bed"; 31 | 32 | 33 | ### OPTIONS ### 34 | my $check_HGMD; 35 | GetOptions('i:s' => \$opt{i}, 'l:s' => \$opt{l}, 'list:s' => \$opt{g}, 'db:s' => \$opt{db}, 'checkHGMD' => \$check_HGMD) || &USAGE; 36 | my $bed = $opt{i} || "null"; 37 | my $low_cov_file = $opt{l} || "null"; 38 | my $gene_list; 39 | $gene_list = $opt{g} if($opt{g}); 40 | my $targeted_database; 41 | $targeted_database = $opt{db} if($opt{db}) ; 42 | my $pct_file; 43 | my @genes_lest; 44 | 45 | 46 | if (!$gene_list && !$targeted_database) { 47 | print STDERR "Please provide either list of genes (HGNC Symbols) or the targeted Gene database. Check the documentation for database format\n"; 48 | exit; 49 | } 50 | 51 | if ($bed eq "null") { 52 | print STDERR "Please provide Targeted Bedfile.\n"; 53 | exit; 54 | } 55 | 56 | if($low_cov_file eq "null"){ 57 | print STDERR "Please provide inadequately covered bases in bed file format.\n"; 58 | exit; 59 | } 60 | 61 | 62 | 63 | if($targeted_database && -e $targeted_database && !$gene_list){ 64 | print STDERR "Targeted Data base provided. Obtaining Gene percent coverages.\n"; 65 | $pct_file=genes_check_db($low_cov_file, $bed, $targeted_database); 66 | if($check_HGMD) { 67 | HGMDcheck($pct_file,@genes_lest) ; 68 | }else{ 69 | averageGene($pct_file,@genes_lest) ; 70 | } 71 | exit 0 ; 72 | }elsif($targeted_database && -e $targeted_database && $gene_list && -e $gene_list){ 73 | print STDERR "Targeted Data base provided. Obtaining Gene percent coverages.\n"; 74 | $pct_file=genes_check_db($low_cov_file, $bed, $targeted_database); 75 | open(my $glfh, "< $gene_list") or die $!; 76 | @genes_lest= <$glfh>; 77 | close($glfh); 78 | if($check_HGMD) { 79 | HGMDcheck($pct_file,@genes_lest) ; 80 | }else{ 81 | averageGene($pct_file,@genes_lest) ; 82 | } 83 | exit 0 ; 84 | }elsif($gene_list && -e $gene_list && !$targeted_database){ 85 | open(my $glfh, "< $gene_list") or die $!; 86 | @genes_lest= <$glfh>; 87 | close($glfh); 88 | }else{ 89 | print STDERR "Some issue with the run. Please check the commands.\n"; 90 | } 91 | 92 | my $output_dir; 93 | my $output_dir_tmp = $low_cov_file; 94 | $output_dir_tmp =~m/^.+\//; 95 | $output_dir=$&; 96 | 97 | $targeted_database = $output_dir."/".basename($gene_list)."-database.bed"; 98 | my $targeted_miRBASE = $output_dir."/".basename($gene_list)."-miRBASE.bed"; 99 | open(my $gldb, "> $targeted_database") or die $!; 100 | open(my $gldbm, "> $targeted_miRBASE") or die $!; 101 | my $mirfound = 0; 102 | foreach my $gene (@genes_lest){ 103 | chomp($gene); 104 | next unless(length($gene) > 0); 105 | my $found = 0; 106 | my @regions = `grep -w -P \"\t$gene\$\" $database1 `; 107 | if (scalar(@regions) > 0) { 108 | foreach my $reg (@regions){ 109 | chomp($reg); 110 | my @tmp_split = split("\t",$reg); 111 | print $gldb "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[5]|$tmp_split[4]\n" ; 112 | } 113 | $found = 1; 114 | } 115 | 116 | @regions = `grep -w -P \"\t$gene\$\" $database2 `; 117 | if (scalar(@regions) > 0) { 118 | foreach my $reg (@regions){ 119 | chomp($reg); 120 | my @tmp_split = split("\t",$reg); 121 | print $gldb "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[4]|$tmp_split[3]\n" ; 122 | } 123 | $found = 1; 124 | } 125 | 126 | @regions = `grep -w -P \"\t$gene\$\" $database3 `; 127 | if (scalar(@regions) > 0) { 128 | foreach my $reg (@regions){ 129 | chomp($reg); 130 | my @tmp_split = split("\t",$reg); 131 | print $gldb "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[5]|$tmp_split[4]\n" ; 132 | } 133 | $found = 1; 134 | } 135 | 136 | @regions = `grep -w -P \"\t$gene\$\" $database4 `; 137 | if (scalar(@regions) > 0) { 138 | foreach my $reg (@regions){ 139 | chomp($reg); 140 | my @tmp_split = split("\t",$reg); 141 | print $gldbm "$tmp_split[0]\t$tmp_split[1]\t$tmp_split[2]\t$tmp_split[4]|$tmp_split[3]\n" ; 142 | } 143 | $found = 1; 144 | $mirfound = 1; 145 | } 146 | 147 | if ($found != 1) { 148 | print STDERR "$gene is not a HGNC symbol. Please check the Gene or update the databases.\n"; 149 | } 150 | 151 | } 152 | 153 | close($gldb); 154 | close($gldbm); 155 | 156 | $pct_file=genes_check_db($low_cov_file, $bed, $targeted_database); 157 | 158 | open(my $glfh, "< $gene_list") or die $!; 159 | @genes_lest= <$glfh>; 160 | close($glfh); 161 | 162 | if($check_HGMD) { 163 | HGMDcheck($pct_file,@genes_lest) ; 164 | }else{ 165 | averageGene($pct_file,@genes_lest) ; 166 | } 167 | 168 | if ($mirfound == 1) { 169 | system("$script_dir/ExCID.grep_gene_list_pct_Final-miRNA.pl -i $bed -l $low_cov_file -db $targeted_miRBASE "); 170 | } 171 | 172 | 173 | 174 | ########## SUBROUTINES ########### 175 | sub USAGE { 176 | print "\nUSAGE: $0 -i -l -list -db \n"; 177 | print " -i: Annotated bed file\n"; 178 | print " -l: Low cov bed file\n"; 179 | print " -list: List of genes. one per line\n"; 180 | print " -checkHGMD: Output only HGMD Transcripts if present or Average among all the transcripts.\n"; 181 | print " -db: Database of interested Genes. Please see documentation for database format. \n"; 182 | exit; 183 | } 184 | 185 | sub genes_check_db { 186 | 187 | my ($low_cov_file, $Bed_file, $db_list) = @_; 188 | 189 | my $output_file_name = basename($low_cov_file); 190 | $output_file_name=~ s{\.[^.]+$}{}; # removes extension 191 | my $output_dir; 192 | my $output_dir_tmp = $low_cov_file; 193 | $output_dir_tmp =~m/^.+\//; 194 | $output_dir=$&; 195 | my $db_name = basename($db_list); 196 | my $out_file = $output_file_name; 197 | $out_file=~ s{.*/}{}; # remove path 198 | $out_file.="-$db_name.bed"; 199 | my $nottargeted = $Bed_file; 200 | $nottargeted=~ s{.*/}{}; # remove path 201 | $nottargeted.="-notTrgtdin-$db_name-$output_file_name.bed"; 202 | my $tmp = $out_file; 203 | $tmp=~ s{.*/}{}; # remove path 204 | $tmp=~s/\.bed$//; 205 | $combined = $nottargeted; 206 | $combined=~s/\-$output_file_name//; 207 | $combined=~s/\.bed$//; 208 | $combined.="-$tmp.bed"; 209 | 210 | system("$script_dir/bin/bedtools subtract -a $db_list -b $Bed_file > $nottargeted"); 211 | system("$script_dir/bin/bedtools intersect -a $db_list -b $low_cov_file > $out_file"); 212 | system("cat $nottargeted $out_file | $script_dir/bin/bedtools sort -i > $combined"); 213 | system("$script_dir/bin/bedtools merge -i $combined > $combined-tmp"); 214 | system("$script_dir/bin/bedtools intersect -a $db_list -b $combined-tmp > $combined"); 215 | my $db_list_transcript_size = get_Transcirpt_size_db($low_cov_file,$db_list); 216 | my $out_FILE = get_Transcirpt_size($low_cov_file,$combined, $db_list_transcript_size); 217 | 218 | my $rm_file = "$combined-tmp"; 219 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 220 | $rm_file = $out_file; 221 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 222 | $rm_file = $nottargeted; 223 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 224 | $rm_file = "$combined"; 225 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 226 | $rm_file = $db_list_transcript_size; 227 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 228 | $rm_file=~s/\.txt$/\_perExon\.txt/; 229 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 230 | 231 | 232 | $db_name=~ s{\.[^.]+$}{}; # removes extension 233 | system("mv $out_FILE $output_dir/$output_file_name-$db_name-pct.txt"); 234 | $out_FILE=~s/\_pct\.txt$/\_perExon\_pct\.txt/; 235 | system("mv $out_FILE $output_dir/$output_file_name-$db_name-perExon_pct.txt"); 236 | return "$output_dir/$output_file_name-$db_name-pct.txt"; 237 | } 238 | 239 | sub get_Transcirpt_size{ 240 | my ($low_cov_file,$combined,$db)=@_; 241 | 242 | my $tmp = $combined; 243 | $tmp=~ s{.*/}{}; # remove path 244 | $tmp=~s/\.bed$//; 245 | 246 | 247 | my $outfile_final = "$tmp"."_transcriptSIZE.txt"; 248 | my $outfile_final_exon = "$tmp"."_transcriptSIZE_perExon.txt"; 249 | my %sizehash; 250 | open(my $in,"$combined") || die "Can't open $combined: $!\n"; 251 | open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n"; 252 | open(my $outex, ">$outfile_final_exon") || die "Can't open $outfile_final_exon: $!\n"; 253 | 254 | while(<$in>) 255 | { 256 | chomp; my $line = $_; 257 | next unless (length($line) != 0); 258 | my ($chr, $start, $stop, $gene) = split(/\s/, $line); 259 | my $exonsize = $stop - $start + 1; 260 | my @tranarray = split(/\;/, $gene); 261 | my $arraysize = scalar(@tranarray) - 1; 262 | for (my $j=0; $j<=$arraysize; $j++) { 263 | print $outex "$tranarray[$j]\t$exonsize\n"; 264 | my @tmp = split(/\_/, $tranarray[$j]); 265 | my $unit = join("_",@tmp[0..(scalar(@tmp)-3)]); 266 | my $cds = $tmp[(scalar(@tmp)-2)]; 267 | my $exon = $tmp[(scalar(@tmp)-1)]; 268 | push @{$sizehash{$unit}}, $exonsize; 269 | } 270 | } 271 | 272 | foreach my $key ( keys %sizehash ) 273 | { 274 | my $total_size = eval join '+', @{$sizehash{$key}}; 275 | print $out "$key\t$total_size\n"; 276 | } 277 | 278 | close($in); 279 | close($out); 280 | close($outex); 281 | my $outFile = get_pct($outfile_final,$db); 282 | my $outFile_perExon = get_pct_perExon($outfile_final,$db); 283 | return $outFile; 284 | } 285 | 286 | sub get_Transcirpt_size_db{ 287 | my ($low_cov_file, $db_list)=@_; 288 | 289 | my $tmp = $low_cov_file; 290 | $tmp=~ s{\.[^.]+$}{}; # removes extension 291 | 292 | my $tmp1 = basename($db_list); 293 | $tmp.="-$tmp1"; 294 | $tmp=~s/\.bed$//; 295 | 296 | my $outfile_final = "$tmp"."_transcriptSIZE.txt"; 297 | my $outfile_final_exon = "$tmp"."_transcriptSIZE_perExon.txt"; 298 | my %sizehash; 299 | open(my $in,"< $db_list") || die "Can't open $db_list: $!\n"; 300 | open(my $out, ">$outfile_final") || die "Can't open $outfile_final: $!\n"; 301 | open(my $outex, ">$outfile_final_exon") || die "Can't open $outfile_final_exon: $!\n"; 302 | 303 | while(<$in>) 304 | { 305 | chomp; my $line = $_; 306 | unless(length($line) != 0){next;} 307 | my ($chr, $start, $stop, $gene) = split(/\s/, $line); 308 | my $exonsize = $stop - $start + 1; 309 | my @tranarray = split(/\;/, $gene); 310 | my $arraysize = scalar(@tranarray) - 1; 311 | for (my $j=0; $j<=$arraysize; $j++) { 312 | print $outex "$tranarray[$j]\t$exonsize\t$chr\t$start\t$stop\n"; 313 | my @tmp = split(/\_/, $tranarray[$j]); 314 | my $unit = join("_",@tmp[0..(scalar(@tmp)-3)]); 315 | my $cds = $tmp[(scalar(@tmp)-2)]; 316 | my $exon = $tmp[(scalar(@tmp)-1)]; 317 | push @{$sizehash{$unit}{'exonsize'}}, $exonsize; 318 | $sizehash{$unit}{'chr'} = $chr; 319 | } 320 | } 321 | 322 | foreach my $key ( keys %sizehash ) 323 | { 324 | my $total_size = eval join '+', @{$sizehash{$key}{'exonsize'}}; 325 | print $out "$key\t$total_size\t".$sizehash{$key}{'chr'}."\t".scalar(@{$sizehash{$key}{'exonsize'}})."\n"; 326 | my @tmp= split(/\|/,$key); 327 | push @genes_lest,$tmp[0]; 328 | } 329 | 330 | close($in); 331 | close($out); 332 | close($outex); 333 | return $outfile_final; 334 | } 335 | 336 | sub get_pct{ 337 | 338 | my ($infile,$control) =@_; 339 | 340 | unless (-e $control) {print STDERR "$control does not exist\n"; exit;} 341 | unless (-e $infile) {print STDERR "$infile does not exist\n"; exit;} 342 | 343 | my $outfile = $infile; 344 | $outfile=~s/\_transcriptSIZE\.txt$/\_pct\.txt/; 345 | my %control; 346 | my %file_data; 347 | open(my $fh,"<$control") or die $!; 348 | 349 | while (my $line = <$fh>) { 350 | chomp($line); 351 | my @data1 = split("\t",$line); 352 | #print STDERR $data1[0]."\n"; 353 | my @data = split(/\|/,$data1[0]); 354 | #print STDERR $data[0]."\n"; 355 | if (scalar @data1 ==1) { 356 | $control{$data1[0]}{"val"} = 0; 357 | $control{$data1[0]}{"Gene"} = $data[0]; 358 | } 359 | if (scalar @data1 ==4) { 360 | $control{$data1[0]}{"val"} = $data1[1]; 361 | $control{$data1[0]}{"Gene"} = $data[0]; 362 | $control{$data1[0]}{"chr"} = $data1[2]; 363 | $control{$data1[0]}{"exons"} = $data1[3]; 364 | } 365 | } 366 | close($fh); 367 | 368 | open(my $fh1,"<$infile") or die $!; 369 | while (my $line = <$fh1>) { 370 | chomp($line); 371 | my @data1 = split("\t",$line); 372 | my @data = split(/\|/,$data1[0]); 373 | #print STDERR scalar @data."\n"; 374 | if (scalar @data1 ==1) { 375 | $file_data{$data1[0]}{"val"} = 0; 376 | $file_data{$data1[0]}{"Gene"} = $data[0]; 377 | } 378 | if (scalar @data1 ==2) { 379 | $file_data{$data1[0]}{"val"} = $data1[1]; 380 | $file_data{$data1[0]}{"Gene"} = $data[0]; 381 | } 382 | } 383 | close($fh1); 384 | 385 | open(my $fho,">$outfile") or die $!; 386 | 387 | foreach my $key (keys %control){ 388 | my @key_split = split(/\|/,$key); 389 | print STDERR "$key\n" unless($key_split[1]); 390 | if (exists $file_data{$key}) { 391 | my $tmp = $file_data{$key}{"val"}/$control{$key}{"val"}; 392 | my $pct = sprintf("%.3f",(1-$tmp)*100); 393 | print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t$key_split[1]\t".$control{$key}{"val"}."\t".$control{$key}{"exons"}."\t$pct%\n"; 394 | }else { 395 | print $fho $control{$key}{"chr"}."\t".$control{$key}{"Gene"}."\t$key_split[1]\t".$control{$key}{"val"}."\t".$control{$key}{"exons"}."\t100.000%\n"; 396 | } 397 | } 398 | close($fho); 399 | my $rm_file = $infile; 400 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 401 | 402 | return $outfile; 403 | } 404 | 405 | 406 | 407 | sub get_pct_perExon{ 408 | 409 | my ($infile,$control) =@_; 410 | 411 | $control=~s/\.txt$/\_perExon\.txt/; 412 | $infile=~s/\.txt$/\_perExon\.txt/; 413 | 414 | unless (-e $control) {print STDERR "$control does not exist\n"; exit;} 415 | unless (-e $infile) {print STDERR "$infile does not exist\n"; exit;} 416 | 417 | my $outfile_pex = $infile; 418 | $outfile_pex=~s/\_transcriptSIZE\_perExon\.txt/\_perExon\_pct\.txt/; 419 | 420 | my %control; 421 | my %file_data; 422 | 423 | open(my $fh,"<$control") or die $!; 424 | 425 | while (my $line = <$fh>) { 426 | chomp($line); 427 | my @data1 = split("\t",$line); 428 | #print STDERR $data1[0]."\n"; 429 | if (scalar @data1 ==1) { 430 | $control{$data1[0]}{"val"} = 0; 431 | } 432 | if (scalar @data1 ==5) { 433 | $control{$data1[0]}{"val"} = $data1[1]; 434 | $control{$data1[0]}{"chr"} = $data1[2]; 435 | $control{$data1[0]}{"start"} = $data1[3]; 436 | $control{$data1[0]}{"stop"} = $data1[4]; 437 | } 438 | } 439 | close($fh); 440 | 441 | open(my $fh1,"<$infile") or die $!; 442 | while (my $line = <$fh1>) { 443 | chomp($line); 444 | my @data1 = split("\t",$line); 445 | if (scalar @data1 ==1) { 446 | $file_data{$data1[0]}{"val"} = 0; 447 | } 448 | if (scalar @data1 ==2) { 449 | $file_data{$data1[0]}{"val"} = $data1[1]; 450 | } 451 | } 452 | close($fh1); 453 | 454 | open(my $fho,">$outfile_pex") or die $!; 455 | 456 | foreach my $key (keys %control){ 457 | my @key_split = split(/\|/,$key); 458 | my @NM_details = split("_",$key_split[1]); 459 | if (exists $file_data{$key}) { 460 | my $tmp = $file_data{$key}{"val"}/$control{$key}{"val"}; 461 | my $pct = sprintf("%.3f",(1-$tmp)*100); 462 | 463 | my @regions = `grep -w \"$key\" $combined`; 464 | my $lowcov_coords=''; 465 | foreach my $lowcov_region (@regions){ 466 | my @tmp = split ("\t",$lowcov_region); 467 | $lowcov_coords.="$tmp[1]-$tmp[2];"; 468 | } 469 | $lowcov_coords=~s/;$//; 470 | if (scalar(@NM_details)==4) { 471 | print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]_$NM_details[1]\t$NM_details[2]_$NM_details[3]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t$pct%\t$lowcov_coords\n"; 472 | }else{ 473 | print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]\t$NM_details[1]_$NM_details[2]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t$pct%\t$lowcov_coords\n"; 474 | } 475 | }else { 476 | if (scalar(@NM_details)==4) { 477 | print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]_$NM_details[1]\t$NM_details[2]_$NM_details[3]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t100.000%\t.\n"; 478 | }else{ 479 | print $fho $control{$key}{"chr"}."\t$key_split[0]\t$NM_details[0]\t$NM_details[1]_$NM_details[2]\t".$control{$key}{"start"}."\t".$control{$key}{"stop"}."\t100.000%\t.\n"; 480 | } 481 | 482 | } 483 | } 484 | close($fho); 485 | my $rm_file = $infile; 486 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 487 | 488 | return $outfile_pex; 489 | } 490 | 491 | 492 | ####### Added Feb 5th ######### 493 | sub HGMDcheck { 494 | my ($pct_file,@genes_lest) = @_; 495 | my %genes =(); 496 | my $outfile=$pct_file; 497 | $outfile=~s/-pct\.txt/-pctCov\.txt/; 498 | foreach my $gene (@genes_lest) { 499 | chomp($gene); 500 | $gene=~s/\(/\\\(/; 501 | $gene=~s/\)/\\\)/; 502 | my @grep_gene = `grep -w -P \"\t$gene\t\" $pct_file`; 503 | if ((scalar @grep_gene) == 0) { 504 | print STDERR "$gene is not present in the provided database\n"; 505 | } 506 | 507 | foreach my $gene_trans (@grep_gene){ 508 | chomp($gene_trans); 509 | my @tmp = split("\t",$gene_trans); 510 | my $is_HGMD = `grep -w -P \"\t$tmp[2]\$\" $HGMD_db`; 511 | if ($is_HGMD) { 512 | $genes{$tmp[1]}{$tmp[2]}{'is_HGMD'} = "true"; 513 | }else{ 514 | $genes{$tmp[1]}{$tmp[2]}{'is_HGMD'} = "false"; 515 | } 516 | $genes{$tmp[1]}{$tmp[2]}{'line'} = $gene_trans; 517 | $genes{$tmp[1]}{$tmp[2]}{'cov'} = $tmp[5]; 518 | $genes{$tmp[1]}{$tmp[2]}{'cov'}=~ s/%$//; 519 | } 520 | } 521 | 522 | open(my $fho," > $outfile") or die $!; 523 | 524 | #my $total_genes = keys %genes; 525 | #print STDERR "$total_genes\n"; 526 | foreach my $gene (sort keys %genes){ 527 | my $written = 0; 528 | foreach my $transcript (sort keys %{$genes{$gene}}){ 529 | if ($genes{$gene}{$transcript}{'is_HGMD'} eq "true") { 530 | print $fho "$gene\t$transcript\t$genes{$gene}{$transcript}{'cov'}\tHGMD\n"; 531 | $written=1; 532 | } 533 | 534 | } 535 | #print STDERR "$gene\t$no_trans\t$written\n"; 536 | if ($written == 0) { 537 | my $print = "$gene\t"; 538 | my $avg_cov = 0; 539 | my $no_trans = 0; 540 | foreach my $transcript (keys %{$genes{$gene}}){ 541 | $no_trans++; 542 | $avg_cov+=$genes{$gene}{$transcript}{'cov'}; 543 | $print .= "$transcript($genes{$gene}{$transcript}{'cov'});"; 544 | } 545 | $print=~ s/;$//; 546 | $avg_cov = sprintf("%0.2f",($avg_cov/$no_trans)); 547 | print $fho "$print\t$avg_cov\n"; 548 | } 549 | } 550 | 551 | close($fho); 552 | } 553 | ####### Added Feb 5th ######### 554 | 555 | ####### Added Feb 10th ######### 556 | sub averageGene { 557 | my ($pct_file,@genes_lest) = @_; 558 | my %genes =(); 559 | my $outfile=$pct_file; 560 | $outfile=~s/-pct\.txt/-pctCov\.txt/; 561 | foreach my $gene (@genes_lest) { 562 | chomp($gene); 563 | $gene=~s/\(/\\\(/; 564 | $gene=~s/\)/\\\)/; 565 | my @grep_gene = `grep -w -P \"\t$gene\t\" $pct_file`; 566 | if ((scalar @grep_gene) == 0) { 567 | print STDERR "$gene is not present in the provided database\n"; 568 | } 569 | 570 | foreach my $gene_trans (@grep_gene){ 571 | chomp($gene_trans); 572 | my @tmp = split("\t",$gene_trans); 573 | $genes{$tmp[1]}{$tmp[2]}{'line'} = $gene_trans; 574 | $genes{$tmp[1]}{$tmp[2]}{'cov'} = $tmp[5]; 575 | $genes{$tmp[1]}{$tmp[2]}{'cov'}=~ s/%$//; 576 | } 577 | } 578 | 579 | open(my $fho," > $outfile") or die $!; 580 | 581 | #my $total_genes = keys %genes; 582 | #print STDERR "$total_genes\n"; 583 | foreach my $gene (sort keys %genes){ 584 | my $print = "$gene\t"; 585 | my $avg_cov = 0; 586 | my $no_trans = 0; 587 | foreach my $transcript (keys %{$genes{$gene}}){ 588 | $no_trans++; 589 | $avg_cov+=$genes{$gene}{$transcript}{'cov'}; 590 | $print .= "$transcript($genes{$gene}{$transcript}{'cov'});"; 591 | } 592 | $print=~ s/;$//; 593 | $avg_cov = sprintf("%0.2f",($avg_cov/$no_trans)); 594 | print $fho "$print\t$avg_cov\n"; 595 | } 596 | 597 | close($fho); 598 | 599 | } 600 | ####### Added Feb 10th ######### 601 | 602 | ############################################################################################################################## 603 | 604 | sub timestamp { 605 | my $t = localtime; 606 | return sprintf( "%04d-%02d-%02d_%02d-%02d-%02d", $t->year + 1900, $t->mon + 1, $t->mday, $t->hour, $t->min, $t->sec ); 607 | } -------------------------------------------------------------------------------- /ExCID_v2.1/ExCID_v2.1-Batch.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/ExCID_v2.1-Batch.pl -------------------------------------------------------------------------------- /ExCID_v2.1/Get_HGNC.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use LWP::Simple; 4 | my $url = 'http://www.genenames.org/cgi-bin/download?'. 5 | 'col=gd_app_sym&'. 6 | 'col=gd_app_name&'. 7 | 'col=gd_status&'. 8 | 'col=gd_prev_sym&'. 9 | 'col=gd_aliases&'. 10 | 'col=gd_name_aliases&'. 11 | 'col=gd_pub_chrom_map&'. 12 | 'col=gd_pub_acc_ids&'. 13 | 'col=gd_pub_ensembl_id&'. 14 | 'col=gd_pub_refseq_ids&'. 15 | 'col=gd_ccds_ids&'. 16 | 'col=gd_vega_ids&'. 17 | 'col=md_mim_id&'. 18 | 'col=md_ucsc_id&'. 19 | 'status=Approved&'. 20 | 'status_opt=2&'. 21 | 'where=&'. 22 | 'order_by=gd_app_sym_sort&'. 23 | 'format=text&'. 24 | 'limit=&'. 25 | 'submit=submit'; 26 | getprint($url); 27 | -------------------------------------------------------------------------------- /ExCID_v2.1/bed_file-annotator_V2_CCDS-miRBASE.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ## Anootater for BED file. 3 | use strict; 4 | use diagnostics; 5 | use Getopt::Std; 6 | use File::Basename; 7 | 8 | ### GLOBAL VARS ### 9 | my $bed = $ARGV[0] || &USAGE; 10 | my $database = $ARGV[1] || &USAGE; 11 | my $output_dir = $ARGV[2] || dirname($bed); 12 | my $db_name = basename($database); 13 | my $script_dir; 14 | my $script_dir_tmp = $0; 15 | $script_dir_tmp =~m/^.+\//; 16 | $script_dir=$&; 17 | 18 | system("ln -s $bed $output_dir/"); 19 | 20 | $bed = "$output_dir/".basename($bed); 21 | my %bedfile; 22 | 23 | open(my $fhb,"<$bed") or die $!; 24 | my $line = <$fhb>; 25 | chomp($line); 26 | my @tmp = split("\t",$line); 27 | my $cols = scalar @tmp; 28 | close($fhb); 29 | 30 | my $cmd=""; 31 | 32 | for(my $i =0; $i < $cols; $i++){ 33 | my $tmp = $i+1; 34 | $cmd .= "\$$tmp\"\t\""; 35 | } 36 | $cmd .= "\$".($cols+3+1)."\"\t\""."\$".($cols+3+1+1)." "; 37 | 38 | 39 | system("$script_dir/bin/bedtools intersect -a $bed -b $database -wao | awk -F\$\'\t\' '{print $cmd}' > $bed.$db_name.Annotated"); 40 | 41 | my %data_annotation; 42 | open(my $fh, "< $bed.$db_name.Annotated") or die $!; 43 | while (my $line = <$fh>) { 44 | chomp($line); 45 | $line=~s/\t-1\t/\t.\t/; 46 | my @columns = split("\t",$line); 47 | my $key = "$columns[0]_$columns[1]_$columns[2]"; 48 | unless (exists $data_annotation{$key}){ 49 | $data_annotation{$key}{'HGNC_gene'} = ""; 50 | $data_annotation{$key}{'transcript'} = ""; 51 | $data_annotation{$key}{'other_Genes'} = ""; 52 | $data_annotation{$key}{'rest'} = join("\t", @columns[3..(scalar(@columns)-3)]); 53 | } 54 | 55 | if ($columns[scalar(@columns)-1] ne ".") { 56 | if (length($data_annotation{$key}{'HGNC_gene'}) == 0) { 57 | $data_annotation{$key}{'HGNC_gene'} = $columns[scalar(@columns)-1].";"; 58 | }else{ 59 | my $check = $columns[scalar(@columns)-1].";"; 60 | if (index($data_annotation{$key}{'HGNC_gene'},$check) == -1 && index($data_annotation{$key}{'other_Genes'},$check) == -1) { 61 | $data_annotation{$key}{'other_Genes'} = $columns[scalar(@columns)-1].";"; 62 | } 63 | } 64 | } 65 | 66 | if ($columns[scalar(@columns)-2] ne ".") { 67 | if (length($data_annotation{$key}{'transcript'}) == 0) { 68 | $data_annotation{$key}{'transcript'} = $columns[scalar(@columns)-2].";"; 69 | }else{ 70 | my $check = $columns[scalar(@columns)-2].";"; 71 | unless (index($data_annotation{$key}{'transcript'}, $check) != -1) { 72 | $data_annotation{$key}{'transcript'} .= $columns[scalar(@columns)-2].";"; 73 | } 74 | } 75 | } 76 | 77 | 78 | if (length($data_annotation{$key}{'HGNC_gene'}) == 0) { 79 | $data_annotation{$key}{'HGNC_gene'} ="."; 80 | } 81 | if (length($data_annotation{$key}{'transcript'}) == 0) { 82 | $data_annotation{$key}{'transcript'} ="."; 83 | } 84 | if (length($data_annotation{$key}{'other_Genes'}) == 0) { 85 | $data_annotation{$key}{'other_Genes'} ="."; 86 | } 87 | } 88 | close($fh); 89 | 90 | open(my $fho, ">$bed.$db_name.Annotated.edit") or die $!; 91 | foreach my $key (keys %data_annotation){ 92 | my @columns = split("_",$key); 93 | 94 | $data_annotation{$key}{'HGNC_gene'} =~ s/;$//; 95 | $data_annotation{$key}{'other_Genes'} =~ s/;$//; 96 | $data_annotation{$key}{'transcript'}=~ s/;$//; 97 | $data_annotation{$key}{'other_Genes'} =~ s/^\.// if ($data_annotation{$key}{'other_Genes'} ne "."); 98 | 99 | if (length($data_annotation{$key}{'rest'}) < 1) { 100 | my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t.\n"; 101 | $out_line=~s/\t-1\t/\t.\t/; 102 | print $fho $out_line; 103 | }else{ 104 | my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t$data_annotation{$key}{'rest'}\n"; 105 | $out_line=~s/\t-1\t/\t.\t/; 106 | print $fho $out_line; 107 | } 108 | } 109 | close($fho); 110 | 111 | 112 | 113 | system("$script_dir/bin/bedtools sort -i $bed.$db_name.Annotated.edit > $bed-$db_name"); 114 | 115 | my $rm_file = "$bed.$db_name.Annotated.edit"; 116 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 117 | $rm_file = "$bed.$db_name.Annotated"; 118 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 119 | 120 | 121 | ### SUBROUTINES ### 122 | sub USAGE { 123 | print "USAGE: $0 \n\n"; 124 | exit; 125 | } -------------------------------------------------------------------------------- /ExCID_v2.1/bed_file-annotator_V2_RefSeq-VEGA.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ## Anootater for BED file. 3 | use strict; 4 | use diagnostics; 5 | use Getopt::Std; 6 | use File::Basename; 7 | 8 | ### GLOBAL VARS ### 9 | my $bed = $ARGV[0] || &USAGE; 10 | my $database = $ARGV[1] || &USAGE; 11 | my $output_dir = $ARGV[2] || dirname($bed); 12 | my $db_name = basename($database); 13 | my $script_dir; 14 | my $script_dir_tmp = $0; 15 | $script_dir_tmp =~m/^.+\//; 16 | $script_dir=$&; 17 | 18 | system("ln -s $bed $output_dir/"); 19 | 20 | $bed = "$output_dir/".basename($bed); 21 | my %bedfile; 22 | 23 | open(my $fhb,"<$bed") or die $!; 24 | my $line = <$fhb>; 25 | chomp($line); 26 | my @tmp = split("\t",$line); 27 | my $cols = scalar @tmp; 28 | close($fhb); 29 | 30 | my $cmd=""; 31 | 32 | for(my $i =0; $i < $cols; $i++){ 33 | my $tmp = $i+1; 34 | $cmd .= "\$$tmp\"\t\""; 35 | } 36 | $cmd .= "\$".($cols+3+1)."\"\t\""."\$".($cols+3+1+1)."\"\t\""."\$".($cols+3+1+1+1)." "; 37 | 38 | 39 | system("$script_dir/bin/bedtools intersect -a $bed -b $database -wao | awk -F\$\'\t\' '{print $cmd}' > $bed.$db_name.Annotated"); 40 | 41 | my %data_annotation; 42 | open(my $fh, "< $bed.$db_name.Annotated") or die $!; 43 | while (my $line = <$fh>) { 44 | chomp($line); 45 | $line=~s/\t-1\t/\t.\t/; 46 | my @columns = split("\t",$line); 47 | my $key = "$columns[0]_$columns[1]_$columns[2]"; 48 | unless (exists $data_annotation{$key}){ 49 | $data_annotation{$key}{'HGNC_gene'} = ""; 50 | $data_annotation{$key}{'transcript'} = ""; 51 | $data_annotation{$key}{'other_Genes'} = ""; 52 | $data_annotation{$key}{'rest'} = join("\t", @columns[3..(scalar(@columns)-4)]); 53 | } 54 | 55 | if ($columns[scalar(@columns)-1] ne ".") { 56 | if (length($data_annotation{$key}{'HGNC_gene'}) == 0) { 57 | $data_annotation{$key}{'HGNC_gene'} = $columns[scalar(@columns)-1].";"; 58 | }else{ 59 | my $check = $columns[scalar(@columns)-1].";"; 60 | if (index($data_annotation{$key}{'HGNC_gene'},$check) == -1 && index($data_annotation{$key}{'other_Genes'},$check) == -1) { 61 | $data_annotation{$key}{'other_Genes'} = $columns[scalar(@columns)-1].";"; 62 | } 63 | } 64 | } 65 | 66 | if ($columns[scalar(@columns)-2] ne ".") { 67 | if (length($data_annotation{$key}{'transcript'}) == 0) { 68 | $data_annotation{$key}{'transcript'} = $columns[scalar(@columns)-2].";"; 69 | }else{ 70 | my $check = $columns[scalar(@columns)-2].";"; 71 | unless (index($data_annotation{$key}{'transcript'}, $check) != -1) { 72 | $data_annotation{$key}{'transcript'} .= $columns[scalar(@columns)-2].";"; 73 | } 74 | } 75 | } 76 | 77 | 78 | if ($columns[scalar(@columns)-3] ne ".") { 79 | my $check = $columns[scalar(@columns)-3].";"; 80 | if (index($data_annotation{$key}{'HGNC_gene'},$check) == -1 && index($data_annotation{$key}{'other_Genes'},$check) == -1) { 81 | $data_annotation{$key}{'other_Genes'} = $columns[scalar(@columns)-3].";"; 82 | } 83 | } 84 | 85 | 86 | if (length($data_annotation{$key}{'HGNC_gene'}) == 0) { 87 | $data_annotation{$key}{'HGNC_gene'} ="."; 88 | } 89 | if (length($data_annotation{$key}{'transcript'}) == 0) { 90 | $data_annotation{$key}{'transcript'} ="."; 91 | } 92 | if (length($data_annotation{$key}{'other_Genes'}) == 0) { 93 | $data_annotation{$key}{'other_Genes'} ="."; 94 | } 95 | } 96 | close($fh); 97 | 98 | open(my $fho, ">$bed.$db_name.Annotated.edit") or die $!; 99 | foreach my $key (keys %data_annotation){ 100 | my @columns = split("_",$key); 101 | 102 | $data_annotation{$key}{'HGNC_gene'} =~ s/;$//; 103 | $data_annotation{$key}{'other_Genes'} =~ s/;$//; 104 | $data_annotation{$key}{'transcript'}=~ s/;$//; 105 | $data_annotation{$key}{'other_Genes'} =~ s/^\.// if ($data_annotation{$key}{'other_Genes'} ne "."); 106 | 107 | if (length($data_annotation{$key}{'rest'}) < 1) { 108 | my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t.\n"; 109 | $out_line=~s/\t-1\t/\t.\t/; 110 | print $fho $out_line; 111 | }else{ 112 | my $out_line = "$columns[0]\t$columns[1]\t$columns[2]\t$data_annotation{$key}{'HGNC_gene'}\t$data_annotation{$key}{'transcript'}\t$data_annotation{$key}{'other_Genes'}\t$data_annotation{$key}{'rest'}\n"; 113 | $out_line=~s/\t-1\t/\t.\t/; 114 | print $fho $out_line; 115 | } 116 | } 117 | close($fho); 118 | 119 | 120 | 121 | system("$script_dir/bin/bedtools sort -i $bed.$db_name.Annotated.edit > $bed-$db_name"); 122 | 123 | my $rm_file = "$bed.$db_name.Annotated.edit"; 124 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 125 | $rm_file = "$bed.$db_name.Annotated"; 126 | unlink $rm_file or warn "Could not unlink $rm_file: $!"; 127 | 128 | 129 | ### SUBROUTINES ### 130 | sub USAGE { 131 | print "USAGE: $0 \n\n"; 132 | exit; 133 | } -------------------------------------------------------------------------------- /ExCID_v2.1/bin/CapStatsV2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/CapStatsV2.5.jar -------------------------------------------------------------------------------- /ExCID_v2.1/bin/CapStatsV2.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/CapStatsV2.6.jar -------------------------------------------------------------------------------- /ExCID_v2.1/bin/CovFasta_Generator.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/bin/CovFasta_Generator.jar -------------------------------------------------------------------------------- /ExCID_v2.1/bin/Java_code/CaptureStatsBAM5_extended.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import java.util.*; 3 | 4 | import net.sf.samtools.*; 5 | import net.sf.samtools.SAMFileReader.ValidationStringency; 6 | 7 | /** 8 | * Generates capture statistics from a BAM input file. 9 | * Generates 3 output files: 10 | * @author bainbrid 11 | * 12 | */ 13 | public class CaptureStatsBAM5_extended 14 | { 15 | static final int prime_size = 1000; 16 | static int[] fivePrime = new int[prime_size]; //stores data about coverage upstream of the target 17 | static int[] threePrime = new int[prime_size];//stores data about coverage downstream of the target 18 | static int[] targetCov = new int[101]; //stores data about the coverage across a target 19 | static final int BUFFER = 100; //buffer region around a target 20 | static int tempcnt = 0; 21 | static int totalReadsProduced = 0; //total reads contains in the bam 22 | static int totalReadsAligned = 0; //total reads aligned to a target region 23 | static long totalTargetCoverage = 0; //total number of read bases aligned to the target 24 | static int totalReadsPaired = 0; //total number of reads with mate pairs (if any) 25 | static int totalPairedreadsWithmappedMates = 0; //total number of aligned reads which have mapped mates 26 | static int offtargetReadHitCount = 0; //total number of reads which do not align to a target region 27 | static int ontargetReadHitCount = 0; //total number of reads which align to a target region 28 | static int inbufferReadHitCount = 0; //total number of reads which align to the buffer region 29 | static int readsAlignedINCLUDINGDUPES = 0; //total reads aligned including duplicates 30 | static int duplicateReads = 0; //total number of duplicate reads 31 | static long totalAlignedBases = 0; //total number of aligned bases 32 | static int totalTargetedBases = 0; //total number of bases targeted 33 | static int totalBufferBases = 0; //total number of bases in the buffer region 34 | static int basesWithOneHitorMore = 0; //total targeted bases with at least 1 coverage 35 | static int basesWith10HitsorMore = 0; //total targeted bases with at least 10 coverage 36 | static int basesWith20HitsorMore = 0; //total targeted bases with at least 20 coverage 37 | static int basesWith40HitsorMore = 0; //total targeted bases with at least 40 coverage 38 | static int basesWith50HitsorMore = 0; //total targeted bases with at least 50 coverage 39 | static int basesWith100HitsorMore = 0; //total targeted bases with at least 100 coverage 40 | static int basesWith500HitsorMore = 0; //total targeted bases with at least 500 coverage 41 | static int basesWith1000HitsorMore = 0; //total targeted bases with at least 1000 coverage 42 | static int totalTargets = 0; //total taregted regions 43 | static int hitTargetCount = 0; //total targets with at least 1 read aligned to them 44 | static int hitTarget_bufferonly_Count = 0; //total targets with no hits, except in buffer 45 | static int[] dupHisto = new int[9]; //depcrecated 46 | static int[] covHisto = new int[1001]; //coverage histogram 47 | static int nonTragetGoodHits = 0; //regions that have high coverage but are not in the target 48 | static String VERSION = "CapStatsV2.6 2015-03-25"; 49 | static Hashtable fht ; 50 | static final String dummy = "dummy"; 51 | static boolean removeDupes = false; //do we not consider duplicate reads 52 | static boolean writeWGC = false; //write whole genome coverage statistics 53 | static double _percentage = 1.0; //number of read to take (to randomly dump some) 54 | static Random RAND = new Random(88651); //random number generator, good for removing a proportion of the reads 55 | static String[] targetChrs; 56 | static int[] targetStarts; 57 | static int[] targetStops; 58 | static double minmapscore = -1.0; 59 | static double minbasescore = -1.0; 60 | static int size =0 ; //Holds the size of the Chromosome being analyzed. 61 | static int[] coverage_forMedian = new int[1000]; //Used for calculating the median coverage. 62 | static int median_coverage = 0; 63 | static long READLENGTH = 0; //Gets READ Length 64 | 65 | /** 66 | * ONLY WORKS ON A SINGLE FILES THAT HAS BEEN SORTED!!!!!!!!!! 67 | * looks like it works in 4000M of ram. 68 | * 69 | * usage: -o -t - -i [-d] [-w] [-m] [-b] [-p] 70 | * 71 | * i: BAM File 72 | * t: target file 73 | * o: output directory and base file name 74 | * d: remove duplicates, and do not use them for statistics 75 | * w: write whole genome coverage 76 | * m: minimum mapscore (mapscore must be >= to this) 77 | * b: minimum base score (basescore must be >= to this) 78 | * p: only take this proportion of reads into consideration (for scale back experiments) 79 | * 80 | * 81 | * @throws Exception 82 | * 83 | */ 84 | public static void main(String[] args) throws Exception 85 | { 86 | if(args.length == 0) {usage();System.exit(0);}; 87 | String[] validargs = {"r","t","o","d","i","w","p","m","b"}; 88 | String[] arguments = new String[validargs.length]; 89 | String warns = ParseOpts.parse(validargs, args, arguments); 90 | if(arguments[0] == null) 91 | { 92 | //deprecated. no longer needed. 93 | } 94 | if(arguments[1] == null) 95 | { 96 | System.err.println("No target file specified!!! Exiting\n"); 97 | usage(); 98 | System.exit(2); 99 | } 100 | if(arguments[2] == null) 101 | { 102 | System.err.println("No output file specified!!! Exiting\n"); 103 | usage(); 104 | System.exit(3); 105 | } 106 | int index = arguments[2].lastIndexOf(File.separator); 107 | String fileName = arguments[2].substring(index + 1); 108 | if(fileName == null) 109 | { 110 | System.err.println("Please provide proper output file!!! Exiting\n"); 111 | usage(); 112 | System.exit(3); 113 | } 114 | if(arguments[3]!= null ) 115 | { 116 | removeDupes = true; 117 | } 118 | if(arguments[4] == null) 119 | { 120 | System.err.println("No alignment file specified!!! Exiting\n"); 121 | usage(); 122 | System.exit(4); 123 | } 124 | if(arguments[5] != null) 125 | { 126 | writeWGC = true; 127 | } 128 | if(arguments[6] != null) 129 | { 130 | _percentage = Double.parseDouble(arguments[6]); 131 | } 132 | if(arguments[7] != null) 133 | { 134 | minmapscore = Double.parseDouble(arguments[7]); 135 | } 136 | if(arguments[8] != null) 137 | { 138 | minbasescore = Double.parseDouble(arguments[8]); 139 | } 140 | String targetFile = arguments[1]; 141 | String alignmentFile = arguments[4]; 142 | checkFile(alignmentFile); 143 | checkFile(targetFile); 144 | String outfile = arguments[2]; 145 | String covFile = outfile+".cov.fasta"; 146 | System.out.println("Writing to: "+outfile); 147 | FileWriter wgcFasta = null; 148 | if(writeWGC) 149 | { 150 | wgcFasta = new FileWriter(outfile+".wholeGenomeCov.fasta"); 151 | } 152 | FileWriter covFasta = new FileWriter(covFile); 153 | FileWriter wig = new FileWriter(outfile+".missReads.wig"); 154 | FileWriter missTar = new FileWriter(outfile+".missTargets.txt"); 155 | fht = new Hashtable(50); 156 | wig.write("track type=wiggle_0 name="+alignmentFile+"\n"); 157 | loadTargets(targetFile); 158 | readBAM(alignmentFile, wig, covFasta, missTar, wgcFasta); 159 | covFasta.close(); 160 | if(writeWGC) wgcFasta.close(); 161 | wig.close(); 162 | writeReport(outfile); 163 | missTar.close(); 164 | } 165 | 166 | /** 167 | * load the target regions into memory and leave them there 168 | * @param targetFile 169 | */ 170 | public static void loadTargets(String targetFile) throws Exception 171 | { 172 | BufferedReader br = new BufferedReader(new FileReader(targetFile)); 173 | String line; 174 | int cnt = 0; 175 | int start = 0; 176 | int stop = 0; 177 | while((line = br.readLine())!=null) 178 | { 179 | String[] tokens = line.split("[ \t]+"); 180 | if(tokens.length < 3) {continue;} 181 | //if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue; 182 | try 183 | { 184 | start = Integer.parseInt(tokens[1]); 185 | stop = Integer.parseInt(tokens[2]); 186 | } 187 | catch(NumberFormatException e){continue;} 188 | cnt++; 189 | } 190 | targetChrs = new String[cnt]; 191 | targetStarts = new int[cnt]; 192 | targetStops = new int[cnt]; 193 | cnt = 0; 194 | br.close(); 195 | br = new BufferedReader(new FileReader(targetFile)); 196 | while((line = br.readLine())!=null) 197 | { 198 | String[] tokens = line.split("[ \t]+"); 199 | if(tokens.length < 3) {continue;} 200 | //if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue; 201 | try 202 | { 203 | start = Integer.parseInt(tokens[1]); 204 | stop = Integer.parseInt(tokens[2]); 205 | } 206 | catch(NumberFormatException e){continue;} 207 | targetChrs[cnt] = tokens[0]; 208 | targetStarts[cnt]=start; 209 | targetStops[cnt] = stop; 210 | cnt++; 211 | } 212 | totalTargets = cnt; /* Should be used to count Targets Oct3_2014*///totalTargets = TR.length; 213 | } 214 | 215 | 216 | 217 | /** 218 | * Major workhorse of the application. Reads in the BAM file and processes each record. 219 | * @param bamfile The bam file to read in 220 | * @param wig The output wig file to write offtarget high coverage regions 221 | * @param covFasta the coverage fasta output file 222 | * @param missTraget the miss target file 223 | * @param wgCoverage whole genome coverage file 224 | * @throws Exception 225 | * 226 | */ 227 | public static void readBAM(String bamfile, FileWriter wig, FileWriter covFasta, FileWriter missTraget, FileWriter wgCoverage) throws Exception 228 | { 229 | SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); 230 | SAMFileReader sfr = new SAMFileReader(new File(bamfile)); 231 | SAMFileHeader header = sfr.getFileHeader(); 232 | Iterator iter = sfr.iterator(); 233 | String lastchr = "SOMEVERYFAKETHINGGOESHERE"; 234 | short[] COVERAGE = new short[0]; 235 | char[] TR = new char[0]; 236 | int cnt = 0; 237 | SAMRecord rec = null; 238 | while(iter.hasNext()) 239 | { 240 | try 241 | { 242 | rec = iter.next(); 243 | if(_percentage < 1.0) 244 | { 245 | double rd = RAND.nextDouble(); 246 | if(rd < _percentage) continue; 247 | } 248 | 249 | totalReadsProduced++; 250 | if(rec.getMappingQuality() < minmapscore) 251 | { 252 | continue; 253 | } 254 | if(rec.getReadFailsVendorQualityCheckFlag()){ 255 | continue; 256 | } 257 | 258 | if(rec.getNotPrimaryAlignmentFlag()){ 259 | continue; 260 | } 261 | 262 | if(rec.getReadUnmappedFlag()) 263 | { 264 | //System.out.println("Unmapped! region. breaking!!!"); 265 | continue; 266 | } 267 | totalReadsAligned++; 268 | 269 | if(rec.getReadPairedFlag()) 270 | { 271 | totalReadsPaired++; 272 | if(!rec.getMateUnmappedFlag()){ 273 | totalPairedreadsWithmappedMates++; 274 | } 275 | } 276 | 277 | if(rec.getDuplicateReadFlag()) 278 | { 279 | duplicateReads++; 280 | if(removeDupes){continue;} 281 | } 282 | 283 | /////////////////Added on Feb 11, 2015///////////////////////////// 284 | READLENGTH = rec.getReadLength(); 285 | 286 | /////////////////////////////////////////////////////////////////// 287 | 288 | String currchr = rec.getReferenceName(); 289 | if(!currchr.equals(lastchr)) 290 | { 291 | if(!lastchr.equals("SOMEVERYFAKETHINGGOESHERE")) 292 | { 293 | getTargetsAndWriteCoverage(lastchr, COVERAGE, covFasta, missTraget, wgCoverage); 294 | findWhereReadsHit(lastchr, COVERAGE,wig); 295 | } 296 | lastchr = currchr; 297 | System.out.println(currchr); 298 | size = header.getSequence(currchr).getSequenceLength()+1; 299 | COVERAGE = new short[size]; 300 | TR = getTargetPos(currchr, size); 301 | if(TR == null || COVERAGE == null) 302 | { 303 | System.err.println("COVERAGE or TR is null! "+currchr); 304 | } 305 | 306 | } 307 | processRecord(rec, TR, COVERAGE); 308 | } 309 | catch(Exception e) 310 | { 311 | System.err.println("Error on record: "+cnt+"\n"+e.getMessage()+" "+Arrays.toString(e.getStackTrace())); 312 | System.err.println(rec.toString()+" "+rec.getReferenceName()+" "+rec.getAlignmentStart()+" "+rec.getAlignmentEnd()); 313 | // throw e; 314 | } 315 | cnt++; 316 | } 317 | System.out.println("Done read bam"); 318 | getTargetsAndWriteCoverage(lastchr, COVERAGE, covFasta, missTraget, wgCoverage); 319 | findWhereReadsHit(lastchr, COVERAGE,wig); 320 | COVERAGE = null; 321 | } 322 | 323 | /** 324 | * Processes read record from a BAM. Adding the alignment to coverage array. 325 | * @param rec 326 | * @param TR 327 | * @param COVERAGE 328 | */ 329 | public static void processRecord(SAMRecord rec, char[] TR, short[] COVERAGE) throws Exception 330 | { 331 | boolean inbuffer = false; 332 | boolean ontarget = false; 333 | int start = rec.getAlignmentStart(); 334 | int stop = rec.getAlignmentEnd(); 335 | int referenceposition = 0; 336 | byte[] baseQual = rec.getBaseQualities(); 337 | /*if(rec.getReadNegativeStrandFlag()) 338 | { 339 | start-=25; 340 | } 341 | else 342 | { 343 | stop+=25; 344 | }*/ 345 | 346 | for(int i = start; i <= stop; i++) 347 | { 348 | totalAlignedBases++; 349 | try 350 | { 351 | if(TR[i] == 1) 352 | { 353 | ontarget = true; 354 | } 355 | else if(TR[i] == 2) 356 | { 357 | inbuffer = true; 358 | } 359 | 360 | referenceposition = rec.getReferencePositionAtReadPosition(i-start+1); 361 | if(referenceposition==0){continue;} 362 | 363 | if(minbasescore > 0){ 364 | if((double)baseQual[i-start] >= minbasescore) { 365 | COVERAGE[referenceposition]++; 366 | } 367 | }else { 368 | COVERAGE[referenceposition]++; 369 | } 370 | } 371 | catch(Exception e) 372 | { 373 | System.err.println("array size:"+COVERAGE.length); 374 | System.err.println("start stop:"+start+" "+stop); 375 | System.err.println(e.getMessage()+" -- "+e.getLocalizedMessage()+" -- "+e.getCause()+" -- "+Arrays.toString(e.getStackTrace())); 376 | //throw e; 377 | break; 378 | } 379 | } 380 | if(ontarget) 381 | { 382 | ontargetReadHitCount++; 383 | } 384 | else if(inbuffer) 385 | { 386 | inbufferReadHitCount++; 387 | } 388 | else 389 | { 390 | offtargetReadHitCount++; 391 | } 392 | } 393 | 394 | /** 395 | * makes sure a file exists 396 | * @param s 397 | * @throws Exception 398 | */ 399 | public static void checkFile(String s) throws Exception 400 | { 401 | File f = new File(s); 402 | if(!f.exists()) 403 | { 404 | throw new FileNotFoundException("No such file as \""+s+"\". File not found."); 405 | } 406 | } 407 | 408 | public static void usage() 409 | { 410 | String s= "Version: "+VERSION+"\nUsage: -o -t -i [-d] [-w] [-m ] [-b ]\n\t* t: target file\n\t* o: output directory and base file name"; 411 | s+="\n\t* d: remove duplicates, and do not use them for statistics\n\t* i: alignment file (multiple files are not allowed)\n\t* w: write whole genome coverage\n\t* m: minimum mapscore (mapscore must be >= to this)\n\t* b: minimum base quality\n"; 412 | System.out.println(s); 413 | } 414 | 415 | /** 416 | * removes the "chr" portion of any chromosome name 417 | * @param c 418 | * @return 419 | */ 420 | public static String removechr(String c) 421 | { 422 | if(c.length() > 3 && c.substring(0, 3).equalsIgnoreCase("chr")){ 423 | c = c.substring(3); 424 | } 425 | return c; 426 | } 427 | 428 | /** 429 | * converts fractions into percentages with 2 decimal positions 430 | * @param num 431 | * @param dom 432 | * @return 433 | */ 434 | public static double pc(int num, int dom) 435 | { 436 | double pc = (double)num/(double)dom; 437 | pc*=10000.0;pc+=0.5; int ipc = (int)pc; pc = (double)ipc/100; 438 | return pc; 439 | } 440 | 441 | /** 442 | * Writes all the statistical information to an output file. 443 | * @param fname output file name 444 | * @throws Exception 445 | */ 446 | public static void writeReport(String fname) throws Exception 447 | { 448 | int nonduplicatereads = totalReadsAligned - duplicateReads; 449 | if(totalTargetedBases == 0) 450 | { 451 | System.err.println("Total targeted bases is zero. This means that no read has aligned to a chromosome that contains a target. No target matches a chromosome in the BAM, or something else very weird. Aborting."); 452 | System.exit(1); 453 | } 454 | if(totalReadsAligned == 0) 455 | { 456 | System.err.println("No reads aligned. Aborting."); 457 | System.exit(2); 458 | } 459 | if(nonduplicatereads == 0) 460 | { 461 | System.err.println("All reads are duplicates. Aborting."); 462 | System.exit(3); 463 | } 464 | if(totalTargets == 0) 465 | { 466 | //I don't think we should ever see this error, as its dealt with above. 467 | System.err.println("No target regions given. Aborting."); 468 | System.exit(4); 469 | } 470 | 471 | 472 | int sum =0; 473 | for(int i = 0; i= (totalTargetedBases/2)){ 475 | median_coverage = i; 476 | break; 477 | }else{ 478 | sum+=coverage_forMedian[i]; 479 | } 480 | } 481 | 482 | 483 | FileWriter report = new FileWriter(fname+".CoverageReport.csv"); 484 | report.write("Version: "+VERSION+"\n"); 485 | report.write("BUFFER size:,"+BUFFER+"\n"); 486 | report.write("Read Stats\n"); 487 | report.write("Total Reads Produced:,"+totalReadsProduced+"\n"); 488 | report.write("Total Yield Produced:,"+(READLENGTH * totalReadsProduced)+"\n"); 489 | report.write("Total Unique Yield Produced:,"+(READLENGTH * (totalReadsAligned-duplicateReads))+"\n"); 490 | report.write("Duplicate Reads:,"+duplicateReads+",("+pc(duplicateReads,totalReadsAligned)+"%)\n"); 491 | report.write("Total Reads Aligned:,"+totalReadsAligned+",("+pc(totalReadsAligned,totalReadsProduced)+"%)"); 492 | report.write(",reads paired:,"+totalReadsPaired); 493 | report.write(",reads paired with mapped mates:,"+totalPairedreadsWithmappedMates+"\n"); 494 | report.write("Aligned Reads On-Buffer:,"+inbufferReadHitCount+",("+pc(inbufferReadHitCount,totalReadsAligned)+"%)\n"); 495 | report.write("Aligned Reads On-Target:,"+ontargetReadHitCount+",("+pc(ontargetReadHitCount,totalReadsAligned)+"%)\n"); 496 | report.write("Average Coverage:,-,("+((int)(totalTargetCoverage/totalTargetedBases))+")\n"); 497 | report.write("Median Coverage:,-,("+median_coverage+")\n"); 498 | int hittot = inbufferReadHitCount+ontargetReadHitCount; 499 | report.write("Reads that hit target or buffer:,"+hittot+",("+pc(hittot,totalReadsAligned)+"%)\n"); 500 | report.write("Total Aligned Reads (expected):,"+totalReadsAligned+"\n"); 501 | report.write("Total Aligned Reads (calculated):,"+(offtargetReadHitCount+inbufferReadHitCount+ontargetReadHitCount)+"\n"); 502 | report.write("Target Stats\n"); 503 | report.write("Targets Hit:,"+hitTargetCount+",("+pc(hitTargetCount,totalTargets)+"%)\n"); 504 | report.write("Target Buffers Hit:,"+hitTarget_bufferonly_Count+",("+pc(hitTarget_bufferonly_Count,totalTargets)+"%)\n"); 505 | report.write("Total Targets:,"+totalTargets+"\n"); 506 | report.write("Non target regions with high coverage:,"+nonTragetGoodHits+"\n"); 507 | report.write("Base Stats\n"); 508 | report.write("Bases Targeted:,"+totalTargetedBases+"\n"); 509 | report.write("Buffer Bases:,"+totalBufferBases+"\n"); 510 | report.write("Bases with 1+ coverage:,"+basesWithOneHitorMore+",("+pc(basesWithOneHitorMore,totalTargetedBases)+"%)\n"); 511 | report.write("Bases with 10+ coverage:,"+basesWith10HitsorMore+",("+pc(basesWith10HitsorMore,totalTargetedBases)+"%)\n"); 512 | report.write("Bases with 20+ coverage:,"+basesWith20HitsorMore+",("+pc(basesWith20HitsorMore,totalTargetedBases)+"%)\n"); 513 | report.write("Bases with 40+ coverage:,"+basesWith40HitsorMore+",("+pc(basesWith40HitsorMore,totalTargetedBases)+"%)\n"); 514 | report.write("Bases with 50+ coverage:,"+basesWith50HitsorMore+",("+pc(basesWith50HitsorMore,totalTargetedBases)+"%)\n"); 515 | report.write("Bases with 100+ coverage:,"+basesWith100HitsorMore+",("+pc(basesWith100HitsorMore,totalTargetedBases)+"%)\n"); 516 | report.write("Bases with 500+ coverage:,"+basesWith500HitsorMore+",("+pc(basesWith500HitsorMore,totalTargetedBases)+"%)\n"); 517 | report.write("Bases with 1000+ coverage:,"+basesWith1000HitsorMore+",("+pc(basesWith1000HitsorMore,totalTargetedBases)+"%)\n"); 518 | report.write("Duplicate read distribution\n"); 519 | report.write("1,2,3,4,5,6to10,11to20,20plus\n"); 520 | int[] mult = {1,1,2,3,4,5,8,15,25}; 521 | for(int i = 1 ; i < dupHisto.length; i++) 522 | { 523 | report.write(pc(dupHisto[i]*mult[i],totalReadsAligned)+"%,"); 524 | } 525 | report.write("\n"); 526 | report.write("Coverage Histogram (may look weird if target regions overlap...)\n"); 527 | for(int i = 0; i < covHisto.length; i++){ 528 | report.write(i+","); 529 | } 530 | report.write("\n"); 531 | for(int i = 0; i < covHisto.length; i++){ 532 | report.write(covHisto[i]+","); 533 | } 534 | report.write("\n"); 535 | 536 | report.write("Target and region coverage plot\n"); 537 | report.write("Position,5'count,3'count\n"); 538 | for(int i = 20; i <= prime_size; i+=20) 539 | { 540 | report.write(i+","+fivePrime[fivePrime.length-(i-1)-1]+","+threePrime[i-1]+"\n"); 541 | } 542 | report.write("%tar-Pos,count\n"); 543 | for(int i = 0; i < 101; i+=2) 544 | { 545 | report.write(i+","+targetCov[i]+"\n"); 546 | } 547 | report.close(); 548 | } 549 | 550 | /** 551 | * This method is destructive to the data structure, no further work can be done after this method has ran. 552 | * Works out whether reads are on or off target and how far off target they are 553 | * 554 | * @param chromo -- current chromosome 555 | * @param COVERAGE -- the coverage of the genome 556 | * @param wig -- writes a wig file 557 | * @throws Exception 558 | */ 559 | public static void findWhereReadsHit(String chromo, short COVERAGE[], FileWriter wig) throws Exception 560 | { 561 | for(int j = 0; j < targetChrs.length; j++) 562 | { 563 | if(!removechr(targetChrs[j]).equals(removechr(chromo))) continue; 564 | int start = targetStarts[j]; 565 | int end = targetStops[j]; 566 | for(int i = start - 500; i < end +500; i++) 567 | { 568 | if(i < 0 || i >= size) {continue;} 569 | COVERAGE[i] = 0; 570 | } 571 | } 572 | 573 | for(int i = 0; i < COVERAGE.length; i++) 574 | { 575 | if(COVERAGE[i] > 20) 576 | { 577 | int j = i; 578 | nonTragetGoodHits++; 579 | while(i < COVERAGE.length && COVERAGE[i] > 0) 580 | { 581 | i++; 582 | } 583 | 584 | while(j > 0 && COVERAGE[j] > 0) 585 | { 586 | j--; 587 | } 588 | wig.write("fixedStep chrom="+chromo+" start="+j+" step=1\n"); 589 | for(int h = j; h < i; h++) 590 | { 591 | wig.write(COVERAGE[h]+"\n"); 592 | } 593 | } 594 | } 595 | wig.flush(); 596 | } 597 | 598 | static boolean supertets = false; 599 | 600 | /** 601 | * Gets the target regions from the target file, and writes over the coverage fasta files, as well as determines many of the coverage statistics. 602 | * @param chromo Current chromosome 603 | * @param COVERAGE The array which contains the coverage of every base in the genome 604 | * @param covFasta The filewriter for the target-specific coverage 605 | * @param missTraget Write a "wig" format file (good for ucsc) which shows you where all off-target regions with high coverage are 606 | * @param wgCoverage A filewriter for the whole genome coverage... if null, this file won't be written 607 | * @throws Exception 608 | */ 609 | public static void getTargetsAndWriteCoverage(String chromo, short COVERAGE[], FileWriter covFasta, FileWriter missTraget, FileWriter wgCoverage) throws Exception 610 | { 611 | if(wgCoverage != null) 612 | { 613 | wgCoverage.write(">"+chromo); 614 | for(int i = 0; i < COVERAGE.length; i++) 615 | { 616 | if(i%100==0) wgCoverage.write("\n"); 617 | wgCoverage.write(COVERAGE[i]+" "); 618 | } 619 | wgCoverage.write("\n"); 620 | } 621 | for(int j = 0; j < targetChrs.length; j++) 622 | { 623 | if(!removechr(targetChrs[j]).equals(removechr(chromo))) {continue;} 624 | //totalTargets++; 625 | int start = targetStarts[j]; 626 | int end = targetStops[j]; 627 | int length = end - start+1; 628 | boolean collectTargetCov = length > 99 ; 629 | 630 | if(supertets) 631 | { 632 | System.out.println(targetChrs[j]+" "+start+" "+end); 633 | } 634 | if(collectTargetCov) 635 | { 636 | for(int i = 0; i < prime_size; i++) 637 | { 638 | if((start - i) < 0 || (end+i) >= size){ 639 | ///System.err.println("The BED Target "+targetChrs[j]+" "+start+" "+end+" is going out of Bound!!!\n"); 640 | continue; 641 | } 642 | fivePrime[i]+=COVERAGE[start-i]; 643 | threePrime[i]+=COVERAGE[end+i]; 644 | 645 | ///fivePrime[i]+=COVERAGE[end-i+300]; 646 | //threePrime[i]+=COVERAGE[start+i-300]; 647 | 648 | } 649 | } 650 | 651 | if(supertets) 652 | { 653 | 654 | for(int i = 0; i < 500; i++) 655 | { 656 | if((start-i) < 0) {continue;} 657 | System.out.print( (start-i)+" "); 658 | } 659 | System.out.print("\n"); 660 | for(int i = 0; i < 500; i++) 661 | { 662 | if((end+i) >= size) {continue;} 663 | System.out.print( (end+i)+" "); 664 | } 665 | System.out.print("\n"); 666 | 667 | 668 | supertets= false; 669 | } 670 | 671 | boolean targetHit = false; 672 | short[] pc = new short[101]; 673 | short[] pc2 = new short[101]; 674 | 675 | covFasta.write(">"+chromo+" "+start+" "+end+"\n"); 676 | boolean spaceit = false; 677 | if(end - start > 10000) spaceit = true; 678 | for(int i = 0; i < length; i++) 679 | { 680 | if((i+start) >= size) {continue;} 681 | if(spaceit && i%100 == 0) covFasta.write("\n"); 682 | short cov = COVERAGE[i+start]; 683 | if(cov < 0) 684 | { 685 | cov = Short.MAX_VALUE; 686 | System.err.println("Coverage less than 0!!!!!!!\n"); 687 | } 688 | short temp_cov = cov; 689 | if(temp_cov >= covHisto.length){ 690 | temp_cov = (short) (covHisto.length-1); 691 | } 692 | covHisto[temp_cov]++; 693 | totalTargetCoverage+=cov; 694 | if(cov > 0) 695 | { 696 | targetHit=true; 697 | basesWithOneHitorMore++; 698 | } 699 | if(cov > 9){ 700 | basesWith10HitsorMore++;} 701 | if(cov > 19){ 702 | basesWith20HitsorMore++;} 703 | if(cov > 39){ 704 | basesWith40HitsorMore++;} 705 | if(cov > 49){ 706 | basesWith50HitsorMore++;} 707 | if(cov > 99){ 708 | basesWith100HitsorMore++;} 709 | if(cov > 499){ 710 | basesWith500HitsorMore++;} 711 | if(cov > 999){ 712 | basesWith1000HitsorMore++;} 713 | 714 | covFasta.write(cov+" "); 715 | 716 | if(cov < coverage_forMedian.length){ 717 | coverage_forMedian[cov]++; 718 | }else{ 719 | int[] tmp = new int[coverage_forMedian.length]; 720 | System.arraycopy(coverage_forMedian, 0, tmp, 0, coverage_forMedian.length); 721 | coverage_forMedian = new int[cov+1]; 722 | System.arraycopy(tmp, 0, coverage_forMedian, 0, tmp.length); 723 | coverage_forMedian[cov]++; 724 | } 725 | 726 | if(collectTargetCov) 727 | { 728 | int pcpos = (int)((double)i/(double)length*100+0.5); 729 | pc[pcpos] += cov; 730 | pc2[pcpos]++; 731 | } 732 | } 733 | covFasta.write("\n"); 734 | 735 | 736 | for(int index = 0; index < pc.length; index++) 737 | { 738 | if(pc2[index] != 0) 739 | { 740 | int d = (int) (((double)pc[index]/(double)pc2[index])+0.5); 741 | pc[index] = (short) d; 742 | } 743 | } 744 | 745 | for(int i = 0; i < 101; i++) 746 | { 747 | targetCov[i]+=pc[i]; 748 | } 749 | if(targetHit) 750 | { 751 | hitTargetCount++; 752 | } 753 | else 754 | { 755 | missTraget.write(targetChrs[j]+"\t"+targetStarts[j]+"\t"+targetStops[j]+"\n"); 756 | boolean hit = false; 757 | for(int i = start - BUFFER; i < start && !hit; i++) 758 | { 759 | if(i < 0) {continue;} 760 | if(COVERAGE[i] > 0){ 761 | hit=true; 762 | } 763 | } 764 | for(int i = end; i < end+BUFFER && !hit; i++) 765 | { 766 | if(i >= size) {continue;} 767 | if(COVERAGE[i] > 0){ 768 | hit=true; 769 | } 770 | } 771 | if(hit){ 772 | hitTarget_bufferonly_Count++; 773 | } 774 | } 775 | /*for(int i = 0; i < length; i++){ 776 | if((i+start) >= size) {continue;} 777 | COVERAGE[i+start]=0; 778 | }*/ //Commented out on June 13th 2014, to include non flattened bedfile. 779 | } 780 | } 781 | 782 | /** 783 | * 784 | * @param chromo the current chromosome to load 785 | * @param size the size of the chromosome 786 | * @return 787 | * @throws Exception 788 | */ 789 | public static char[] getTargetPos(String chromo, int size) throws Exception 790 | { 791 | char[] TR = new char[size]; 792 | chromo = removechr(chromo); 793 | for(int j = 0; j < targetChrs.length; j++) 794 | { 795 | try{ 796 | if(!removechr(targetChrs[j]).equals(chromo))continue; 797 | int start = targetStarts[j]; 798 | int end = targetStops[j]; 799 | for(int i = start; i <= end; i++) 800 | { 801 | if(i >= size) { 802 | continue; 803 | }else{ 804 | TR[i] = 1; 805 | } 806 | } 807 | for(int i = start -BUFFER; i 0) 23 | { 24 | System.out.println("Warning: "); 25 | System.out.println(warns); 26 | } 27 | for(int i = 0; i < on.length; i++) 28 | { 29 | System.out.println(on[i]+" "+vals[i]); 30 | } 31 | 32 | } 33 | 34 | /** 35 | * Parses args based on optnames, into values. 36 | * @param optnames The names of allowable options 37 | * @param args The input arguments 38 | * @param values The return values for each optname (co-indexed) 39 | * @return 40 | */ 41 | public static String parse(String[] optnames, String[] args, String[] values) 42 | { 43 | StringBuffer warnings = new StringBuffer(100); 44 | for(int i = 0; i < args.length; i++) 45 | { 46 | String curr = args[i]; 47 | if(curr.charAt(0) == '-') 48 | { 49 | curr = curr.substring(1); 50 | String value = null; 51 | String optname = null; 52 | if(curr.indexOf('=')!= -1) 53 | { 54 | optname = curr.substring(0, curr.indexOf('=')); 55 | value = curr.substring(curr.indexOf('=')+1); 56 | } 57 | else 58 | { 59 | if(i fht ; 63 | static final String dummy = "dummy"; 64 | static boolean removeDupes = false; //do we not consider duplicate reads 65 | static boolean writeWGC = false; //write whole genome coverage statistics 66 | static double percentage = 1.0; //number of read to take (to randomly dump some) 67 | static Random RAND = new Random(88651); //random number generator, good for removing a proportion of the reads 68 | static String[] targetChrs; 69 | static int[] targetStarts; 70 | static int[] targetStops; 71 | static double minmapscore = -1.0; 72 | static double minbasescore = -1.0; 73 | static int size =0 ; //Holds the size of the Chromosome being analyzed. 74 | static int[] coverage_forMedian = new int[1000]; //Used for calculating the median coverage. 75 | static int[] coverage_forMedian_WG = new int[1000]; //Used for calculating the median coverage for Whole Genome. 76 | static int median_coverage = 0; 77 | static int median_coverage_WG = 0; 78 | static boolean is_target = false; //Is target file provided 79 | static long READLENGTH = 0; //Gets READ Length 80 | 81 | /** 82 | * ONLY WORKS ON A SINGLE FILES THAT HAS BEEN SORTED!!!!!!!!!! 83 | * looks like it works in 4000M of ram. 84 | * 85 | * usage: -o -t - -i -r [-d] [-w] [-m] [-b] 86 | * 87 | * i: BAM File 88 | * t: target file 89 | * o: output directory and base file name 90 | * d: remove duplicates, and do not use them for statistics 91 | * m: minimum mapscore (mapscore must be >= to this) 92 | * b: minimum base score (basescore must be >= to this) 93 | * @param args 94 | * 95 | * @throws Exception 96 | * 97 | */ 98 | public static void main(String[] args) throws Exception 99 | { 100 | if(args.length == 0) {usage();System.exit(0);}; 101 | String[] validargs = {"t","o","d","i","m","b"}; 102 | String[] arguments = new String[validargs.length]; 103 | String warns = ParseOpts.parse(validargs, args, arguments); 104 | if(arguments[0] == null) 105 | { 106 | System.err.println("No target file specified!!! Only calculating Whole genome stats\n"); 107 | }else{is_target = true;} 108 | if(arguments[1] == null) 109 | { 110 | System.err.println("No output file specified!!! Exiting\n"); 111 | usage(); 112 | System.exit(3); 113 | } 114 | int index = arguments[1].lastIndexOf(File.separator); 115 | String fileName = arguments[1].substring(index + 1); 116 | if(fileName == null) 117 | { 118 | System.err.println("Please provide proper output file!!! Exiting\n"); 119 | usage(); 120 | System.exit(3); 121 | } 122 | if(arguments[2]!= null ) 123 | { 124 | removeDupes = true; 125 | } 126 | if(arguments[3] == null) 127 | { 128 | System.err.println("No alignment file specified!!! Exiting\n"); 129 | usage(); 130 | System.exit(4); 131 | } 132 | if(arguments[4] != null) 133 | { 134 | minmapscore = Double.parseDouble(arguments[4]); 135 | } 136 | if(arguments[5] != null) 137 | { 138 | minbasescore = Double.parseDouble(arguments[5]); 139 | } 140 | 141 | String targetFile = null; 142 | 143 | if(is_target) { 144 | targetFile = arguments[0]; 145 | checkFile(targetFile); 146 | } 147 | 148 | String alignmentFile = arguments[3]; 149 | checkFile(alignmentFile); 150 | String outfile = arguments[1]; 151 | String covFile = outfile+".cov.fasta"; 152 | System.out.println("Writing to: "+outfile); 153 | 154 | FileWriter wgcFasta = new FileWriter(outfile+".wholeGenomeCov.fasta");; 155 | 156 | FileWriter missTar = new FileWriter(outfile+".missTargets.txt"); 157 | fht = new Hashtable(50); 158 | if(is_target) { 159 | loadTargets(targetFile); 160 | FileWriter covFasta = new FileWriter(covFile); 161 | readBAM(alignmentFile, covFasta, missTar, wgcFasta); 162 | covFasta.close(); 163 | }else{ 164 | readBAM(alignmentFile, missTar, wgcFasta); 165 | } 166 | 167 | wgcFasta.close(); 168 | writeReport(outfile); 169 | missTar.close(); 170 | } 171 | 172 | /** 173 | * load the target regions into memory and leave them there 174 | * @param targetFile 175 | */ 176 | public static void loadTargets(String targetFile) throws Exception 177 | { 178 | BufferedReader br = new BufferedReader(new FileReader(targetFile)); 179 | String line; 180 | int cnt = 0; 181 | int start = 0; 182 | int stop = 0; 183 | while((line = br.readLine())!=null) 184 | { 185 | String[] tokens = line.split("[ \t]+"); 186 | if(tokens.length < 3) continue; 187 | //if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue; 188 | try 189 | { 190 | start = Integer.parseInt(tokens[1]); 191 | stop = Integer.parseInt(tokens[2]); 192 | } 193 | catch(NumberFormatException e){continue;} 194 | cnt++; 195 | } 196 | targetChrs = new String[cnt]; 197 | targetStarts = new int[cnt]; 198 | targetStops = new int[cnt]; 199 | cnt = 0; 200 | br.close(); 201 | br = new BufferedReader(new FileReader(targetFile)); 202 | while((line = br.readLine())!=null) 203 | { 204 | String[] tokens = line.split("[ \t]+"); 205 | if(tokens.length < 3) continue; 206 | //if(!tokens[0].substring(0,3).equalsIgnoreCase("chr"))continue; 207 | try 208 | { 209 | start = Integer.parseInt(tokens[1]); 210 | stop = Integer.parseInt(tokens[2]); 211 | } 212 | catch(NumberFormatException e){continue;} 213 | targetChrs[cnt] = tokens[0]; 214 | targetStarts[cnt]=start; 215 | targetStops[cnt] = stop; 216 | cnt++; 217 | } 218 | totalTargets = cnt; /* Should be used to count Targets Oct3_2014*///totalTargets = TR.length; 219 | } 220 | 221 | 222 | 223 | /** 224 | * Major workhorse of the application. Reads in the BAM file and processes each record. 225 | * @param bamfile The bam file to read in 226 | * @param covFasta the coverage fasta output file 227 | * @param missTraget the miss target file 228 | * @param wgCoverage whole genome coverage file 229 | * @throws Exception 230 | * 231 | */ 232 | public static void readBAM(String bamfile, FileWriter covFasta, FileWriter missTraget, FileWriter wgCoverage) throws Exception 233 | { 234 | SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); 235 | SAMFileReader sfr = new SAMFileReader(new File(bamfile)); 236 | SAMFileHeader header = sfr.getFileHeader(); 237 | Iterator iter = sfr.iterator(); 238 | String lastchr = "SOMEVERYFAKETHINGGOESHERE"; 239 | int[] COVERAGE = new int[0]; 240 | char[] TR = new char[0]; 241 | int cnt = 0; 242 | SAMRecord rec = null; 243 | while(iter.hasNext()) 244 | { 245 | try 246 | { 247 | rec = iter.next(); 248 | if(percentage < 1.0) 249 | { 250 | double rd = RAND.nextDouble(); 251 | if(rd < percentage) continue; 252 | } 253 | 254 | totalReadsProduced++; 255 | if(rec.getMappingQuality() < minmapscore) 256 | { 257 | continue; 258 | } 259 | 260 | if(rec.getReadFailsVendorQualityCheckFlag()){ 261 | continue; 262 | } 263 | 264 | if(rec.getNotPrimaryAlignmentFlag()){ 265 | continue; 266 | } 267 | 268 | if(rec.getReadUnmappedFlag()) 269 | { 270 | //System.out.println("Unmapped! region. breaking!!!"); 271 | continue; 272 | } 273 | totalReadsAligned++; 274 | 275 | if(rec.getReadPairedFlag()) 276 | { 277 | totalReadsPaired++; 278 | if(!rec.getMateUnmappedFlag()){ 279 | totalPairedreadsWithmappedMates++; 280 | } 281 | } 282 | 283 | if(rec.getDuplicateReadFlag()) 284 | { 285 | duplicateReads++; 286 | if(removeDupes){continue;} 287 | } 288 | 289 | /////////////////Added on Feb 11, 2015///////////////////////////// 290 | READLENGTH = rec.getReadLength(); 291 | 292 | /////////////////////////////////////////////////////////////////// 293 | 294 | String currchr = rec.getReferenceName(); 295 | if(!currchr.equals(lastchr)) 296 | { 297 | if(!lastchr.equals("SOMEVERYFAKETHINGGOESHERE")) 298 | { 299 | getTargetsAndWriteCoverage(lastchr, COVERAGE, covFasta, missTraget, wgCoverage); 300 | } 301 | lastchr = currchr; 302 | System.out.println(currchr); 303 | size = header.getSequence(currchr).getSequenceLength()+1; 304 | totalGenomeBases+=size; 305 | COVERAGE = new int[size]; 306 | TR = getTargetPos(currchr, size); 307 | if(TR == null || COVERAGE == null) 308 | { 309 | System.err.println("COVERAGE or TR is null! "+currchr); 310 | } 311 | 312 | } 313 | processRecord(rec, TR, COVERAGE); 314 | } 315 | catch(Exception e) 316 | { 317 | System.err.println("Error on record: "+cnt+"\n"+e.getMessage()+" "+Arrays.toString(e.getStackTrace())); 318 | System.err.println(rec.toString()+" "+rec.getReferenceName()+" "+rec.getAlignmentStart()+" "+rec.getAlignmentEnd()); 319 | // throw e; 320 | } 321 | cnt++; 322 | } 323 | System.out.println("Done read bam"); 324 | getTargetsAndWriteCoverage(lastchr, COVERAGE, covFasta, missTraget, wgCoverage); 325 | COVERAGE = null; 326 | } 327 | 328 | /** 329 | * Major workhorse of the application. Reads in the BAM file and processes each record. 330 | * @param bamfile The bam file to read in 331 | * @param missTarget the miss target file 332 | * @param wgCoverage whole genome coverage file 333 | * @throws Exception 334 | * 335 | */ 336 | public static void readBAM(String bamfile, FileWriter missTarget, FileWriter wgCoverage) throws Exception 337 | { 338 | SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); 339 | SAMFileReader sfr = new SAMFileReader(new File(bamfile)); 340 | SAMFileHeader header = sfr.getFileHeader(); 341 | Iterator iter = sfr.iterator(); 342 | String lastchr = "SOMEVERYFAKETHINGGOESHERE"; 343 | int[] COVERAGE = new int[0]; 344 | char[] TR = new char[0]; 345 | int cnt = 0; 346 | SAMRecord rec = null; 347 | while(iter.hasNext()) 348 | { 349 | try 350 | { 351 | rec = iter.next(); 352 | if(percentage < 1.0) 353 | { 354 | double rd = RAND.nextDouble(); 355 | if(rd < percentage) continue; 356 | } 357 | 358 | totalReadsProduced++; 359 | if(rec.getMappingQuality() < minmapscore) 360 | { 361 | continue; 362 | } 363 | if(rec.getReadFailsVendorQualityCheckFlag()){ 364 | continue; 365 | } 366 | 367 | if(rec.getNotPrimaryAlignmentFlag()){ 368 | continue; 369 | } 370 | 371 | if(rec.getReadUnmappedFlag()) 372 | { 373 | //System.out.println("Unmapped! region. breaking!!!"); 374 | continue; 375 | } 376 | totalReadsAligned++; 377 | if(rec.getReadPairedFlag()) 378 | { 379 | totalReadsPaired++; 380 | if(!rec.getMateUnmappedFlag()){ 381 | totalPairedreadsWithmappedMates++; 382 | } 383 | } 384 | if(rec.getDuplicateReadFlag()) 385 | { 386 | duplicateReads++; 387 | if(removeDupes){ 388 | continue; 389 | } 390 | } 391 | 392 | /////////////////Added on Feb 11, 2015///////////////////////////// 393 | READLENGTH = rec.getReadLength(); 394 | /////////////////////////////////////////////////////////////////// 395 | 396 | 397 | String currchr = rec.getReferenceName(); 398 | if(!currchr.equals(lastchr)) 399 | { 400 | if(!lastchr.equals("SOMEVERYFAKETHINGGOESHERE")) 401 | { 402 | getTargetsAndWriteCoverage(lastchr, COVERAGE, null, missTarget, wgCoverage); 403 | } 404 | lastchr = currchr; 405 | System.out.println(currchr); 406 | size = header.getSequence(currchr).getSequenceLength()+1; 407 | totalGenomeBases+=size; 408 | COVERAGE = new int[size]; 409 | if(COVERAGE == null) 410 | { 411 | System.err.println("COVERAGE or TR is null! "+currchr); 412 | } 413 | 414 | } 415 | processRecord(rec, COVERAGE); 416 | } 417 | catch(Exception e) 418 | { 419 | System.err.println("Error on record: "+cnt+"\n"+e.getMessage()+" "+Arrays.toString(e.getStackTrace())); 420 | System.err.println(rec.toString()+" "+rec.getReferenceName()+" "+rec.getAlignmentStart()+" "+rec.getAlignmentEnd()); 421 | // throw e; 422 | } 423 | cnt++; 424 | } 425 | System.out.println("Done read bam"); 426 | getTargetsAndWriteCoverage(lastchr, COVERAGE, null, missTarget, wgCoverage); 427 | COVERAGE = null; 428 | } 429 | 430 | /** 431 | * Processes read record from a BAM. Adding the alignment to coverage array. 432 | * @param rec 433 | * @param TR 434 | * @param COVERAGE 435 | */ 436 | public static void processRecord(SAMRecord rec, char[] TR, int[] COVERAGE) throws Exception 437 | { 438 | boolean inbuffer = false; 439 | boolean ontarget = false; 440 | int start = rec.getAlignmentStart(); 441 | int stop = rec.getAlignmentEnd(); 442 | int referenceposition = 0; 443 | byte[] baseQual = rec.getBaseQualities(); 444 | 445 | for(int i = start; i <= stop; i++) 446 | { 447 | totalAlignedBases++; 448 | try 449 | { 450 | if(TR[i] == 1) 451 | { 452 | ontarget = true; 453 | } 454 | else if(TR[i] == 2) 455 | { 456 | inbuffer = true; 457 | } 458 | 459 | referenceposition = rec.getReferencePositionAtReadPosition(i-start+1); 460 | if(referenceposition==0){continue;} 461 | 462 | if(minbasescore > 0){ 463 | if((double)baseQual[i-start] >= minbasescore) { 464 | COVERAGE[referenceposition]++; 465 | } 466 | }else { 467 | COVERAGE[referenceposition]++; 468 | } 469 | } 470 | catch(Exception e) 471 | { 472 | System.err.println("array size:"+COVERAGE.length); 473 | System.err.println("start stop:"+start+" "+stop); 474 | System.err.println(e.getMessage()+" -- "+e.getLocalizedMessage()+" -- "+e.getCause()+" -- "+Arrays.toString(e.getStackTrace())); 475 | //throw e; 476 | break; 477 | } 478 | } 479 | if(ontarget) 480 | { 481 | ontargetReadHitCount++; 482 | } 483 | else if(inbuffer) 484 | { 485 | inbufferReadHitCount++; 486 | } 487 | else 488 | { 489 | offtargetReadHitCount++; 490 | } 491 | } 492 | 493 | /** 494 | * Processes read record from a BAM. Adding the alignment to coverage array. 495 | * @param rec 496 | * @param COVERAGE 497 | */ 498 | public static void processRecord(SAMRecord rec, int[] COVERAGE) throws Exception 499 | { 500 | int start = rec.getAlignmentStart(); 501 | int stop = rec.getAlignmentEnd(); 502 | int referenceposition = 0; 503 | byte[] baseQual = rec.getBaseQualities(); 504 | 505 | for(int i = start; i <= stop; i++) 506 | { 507 | totalAlignedBases++; 508 | try 509 | { 510 | if(minbasescore > 0){ 511 | 512 | referenceposition = rec.getReferencePositionAtReadPosition(i-start+1); 513 | if(referenceposition==0){continue;} 514 | 515 | if((double)baseQual[i-start] >= minbasescore) { 516 | COVERAGE[referenceposition]++; 517 | } 518 | }else { 519 | COVERAGE[referenceposition]++; 520 | } 521 | } 522 | catch(Exception e) 523 | { 524 | System.err.println("array size:"+COVERAGE.length); 525 | System.err.println("start stop:"+start+" "+stop); 526 | System.err.println(e.getMessage()+" -- "+e.getLocalizedMessage()+" -- "+e.getCause()+" -- "+Arrays.toString(e.getStackTrace())); 527 | //throw e; 528 | break; 529 | } 530 | } 531 | } 532 | 533 | /** 534 | * makes sure a file exists 535 | * @param s 536 | * @throws Exception 537 | */ 538 | public static void checkFile(String s) throws Exception 539 | { 540 | File f = new File(s); 541 | if(!f.exists()) 542 | { 543 | throw new FileNotFoundException("No such file as \""+s+"\". File not found."); 544 | } 545 | } 546 | 547 | public static void usage() 548 | { 549 | String s= "Version: "+VERSION+"\nUsage: -o -t -i [-d] [-m ] [-b ]\n\t* t: target file\n\t* o: output directory and base file name"; 550 | s+="\n\t* d: remove duplicates, and do not use them for statistics\n\t* i: alignment file (multiple files are not allowed)\n"; 551 | s+="\t* m: minimum mapping quality\n\t* b: minimum base quality\n"; 552 | System.out.println(s); 553 | } 554 | 555 | /** 556 | * removes the "chr" portion of any chromosome name 557 | * @param c 558 | * @return 559 | */ 560 | public static String removechr(String c) 561 | { 562 | if(c.length() > 3 && c.substring(0, 3).equalsIgnoreCase("chr")){ 563 | c = c.substring(3); 564 | } 565 | return c; 566 | } 567 | 568 | /** 569 | * converts fractions into percentages with 2 decimal positions 570 | * @param num 571 | * @param dom 572 | * @return 573 | */ 574 | public static double pc(int num, int dom) 575 | { 576 | double pc = (double)num/(double)dom; 577 | pc*=10000.0;pc+=0.5; int ipc = (int)pc; pc = (double)ipc/100; 578 | return pc; 579 | } 580 | 581 | /** 582 | * converts fractions into percentages with 2 decimal positions 583 | * @param num 584 | * @param dom 585 | * @return 586 | */ 587 | public static double pc(long num, long dom) 588 | { 589 | double pc = (double)num/(double)dom; 590 | pc*=10000.0;pc+=0.5; int ipc = (int)pc; pc = (double)ipc/100; 591 | return pc; 592 | } 593 | 594 | /** 595 | * Writes all the statistical information to an output file. 596 | * @param fname output file name 597 | * @throws Exception 598 | */ 599 | public static void writeReport(String fname) throws Exception 600 | { 601 | long nonduplicatereads = totalReadsAligned - duplicateReads; 602 | if(is_target && totalTargetedBases == 0) 603 | { 604 | System.err.println("Total targeted bases is zero. This means that no read has aligned to a chromosome that contains a target. No target matches a chromosome in the BAM, or something else very weird. Aborting."); 605 | System.exit(1); 606 | } 607 | if(totalReadsAligned == 0) 608 | { 609 | System.err.println("No reads aligned. Aborting."); 610 | System.exit(2); 611 | } 612 | if(nonduplicatereads == 0) 613 | { 614 | System.err.println("All reads are duplicates. Aborting."); 615 | System.exit(3); 616 | } 617 | if(is_target && totalTargets == 0) 618 | { 619 | //I don't think we should ever see this error, as its dealt with above. 620 | System.err.println("No target regions given. Aborting."); 621 | System.exit(4); 622 | } 623 | 624 | 625 | int sum =0; 626 | for(int i = 0; i= (totalGenomeBases/2)){ 628 | median_coverage_WG = i; 629 | break; 630 | }else{ 631 | sum+=coverage_forMedian_WG[i]; 632 | } 633 | } 634 | 635 | FileWriter report_WG = new FileWriter(fname+".WGCoverageReport.csv"); 636 | report_WG.write("Version: "+VERSION+"\n"); 637 | report_WG.write("Read Stats\n"); 638 | report_WG.write("Total Reads Produced:,"+totalReadsProduced+"\n"); 639 | report_WG.write("Total Yield Produced:"+READLENGTH+","+(READLENGTH * totalReadsProduced)+"\n"); 640 | report_WG.write("Total Unique Yield Produced:,"+(READLENGTH * (totalReadsAligned-duplicateReads))+"\n"); 641 | report_WG.write("Duplicate Reads:,"+duplicateReads+",("+pc(duplicateReads,totalReadsAligned)+"%)\n"); 642 | report_WG.write("Total Reads Aligned:,"+totalReadsAligned+",("+pc(totalReadsAligned,totalReadsProduced)+"%)"); 643 | report_WG.write(",reads paired:,"+totalReadsPaired); 644 | report_WG.write(",reads paired with mapped mates:,"+totalPairedreadsWithmappedMates+"\n"); 645 | report_WG.write("Average Coverage:,-,("+((int)(totalGenomeCoverage/totalGenomeBases))+")\n"); 646 | report_WG.write("Median Coverage:,-,("+median_coverage_WG+")\n"); 647 | report_WG.write("Base Stats\n"); 648 | report_WG.write("Bases Targeted:,"+totalGenomeBases+"\n"); 649 | report_WG.write("Bases with 0 coverage:,"+covHisto_WG[0]+",("+pc(covHisto_WG[0],totalGenomeBases)+"%)\n"); 650 | report_WG.write("Bases with 1+ coverage:,"+basesWithOneHitorMore_WG+",("+pc(basesWithOneHitorMore_WG, totalGenomeBases)+"%)\n"); 651 | report_WG.write("Bases with 5+ coverage:,"+basesWith5HitorMore_WG+",("+pc(basesWith5HitorMore_WG,totalGenomeBases)+"%)\n"); 652 | report_WG.write("Bases with 10+ coverage:,"+basesWith10HitsorMore_WG+",("+pc(basesWith10HitsorMore_WG,totalGenomeBases)+"%)\n"); 653 | report_WG.write("Bases with 15+ coverage:,"+basesWith15HitorMore_WG+",("+pc(basesWith15HitorMore_WG,totalGenomeBases)+"%)\n"); 654 | report_WG.write("Bases with 20+ coverage:,"+basesWith20HitsorMore_WG+",("+pc(basesWith20HitsorMore_WG,totalGenomeBases)+"%)\n"); 655 | report_WG.write("Bases with 30+ coverage:,"+basesWith30HitorMore_WG+",("+pc(basesWith30HitorMore_WG,totalGenomeBases)+"%)\n"); 656 | report_WG.write("Bases with 40+ coverage:,"+basesWith40HitsorMore_WG+",("+pc(basesWith40HitsorMore_WG,totalGenomeBases)+"%)\n"); 657 | report_WG.write("Bases with 50+ coverage:,"+basesWith50HitsorMore_WG+",("+pc(basesWith50HitsorMore_WG,totalGenomeBases)+"%)\n"); 658 | report_WG.write("Bases with 60+ coverage:,"+basesWith60HitsorMore_WG+",("+pc(basesWith60HitsorMore_WG,totalGenomeBases)+"%)\n"); 659 | report_WG.write("Bases with 100+ coverage:,"+basesWith100HitsorMore_WG+",("+pc(basesWith100HitsorMore_WG,totalGenomeBases)+"%)\n"); 660 | report_WG.write("\n"); 661 | report_WG.write("Coverage Histogram for Whole Genome (may look weird if target regions overlap...)\n"); 662 | for(int i = 0; i < covHisto_WG.length; i++) 663 | report_WG.write(i+","); 664 | report_WG.write("\n"); 665 | for(int i = 0; i < covHisto_WG.length; i++) 666 | report_WG.write(covHisto_WG[i]+","); 667 | report_WG.write("\n"); 668 | 669 | report_WG.close(); 670 | 671 | 672 | if(is_target){ 673 | sum =0; 674 | for(int i = 0; i= (totalTargetedBases/2)){ 676 | median_coverage = i; 677 | break; 678 | }else{ 679 | sum+=coverage_forMedian[i]; 680 | } 681 | } 682 | FileWriter report = new FileWriter(fname+".CoverageReport.csv"); 683 | report.write("Version: "+VERSION+"\n"); 684 | report.write("BUFFER size:,"+BUFFER+"\n"); 685 | report.write("Read Stats\n"); 686 | report.write("Total Reads Produced:,"+totalReadsProduced+"\n"); 687 | report.write("Total Yield Produced:,"+(READLENGTH * totalReadsProduced)+"\n"); 688 | report.write("Total Unique Yield Produced:,"+(READLENGTH * (totalReadsAligned-duplicateReads))+"\n"); 689 | report.write("Duplicate Reads:,"+duplicateReads+",("+pc(duplicateReads,totalReadsAligned)+"%)\n"); 690 | report.write("Total Reads Aligned:,"+totalReadsAligned+",("+pc(totalReadsAligned,totalReadsProduced)+"%)"); 691 | report.write(",reads paired:,"+totalReadsPaired); 692 | report.write(",reads paired with mapped mates:,"+totalPairedreadsWithmappedMates+"\n"); 693 | //report.write("Aligned Reads On-Buffer:,"+inbufferReadHitCount+",("+pc(inbufferReadHitCount,totalReadsAligned)+"%)\n"); 694 | //report.write("Aligned Reads On-Target:,"+ontargetReadHitCount+",("+pc(ontargetReadHitCount,totalReadsAligned)+"%)\n"); 695 | report.write("Average Coverage:,-,("+((int)(totalTargetCoverage/totalTargetedBases))+")\n"); 696 | report.write("Median Coverage:,-,("+median_coverage+")\n"); 697 | int hittot = inbufferReadHitCount+ontargetReadHitCount; 698 | //report.write("Reads that hit target or buffer:,"+hittot+",("+pc(hittot,totalReadsAligned)+"%)\n"); 699 | report.write("Total Aligned Reads (expected):,"+totalReadsAligned+"\n"); 700 | report.write("Total Aligned Reads (calculated):,"+(offtargetReadHitCount+inbufferReadHitCount+ontargetReadHitCount)+"\n"); 701 | report.write("Target Stats\n"); 702 | report.write("Targets Hit:,"+hitTargetCount+",("+pc(hitTargetCount,totalTargets)+"%)\n"); 703 | //report.write("Target Buffers Hit:,"+hitTarget_bufferonly_Count+",("+pc(hitTarget_bufferonly_Count,totalTargets)+"%)\n"); 704 | report.write("Total Targets:,"+totalTargets+"\n"); 705 | report.write("Non target regions with high coverage:,"+nonTragetGoodHits+"\n"); 706 | report.write("Base Stats\n"); 707 | report.write("Bases Targeted:,"+totalTargetedBases+"\n"); 708 | report.write("Buffer Bases:,"+totalBufferBases+"\n"); 709 | report.write("Bases with 0 coverage:,"+covHisto[0]+",("+pc(covHisto[0],totalTargetedBases)+"%)\n"); 710 | report.write("Bases with 1+ coverage:,"+basesWithOneHitorMore+",("+pc(basesWithOneHitorMore,totalTargetedBases)+"%)\n"); 711 | report.write("Bases with 5+ coverage:,"+basesWith5HitorMore+",("+pc(basesWith5HitorMore,totalTargetedBases)+"%)\n"); 712 | report.write("Bases with 10+ coverage:,"+basesWith10HitsorMore+",("+pc(basesWith10HitsorMore,totalTargetedBases)+"%)\n"); 713 | report.write("Bases with 15+ coverage:,"+basesWith15HitorMore+",("+pc(basesWith15HitorMore,totalTargetedBases)+"%)\n"); 714 | report.write("Bases with 20+ coverage:,"+basesWith20HitsorMore+",("+pc(basesWith20HitsorMore,totalTargetedBases)+"%)\n"); 715 | report.write("Bases with 30+ coverage:,"+basesWith30HitorMore+",("+pc(basesWith30HitorMore,totalTargetedBases)+"%)\n"); 716 | report.write("Bases with 40+ coverage:,"+basesWith40HitsorMore+",("+pc(basesWith40HitsorMore,totalTargetedBases)+"%)\n"); 717 | report.write("Bases with 50+ coverage:,"+basesWith50HitsorMore+",("+pc(basesWith50HitsorMore,totalTargetedBases)+"%)\n"); 718 | report.write("Bases with 60+ coverage:,"+basesWith60HitsorMore+",("+pc(basesWith60HitsorMore,totalTargetedBases)+"%)\n"); 719 | report.write("Bases with 100+ coverage:,"+basesWith100HitsorMore+",("+pc(basesWith100HitsorMore,totalTargetedBases)+"%)\n"); 720 | report.write("\n"); 721 | report.write("Coverage Histogram (may look weird if target regions overlap...)\n"); 722 | for(int i = 0; i < covHisto.length; i++) 723 | report.write(i+","); 724 | report.write("\n"); 725 | for(int i = 0; i < covHisto.length; i++) 726 | report.write(covHisto[i]+","); 727 | report.write("\n"); 728 | report.write("Target and region coverage plot\n"); 729 | report.write("Position,5'count,3'count\n"); 730 | for(int i = 20; i <= prime_size; i+=20) 731 | { 732 | report.write(i+","+fivePrime[fivePrime.length-(i-1)-1]+","+threePrime[i-1]+"\n"); 733 | } 734 | report.write("%tar-Pos,count\n"); 735 | for(int i = 0; i < 101; i+=2) 736 | { 737 | report.write(i+","+targetCov[i]+"\n"); 738 | } 739 | report.close(); 740 | } 741 | } 742 | 743 | static boolean supertets = false; 744 | 745 | /** 746 | * Gets the target regions from the target file, and writes over the coverage fasta files, as well as determines many of the coverage statistics. 747 | * @param chromo Current chromosome 748 | * @param COVERAGE The array which contains the coverage of every base in the genome 749 | * @param covFasta The FileWriter for the target-specific coverage 750 | * @param missTarget Write a "wig" format file (good for UCSC) which shows you where all off-target regions with high coverage are 751 | * @param wgCoverage A FileWriter for the whole genome coverage... if null, this file won't be written 752 | * @throws Exception 753 | */ 754 | public static void getTargetsAndWriteCoverage(String chromo, int COVERAGE[], FileWriter covFasta, FileWriter missTarget, FileWriter wgCoverage) throws Exception 755 | { 756 | wgCoverage.write(">"+chromo+" 1 "+(size-1)); 757 | for(int i = 0; i < COVERAGE.length; i++) 758 | { 759 | if(i%100==0) wgCoverage.write("\n"); 760 | int cov = COVERAGE[i]; 761 | if(cov < 0) 762 | { 763 | System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i)+"\n"); 764 | cov = Short.MAX_VALUE; 765 | System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i)+"\n"); 766 | } 767 | int temp_cov = cov; 768 | if(temp_cov >= covHisto_WG.length) 769 | temp_cov = (covHisto_WG.length-1); 770 | covHisto_WG[temp_cov]++; 771 | totalGenomeCoverage+=cov; 772 | if(cov > 0){ 773 | basesWithOneHitorMore_WG++;} 774 | if(cov > 4){ 775 | basesWith5HitorMore_WG++;} 776 | if(cov > 9){ 777 | basesWith10HitsorMore_WG++;} 778 | if(cov > 14){ 779 | basesWith15HitorMore_WG++;} 780 | if(cov > 19){ 781 | basesWith20HitsorMore_WG++;} 782 | if(cov > 29){ 783 | basesWith30HitorMore_WG++;} 784 | if(cov > 39){ 785 | basesWith40HitsorMore_WG++;} 786 | if(cov > 49){ 787 | basesWith50HitsorMore_WG++;} 788 | if(cov > 59){ 789 | basesWith60HitsorMore_WG++;} 790 | if(cov > 99){ 791 | basesWith100HitsorMore_WG++;} 792 | 793 | wgCoverage.write(cov+" "); 794 | 795 | 796 | if(cov < coverage_forMedian_WG.length){ 797 | coverage_forMedian_WG[cov]++; 798 | }else{ 799 | int[] tmp = new int[coverage_forMedian_WG.length]; 800 | System.arraycopy(coverage_forMedian_WG, 0, tmp, 0, coverage_forMedian_WG.length); 801 | coverage_forMedian_WG = new int[cov+1]; 802 | System.arraycopy(tmp, 0, coverage_forMedian_WG, 0, tmp.length); 803 | coverage_forMedian_WG[cov]++; 804 | } 805 | } 806 | wgCoverage.write("\n"); 807 | 808 | if(covFasta != null){ 809 | for(int j = 0; j < targetChrs.length; j++) 810 | { 811 | if(!removechr(targetChrs[j]).equals(removechr(chromo))) {continue;} 812 | //totalTargets++; 813 | int start = targetStarts[j]; 814 | int end = targetStops[j]; 815 | int length = end - start+1; 816 | boolean collectTargetCov = length > 99 ; 817 | 818 | //System.err.println(targetChrs[j]+" "+start+" "+end); 819 | 820 | if(supertets) 821 | { 822 | System.out.println(targetChrs[j]+" "+start+" "+end); 823 | } 824 | if(collectTargetCov) 825 | { 826 | for(int i = 0; i < prime_size; i++) 827 | { 828 | if((start - i) < 0 || (end+i) >= size){ 829 | continue; 830 | } 831 | fivePrime[i]+=COVERAGE[start-i]; 832 | threePrime[i]+=COVERAGE[end+i]; 833 | } 834 | } 835 | 836 | if(supertets) 837 | { 838 | 839 | for(int i = 0; i < 500; i++) 840 | { 841 | if((start-i) < 0) {continue;} 842 | System.out.print( (start-i)+" "); 843 | } 844 | System.out.print("\n"); 845 | for(int i = 0; i < 500; i++) 846 | { 847 | if((end+i) >= size) {continue;} 848 | System.out.print( (end+i)+" "); 849 | } 850 | System.out.print("\n"); 851 | 852 | supertets= false; 853 | } 854 | 855 | boolean targetHit = false; 856 | short[] pc = new short[101]; 857 | short[] pc2 = new short[101]; 858 | 859 | covFasta.write(">"+chromo+" "+start+" "+end+"\n"); 860 | boolean spaceit = false; 861 | if(end - start > 10000) spaceit = true; 862 | for(int i = 0; i < length; i++) 863 | { 864 | if((i+start) >= size) {continue;} 865 | if(spaceit && i%100 == 0) {covFasta.write("\n");} 866 | int cov = COVERAGE[i+start]; 867 | if(cov < 0) 868 | { 869 | System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i+start)+"\n"); 870 | cov = Short.MAX_VALUE; 871 | System.err.println("Coverage less than 0!!!!!!!\t"+cov+"\t"+(i+start)+"\n"); 872 | } 873 | int temp_cov = cov; 874 | if(temp_cov >= covHisto.length){ 875 | temp_cov = (covHisto.length-1); 876 | } 877 | 878 | covHisto[temp_cov]++; 879 | totalTargetCoverage+=cov; 880 | 881 | if(cov > 0) 882 | { 883 | targetHit=true; 884 | basesWithOneHitorMore++; 885 | } 886 | if(cov > 4){ 887 | basesWith5HitorMore++;} 888 | if(cov > 9){ 889 | basesWith10HitsorMore++;} 890 | if(cov > 14){ 891 | basesWith15HitorMore++;} 892 | if(cov > 19){ 893 | basesWith20HitsorMore++;} 894 | if(cov > 29){ 895 | basesWith30HitorMore++;} 896 | if(cov > 39){ 897 | basesWith40HitsorMore++;} 898 | if(cov > 49){ 899 | basesWith50HitsorMore++;} 900 | if(cov > 59){ 901 | basesWith60HitsorMore++;} 902 | if(cov > 99){ 903 | basesWith100HitsorMore++;} 904 | 905 | covFasta.write(cov+" "); 906 | 907 | 908 | if(cov < coverage_forMedian.length){ 909 | coverage_forMedian[cov]++; 910 | }else{ 911 | int[] tmp = new int[coverage_forMedian.length]; 912 | System.arraycopy(coverage_forMedian, 0, tmp, 0, coverage_forMedian.length); 913 | coverage_forMedian = new int[cov+1]; 914 | System.arraycopy(tmp, 0, coverage_forMedian, 0, tmp.length); 915 | coverage_forMedian[cov]++; 916 | } 917 | 918 | 919 | if(collectTargetCov) 920 | { 921 | int pcpos = (int)((double)i/(double)length*100+0.5); 922 | pc[pcpos] += cov; 923 | pc2[pcpos]++; 924 | } 925 | } 926 | covFasta.write("\n"); 927 | 928 | for(int index = 0; index < pc.length; index++) 929 | { 930 | if(pc2[index] != 0) 931 | { 932 | int d = (int) (((double)pc[index]/(double)pc2[index])+0.5); 933 | pc[index] = (short) d; 934 | } 935 | } 936 | 937 | for(int i = 0; i < 101; i++) 938 | { 939 | targetCov[i]+=pc[i]; 940 | } 941 | 942 | if(targetHit) 943 | { 944 | hitTargetCount++; 945 | }else{ 946 | missTarget.write(targetChrs[j]+"\t"+targetStarts[j]+"\t"+targetStops[j]+"\n"); 947 | boolean hit = false; 948 | for(int i = start - BUFFER; i < start && !hit; i++) 949 | { 950 | if(i < 0) {continue;} 951 | if(COVERAGE[i] > 0){ 952 | hit=true; 953 | } 954 | } 955 | for(int i = end; i < end+BUFFER && !hit; i++) 956 | { 957 | if(i >= size) {continue;} 958 | if(COVERAGE[i] > 0){ 959 | hit=true; 960 | } 961 | } 962 | if(hit){ 963 | hitTarget_bufferonly_Count++; 964 | } 965 | } 966 | } 967 | } 968 | } 969 | 970 | /** 971 | * 972 | * @param chromo the current chromosome to load 973 | * @param size the size of the chromosome 974 | * @return 975 | * @throws Exception 976 | */ 977 | public static char[] getTargetPos(String chromo, int size) throws Exception 978 | { 979 | char[] TR = new char[size]; 980 | chromo = removechr(chromo); 981 | for(int j = 0; j < targetChrs.length; j++) 982 | { 983 | try{ 984 | if(!removechr(targetChrs[j]).equals(chromo))continue; 985 | int start = targetStarts[j]; 986 | int end = targetStops[j]; 987 | for(int i = start; i <= end; i++) 988 | { 989 | if(i >= size) { 990 | continue; 991 | }else{ 992 | TR[i] = 1; 993 | } 994 | } 995 | for(int i = start -BUFFER; i 1 1 249250622, Should Be: >1 1 249250621) 2 | Bugfix[WGS_Stats_v1.java]: covHistio_WG variable was defined as 'int' and could go over the data type limit while handling whole genome samples. This is now changed to 'long'. 3 | Bugfix[batch_*_Lowcov.bed]: There was an extra tab between the 'length' and 'gene' fields. The file has been fixed to match the Headers. 4 | Bugfix[batch_*_Lowcov.bed]: Interval length now matches length of 0-based genome coordinate positions in 'batch_*_Lowcov.bed' file. 5 | Bugfix[ExCID.BatchScript_v2.1-threading_Final.pl]: Targets BED file with more than 3 columns were not handled correctly in the Annotation step. This has been fixed and will now allow BED file with more than 3 columns as Input. 6 | Bugfix[ExCID.BatchScript_v2.1-threading_Final.pl]: In Batch mode, while obtaining the Gene% coverage, '-checkHGMD' was applied as a mandatory option. This has been corrected and is changed to optional. 7 | Bugfix: An assumption made in "ExCID.grep_gene_list_pct_Final-miRNA.pl" and "ExCID.grep_gene_list_pct_Final.pl" scripts about the location of the database files. This has been fixed to obtain the location from the Config.txt . 8 | Bugfix[CapStatsV2.6.java]: A VERSION variable is used to write the version in the *.csv file. 9 | 10 | Devel[batch_*_Lowcov.bed]: The format of the file will be changed to 1-based making it consistent with all other Outputs. 11 | Devel[WGS_Stats_v1.java]: There is an extra 0 at the start of coverage information of each chromosome in the Ò*wholeGenomeCov.fastaÓ output file, that needs to be removed. 12 | -------------------------------------------------------------------------------- /ExCID_v2.1/check_HGNC_individual_CCDSDB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | 5 | my $annotated_index = $ARGV[0]; 6 | my $HGNC = $ARGV[1]; 7 | 8 | 9 | open(my $fh,"<$annotated_index") or die $!; 10 | 11 | 12 | while (my $line = <$fh>) { 13 | 14 | chomp($line); 15 | 16 | my ($chr,$start,$Stop,$ID) = split("\t",$line); 17 | my @transcript_ID_split = split("_exon_",$ID); 18 | my $transcript_ID = $transcript_ID_split[0]; 19 | 20 | my @grep = `grep -w "$transcript_ID" $HGNC `; 21 | 22 | if (scalar(@grep) == 1) { 23 | my @tmp = split("\t",$grep[0]); 24 | my $gene_name = $tmp[0]; 25 | print "$chr\t$start\t$Stop\t$ID\t$gene_name\n"; 26 | }else{ 27 | print "$chr\t$start\t$Stop\t$ID\t.\n"; 28 | #print STDERR "$chr\t$start\t$Stop\t$ID\t.\n"; 29 | } 30 | 31 | } 32 | 33 | close($fh); 34 | -------------------------------------------------------------------------------- /ExCID_v2.1/check_HGNC_individual_RefSeqDB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | 5 | my $annotated_index = $ARGV[0]; 6 | my $HGNC = $ARGV[1]; 7 | 8 | 9 | open(my $fh,"<$annotated_index") or die $!; 10 | 11 | 12 | while (my $line = <$fh>) { 13 | 14 | chomp($line); 15 | 16 | my ($chr,$start,$Stop,$gene,$ID) = split("\t",$line); 17 | my @transcript_ID_tmp_split = split("_exon_",$ID); 18 | my $transcript_ID = $transcript_ID_tmp_split[0]; 19 | 20 | my @grep = `grep -w "$transcript_ID" $HGNC `; 21 | 22 | if (scalar(@grep) == 1) { 23 | my @tmp = split("\t",$grep[0]); 24 | my $gene_name = $tmp[0]; 25 | print "$chr\t$start\t$Stop\t$gene\t$ID\t$gene_name\n"; 26 | }else{ 27 | print "$chr\t$start\t$Stop\t$gene\t$ID\t.\n"; 28 | # print STDERR "$chr\t$start\t$Stop\t$gene\t$ID\t.\n"; 29 | } 30 | 31 | } 32 | 33 | close($fh); 34 | -------------------------------------------------------------------------------- /ExCID_v2.1/check_HGNC_individual_VEGADB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | 5 | my $annotated_index = $ARGV[0]; 6 | my $HGNC = $ARGV[1]; 7 | my $VEGA_HGNC_names = $ARGV[2]; 8 | 9 | my %words_to_genes = (); # not a perfect index, but will do the same as "grep -w" 10 | open(my $hgnc_fh, "<$HGNC") or die $!; 11 | while (my $line = <$hgnc_fh>) { 12 | chomp $line; 13 | my @row = split(/[\s,]/, $line); 14 | my $gene = shift @row; 15 | map { $words_to_genes{$_} = $gene } @row; 16 | } 17 | close $hgnc_fh or die $!; 18 | 19 | my %vega_index = (); 20 | open(my $vega_fh, "<$VEGA_HGNC_names") or die $!; 21 | while (my $line = <$vega_fh>) { 22 | chomp $line; 23 | my ($a, $b) = split "\t", $line; 24 | $vega_index{$b} = $a; 25 | } 26 | close $vega_fh or die $!; 27 | 28 | open(my $fh,"<$annotated_index") or die $!; 29 | 30 | while (my $line = <$fh>) { 31 | 32 | chomp($line); 33 | 34 | my ($chr,$start,$Stop,$gene,$ID) = split("\t",$line); 35 | my @transcript_ID_tmp_split = split("_exon_",$ID); 36 | my $transcript_ID = $transcript_ID_tmp_split[0]; 37 | 38 | $transcript_ID = $vega_index{$transcript_ID} || $transcript_ID; 39 | my $gene_name = $words_to_genes{$transcript_ID} || '.'; 40 | 41 | print "$chr\t$start\t$Stop\t$gene\t$ID\t$gene_name\n"; 42 | 43 | } 44 | 45 | close($fh); -------------------------------------------------------------------------------- /ExCID_v2.1/check_HGNC_individual_mirnaDB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | 5 | my $annotated_index = $ARGV[0]; 6 | my $HGNC = $ARGV[1]; 7 | 8 | 9 | open(my $fh,"<$annotated_index") or die $!; 10 | 11 | 12 | while (my $line = <$fh>) { 13 | 14 | chomp($line); 15 | 16 | my ($chr,$start,$Stop,$transcript_ID) = split("\t",$line); 17 | 18 | my @grep = `grep -w "$transcript_ID" $HGNC `; 19 | 20 | if (scalar(@grep) == 1) { 21 | my @tmp = split("\t",$grep[0]); 22 | my $gene_name = $tmp[0]; 23 | print "$chr\t$start\t$Stop\t$transcript_ID\t$gene_name\n"; 24 | }else{ 25 | #my @tmp = split("-",$transcript_ID); 26 | #my $gene_name = "MIR"; 27 | #for(my $i = 2; $i < scalar(@tmp); $i++){ 28 | # $gene_name.=uc($tmp[$i]); 29 | #} 30 | print "$chr\t$start\t$Stop\t$transcript_ID\t.\n"; 31 | #print STDERR "$chr\t$start\t$Stop\t$gene_name\t$transcript_ID\n"; 32 | } 33 | 34 | } 35 | 36 | close($fh); -------------------------------------------------------------------------------- /ExCID_v2.1/creat_bed_UCSC_coding.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | ### GLOBAL VARS ### 4 | ################### 5 | my $infile = $ARGV[0]; 6 | ### MAIN CODE ### 7 | ################# 8 | open(FIN,"$infile") || die "Can't open $infile: $!\n"; 9 | 10 | while(){ 11 | 12 | chomp; my $line = $_; 13 | my ($chr, $cds_start, $cds_stop, $ID, $gene, $exon_num, $exon_start, $exon_stop) = split(/\s/, $line); 14 | unless($gene){$gene = " " ;} 15 | $chr=~s/^chr//; 16 | my ($id, $version) = split(/\./, $ID); #for ccds data 17 | my @chrarray = split(/\_/, $chr); 18 | my $chrNum = $chrarray[0]; 19 | my $chrcount = scalar(@chrarray); 20 | if ($chrcount == 1){ #if need the halytype like chr6_hap, need to screen out 21 | my @exon_startarray = split(/\,/, $exon_start); 22 | my @exon_stoparray = split(/\,/, $exon_stop); 23 | my $outfile1 = "${infile}-exon.bed"; 24 | my $outfile2 = "${infile}-Coding_region.bed"; 25 | open(OUT1, ">>$outfile1") || die "Can't open $outfile1: $!\n"; 26 | #open(OUT2, ">>$outfile2") || die "Can't open $outfile2: $!\n"; 27 | #print OUT2 "$chrNum\t$cds_start\t$cds_stop\t${id}\n" if($cds_start != $cds_stop); 28 | #print OUT2 "${chrNum}\t$cds_start\t$cds_stop\t${gene}\n"; 29 | for (my $j=0; $j<$exon_num; $j++) { #for gene 30 | #for (my $j=0; $j<$exon_num; $j++){ #for ; 31 | next if ($exon_startarray[$j] > $cds_stop); 32 | next if ($exon_stoparray[$j]< $cds_start); 33 | 34 | 35 | if($cds_start != $cds_stop) { 36 | if ($exon_startarray[$j] >= $cds_start && $exon_stoparray[$j] <= $cds_stop) { 37 | print OUT1 "$chrNum\t$exon_startarray[$j]\t$exon_stoparray[$j]\t$gene\t${id}_exon_${j}\n"; 38 | }elsif($exon_startarray[$j] < $cds_start && $exon_stoparray[$j] <= $cds_stop) { 39 | print OUT1 "$chrNum\t$cds_start\t$exon_stoparray[$j]\t$gene\t${id}_exon_${j}\n"; 40 | }elsif($exon_startarray[$j] >= $cds_start && $exon_stoparray[$j] > $cds_stop) { 41 | print OUT1 "$chrNum\t$exon_startarray[$j]\t$cds_stop\t$gene\t${id}_exon_${j}\n"; 42 | }elsif($exon_startarray[$j] < $cds_start && $exon_stoparray[$j] > $cds_stop) { 43 | print OUT1 "$chrNum\t$cds_start\t$cds_stop\t$gene\t${id}_exon_${j}\n"; 44 | } 45 | }else{ 46 | print OUT1 "$chrNum\t$exon_startarray[$j]\t$exon_stoparray[$j]\t$gene\t${id}_exon_${j}\n"; 47 | } 48 | 49 | } 50 | } 51 | 52 | } 53 | 54 | 55 | close(FIN); 56 | close(OUT1); 57 | #close(OUT2); 58 | -------------------------------------------------------------------------------- /ExCID_v2.1/database.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/database.tgz -------------------------------------------------------------------------------- /ExCID_v2.1/external_programs/BEDTools.v2.17.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbuhay/ExCID/38741e72e2e11e72ceb6b45db0de3f37d96b2e85/ExCID_v2.1/external_programs/BEDTools.v2.17.0.tar.gz -------------------------------------------------------------------------------- /ExCID_v2.1/reformat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # This script takes in the annotated BED file (eg the VCrome+PKv2 fully annotated) and checks 3 | # the gene names and other gene names columns with the HGNC database to add the Approved names in 4 | # the gene names column and put other names in the other gene names column. 5 | # 6 | 7 | use strict; 8 | 9 | 10 | my $annotated_index = $ARGV[0]; 11 | my $HGNC = $ARGV[1]; 12 | 13 | 14 | open(my $fh,"<$annotated_index") or die $!; 15 | 16 | 17 | while (my $line = <$fh>) { 18 | 19 | chomp($line); 20 | 21 | $line=~s/\t-1\t/\t.\t/; 22 | my $not_anno = 0; 23 | my @line_tmp = split("\t",$line); 24 | my $target = $line_tmp[0]."\t".$line_tmp[1]."\t".$line_tmp[2]; 25 | my $gene_name=""; 26 | my $prev_name=""; 27 | my $synonyms_names = ""; 28 | my $refseq_IDs = $line_tmp[11]; 29 | my $CCDS_IDs = $line_tmp[12]; 30 | my $VEGA_IDs = $line_tmp[13]; 31 | my $miRNA_IDs = $line_tmp[14]; 32 | my $rest = join("\t", @line_tmp[15..(scalar(@line_tmp)-1)]); 33 | 34 | 35 | ## Gene_name 36 | 37 | if ($line_tmp[3] ne "." ) { 38 | $gene_name =$line_tmp[3].";"; 39 | my $check = $line_tmp[4].";"; 40 | if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) { 41 | $synonyms_names = $line_tmp[4].";"; ## First time Syn Name is given a value. 42 | } 43 | $check = $line_tmp[5].";"; 44 | if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) { 45 | $synonyms_names .= $line_tmp[5].";"; 46 | } 47 | $check = $line_tmp[6].";"; 48 | if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) { 49 | $synonyms_names .= $line_tmp[6].";"; 50 | } 51 | }elsif($line_tmp[4] ne "." ){ 52 | $gene_name =$line_tmp[4].";"; 53 | my $check = $line_tmp[5].";"; 54 | if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) { 55 | $synonyms_names = $line_tmp[5].";"; ## First time Syn Name is given a value. 56 | } 57 | $check = $line_tmp[6].";"; 58 | if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) { 59 | $synonyms_names .= $line_tmp[6].";"; 60 | } 61 | }elsif($line_tmp[5] ne "." ){ 62 | $gene_name =$line_tmp[5].";"; 63 | my $check = $line_tmp[6].";"; 64 | if(index($synonyms_names,$check) == -1 && index($gene_name,$check) == -1) { 65 | $synonyms_names = $line_tmp[6].";"; ## First time Syn Name is given a value. 66 | } 67 | }elsif($line_tmp[6] ne "." ){ 68 | $gene_name =$line_tmp[6].";"; 69 | } 70 | 71 | ## 72 | 73 | ## Other_Names 74 | 75 | if ($line_tmp[7] ne "."){ 76 | my @line_tmp_split = split(";",$line_tmp[7]); 77 | foreach my $tmp (@line_tmp_split){ 78 | my $check = $tmp.";"; 79 | if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) { 80 | $prev_name = $line_tmp[7].";"; 81 | } 82 | 83 | } 84 | } 85 | 86 | if ($line_tmp[8] ne "."){ 87 | my @line_tmp_split = split(";",$line_tmp[8]); 88 | foreach my $tmp (@line_tmp_split){ 89 | my $check = $tmp.";"; 90 | if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) { 91 | $prev_name = $line_tmp[8].";"; 92 | } 93 | 94 | } 95 | } 96 | 97 | if ($line_tmp[9] ne "."){ 98 | my @line_tmp_split = split(";",$line_tmp[9]); 99 | foreach my $tmp (@line_tmp_split){ 100 | my $check = $tmp.";"; 101 | if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) { 102 | $prev_name = $line_tmp[9].";"; 103 | } 104 | 105 | } 106 | } 107 | 108 | if ($line_tmp[10] ne "."){ 109 | my @line_tmp_split = split(";",$line_tmp[10]); 110 | foreach my $tmp (@line_tmp_split){ 111 | my $check = $tmp.";"; 112 | if (index($gene_name,$check) == -1 && index($synonyms_names,$check) == -1 && index($prev_name,$check) == -1) { 113 | $prev_name = $line_tmp[10].";"; 114 | } 115 | 116 | } 117 | } 118 | 119 | $gene_name=~ s/;$//; 120 | $prev_name=~ s/;$//; 121 | $synonyms_names=~ s/;$//; 122 | $gene_name=~ s/^\.;//; 123 | $prev_name=~ s/^\.;//; 124 | $synonyms_names=~ s/^\.;//; 125 | $gene_name=~ s/;\.$//; 126 | $prev_name=~ s/;\.$//; 127 | $synonyms_names=~ s/;\.$//; 128 | 129 | chomp($gene_name); 130 | chomp($prev_name); 131 | chomp($synonyms_names); 132 | 133 | if (length($gene_name) == 0) { 134 | $gene_name = "."; 135 | } 136 | if (length($prev_name) == 0) { 137 | $prev_name = "."; 138 | } 139 | if (length($synonyms_names) == 0) { 140 | $synonyms_names = "."; 141 | } 142 | 143 | print "$target\t$gene_name\t$prev_name\t$synonyms_names\t$refseq_IDs\t$CCDS_IDs\t$VEGA_IDs\t$miRNA_IDs\t$rest\n"; 144 | 145 | } 146 | 147 | close($fh); 148 | -------------------------------------------------------------------------------- /ExCID_v2.1/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR=$(cd `dirname ${0}`; pwd) 4 | source Config.txt 5 | 6 | mkdir $DataBaseDir; 7 | mkdir $AnnotationDir; 8 | 9 | tar xzvf external_programs/BEDTools.v2.17.0.tar.gz 1>>setup.log 2>>setup.log ; 10 | # build the bedtools package 11 | cd bedtools-2.17.0 ; 12 | make 1>>setup.log 2>>setup.log ; 13 | cp bin/bedtools ../bin/ ; 14 | 15 | 16 | cd $BASEDIR; 17 | printf "Extracting external databases... "; 18 | tar -xzf $BASEDIR/database.tgz; 19 | printf "DONE.\n"; 20 | 21 | mv $BASEDIR/database/* $DataBaseDir/ ; 22 | 23 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz $DataBaseDir ; 24 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ccdsGene.txt.gz $DataBaseDir ; 25 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGene.txt.gz $DataBaseDir ; 26 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGtp.txt.gz $DataBaseDir ; 27 | cp miRBASE_r20.gff2 $DataBaseDir/miRBASE_r20.gff2 ; 28 | 29 | if [ "$(uname)" == "Darwin" ]; then 30 | ls $DataBaseDir/*.gz | while read FILE ; do gzip -d "$FILE" ; done ; 31 | ls $DataBaseDir/*.txt | while read FILE ; do awk -F "\t" '{print $3"\t"$7"\t"$8"\t"$2"\t"$13"\t"$9"\t"$10"\t"$11}' "$FILE" > "$FILE.bed" ; done ; 32 | elif [ "$(uname)" == "Linux" ]; then 33 | ls --color=never $DataBaseDir/*.gz | while read FILE ; do gzip -d "$FILE" ; done ; 34 | ls --color=never $DataBaseDir/*.txt | while read FILE ; do awk -F "\t" '{print $3"\t"$7"\t"$8"\t"$2"\t"$13"\t"$9"\t"$10"\t"$11}' "$FILE" > "$FILE.bed" ; done ; 35 | fi 36 | 37 | rm $DataBaseDir/vegaGtp.txt.bed ; 38 | awk -F "\t" '{print $1"\t"$2}' $DataBaseDir/vegaGtp.txt > $DataBaseDir/VEGA-hgnc_names ; 39 | rm $DataBaseDir/vegaGtp.txt ; 40 | 41 | awk -F "\t| " '{print $1"\t"$4"\t"$5"\t"$10}' $DataBaseDir/miRBASE_r20.gff2 | sed s/ID=\"// | sed s/\"\;// | grep "^#" -v > $DataBaseDir/miRBASE_r20.bed ; 42 | 43 | perl creat_bed_UCSC_coding.pl $DataBaseDir/refGene.txt.bed ; 44 | perl creat_bed_UCSC_coding.pl $DataBaseDir/ccdsGene.txt.bed ; 45 | perl creat_bed_UCSC_coding.pl $DataBaseDir/vegaGene.txt.bed ; 46 | 47 | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$5}' $DataBaseDir/ccdsGene.txt.bed-exon.bed > tmp ; mv tmp $DataBaseDir/ccdsGene.txt.bed-exon.bed ; 48 | 49 | sed s/^chr// $DataBaseDir/miRBASE_r20.bed > tmp ; mv tmp $DataBaseDir/miRBASE_r20.bed ; 50 | 51 | perl Get_HGNC.pl > $DataBaseDir/HGNC_database.txt ; 52 | 53 | perl check_HGNC_individual_mirnaDB.pl $DataBaseDir/miRBASE_r20.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/miRBASE_r20_HGNC.bed & 54 | perl check_HGNC_individual_CCDSDB.pl $DataBaseDir/ccdsGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/ccdsGene.txt.bed-exon_HGNC.bed & 55 | perl check_HGNC_individual_VEGADB.pl $DataBaseDir/vegaGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt $DataBaseDir/VEGA-hgnc_names > $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed & 56 | perl check_HGNC_individual_RefSeqDB.pl $DataBaseDir/refGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/refGene.txt.bed-exon_HGNC.bed & 57 | 58 | wait; 59 | 60 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed | $BASEDIR/bin/bedtools intersect -a - -b $DataBaseDir/refGene.txt.bed-Coding_region.bed -u > refGene.txt.bed-exon_HGNC.bed_tmp ; 61 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed -v | cat - refGene.txt.bed-exon_HGNC.bed_tmp > tmp ; 62 | #mv tmp $DataBaseDir/refGene.txt.bed-exon_HGNC.bed ; 63 | 64 | #$BASEDIR/bin/bedtools intersect -a $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed -b $DataBaseDir/vegaGene.txt.bed-Coding_region.bed -u > tmp; 65 | #mv tmp $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed ; 66 | 67 | 68 | #rm refGene.txt.bed-exon_HGNC.bed_tmp ; 69 | rm $DataBaseDir/miRBASE_r20.gff2 ; 70 | rm $DataBaseDir/miRBASE_r20.bed ; 71 | rm $DataBaseDir/ccdsGene.txt ; 72 | rm $DataBaseDir/vegaGene.txt ; 73 | rm $DataBaseDir/refGene.txt ; 74 | rm $DataBaseDir/ccdsGene.txt.bed ; 75 | rm $DataBaseDir/vegaGene.txt.bed ; 76 | rm $DataBaseDir/refGene.txt.bed ; 77 | rm $DataBaseDir/ccdsGene.txt.bed-exon.bed ; 78 | rm $DataBaseDir/vegaGene.txt.bed-exon.bed ; 79 | rm $DataBaseDir/refGene.txt.bed-exon.bed ; 80 | 81 | -------------------------------------------------------------------------------- /ExCID_v2.1/update_databases.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR=$(cd `dirname ${0}`; pwd) 4 | source Config.txt 5 | 6 | mkdir -p $DataBaseDir; 7 | mkdir -p $AnnotationDir; 8 | rm -rf $DataBaseDir/* ; 9 | rm -rf $AnnotationDir/* ; 10 | 11 | 12 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz $DataBaseDir ; 13 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ccdsGene.txt.gz $DataBaseDir ; 14 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGene.txt.gz $DataBaseDir ; 15 | rsync -a -P rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/vegaGtp.txt.gz $DataBaseDir ; 16 | cp miRBASE_r20.gff2 $DataBaseDir/miRBASE_r20.gff2 ; 17 | 18 | ls --color=never $DataBaseDir/*.gz | while read FILE ; do gzip -d "$FILE" ; done ; 19 | ls --color=never $DataBaseDir/*.txt | while read FILE ; do awk -F "\t" '{print $3"\t"$7"\t"$8"\t"$2"\t"$13"\t"$9"\t"$10"\t"$11}' "$FILE" > "$FILE.bed" ; done ; 20 | rm $DataBaseDir/vegaGtp.txt.bed ; 21 | awk -F "\t" '{print $1"\t"$2}' $DataBaseDir/vegaGtp.txt > $DataBaseDir/VEGA-hgnc_names ; 22 | rm $DataBaseDir/vegaGtp.txt ; 23 | 24 | awk -F "\t| " '{print $1"\t"$4"\t"$5"\t"$10}' $DataBaseDir/miRBASE_r20.gff2 | sed s/ID=\"// | sed s/\"\;// | grep "^#" -v > $DataBaseDir/miRBASE_r20.bed ; 25 | 26 | perl creat_bed_UCSC_coding.pl $DataBaseDir/refGene.txt.bed ; 27 | perl creat_bed_UCSC_coding.pl $DataBaseDir/ccdsGene.txt.bed ; 28 | perl creat_bed_UCSC_coding.pl $DataBaseDir/vegaGene.txt.bed ; 29 | 30 | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$5}' $DataBaseDir/ccdsGene.txt.bed-exon.bed > tmp ; mv tmp $DataBaseDir/ccdsGene.txt.bed-exon.bed ; 31 | 32 | sed s/^chr// $DataBaseDir/miRBASE_r20.bed > tmp ; mv tmp $DataBaseDir/miRBASE_r20.bed ; 33 | 34 | perl Get_HGNC.pl > $DataBaseDir/HGNC_database.txt ; 35 | 36 | perl check_HGNC_individual_mirnaDB.pl $DataBaseDir/miRBASE_r20.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/miRBASE_r20_HGNC.bed & 37 | perl check_HGNC_individual_CCDSDB.pl $DataBaseDir/ccdsGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/ccdsGene.txt.bed-exon_HGNC.bed & 38 | perl check_HGNC_individual_VEGADB.pl $DataBaseDir/vegaGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt $DataBaseDir/VEGA-hgnc_names > $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed & 39 | perl check_HGNC_individual_RefSeqDB.pl $DataBaseDir/refGene.txt.bed-exon.bed $DataBaseDir/HGNC_database.txt > $DataBaseDir/refGene.txt.bed-exon_HGNC.bed & 40 | 41 | wait; 42 | 43 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed | $BASEDIR/bin/bedtools intersect -a - -b $DataBaseDir/refGene.txt.bed-Coding_region.bed -u > refGene.txt.bed-exon_HGNC.bed_tmp ; 44 | #grep -P "\tNM_" $DataBaseDir/refGene.txt.bed-exon_HGNC.bed -v | cat - refGene.txt.bed-exon_HGNC.bed_tmp > tmp ; 45 | #mv tmp $DataBaseDir/refGene.txt.bed-exon_HGNC.bed ; 46 | 47 | #$BASEDIR/bin/bedtools intersect -a $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed -b $DataBaseDir/vegaGene.txt.bed-Coding_region.bed -u > tmp; 48 | #mv tmp $DataBaseDir/vegaGene.txt.bed-exon_HGNC.bed ; 49 | 50 | 51 | #rm refGene.txt.bed-exon_HGNC.bed_tmp ; 52 | rm $DataBaseDir/miRBASE_r20.gff2 ; 53 | rm $DataBaseDir/miRBASE_r20.bed ; 54 | rm $DataBaseDir/ccdsGene.txt ; 55 | rm $DataBaseDir/vegaGene.txt ; 56 | rm $DataBaseDir/refGene.txt ; 57 | rm $DataBaseDir/ccdsGene.txt.bed ; 58 | rm $DataBaseDir/vegaGene.txt.bed ; 59 | rm $DataBaseDir/refGene.txt.bed ; 60 | rm $DataBaseDir/ccdsGene.txt.bed-exon.bed ; 61 | rm $DataBaseDir/vegaGene.txt.bed-exon.bed ; 62 | rm $DataBaseDir/refGene.txt.bed-exon.bed ; 63 | 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## About the ExCID Report ## 2 | 3 | The Exome Coverage and Identification (ExCID) Report is a software tool developed at BCM-HGSC to assess sequence depth in user-defined targeted regions. The tool was initially developed for use in targeted capture applications, but its functionality has evolved to encompass any sequencing application from amplicon and targeted capture sequencing to WGS. ExCID analyzes sequence depth of any sequencing event, reports the average coverage across each target, and identifies bases below a user-defined threshold (20X coverage by default). Furthermore, the tool annotates the target with the latest gene, transcript, and exon information from RefSeq and the Human Gene Mutation Database (HGMD). The report has the option to output data tracks of sample targets and coverage that can be visualized in UCSC and IGV genome browsers. 4 | 5 | ## Outputs ## 6 | * Outputs length, average coverage, and gene annotations for targets in BCM-HGSC VCRome (or your custom design) 7 | * Outputs all regions of low coverage, including length and avg. coverage 8 | * Output coverage track across regions of interest viewable in standard browser 9 | * Outputs the percentage of gene covered in the design for all the Genetest genes (Clinically important genes) and other provided Gene Lists or Gene databases. 10 | 11 | ## Installation ## 12 | 13 | Requirements: 14 | 15 | 1. Latest version of JAVA and PERL. 16 | 2. If on a Mac, you might need to install XCode: https://developer.apple.com/xcode/downloads/ 17 | 18 | 1) Fill the information in Config.txt. 19 | 20 | DataBaseDir=/path/to/directory/to_put_the_databases/ 21 | AnnotationDir=/path/to/directory/to_put_the_annotations_of_bed_files/ 22 | 23 | 2) Run setup.sh script from command line. 24 | 25 | $./setup.sh 26 | 27 | The setup script installs the bedtools version 2.17.0 (Released under GNU public license version 2 (GPL v2)) and maintained by the Quinlan Laboratory at the University of Virginia. 28 | It will download the latest RefSEQ, VEGA, CCDS and miRBASE databses for bed file annotation. The databases generated are for Coding regions only. 29 | 30 | ## Usage ## 31 | 32 | 1) For using with VCrome regions run the program as: 33 | 34 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam -m 35 | 36 | Multiple Bam files can be provided eg. 37 | 38 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam -bam -bam -m 39 | OR 40 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bamList -m 41 | where the BAM list is a test file with 1 bam file per line. 42 | 43 | If the minimum threshold is not provided by the user then 20x coverage is assummed by default. 44 | 45 | 46 | 2) For using with a user defined BED file: 47 | 48 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam -m -i 49 | 50 | Multiple Bam files can be provided eg. 51 | 52 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam -bam -bam -m -i 53 | 54 | If the minimum threshold is not provided by the user then 20x coverage is assummed by default. 55 | 56 | 57 | 3) For generating a wig file for all the target regions and a bed file for low covered regions for visualization in standard genome browser, use the '-wig' option: 58 | 59 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam -m -i -wig 60 | 61 | 62 | 4) Using '-d' option will not consider duplicates reads for generating the coverages statistics: 63 | 64 | $ perl ExCID.BatchScript_v2.1-threading_Final.pl -bam -m -i -d 65 | 66 | 67 | The BED file will be annotated with RefSEQ, CCDS, VEGA and miRBASE gene annotations. The RefSEQ, CCDS, VEGA and miRBASE database can be updated as: 68 | $ ./update_databases.sh 69 | 70 | 71 | The Gentest genes were complied and annotated in November 2013. 72 | 73 | ## File Formats ## 74 | 75 | 1) If the user wants to generate a Gene database to obtain the Gene coverage percentage, the database should be of the following format: 76 | 77 | CHR START STOP GENE|TRASCRIPT_exon_number 78 | 79 | Example: 80 | 10 100177320 100177483 HPS1|NM_000195_cds_0 81 | 10 100177931 100178014 HPS1|NM_000195_cds_1 82 | 10 100179801 100179915 HPS1|NM_000195_cds_2 83 | 84 | --------------------------------------------------------------------------------