├── webblast ├── cgi-bin │ ├── templates │ │ ├── blastn_results.html │ │ ├── blastp_results.html │ │ ├── blastx_results.html │ │ ├── tblastn_results.html │ │ ├── tblastx_results.html │ │ ├── sorttable.js │ │ ├── footer.html │ │ ├── site_defaults.csv │ │ ├── blastn_defaults.csv │ │ ├── blastp_defaults.csv │ │ ├── blastx_defaults.csv │ │ ├── tblastn_defaults.csv │ │ ├── tblastx_defaults.csv │ │ ├── blastn_query.html │ │ ├── blastp_query.html │ │ ├── blastx_query.html │ │ ├── tblastn_query.html │ │ ├── tblastx_query.html │ │ ├── header.html │ │ ├── blastn_params.html │ │ ├── blastp_params.html │ │ ├── blastx_params.html │ │ ├── tblastn_params.html │ │ └── tblastx_params.html │ └── db │ │ ├── ZEV_mmusmtDNA_bphagelambda.nhr │ │ ├── ZEV_mmusmtDNA_bphagelambda.nin │ │ └── ZEV_mmusmtDNA_bphagelambda.nsq └── style │ ├── css │ ├── tabs_bg.png │ ├── menu_active.png │ ├── menu_active.xcf │ ├── tabs_norm_left.png │ ├── tabs_norm_right.png │ ├── tabs_norm_left_on.png │ ├── tabs_norm_right_on.png │ ├── tabs_custom_corners_topleft.png │ ├── tabs_custom_corners_bottomleft.png │ ├── tabs_custom_corners_topright.png │ ├── tabs_custom_corners_bottomright.png │ ├── tabs_custom_corners_rightborder.png │ ├── main_area.css │ ├── tabs_style.css │ └── side_menu.css │ └── project_logo.png ├── affytools ├── Affy2SimpleGT.jar ├── Affy2SimpleGT.class ├── manifest_source.txt ├── .classpath └── .project ├── weblabeler └── room_label │ └── footer.html ├── README.md ├── aggregate.pl ├── transpose.pl ├── RCVS_filter.pl ├── bootstrap2rank.sh ├── blastx_boundaries.pl ├── last-rmident.pl ├── paf2mat.pl ├── obo2csv.pl ├── fastqcat.pl ├── gfa_cleaner.pl ├── fastx2fastm.pl ├── fastq2fasta.pl ├── align2csv.pl ├── chunkFilter.pl ├── gtcounts2MAF.pl ├── fourievar.r ├── fastm2fastx.pl ├── fasta-nsplit.pl ├── pileupPropView.r ├── mitochondrial_descendants.pl ├── minimus2_OCA.sh ├── mergeFast5events.pl ├── sam2mapped.pl ├── sam2LongestBase.pl ├── fastx-qualHist.pl ├── labid2UUID.pl ├── posAggregate.pl ├── vcf_normalise.pl ├── gtsubset.pl ├── tped2gtcounts.pl ├── vcf2simplegt.pl ├── vcf_windowSorter.pl ├── bam2proportion.sh ├── fastx-annotate.pl ├── dotplotAnnotator.r ├── markercsv2linkage.pl ├── fastx-grep.pl ├── quantile_subset.pl ├── power_analysis.r ├── sam-senseflagger.pl ├── fastx-rc.pl ├── fastx-isofilter.pl ├── exchange_rate.pl ├── fastx-sample.pl ├── pedtodot.sh ├── mpileupDC.pl ├── fastx-cgmasker.pl ├── addKraken2Silva.pl ├── edgebeeguildfinder.py ├── shufflefastx.pl ├── fastx-sort.pl ├── GenABEL2GIANT.pl ├── maf2gfa.pl ├── nbmt-translate.pl ├── fastx-length.pl ├── expenseTemplate.tex ├── maf_bcsplit.pl ├── rsfilter.pl ├── fastx-rlength.pl ├── fastx-hplength.pl ├── fmod.py ├── svgImport.r ├── tped2trios.pl ├── vcf2fq.pl ├── fastx-fetch.pl ├── readthrough.py ├── spiralign.r ├── fastx-repeatFilter.pl └── fastx-interleave.pl /webblast/cgi-bin/templates/blastn_results.html: -------------------------------------------------------------------------------- 1 |

Results

2 | %(results) 3 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastp_results.html: -------------------------------------------------------------------------------- 1 |

Results

2 | %(results) 3 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastx_results.html: -------------------------------------------------------------------------------- 1 |

Results

2 | %(results) 3 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastn_results.html: -------------------------------------------------------------------------------- 1 |

Results

2 | %(results) 3 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastx_results.html: -------------------------------------------------------------------------------- 1 |

Results

2 | %(results) 3 | -------------------------------------------------------------------------------- /affytools/Affy2SimpleGT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/affytools/Affy2SimpleGT.jar -------------------------------------------------------------------------------- /affytools/Affy2SimpleGT.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/affytools/Affy2SimpleGT.class -------------------------------------------------------------------------------- /webblast/style/css/tabs_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_bg.png -------------------------------------------------------------------------------- /webblast/style/project_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/project_logo.png -------------------------------------------------------------------------------- /webblast/style/css/menu_active.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/menu_active.png -------------------------------------------------------------------------------- /webblast/style/css/menu_active.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/menu_active.xcf -------------------------------------------------------------------------------- /affytools/manifest_source.txt: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Class-Path: affy2simplegt.jar 3 | Main-Class: affytools.Affy2SimpleGT 4 | -------------------------------------------------------------------------------- /webblast/style/css/tabs_norm_left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_left.png -------------------------------------------------------------------------------- /webblast/style/css/tabs_norm_right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_right.png -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/sorttable.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/templates/sorttable.js -------------------------------------------------------------------------------- /webblast/style/css/tabs_norm_left_on.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_left_on.png -------------------------------------------------------------------------------- /webblast/style/css/tabs_norm_right_on.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_right_on.png -------------------------------------------------------------------------------- /webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nhr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nhr -------------------------------------------------------------------------------- /webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nin -------------------------------------------------------------------------------- /webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nsq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nsq -------------------------------------------------------------------------------- /webblast/style/css/tabs_custom_corners_topleft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_topleft.png -------------------------------------------------------------------------------- /webblast/style/css/tabs_custom_corners_bottomleft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_bottomleft.png -------------------------------------------------------------------------------- /webblast/style/css/tabs_custom_corners_topright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_topright.png -------------------------------------------------------------------------------- /webblast/style/css/tabs_custom_corners_bottomright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_bottomright.png -------------------------------------------------------------------------------- /webblast/style/css/tabs_custom_corners_rightborder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_rightborder.png -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/footer.html: -------------------------------------------------------------------------------- 1 |

SessionID: %(sessionID)

2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/site_defaults.csv: -------------------------------------------------------------------------------- 1 | site_name,SITE_
NAME 2 | gbrowse_patterns,mmus_mtDNA:%s 3 | -------------------------------------------------------------------------------- /weblabeler/room_label/footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /affytools/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastn_defaults.csv: -------------------------------------------------------------------------------- 1 | results,There are no results 2 | runBlast,'' 3 | queryDB,'' 4 | MAX_NUM_SEQ,100 5 | SHORT_QUERY_ADJUST,checked 6 | EXPECT,10 7 | WORD_SIZE,7 8 | HSP_RANGE_MAX,0 9 | MATRIX_NAME,"" 10 | MATCH_SCORES,"2,-3" 11 | GAPCOSTS,"5 2" 12 | COMPOSITION_BASED_STATISTICS,"" 13 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastp_defaults.csv: -------------------------------------------------------------------------------- 1 | results,There are no results 2 | runBlast,'' 3 | queryDB,'' 4 | MAX_NUM_SEQ,100 5 | SHORT_QUERY_ADJUST,checked 6 | EXPECT,10 7 | WORD_SIZE,7 8 | HSP_RANGE_MAX,0 9 | MATRIX_NAME,"" 10 | MATCH_SCORES,"2,-3" 11 | GAPCOSTS,"5 2" 12 | COMPOSITION_BASED_STATISTICS,"" 13 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastx_defaults.csv: -------------------------------------------------------------------------------- 1 | results,There are no results 2 | runBlast,'' 3 | queryDB,'' 4 | MAX_NUM_SEQ,100 5 | SHORT_QUERY_ADJUST,checked 6 | EXPECT,10 7 | WORD_SIZE,7 8 | HSP_RANGE_MAX,0 9 | MATRIX_NAME,"" 10 | MATCH_SCORES,"2,-3" 11 | GAPCOSTS,"5 2" 12 | COMPOSITION_BASED_STATISTICS,"" 13 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastn_defaults.csv: -------------------------------------------------------------------------------- 1 | results,There are no results 2 | runBlast,'' 3 | queryDB,'' 4 | MAX_NUM_SEQ,100 5 | SHORT_QUERY_ADJUST,checked 6 | EXPECT,10 7 | WORD_SIZE,7 8 | HSP_RANGE_MAX,0 9 | MATRIX_NAME,"" 10 | MATCH_SCORES,"2,-3" 11 | GAPCOSTS,"5 2" 12 | COMPOSITION_BASED_STATISTICS,"" 13 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastx_defaults.csv: -------------------------------------------------------------------------------- 1 | results,There are no results 2 | runBlast,'' 3 | queryDB,'' 4 | MAX_NUM_SEQ,100 5 | SHORT_QUERY_ADJUST,checked 6 | EXPECT,10 7 | WORD_SIZE,7 8 | HSP_RANGE_MAX,0 9 | MATRIX_NAME,"" 10 | MATCH_SCORES,"2,-3" 11 | GAPCOSTS,"5 2" 12 | COMPOSITION_BASED_STATISTICS,"" 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bioinfscripts 2 | ============= 3 | 4 | Bioinformatics scripts created or modified by David Eccles (gringer) 5 | 6 | Note: these scripts are now maintained on [gitlab](https://gitlab.com/gringer/bioinfscripts). If you want to use the most recent version of these scripts (including *many* additional features and bugfixes), download from there. 7 | -------------------------------------------------------------------------------- /aggregate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $keyName = ""; 7 | my @valueAggregate = (); 8 | 9 | while(<>){ 10 | chomp; 11 | my @F = split(",", $_); 12 | if($F[0] ne $keyName){ 13 | if($keyName){ 14 | printf("${keyName},%d\n", scalar(@valueAggregate)); 15 | } 16 | $keyName = $F[0]; 17 | @valueAggregate = (); 18 | } 19 | push(@valueAggregate, $F[1]); 20 | } 21 | -------------------------------------------------------------------------------- /affytools/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | affytools 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /transpose.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use List::MoreUtils qw(pairwise); 7 | 8 | my @rows = (); 9 | my $delim = "\t"; 10 | 11 | while(<>){ 12 | chomp; 13 | if(/(\s)/){ 14 | $delim = $1; 15 | } 16 | my @F = split(/\s+/); 17 | if(!@rows){ 18 | @rows = @F; 19 | } else { 20 | @rows = pairwise {$a . $delim . $b} @rows, @F; 21 | } 22 | } 23 | 24 | foreach my $row (@rows){ 25 | print($row."\n"); 26 | } 27 | -------------------------------------------------------------------------------- /RCVS_filter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | my $snpFileName = shift(@ARGV); 6 | open(my $snpFile, "<", $snpFileName); 7 | 8 | my %inMap = (); 9 | while(<$snpFile>){ 10 | chomp; 11 | $inMap{$_} = 1; 12 | } 13 | close($snpFile); 14 | 15 | $inMap{"dbSNP RS ID"} = 1; 16 | 17 | while(<>){ 18 | if(/^#/){ 19 | print; 20 | } else { 21 | my @fields = split(/\t/); 22 | if($inMap{$fields[14]}){ 23 | print; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastn_query.html: -------------------------------------------------------------------------------- 1 |

Query Database

2 | 3 | 6 | 7 | 8 |

Target Sequence

9 | 10 |

Enter a %(inputType) sequence in FASTA format:

11 | 12 | 13 | 14 |
15 | Or upload a file: 16 |
17 | 20 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastp_query.html: -------------------------------------------------------------------------------- 1 |

Query Database

2 | 3 | 6 | 7 | 8 |

Target Sequence

9 | 10 |

Enter a %(inputType) sequence in FASTA format:

11 | 12 | 13 | 14 |
15 | Or upload a file: 16 |
17 | 20 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastx_query.html: -------------------------------------------------------------------------------- 1 |

Query Database

2 | 3 | 6 | 7 | 8 |

Target Sequence

9 | 10 |

Enter a %(inputType) sequence in FASTA format:

11 | 12 | 13 | 14 |
15 | Or upload a file: 16 |
17 | 20 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastn_query.html: -------------------------------------------------------------------------------- 1 |

Query Database

2 | 3 | 6 | 7 | 8 |

Target Sequence

9 | 10 |

Enter a %(inputType) sequence in FASTA format:

11 | 12 | 13 | 14 |
15 | Or upload a file: 16 |
17 | 20 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastx_query.html: -------------------------------------------------------------------------------- 1 |

Query Database

2 | 3 | 6 | 7 | 8 |

Target Sequence

9 | 10 |

Enter a %(inputType) sequence in FASTA format:

11 | 12 | 13 | 14 |
15 | Or upload a file: 16 |
17 | 20 | -------------------------------------------------------------------------------- /bootstrap2rank.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/sh 2 | 3 | pv bootstrap100_anzgene200_vs_1kg200.csv.gz | zcat | sort -t',' -k 2,2n -k 3,3rn | perl -F',' -lane 'if($run != $F[1]){$run = $F[1]; $rank=1; $lastVal=0; $lastRank=0} if($F[1] ne "bs.run"){$nextVal = $F[2]; $F[2] = ($F[2] == $lastVal) ? $lastRank : $rank; $lastRank = $F[2]; $lastVal = $nextVal; $rank++} print join(",",@F)' | sort -t',' -k 1,1 -k 3,3n | perl -F',' -lane 'if($marker ne $F[0]){if($marker){print $lastLine; print $_} $marker = $F[0]} {$lastLine = $_} END{ print $lastLine}' | gzip > maxminRank_bootstrap100_anzgene200_vs_1kg200.csv.gz 4 | -------------------------------------------------------------------------------- /blastx_boundaries.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my %starts = (); 7 | my %ends = (); 8 | 9 | while(<>){ 10 | chomp; 11 | my @F = split(/\s+/); 12 | my $qs=$F[6]; 13 | my $qe=$F[7]; 14 | if($qe < $qs){ 15 | ($qs, $qe) = ($qe, $qs); 16 | } 17 | my $sig = $F[0]."-".$F[1]; 18 | if(!$starts{$sig} || ($starts{$sig} > $qs)){ 19 | $starts{$sig} = $qs 20 | } 21 | if(!$ends{$sig} || ($ends{$sig} < $qe)){ 22 | $ends{$sig} = $qe 23 | } 24 | } 25 | 26 | foreach my $key (keys(%starts)){ 27 | if($key =~ /(.*)-(.*)/){ 28 | my $target = $1; 29 | my $query = $2; 30 | printf("%s %s:%d-%d\n", $query, $target, $starts{$key}, $ends{$key}); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webblast/style/css/main_area.css: -------------------------------------------------------------------------------- 1 | a { 2 | text-decoration: none; 3 | color: black; 4 | } 5 | 6 | a:hover { 7 | text-decoration: underline; 8 | } 9 | 10 | #tabcontent, #contentarea { 11 | overflow: auto; 12 | position: fixed; 13 | top: 38px; 14 | left: 173px; 15 | right: 5px; 16 | width: auto; 17 | height: auto; 18 | bottom: 5px; 19 | padding: 8px; 20 | } 21 | 22 | #tabcontent { 23 | margin-top: 15px; 24 | margin-bottom: 15px; 25 | margin-right: 15px; 26 | margin-left: 7px; 27 | background-attachment: scroll; 28 | } 29 | 30 | #contentarea { 31 | border: 1px solid black; 32 | /* border-radius: 5px; */ 33 | box-shadow: 2px -2px 4px rgba(0,0,0,0.5); 34 | background: white; 35 | } 36 | 37 | #contentarea p { text-align: justify} -------------------------------------------------------------------------------- /last-rmident.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | my @lineBuffer = (); 4 | my @ids = (); 5 | 6 | while(<>){ 7 | my $line = $_; 8 | if(/^([a-z])\s(.*?)\s/){ 9 | my ($flag, $id) = ($1, $2); 10 | if($flag eq "s"){ 11 | push(@ids, $id); 12 | } elsif($flag eq "a"){ # new alignment 13 | if((scalar(@ids) != 2) || ($ids[0] ne $ids[1])){ 14 | print(join("",@lineBuffer)); 15 | } 16 | # if(@ids){ 17 | # printf("a '%s' '%s' %d %d\n", $ids[0], $ids[1], 18 | # scalar(@lineBuffer), scalar(@ids)); 19 | # } else { 20 | # print("a\n"); 21 | # } 22 | @lineBuffer = (); 23 | @ids = (); 24 | } 25 | } 26 | push(@lineBuffer, $line); 27 | } 28 | 29 | if((scalar(@ids) != 2) || ($ids[0] ne $ids[1])){ 30 | print(join("",@lineBuffer)); 31 | } 32 | -------------------------------------------------------------------------------- /paf2mat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $read = ""; 7 | my @tigs = (); 8 | my %links = (); 9 | while(<>){ 10 | my @F = split(/\s/); 11 | if($F[0] eq $read){ 12 | push(@tigs, $F[5]); 13 | } else { 14 | foreach my $t1 (@tigs){ 15 | foreach my $t2 (@tigs){ 16 | $links{$t1}{$t2}++; 17 | $links{$t2}{$t1}++; 18 | } 19 | } 20 | $read = $F[0]; 21 | @tigs = (); 22 | } 23 | } 24 | 25 | foreach my $t1 (@tigs){ 26 | foreach my $t2 (@tigs){ 27 | $links{$t1}{$t2}++; 28 | } 29 | } 30 | 31 | print("tig1,tig2,count\n"); 32 | foreach my $t1 (sort(keys(%links))){ 33 | foreach my $t2 (sort(keys(%{$links{$t1}}))){ 34 | my $line = sprintf("%s,%s,%d\n", $t1, $t2, $links{$t1}{$t2}); 35 | $line =~ s/Consensus_//g; 36 | print $line; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /obo2csv.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $category = ""; 7 | my %data = (); 8 | 9 | print "id,is_a,name,def\n"; 10 | 11 | while(<>){ 12 | chomp; 13 | if(/^\[(.*?)\]/){ 14 | if($category eq "Term"){ 15 | printf("%s,%s,\"%s\",%s\n", 16 | $data{"id"}, 17 | $data{"is_a"} ? $data{"is_a"} : "", 18 | $data{"name"} ? $data{"name"} : "", 19 | $data{"def"} ? $data{"def"} : ""); 20 | } 21 | %data = (); 22 | $category = $1; 23 | } elsif(/^([^:]+): (.*)$/){ 24 | my $field = $1; 25 | my $value = $2; 26 | $value =~ s/\[.*?\]//g; 27 | if($field eq "is_a"){ 28 | $value =~ s/ .*$//; 29 | } 30 | if($category eq "Term"){ 31 | $data{$field} = ($data{$field}) ? ($data{$field}.";".$value) : $value; 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /fastqcat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_version auto_help pass_through); # for option parsing 6 | 7 | my $inQual = 0; # false 8 | my $seqID = ""; 9 | my $qualID = ""; 10 | my $seq = ""; 11 | my $qual = ""; 12 | my $minLen = 0; 13 | 14 | GetOptions('minlength=i' => \$minLen) or 15 | die("Error in command line arguments"); 16 | 17 | while(<>){ 18 | chomp; 19 | chomp; 20 | if(!$inQual){ 21 | if(/^@(.+)$/){ 22 | my $newSeqID = $1; 23 | if($seqID && (length($seq) >= $minLen)){ 24 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 25 | } 26 | $seq = ""; 27 | $qual = ""; 28 | $seqID = $newSeqID; 29 | } elsif(/^\+(.*)$/) { 30 | $inQual = 1; # true 31 | $qualID = $1; 32 | } else { 33 | $seq .= $_; 34 | } 35 | } else { 36 | $qual .= $_; 37 | if(length($qual) >= length($seq)){ 38 | $inQual = 0; # false 39 | } 40 | } 41 | } 42 | 43 | if($seqID && (length($seq) >= $minLen)){ 44 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 45 | } 46 | -------------------------------------------------------------------------------- /gfa_cleaner.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## gfa_cleaner.pl -- removes duplicate links and links that include 4 | ## vertexes with high numbers of outgoing or incoming edges 5 | 6 | use warnings; 7 | use strict; 8 | 9 | my %seen = (); 10 | my %counts = (); 11 | my %displayBuffer = (); 12 | 13 | my $maxCount = 4; 14 | 15 | while(<>){ 16 | if(/^[^L]/){ 17 | print; 18 | next; 19 | } 20 | my $line = $_; 21 | chomp; 22 | my @F = split(/\t/); 23 | my $matchFwd = join(";",@F[(1,2,3,4)]); 24 | my $strA = join(";",@F[(1,2)]); 25 | grep {tr/\-\+/\+\-/} @F[(4,2)]; 26 | my $matchRev = join(";",@F[(3,4,1,2)]); 27 | my $strB = join(";",@F[(3,4)]); ## after sign flip 28 | if(!$seen{$matchFwd}){ 29 | $counts{$strA}++; 30 | $counts{$strB}++; 31 | $seen{$matchFwd} = 1; 32 | $seen{$matchRev} = 1; 33 | $displayBuffer{$line}{$strA} = 1; 34 | $displayBuffer{$line}{$strB} = 1; 35 | } 36 | } 37 | 38 | foreach my $line (sort(keys(%displayBuffer))){ 39 | my @strs = keys(%{$displayBuffer{$line}}); 40 | if(grep {$counts{$_} > $maxCount} @strs){ 41 | next; 42 | } 43 | print($line); 44 | } 45 | -------------------------------------------------------------------------------- /webblast/style/css/tabs_style.css: -------------------------------------------------------------------------------- 1 | #tabbar { 2 | z-index: 1; 3 | font-size: 93%; 4 | position: fixed; 5 | top: 5px; 6 | width: auto; 7 | height: 43px; 8 | left: 175px; 9 | right: 92px; 10 | bottom: 26px; 11 | padding: 0px; 12 | margin: 0px; 13 | } 14 | 15 | #tabbar ul { 16 | margin: 0; 17 | padding:5px 5px; 18 | list-style:none; 19 | } 20 | 21 | #tabbar button { 22 | border-style: none; 23 | background-color: transparent; 24 | } 25 | 26 | #tabbar p { 27 | color: gray; 28 | font-weight: bold; 29 | margin: 2px; 30 | } 31 | 32 | #tabbar li { 33 | display: block; 34 | float:left; 35 | border-radius: 5px 5px 0 0; 36 | box-shadow: 1px -1px 1px rgba(0,0,0,0.5); 37 | border: 1px solid black; 38 | border-bottom: 1px solid white; 39 | background: white; 40 | margin: 5px 5px 0 0; 41 | padding:0px 2px 0px 0px; 42 | } 43 | 44 | #tabbar .tabon p { 45 | color: black; 46 | } 47 | 48 | #tabbar .taboff { 49 | border-bottom: 1px solid black; 50 | } 51 | 52 | #tabbar .tabdisabled { 53 | background: rgba(192,192,192,1); 54 | visibility: hidden; 55 | } 56 | -------------------------------------------------------------------------------- /fastx2fastm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $inQual = 0; # false 7 | my $seqID = ""; 8 | my $qualID = ""; 9 | my $seq = ""; 10 | my $qual = ""; 11 | while(<>){ 12 | if(!$inQual){ 13 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 14 | my $newSeqID = $2; 15 | if($seq){ 16 | $seq =~ s/\n/\n /g; 17 | $seq =~ s/ $//; 18 | printf("#seq:%s\n", $seqID); 19 | printf(" %s", $seq); 20 | if($qual){ 21 | $qual =~ s/\n/\n /g; 22 | $qual =~ s/ $//; 23 | printf("#qual:%s\n", $seqID); 24 | printf(" %s", $qual); 25 | } 26 | } 27 | $seq = ""; 28 | $qual = ""; 29 | $seqID = $newSeqID; 30 | chomp $seqID; 31 | chomp $seqID; 32 | } elsif(/^\+(.*)$/) { 33 | $inQual = 1; # true 34 | $qualID = $1; 35 | } else { 36 | $seq .= $_; 37 | } 38 | } else { 39 | $qual .= $_; 40 | if(length($qual) >= length($seq)){ 41 | $inQual = 0; # false 42 | } 43 | } 44 | } 45 | 46 | if($seqID){ 47 | printf("#seq:%s\n", $seqID); 48 | printf(" %s", $seq); 49 | if($qual){ 50 | printf("#qual:%s\n", $seqID); 51 | printf(" %s", $qual); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /fastq2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | 7 | my $minLength = 0; 8 | my $singleLine = 0; 9 | 10 | GetOptions("minLength=i" => \$minLength, "singleLine!" => \$singleLine) or 11 | die("Error in command line arguments"); 12 | 13 | my $inQual = 0; # false 14 | my $seqID = ""; 15 | my $qualID = ""; 16 | my $seq = ""; 17 | my $qual = ""; 18 | while(<>){ 19 | chomp; 20 | chomp; 21 | if(/^\s+$/){ 22 | next; 23 | } 24 | if(!$inQual){ 25 | if(/^@(.+)$/){ 26 | $seqID = $1; 27 | $seq = ""; 28 | } elsif(/^\+(.*)$/) { 29 | $inQual = 1; # true 30 | $qualID = $1; 31 | $qual = ""; 32 | if(length($seq) > $minLength){ 33 | my $printedSeq = $seq; 34 | if(!$singleLine){ 35 | $printedSeq =~ s/(.{60})/$1\n/g; 36 | $printedSeq =~ s/\n$//g; 37 | } 38 | printf(">%s\n%s\n", $seqID, $printedSeq); 39 | } 40 | } else { 41 | $seq .= $_; 42 | } 43 | } else { 44 | $qual .= $_; 45 | if(length($qual) >= length($seq)){ 46 | $inQual = 0; # false 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /align2csv.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | print("SeqID,RefID,Pos,Ref,Alt\n"); 7 | 8 | while(<>){ 9 | if(/^#/){ 10 | next; 11 | } 12 | chomp; 13 | my ($nameR, $nameQ, $score, $seqRef, $seqDiff, $seqQry) = split(/\t/, $_); 14 | my $refPos = 0; 15 | my $ins = ""; # false 16 | my $del = ""; # false 17 | my $indelPos = -1; 18 | for(my $i = 0; $i < length($seqRef); $i++){ 19 | my $r = substr($seqRef, $i, 1); 20 | my $d = substr($seqDiff, $i, 1); 21 | my $q = substr($seqQry, $i, 1); 22 | if($r ne "-"){ 23 | $refPos++; 24 | if(($q ne "-") && ($ins || $del)){ 25 | printf("%s,%s,%05d,%s,%s\n", $nameQ, $nameR, $indelPos, $del, $ins); 26 | $ins = ""; # false 27 | $del = ""; # false 28 | $indelPos = -1; 29 | } 30 | } 31 | if($r eq "-"){ 32 | $ins .= $q; 33 | if($indelPos == -1){ 34 | $indelPos = $refPos+1; 35 | } 36 | } 37 | if($q eq "-"){ 38 | $del .= $r; 39 | if($indelPos == -1){ 40 | $indelPos = $refPos; 41 | } 42 | } 43 | if(!($ins || $del) && ($r ne $q)){ 44 | printf("%s,%s,%05d,%s,%s\n", $nameQ, $nameR, $refPos, $r, $q); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /chunkFilter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $mw = 10; # match window 7 | my $maxChunkSize = 5000; # limit chunk size 8 | 9 | my $lastContig=""; 10 | my $start=-($mw+1); 11 | my $length=-($mw+1); 12 | 13 | my @lineBuffer = (); 14 | 15 | while(<>){ 16 | chomp; 17 | my @F = split(/ /); 18 | my $newStart = $F[1]; 19 | my $newEnd = $F[1] + $F[2]; 20 | my $oldEnd = $start + $length; 21 | if(($F[0] eq $lastContig) && ($length < ($maxChunkSize-$mw)) && 22 | (($start - $mw) < $newStart && ($oldEnd + $mw) > $newStart) || 23 | (($oldEnd + $mw) > $newEnd && ($start - $mw) < $newEnd)){ 24 | $start = $newStart if ($start > $newStart); 25 | $newEnd = $oldEnd if ($oldEnd > $newEnd); 26 | $F[1] = $start; 27 | $F[2] = $newEnd - $start; 28 | } else { 29 | push(@lineBuffer,""); 30 | my $sig = join(" ", ($lastContig, $start, $length)); 31 | print(join(" $sig\n", @lineBuffer)); 32 | @lineBuffer = (); 33 | $lastContig = ""; 34 | $start = -($mw+1); 35 | $length = -($mw+1); 36 | } 37 | push(@lineBuffer, $_); 38 | ($lastContig, $start, $length) = @F; 39 | } 40 | 41 | my $sig = join(" ", ($lastContig, $start, $length)); 42 | push(@lineBuffer,""); 43 | print(join(" $sig\n", @lineBuffer)); 44 | -------------------------------------------------------------------------------- /gtcounts2MAF.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use List::Util; 7 | 8 | sub processCounts{ 9 | my ($marker, $countRef) = @_; 10 | if($marker eq ""){ 11 | return; 12 | } 13 | #print(STDERR "Processing $marker\n"); 14 | foreach my $sex (keys(%{$countRef})){ 15 | my %tCounts = %{$countRef->{$sex}}; 16 | my $total = 0; 17 | my $minA = "0"; 18 | my $minC = undef; 19 | while(my ($a, $b) = each %tCounts) { 20 | if(!defined($minC) || ($b < $minC)){ 21 | $minC = $b; 22 | $minA = $a; 23 | } 24 | $total += $b; 25 | } 26 | my $minF = ($total > 0) ? $minC / $total : 0; 27 | printf("%s,%s,%s,%0.4f\n", $marker, $sex, $minA, $minF); 28 | } 29 | } 30 | 31 | my $lastMarker = ""; 32 | my $counts = {}; 33 | 34 | while(<>){ 35 | chomp; 36 | #print $_."\n"; 37 | my ($marker, $sex, $type, $bases, $count) = split(/,/); 38 | if($marker ne $lastMarker){ 39 | processCounts($lastMarker, $counts); 40 | $counts = {"m" => {}, "f" => {}, "a" => {}}; 41 | $lastMarker = $marker; 42 | } 43 | #print(join(";",%{$counts})."\n"); 44 | if(($type ne "a") || ($bases eq "0")){ 45 | next; 46 | } 47 | $counts->{$sex}->{$bases} = $count; 48 | } 49 | processCounts($lastMarker, $counts); 50 | -------------------------------------------------------------------------------- /fourievar.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | if(length(commandArgs(TRUE)) < 1){ 4 | cat("Error: no fasta file specified\n"); 5 | cat("syntax: ./fouriervar.r \n"); 6 | quit(save="no"); 7 | } 8 | 9 | library(Biostrings, quietly=TRUE, warn.conflicts=FALSE, verbose=FALSE); 10 | 11 | fuzz <- 2; 12 | doPlot <- FALSE; 13 | 14 | #fileName <- "/bioinf/presentations/2017-Sep-03/tig00022132_rpt0.fa"; 15 | fileName <- commandArgs(TRUE)[1]; 16 | seqs <- readDNAStringSet(fileName); 17 | 18 | print(seqs); 19 | 20 | for(si in 1:length(seqs)){ 21 | mySeq <- as.vector(seqs[[si]]); 22 | slen <- length(mySeq); 23 | fLimit <- 1000; 24 | fLimit <- min(slen-1, fLimit); 25 | 26 | baseCplx <- complex(real=c(A=1, C=0, G=-1, T=0)[mySeq], 27 | imaginary=c(A=0, C=1, G=0, T=-1)[mySeq]); 28 | 29 | spectrum <- sapply(2:fLimit, function(cycle){ 30 | sum(head(baseCplx, -cycle) == tail(baseCplx, -cycle)) / (slen-cycle); 31 | }); 32 | 33 | png("out.png"); 34 | plot(spectrum); 35 | invisible(dev.off()); 36 | 37 | names(spectrum) <- 2:fLimit; 38 | spectrum <- (spectrum - mean(spectrum)) / sd(spectrum); 39 | spectrum <- spectrum[order(-spectrum)]; 40 | 41 | print(head(cbind(spectrum, qProb = -10 * log10(1-pnorm(spectrum))),20)); 42 | } 43 | -------------------------------------------------------------------------------- /fastm2fastx.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $fastqMode = 0; # false 7 | my $inQual = 0; 8 | my $inSeq = 0; 9 | 10 | my $seqID = ""; 11 | my $seq = ""; 12 | my $qual = ""; 13 | 14 | while(<>){ 15 | if(/^#seq(:(.*$))?/){ 16 | my $newSeqID = $2; 17 | $inQual = 0; # false 18 | $inSeq = 1; # true 19 | if($seqID){ 20 | if($fastqMode){ 21 | printf("@%s\n%s+\n%s", $seqID, $seq, $qual); 22 | } else { 23 | printf(">%s\n%s", $seqID, $seq); 24 | } 25 | } 26 | $seqID = $newSeqID; 27 | $seq = ""; 28 | } elsif(/^#qual(:(.*$))?/){ 29 | my $newSeqID = $2; 30 | if($newSeqID && ($newSeqID ne $seqID)){ 31 | warn(sprintf("Quality ID [%s] and sequence ID [%s] do not match.". 32 | " Setting ID to sequence ID [%s]", 33 | $newSeqID, $seqID, $seqID)); 34 | } 35 | $fastqMode = 1; 36 | $inSeq = 0; # false 37 | $inQual = 1; # true 38 | } elsif(/^#/){ # some other unknown tag 39 | $inSeq = 0; # false 40 | $inQual = 0; # false 41 | } 42 | if(/^ (.*$)/){ 43 | if($inSeq){ 44 | $seq .= $1 . "\n"; 45 | } 46 | if($inQual){ 47 | $qual .= $1 . "\n"; 48 | } 49 | } 50 | } 51 | 52 | if($seqID){ 53 | if($fastqMode){ 54 | printf("@%s\n%s+\n%s", $seqID, $seq, $qual); 55 | } else { 56 | printf(">%s\n%s", $seqID, $seq); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /fasta-nsplit.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | my $seq = ""; 6 | my $shortSeqID = ""; 7 | my $seqID = ""; 8 | my $keep = 0; 9 | my $cumLength = 0; 10 | while(<>){ 11 | chomp; 12 | if(/^>((.+?)( .*?\s*)?)$/){ 13 | my $newID = $1; 14 | my $newShortID = $2; 15 | if($seq){ 16 | my $inc = 0; 17 | while($seq =~ s/(NNNN+)(.*)//){ 18 | my $nStretch = $1; 19 | my $newSeq = $2; 20 | printf(">%s.%s\n%s\n", $seqID, $inc++, $seq) if ($seq); 21 | $cumLength += length($seq); 22 | printf(STDERR "%s\t%d\t%d\n", $shortSeqID, $cumLength, 23 | $cumLength + length($nStretch)); 24 | $cumLength += length($nStretch); 25 | $seq = $newSeq; 26 | } 27 | printf(">%s\n%s\n", $seqID, $seq) if ($seq); 28 | } 29 | $seq = ""; 30 | $shortSeqID = $newShortID; 31 | $seqID = $newID; 32 | $cumLength = 0; 33 | } else { 34 | $seq .= $_; 35 | } 36 | } 37 | if($seq){ 38 | my $inc = 0; 39 | while($seq =~ s/(NNNN+)(.*)//){ 40 | my $nStretch = $1; 41 | my $newSeq = $2; 42 | printf(">%s.%s\n%s\n", $seqID, $inc++, $seq) if ($seq); 43 | $cumLength += length($seq); 44 | printf(STDERR "%s\t%d\t%d\n", $shortSeqID, $cumLength, 45 | $cumLength + length($nStretch)); 46 | $cumLength += length($nStretch); 47 | $seq = $newSeq; 48 | } 49 | printf(">%s\n%s\n", $seqID, $seq) if ($seq); 50 | } 51 | -------------------------------------------------------------------------------- /pileupPropView.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | pileupFileName <- commandArgs(TRUE)[1]; 4 | pos.start <- as.numeric(commandArgs(TRUE)[2]); 5 | pos.end <- as.numeric(commandArgs(TRUE)[3]); 6 | 7 | data.prop.df <- subset( 8 | read.csv(pileupFileName), 9 | (Position >= pos.start) & (Position <= pos.end)); 10 | 11 | prop.plot <- function(data.df){ 12 | par(mar=c(5,5,0.5,1)); 13 | res <- barplot(t(as.matrix(data.df[,c("A","C","G","T","d","pR","i")]) * 14 | data.df$Coverage), 15 | ylim=c(0,max(data.df$Coverage*1.4)), 16 | xlim=c(0,nrow(data.df)*1.1), 17 | xaxt="n", xlab = "Mitochondrial Genome Location", 18 | ylab="Read Coverage", border=NA, space=0, 19 | col=c("darkgreen","blue","black","red", 20 | "steelblue","grey90","grey60")); 21 | legend("right", horiz=FALSE, legend=c("A","C","G","T","Del","Ref","Ins"), 22 | fill=c("darkgreen","blue","black","red", 23 | "steelblue","grey90","grey60")); 24 | tckPoss <- pretty(data.df$Position); 25 | axis(1, at=res[match(tckPoss, data.df$Position)], 26 | labels=tckPoss); 27 | } 28 | 29 | png(paste0(sub("\\..*$","",pileupFileName),"_",pos.start,"-",pos.end,".png"), 30 | width=1366, height=718, pointsize=24); 31 | prop.plot(data.prop.df); 32 | dummy <- dev.off(); 33 | -------------------------------------------------------------------------------- /mitochondrial_descendants.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my %foundingLine = (); 7 | my %found = (); 8 | my %maternalIDs = (); 9 | my %sex = (); 10 | 11 | while (<>) { 12 | my @F = split(/\s+/); 13 | $maternalIDs{$F[1]} = $F[3]; 14 | $sex{$F[1]} = $F[4]; 15 | if(($F[2] == 0) && ($F[3] == 0)){ 16 | $foundingLine{$F[1]}{$F[1]} = 1; 17 | } 18 | } 19 | 20 | print(STDERR "Loaded all IDs\n"); 21 | 22 | my $changed = 1; 23 | 24 | print(STDERR "Finding mitochondrial descendants..."); 25 | while($changed){ 26 | $changed = 0; 27 | foreach my $line (keys(%foundingLine)){ 28 | my %lineInds = %{$foundingLine{$line}}; 29 | foreach my $ind (keys(%lineInds)){ 30 | if(!$found{$ind}){ 31 | #printf(STDERR "Finding mitochondrial descendants of <%d>\n", $ind); 32 | foreach my $ind2 (keys(%maternalIDs)){ 33 | if($maternalIDs{$ind2} == $ind){ 34 | $foundingLine{$line}{$ind2} = 1; 35 | } 36 | } 37 | $found{$ind} = 1; 38 | $changed = 1; 39 | } 40 | } 41 | } 42 | } 43 | 44 | print(STDERR " stored all descendants\n"); 45 | 46 | foreach my $line (sort {$a <=> $b} (keys(%foundingLine))){ 47 | if(scalar(keys(%{$foundingLine{$line}})) > 1){ 48 | printf("Mitochondrial descendants of <%s>:\n", $line); 49 | my %lineInds = %{$foundingLine{$line}}; 50 | foreach my $ind (sort {$a <=> $b} (keys(%lineInds))){ 51 | printf(" $ind %d\n", $sex{$ind}); 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /minimus2_OCA.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ $# -lt 1 ] 4 | then echo "Error: no arguments have been specified" 5 | echo "usage [path2/sample2.fasta] [-nomerge]" 6 | exit 7 | fi 8 | 9 | ffn1=$1 10 | d1=$(dirname $(readlink -e ${ffn1})) 11 | f1=$(basename ${ffn1}) 12 | y1=$(basename ${f1} .gz) 13 | y1=$(basename ${y1} .gzip) 14 | y1=$(basename ${y1} .fasta) 15 | y1=$(basename ${y1} .fa) 16 | mergeName=${y1}_mm2 17 | refCount=$(zgrep -c '^>' ${d1}/${f1}) 18 | d2="" 19 | f2="" 20 | if [ $# -gt 1 ] 21 | then ffn2=$2 22 | d2=$(dirname $(readlink -e ${ffn2})) 23 | f2=$(basename ${ffn2}) 24 | y2=$(basename ${f2} .gz) 25 | y2=$(basename ${y2} .gzip) 26 | y2=$(basename ${y2} .fasta) 27 | y2=$(basename ${y2} .fa) 28 | mergeName=${y1}_${y2}_mm2 29 | if [ $3 != "-nomerge" ] 30 | then echo "-- merging ${y2} into ${y1} --" 31 | else echo "-- carrying out all-vs-all overlap on ${y1} + ${y2} --" 32 | fi 33 | else echo "-- carrying out all-vs-all overlap on ${y1} --" 34 | fi 35 | mkdir -p ${d1}/${mergeName} 36 | cd ${d1}/${mergeName} 37 | zcat -f ${d1}/${f1} > ${mergeName}.seq 38 | if [ $# -gt 1 ] 39 | then zcat -f ${d2}/${f2} >> ${mergeName}.seq 40 | fi 41 | echo "currently in $(pwd)" 42 | ~/bin/amos/toAmos -s ${mergeName}.seq -o ${mergeName}.afg 43 | if [ $# -gt 2 ] 44 | then if [ $3 = "-nomerge" ] 45 | then ~/bin/amos/minimus2 ${mergeName} 46 | else ~/bin/amos/minimus2 ${mergeName} -D REFCOUNT=${refCount} 47 | fi 48 | else ~/bin/amos/minimus2 ${mergeName} -D REFCOUNT=${refCount} 49 | fi 50 | echo "-- DONE --" 51 | -------------------------------------------------------------------------------- /mergeFast5events.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my $model = ""; 7 | my $cumLength = 0; 8 | my $cumTotalLength = 0; 9 | my $meanSig = 0; 10 | my $bpPos = 0; 11 | my $lastMove = 0; 12 | my $lastStart = 0; 13 | 14 | my %colnums = (); 15 | 16 | while(<>){ 17 | chomp; 18 | my @F = split(",", $_); 19 | if($F[5] !~ /[0-9]/){ 20 | my $colNum = 0; 21 | grep {$colnums{$_} = $colNum++} @F; 22 | print("channel,mux,read,model_state,move,bppos,mean,start,startSecs,length\n"); 23 | next; 24 | } 25 | ## convert start/length to samples, assuming 4kHz sampling 26 | @F[($colnums{start},$colnums{length})] = 27 | map({$_ * $F[$colnums{sampleRate}]} @F[($colnums{start},$colnums{length})]); 28 | if($F[$colnums{model_state}] ne $model){ 29 | if($model){ 30 | print(join(",",(@F[($colnums{runID},$colnums{channel},$colnums{mux})], 31 | $model,$lastMove,$bpPos,sprintf('%0.2f',$meanSig), 32 | sprintf('%.0f', $lastStart), 33 | sprintf('%0.4f', $lastStart/$F[$colnums{sampleRate}]), 34 | sprintf('%.0f',$cumLength) 35 | )),"\n"); 36 | } 37 | $model = $F[$colnums{model_state}]; 38 | $lastMove = $F[$colnums{move}]; 39 | $lastStart = $F[$colnums{start}]; 40 | $bpPos += $F[$colnums{move}]; 41 | $cumLength = 0; 42 | $meanSig = 0; 43 | } 44 | $meanSig = ($meanSig * $cumLength + $F[$colnums{mean}] * $F[$colnums{length}]) / ($cumLength + $F[$colnums{length}]); 45 | $cumLength = $cumLength + $F[$colnums{length}]; 46 | $cumLength = int($cumLength); 47 | $cumTotalLength += $cumLength; 48 | } 49 | -------------------------------------------------------------------------------- /sam2mapped.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## sam2mapped -- created mapped sequences (as fasta files) for aligned regions 4 | 5 | my $pos = -1; 6 | my $seqName = ""; 7 | my $bestFlags = ""; 8 | my $bestID = ""; 9 | my $bestSeq = ""; 10 | my $bestQual = ""; 11 | my $bestLine = ""; 12 | my $seenCount = 0; 13 | 14 | my $addSeq = 40; # amount of sequence to add to the start and end 15 | 16 | my $output = "fa"; # can be "fa" 17 | 18 | sub printFQ { 19 | my ($id, $seq, $qual) = @_; 20 | if($id){ 21 | printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual); 22 | } 23 | } 24 | 25 | sub printFA { 26 | my ($id, $seq, $qual) = @_; 27 | if($id){ 28 | printf(">%s\n%s\n", $id, $seq); 29 | } 30 | } 31 | 32 | while(<>){ 33 | if(/^@/){ 34 | next; 35 | } 36 | my $line = $_; 37 | chomp; 38 | my @F = split(/\t/); 39 | my $refPos = $F[3]; 40 | my $cigar = $F[5]; 41 | $cigar =~ s/[0-9]S$//; 42 | my $seq = $F[9]; 43 | my $qual = $F[10]; 44 | my $startTrim = 0; 45 | my $matchLen = 0; 46 | while($cigar =~ s/^([0-9]+)([MIDNSHP=X])//){ 47 | my $subLen = $1; 48 | my $op = $2; 49 | if($op eq "S"){ 50 | if($subLen > $addSeq){ 51 | $seq = substr($seq, $subLen - $addSeq); 52 | $qual = substr($qual, $subLen - $addSeq); 53 | } 54 | } 55 | if($op =~ /[M=XI]/){ 56 | $matchLen += $subLen; 57 | } 58 | } 59 | $seq = substr($seq, 0, $matchLen + $addSeq); 60 | $qual = substr($qual, 0, $matchLen + $addSeq); 61 | if($seq){ 62 | printFA($F[0], $seq, $qual); 63 | } 64 | } 65 | if($output eq "fastq"){ 66 | printSeq($bestID, $bestSeq, $bestQual); 67 | } elsif($output eq "sam"){ 68 | print($bestLine); 69 | } 70 | -------------------------------------------------------------------------------- /sam2LongestBase.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## sam2LongestBase -- extracts a random sequence from the longest sequences at each base 4 | 5 | my $pos = -1; 6 | my $seqName = ""; 7 | my $bestFlags = ""; 8 | my $bestID = ""; 9 | my $bestSeq = ""; 10 | my $bestQual = ""; 11 | my $bestLine = ""; 12 | my $seenCount = 0; 13 | 14 | my $output = "sam"; # can be "sam" or "fastq" 15 | 16 | sub printSeq { 17 | my ($id, $seq, $qual) = @_; 18 | if($id){ 19 | printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual); 20 | } 21 | } 22 | 23 | while(<>){ 24 | if(/^@/){ 25 | print; 26 | next; 27 | } 28 | my $line = $_; 29 | chomp; 30 | my @F = split(/\t/); 31 | if(($F[2] ne $seqName) || ($F[3] != $pos) || (length($bestSeq) <= length($F[9]))){ 32 | if(length($bestSeq) == length($F[9])){ 33 | ## reservoir sampling with a reservoir size of 1 34 | ## See https://en.wikipedia.org/wiki/Reservoir_sampling 35 | ## * with probability 1/i, keep the new item instead of the current item 36 | $seenCount++; 37 | if(!rand($seenCount)){ 38 | ## i.e. if rand($seenCount) == 0, then continue with replacement 39 | next; 40 | } 41 | } else { 42 | $seenCount = 1; 43 | } 44 | if(($F[2] ne $seqName) || ($F[3] != $pos)){ 45 | if($output eq "fastq"){ 46 | printSeq($bestID, $bestSeq, $bestQual); 47 | } elsif($output eq "sam"){ 48 | print($bestLine); 49 | } 50 | } 51 | $seqName = $F[2]; 52 | $pos = $F[3]; 53 | $bestLine = $line; 54 | $bestID = $F[0]; 55 | $bestFlags = $F[1]; 56 | $bestSeq = $F[9]; 57 | $bestQual = $F[10]; 58 | } 59 | } 60 | if($output eq "fastq"){ 61 | printSeq($bestID, $bestSeq, $bestQual); 62 | } elsif($output eq "sam"){ 63 | print($bestLine); 64 | } 65 | -------------------------------------------------------------------------------- /fastx-qualHist.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 7 | 8 | my $idFileName = ""; 9 | my $quiet = 0; 10 | my $base = 33; 11 | 12 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet, 13 | "base" => \$base ) or 14 | die("Error in command line arguments"); 15 | 16 | # unknown commands are treated as identifiers 17 | my @files = (); 18 | while(@ARGV){ 19 | my $arg = shift(@ARGV); 20 | if(-f $arg){ 21 | push(@files, $arg); 22 | } 23 | } 24 | @ARGV = @files; 25 | 26 | my %qualCounts = (); 27 | 28 | my $baseCount = 0; 29 | 30 | my $inQual = 0; # false 31 | my $seqID = ""; 32 | my $qualID = ""; 33 | my $seq = ""; 34 | my $qual = ""; 35 | while(<>){ 36 | chomp; 37 | chomp; 38 | if(!$inQual){ 39 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 40 | my $newSeqID = $2; 41 | my $newShortID = $3; 42 | $seq = ""; 43 | $qual = ""; 44 | $seqID = $newSeqID; 45 | } elsif(/^\+(.*)$/) { 46 | $inQual = 1; # true 47 | $qualID = $1; 48 | } else { 49 | $seq .= $_; 50 | } 51 | } else { 52 | grep {$qualCounts{$_}++} split(//,$_); 53 | $baseCount += length($_); 54 | $qual .= $_; 55 | if(length($qual) >= length($seq)){ 56 | $inQual = 0; # false 57 | } 58 | } 59 | } 60 | 61 | my $cumCount = 0; 62 | foreach my $qualChar (sort(keys(%qualCounts))){ 63 | $cumCount += $qualCounts{$qualChar}; 64 | printf("%s [%2d]: %6d (%6.2f%% / %6.2f%% )\n", 65 | $qualChar, ord($qualChar) - $base, 66 | $qualCounts{$qualChar}, $qualCounts{$qualChar} * 100 / $baseCount, 67 | $cumCount * 100 / $baseCount); 68 | } 69 | 70 | printf("Total sequence length: %d\n", $baseCount); 71 | -------------------------------------------------------------------------------- /labid2UUID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use Text::CSV; 7 | 8 | open(my $idLookupFile, "<", "/home/gringer/bioinf/GU-2012-Apr-01-RLMB/Miles/individuals/NI_UUID_Ped_2012-Oct-23.csv") or die("Cannot open lookup file"); 9 | 10 | my $csv = Text::CSV->new ({ binary => 1, eol => $/ }); 11 | 12 | my %col = (); 13 | my $headerRow = $csv->getline($idLookupFile); 14 | my $fieldCount = 0; 15 | foreach my $field (@$headerRow){ 16 | $col{$field} = $fieldCount++; 17 | } 18 | 19 | my %dataReplacement = (); 20 | my $fileType = "tfam"; 21 | 22 | while (my $row = $csv->getline($idLookupFile)){ 23 | my @fields = @$row; 24 | if($fields[$col{"LAB_ID"}] ne "NA"){ 25 | my $labid = $fields[$col{"LAB_ID"}]; 26 | my $uuid = $fields[$col{"UUID"}]; 27 | my $gender = $fields[$col{"Gender"}]; 28 | my $genderVal = ($gender eq "Male")?1:($gender eq "Female")?2:0; 29 | if(($fileType eq "tfam") || ($fileType eq "ped")){ 30 | $dataReplacement{$labid} = 31 | join(" ",1,$uuid, 32 | ($fields[$col{"patID"}] eq "NA")?0:$fields[$col{"patID"}], 33 | ($fields[$col{"matID"}] eq "NA")?0:$fields[$col{"matID"}], 34 | $genderVal); 35 | } else { 36 | $dataReplacement{$labid} = $uuid; 37 | } 38 | } 39 | } 40 | close($idLookupFile); 41 | 42 | while(<>){ 43 | if(($fileType eq "tfam") || ($fileType eq "ped")){ 44 | my $output = $_; 45 | my @fields = split(/\s+/,$output,6); 46 | if(exists($dataReplacement{$fields[1]})){ 47 | # try the 'individual' column first 48 | $output = $dataReplacement{$fields[1]}." ".$fields[5]; 49 | } elsif(exists($dataReplacement{$fields[0]})){ 50 | # try the 'family' column 51 | $output = $dataReplacement{$fields[0]}." ".$fields[5]; 52 | } 53 | print($output); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /posAggregate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## Aggregates adjacent LAST matches into a single result/match line 4 | 5 | use warnings; 6 | use strict; 7 | 8 | my ($oldquery, $oldtarget, $olddir, $oldqs, $oldqe, $oldqml, 9 | $oldql, $oldqpct, $oldts, $oldte, $oldtml, $oldtl, $oldtpct) = 10 | ("") x 13; 11 | 12 | while(<>){ 13 | chomp; 14 | my ($query, $target, $dir, $qs, $qe, $qml, 15 | $ql, $qpct, $ts, $te, $tml, $tl, $tpct) = 16 | split(/,/); 17 | if(/^query/){ 18 | print($_."\n"); 19 | next; 20 | } 21 | if(($query eq $oldquery) && ($target eq $oldtarget) && 22 | ($olddir eq $dir) && 23 | ((($dir eq "+") && ($oldts <= $ts)) || 24 | (($dir eq "-") && ($oldte >= $te)))){ 25 | $qs = $oldqs; 26 | $qml += $oldqml; 27 | $qpct = (($oldqpct * $oldqml) + ($qpct * $qml)) / ($oldqml + $qml); 28 | $ts = $oldts if ($dir eq "+"); 29 | $te = $oldte if ($dir eq "-"); 30 | $tml += $oldtml; 31 | $tpct = (($oldtpct * $oldtml) + ($tpct * $tml)) / ($oldtml + $tml); 32 | } elsif($oldquery ne "") { 33 | $qpct = sprintf("%0.2f", $qpct); 34 | $tpct = sprintf("%0.2f", $tpct); 35 | print(join(",",($oldquery, $oldtarget, $olddir, 36 | $oldqs, $oldqe, $oldqml, $oldql, $oldqpct, 37 | $oldts, $oldte, $oldtml, $oldtl, $oldtpct))."\n"); 38 | } 39 | ($oldquery, $oldtarget, $olddir, $oldqs, $oldqe, $oldqml, 40 | $oldql, $oldqpct, $oldts, $oldte, $oldtml, $oldtl, $oldtpct) = 41 | ($query, $target, $dir, $qs, $qe, $qml, 42 | $ql, $qpct, $ts, $te, $tml, $tl, $tpct); 43 | } 44 | 45 | $oldqpct = sprintf("%0.2f", $oldqpct); 46 | $oldtpct = sprintf("%0.2f", $oldtpct); 47 | print(join(",",($oldquery, $oldtarget, $olddir, 48 | $oldqs, $oldqe, $oldqml, $oldql, $oldqpct, 49 | $oldts, $oldte, $oldtml, $oldtl, $oldtpct))."\n"); 50 | -------------------------------------------------------------------------------- /vcf_normalise.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## vcf_normalise.pl -- removes VCF information from all but the 4 | ## highest coverage position, considering SNPs/INDELs separately 5 | 6 | ## RSRS 12 . TATCA TCATCA,TCAATCA 0 . INDEL;IDV=13;IMF=0.0634146;DP=92629;... 7 | ## RSRS 12 . T C,A,G 0 . DP=92629;... 8 | ## RSRS 12 . T C,G,A 0 . DP=7490;... 9 | ## RSRS 12 . T TT 0 . INDEL;IDV=1;IMF=0.2;DP=7490;... 10 | ## => 11 | ## RSRS 12 . TATCA TCATCA,TCAATCA 0 . INDEL;IDV=13;IMF=0.0634146;DP=92629;... 12 | ## RSRS 12 . T C,A,G 0 . DP=92629;... 13 | 14 | use warnings; 15 | use strict; 16 | 17 | my $indelCache = ""; 18 | my $snpCache = ""; 19 | my $bestSnpCov = 0; 20 | my $bestIndelCov = 0; 21 | my $oldChr = ""; 22 | my $oldLoc = -1; 23 | my $chr = ""; 24 | my $loc = 0; 25 | my $indel = ""; 26 | my $cov = 0; 27 | 28 | while(<>){ 29 | if(/^#/){ 30 | print; 31 | next; 32 | } 33 | my $line = $_; 34 | if($line =~ /^(.*?)\t([0-9]+)\t.*?(INDEL.*?;)?DP=([0-9]+)/){ 35 | ($chr, $loc, $indel, $cov) = ($1, $2, $3, $4); 36 | } 37 | if(($chr ne $oldChr) || ($loc != $oldLoc)){ 38 | print($indelCache); 39 | print($snpCache); 40 | if($indel){ 41 | $indelCache = $line; 42 | $bestIndelCov = $cov; 43 | $snpCache = ""; 44 | $bestSnpCov = 0; 45 | } else { 46 | $indelCache = ""; 47 | $bestIndelCov = 0; 48 | $snpCache = $line; 49 | $bestSnpCov = $cov; 50 | } 51 | $oldChr = $chr; 52 | $oldLoc = $loc; 53 | } elsif($indel){ 54 | if($cov > $bestIndelCov){ 55 | $indelCache = $line; 56 | $bestIndelCov = $cov; 57 | } 58 | } else{ 59 | if($cov > $bestSnpCov){ 60 | $snpCache = $line; 61 | $bestSnpCov = $cov; 62 | } 63 | } 64 | } 65 | 66 | print($indelCache); 67 | print($snpCache); 68 | -------------------------------------------------------------------------------- /gtsubset.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # gtsubset.pl -- subsets a simplegt file using an individual list 4 | 5 | use warnings; 6 | use strict; 7 | use FileHandle; 8 | use Getopt::Long qw(:config auto_version auto_help pass_through); 9 | 10 | my $idFileName = ""; 11 | my %ids = (); 12 | 13 | GetOptions( 14 | 'ids=s' => \$idFileName, 15 | ); 16 | 17 | my @fileArgs = (); 18 | 19 | foreach my $arg (@ARGV){ 20 | if(-f $arg){ 21 | $idFileName = $arg; 22 | } else { 23 | $ids{$arg} = 1; 24 | } 25 | } 26 | 27 | @ARGV = @fileArgs; 28 | 29 | if($idFileName){ 30 | print(STDERR "Reading in id file... "); 31 | open(my $idFile, "< $idFileName") 32 | or die("cannot open $idFileName for reading"); 33 | while(<$idFile>){ 34 | chomp; 35 | s/(\s|,).*$//; 36 | $ids{$_} = 1; 37 | } 38 | close($idFile); 39 | print(STDERR "done!\n"); 40 | } 41 | 42 | if(!%ids){ 43 | print(STDERR "Error: no IDs specified. Cannot continue.\n"); 44 | exit(1); 45 | } 46 | 47 | my %filteredIDs = (); 48 | my @idOrder = (); 49 | my @colOrder = (); 50 | 51 | while(<>){ 52 | chomp; 53 | if(/^#/){ 54 | s///g; 57 | s/^\s+//; 58 | s/\s+$//; 59 | my @origIDs = split(/\s+/); 60 | for(my $i = 0; $i <= $#origIDs; $i++){ 61 | if($ids{$origIDs[$i]}){ 62 | $filteredIDs{$origIDs[$i]} = $i; 63 | push(@idOrder, $origIDs[$i]); 64 | push(@colOrder, $i); 65 | } 66 | } 67 | if(!(@idOrder)){ 68 | print(STDERR "Error: no specified IDs found. Cannot continue.\n"); 69 | exit(1); 70 | } 71 | if(scalar(@idOrder) < scalar(keys(%ids))){ 72 | printf(STDERR "Warning: some ID values were not found:\n"); 73 | for my $id (keys(%ids)){ 74 | if(!$filteredIDs{$id}){ 75 | print(STDERR " $id"); 76 | } 77 | } 78 | print(STDERR "\n"); 79 | } 80 | printf("## ##\n", join(" ",@idOrder)); 81 | next; 82 | } 83 | my ($marker, @gts) = split(/\s+/); 84 | printf("%-15s %s\n", $marker, join(" ", @gts[@colOrder])); 85 | } 86 | 87 | -------------------------------------------------------------------------------- /tped2gtcounts.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | my ($tfamFileName) = shift(@ARGV); 7 | 8 | my @maleGTs = (); 9 | my @femaleGTs = (); 10 | 11 | #print(STDERR "Processing TFAM file..."); 12 | open(my $tfamFile, "<", $tfamFileName) or die("Cannot open TFAM file '$tfamFileName'"); 13 | my $tfamPos = 0; 14 | my $lastGTPos = -1; 15 | while(<$tfamFile>){ 16 | chomp; 17 | my ($iid, $fid, $mat, $pat, $sex, $pheno) = split(/\s+/); 18 | if($sex == 1){ 19 | push(@maleGTs, $tfamPos); 20 | } elsif($sex == 2){ 21 | push(@femaleGTs, $tfamPos); 22 | } 23 | $tfamPos++; 24 | $lastGTPos++; 25 | } 26 | close($tfamFile); 27 | #print(STDERR " done\n"); 28 | 29 | my @maleAlleles = map {($_*2, $_*2+1)} @maleGTs; 30 | my @femaleAlleles = map {($_*2, $_*2+1)} @femaleGTs; 31 | 32 | my $processed = 0; 33 | 34 | print(STDERR "."); 35 | #printf("Marker,Sex,Type,Bases,Count\n"); 36 | my %counts = (); 37 | while(<>){ 38 | chomp; 39 | my $line = $_; 40 | my ($chr, $marker, $mapPos, $genPos, @alleles) = split(/\s+/, $_); 41 | my @genotypes = map {@alleles[$_*2].@alleles[$_*2+1]} (0..$lastGTPos); 42 | %counts = (); 43 | grep { 44 | my $gt = $_; 45 | my $a1 = substr($gt,0,1); 46 | my $a2 = substr($gt,1,1); 47 | $counts{"a"}{$gt}++; 48 | $counts{"a"}{$a1}++; 49 | $counts{"a"}{$a2}++; 50 | $counts{"m"}{$gt}++; 51 | $counts{"m"}{$a1}++; 52 | $counts{"m"}{$a2}++; 53 | } @genotypes[@maleGTs]; 54 | grep { 55 | my $gt = $_; 56 | my $a1 = substr($gt,0,1); 57 | my $a2 = substr($gt,1,1); 58 | $counts{"a"}{$gt}++; 59 | $counts{"a"}{$a1}++; 60 | $counts{"a"}{$a2}++; 61 | $counts{"f"}{$gt}++; 62 | $counts{"f"}{$a1}++; 63 | $counts{"f"}{$a2}++; 64 | } @genotypes[@femaleGTs]; 65 | foreach my $sex ("a","m","f"){ 66 | foreach my $bases (keys(%{$counts{$sex}})){ 67 | printf("%s,%s,%s,%s,%d\n", $marker, $sex, 68 | (length($bases) == 1) ? "a" : "g", 69 | $bases, 70 | $counts{$sex}{$bases}); 71 | } 72 | } 73 | $processed++; 74 | if($processed > 1000){ 75 | print(STDERR "."); 76 | $processed = 0; 77 | } 78 | } 79 | #print(STDERR " done\n"); 80 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %(site_name) BLAST Search (%(program)) 6 | 7 | 8 |
9 | 49 |
50 |
51 |
    52 |
  • 53 | 56 |
  • 57 |
  • 58 | 61 |
  • 62 |
  • 63 | 66 |
  • 67 |
68 |
69 |
70 |
71 |
72 | -------------------------------------------------------------------------------- /vcf2simplegt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | ## vcf2simplegt.pl -- convert from VCF file to simplegt format 7 | 8 | use Getopt::Long qw(:config auto_help pass_through); 9 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 10 | 11 | my %colNums = (); 12 | 13 | my $idFileName = ""; 14 | my @idOrder = (); 15 | my $idsSpecified = 0; # false 16 | 17 | my %excludeCols = ( 18 | "#CHROM" => 1, "ID" => 1, "POS" => 1, "REF" => 1, 19 | "ALT" => 1, "QUAL" => 1, "FILTER" => 1, 20 | "INFO" => 1, "FORMAT" => 1); 21 | 22 | GetOptions("idFile=s" => \$idFileName) or 23 | die("Error in command line arguments"); 24 | 25 | if($idFileName){ 26 | print(STDERR "Retrieving id names from $idFileName..."); 27 | $idsSpecified = 1; # true 28 | my $idFile = 0; 29 | $idFile = new IO::Uncompress::Gunzip "$idFileName" or 30 | die "Unable to open $idFileName\n"; 31 | while(<$idFile>){ 32 | if(/^\"?(.*?)\"?[\s,]+/){ 33 | my $id = $1; 34 | push(@idOrder, $id); 35 | } 36 | } 37 | close($idFile); 38 | print(STDERR scalar(@idOrder)." id names extracted\n"); 39 | } 40 | 41 | my $nextAlleleNum = 0; 42 | my %alleleNums = (); 43 | my @alleles = (); 44 | 45 | while(<>){ 46 | if(/^##/){ 47 | next; 48 | } 49 | chomp; 50 | my @F = split(/\t/, $_); 51 | if(/(^|\s)ID(\s|$)/){ 52 | my $colNum = 0; 53 | foreach my $colName (@F){ 54 | $colNums{$colName} = $colNum; 55 | if(!$idsSpecified && (!$excludeCols{$colName})){ 56 | push(@idOrder, $colName); 57 | } 58 | $colNum++; 59 | } 60 | @idOrder = grep {defined($colNums{$_})} @idOrder; 61 | printf("## ##\n", 62 | join(" ", @idOrder)); 63 | next; 64 | } elsif(!defined($colNums{"ID"})){ 65 | die("IDs have not been defined / found"); 66 | } 67 | ## by this time, @idOrder should be populated with column IDs 68 | ## identify alleles 69 | @alleles = ($F[$colNums{"REF"}], split(/,/, $F[$colNums{"ALT"}]), "."); 70 | grep {$_ =~ s/\./$#alleles/ge} @alleles; 71 | printf("%-15s %s\n", $F[$colNums{"ID"}], 72 | join(" ",map {join("",@alleles[split(/\|/, $_)])} @F[@colNums{@idOrder}])); 73 | } 74 | -------------------------------------------------------------------------------- /vcf_windowSorter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # vcf_windowSorter.pl -- sorts a VCF file by position within a 4 | # specified window size. The effect of this is to convert a file 5 | # containing overlapping gene regions (with a given window of overlap) 6 | # into a sorted file that can be indexed. 7 | 8 | use warnings; 9 | use strict; 10 | 11 | my $windowSize = 50000; 12 | my $currentChr = ""; 13 | my $minPos = -1; 14 | my $maxPos = -1; 15 | 16 | my %posCache = (); 17 | my @storedPoss = (); 18 | 19 | while(<>){ 20 | my $line = $_; 21 | if(!/^#/){ 22 | my ($chr, $pos) = (/^(.*?)\t([0-9]+)/); 23 | if($chr ne $currentChr){ 24 | # spit out cache 25 | for my $oldPos (@storedPoss){ 26 | print($posCache{$oldPos}); 27 | } 28 | # reset min/max values and clear cache 29 | $currentChr = $chr; 30 | $minPos = -1; 31 | $maxPos = -1; 32 | %posCache = (); 33 | @storedPoss = (); 34 | } 35 | #print(join(":",($chr, $pos))."\n"); 36 | if($pos > ($minPos + $windowSize)){ 37 | #printf(STDERR "Pos: $pos, MinPos: $minPos, MaxPos: $maxPos, ArraySize: %d\n", scalar(@storedPoss)); 38 | while(@storedPoss && ($storedPoss[0] <= ($pos - $windowSize))){ 39 | # spit out first bit of cache 40 | my $oldPos = shift(@storedPoss); 41 | # extract and remove position information from cache 42 | print(delete($posCache{$oldPos})); 43 | } 44 | if(!(@storedPoss)){ # no remaining values, so no maximum value 45 | $maxPos = -1; 46 | } 47 | } 48 | if(!exists($posCache{$pos})){ # avoid inserting positions multiple times 49 | # pos might need to slot into array, so check with max value 50 | if($pos < $maxPos){ 51 | # this is expensive, but hopefully doesn't happen too often 52 | @storedPoss = sort({$a <=> $b} ($pos, @storedPoss)); 53 | # no change to maxPos 54 | } else { 55 | push(@storedPoss, $pos); 56 | $maxPos = $pos; 57 | } 58 | } 59 | # store line in cache 60 | $posCache{$pos} .= $line; 61 | # regenerate minimum value; storedPoss should have at least 1 element 62 | $minPos = $storedPoss[0]; 63 | } else { 64 | print($_); 65 | } 66 | } 67 | 68 | # print out remaining cached locations 69 | for my $oldPos (@storedPoss){ 70 | print($posCache{$oldPos}); 71 | } 72 | -------------------------------------------------------------------------------- /bam2proportion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | readType=$1; # IonTorrent, Nanopore, Illumina 4 | bamFile=$2; 5 | regFile=$3; # /data/all/david/diagnostic/design/IAD31669_Submitted.bed 6 | refFile=$(samtools view -H ${bamFile} | grep '\.fa' | perl -pe 's#^.*-f #/mnt/ihbi_ngs/iontorrent-kgq4#;s/(\.fa(sta)?).*$/$1/'); 7 | 8 | echo "BAM File: '${bamFile}'" > /dev/stderr; 9 | echo "Reference File: '${refFile}'" > /dev/stderr; 10 | echo "Region File: '${regFile}'" > /dev/stderr; 11 | 12 | pileupOpts=""; 13 | 14 | if [ ${readType} = "Nanopore" ] ; then 15 | # initial assumption: similar to IonTorrent, with lower base quality and lower INDEL coefficient 16 | pileupOpts="-d 10000 -L 10000 -Q 5 -h 20 -o 10 -e 17 -m 10 -f ${refFile} ${bamFile}"; 17 | fi 18 | 19 | if [ ${readType} = "Illumina" ] ; then 20 | # Illumina should work with default options 21 | pileupOpts="-f ${refFile} ${bamFile}"; 22 | fi 23 | 24 | # https://www.edgebio.com/variant-calling-ion-torrent-data 25 | 26 | if [ ${readType} = "IonTorrent" ] ; then 27 | pileupOpts="-d 10000 -L 10000 -Q 7 -h 50 -o 10 -e 17 -m 10 -f ${refFile} ${bamFile}"; 28 | fi 29 | 30 | # for Variant annotation with annovar 31 | # http://www.cureffi.org/2012/09/07/an-alternative-exome-sequencing-pipeline-using-bowtie2-and-samtools/ 32 | 33 | if [ -z ${regFile} ]; then 34 | #echo "samtools mpileup ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g'" > /dev/stderr 35 | samtools mpileup ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g' 36 | else 37 | #echo "samtools mpileup -l ${regFile} ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g'" > /dev/stderr 38 | samtools mpileup -l ${regFile} ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g' 39 | fi 40 | 41 | # for x in IonXpress_0*.bam; do echo -n ${x} "..."; ~/scripts/bam2proportion.sh IonTorrent ${x} /data/all/david/diagnostic/design/IAD31669_Submitted.bed > proportion_Diagnostics_AmpliSeq_QUT_NGS11_98_099/$(basename ${x} .bam).pileupprop.csv; echo "done"; done 42 | # for x in IonXpress_0*.bam; do echo -n ${x} "..."; ~/scripts/bam2proportion.sh IonTorrent ${x} > proportion_Shani_Mito_Run18_71_082/$(basename ${x} .bam).pileupprop.csv; echo "done"; done 43 | 44 | exit 0 45 | -------------------------------------------------------------------------------- /fastx-annotate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | 7 | my $idFileName = ""; 8 | my $quiet = 0; 9 | 10 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet) or 11 | die("Error in command line arguments"); 12 | 13 | my %idsToAnnotate = (); 14 | 15 | # unknown commands are treated as identifiers 16 | my @files = (); 17 | while(@ARGV){ 18 | my $arg = shift(@ARGV); 19 | if(-f $arg){ 20 | push(@files, $arg); 21 | } else { 22 | $idsToAnnotate{$arg} = shift(@ARGV); 23 | } 24 | } 25 | @ARGV = @files; 26 | 27 | if($idFileName){ 28 | # read sequence IDs from input file 29 | printf(STDERR "Reading from file '$idFileName'\n"); 30 | open(my $idFile, "<", $idFileName); 31 | while(<$idFile>){ 32 | chomp; 33 | s/^[>@]//; 34 | s/\"//g; 35 | my @F = split(/,/,$_,2); 36 | $idsToAnnotate{$F[0]} = $F[1]; 37 | } 38 | close($idFile); 39 | } 40 | 41 | if(!$quiet){ 42 | printf(STDERR "Read %d identifiers\n", scalar(keys(%idsToAnnotate))); 43 | } 44 | 45 | my $inQual = 0; # false 46 | my $seqID = ""; 47 | my $qualID = ""; 48 | my $seq = ""; 49 | my $qual = ""; 50 | while(<>){ 51 | chomp; 52 | chomp; 53 | if(!$inQual){ 54 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 55 | my $newSeqID = $2; 56 | my $newShortID = $3; 57 | if($seqID){ 58 | if($qual){ 59 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 60 | } else { 61 | printf(">%s\n%s\n", $seqID, $seq); 62 | } 63 | } 64 | $seq = ""; 65 | $qual = ""; 66 | if(exists($idsToAnnotate{$newSeqID})){ 67 | $seqID = $newSeqID . " " . $idsToAnnotate{$newSeqID}; 68 | } elsif(exists($idsToAnnotate{$newShortID})){ 69 | $seqID = $newShortID . " " . $idsToAnnotate{$newShortID}; 70 | } else { 71 | $seqID = $newSeqID; 72 | } 73 | } elsif(/^\+(.*)$/) { 74 | $inQual = 1; # true 75 | $qualID = $1; 76 | } else { 77 | $seq .= $_; 78 | } 79 | } else { 80 | $qual .= $_; 81 | if(length($qual) >= length($seq)){ 82 | $inQual = 0; # false 83 | } 84 | } 85 | } 86 | 87 | if($seqID){ 88 | if($qual){ 89 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 90 | } else { 91 | printf(">%s\n%s\n", $seqID, $seq); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /webblast/style/css/side_menu.css: -------------------------------------------------------------------------------- 1 | #menu_area { 2 | position: fixed; 3 | padding:0; 4 | margin:0; 5 | left:0; 6 | top:0; 7 | width: 170px; 8 | height: 100%; 9 | } 10 | 11 | #logo_area { 12 | position: absolute; 13 | padding:0; 14 | left:0; 15 | width: 130px; 16 | height: 130px; 17 | top: 0px; 18 | color: black; 19 | margin: 0px; 20 | text-decoration: none; 21 | } 22 | 23 | #logo { 24 | padding: 10px; 25 | } 26 | 27 | /* Menu Text */ 28 | #menu { 29 | color: black; 30 | margin-top: 130px; 31 | margin-left: 10px; 32 | font-family: "Bitstream Vera Sans", Verdana, Helvetica, Arial, sans-serif; 33 | } 34 | 35 | /* Gives padding to sub menu items (because they have links ) */ 36 | #menu a {background-color: transparent; padding: 7px; } 37 | 38 | /* Aligns menu better */ 39 | #menu ul {top: 200px; padding: 0; margin: 0; font-size: 1.05em; } 40 | 41 | /* Sub-menu background area - making it hidden */ 42 | #menu > ul ul { display: none ; } 43 | 44 | /* Sub-Menu Area */ 45 | #menu > ul li:hover > ul { 46 | width:200px; 47 | top: 1.25em; 48 | margin-left: -115px; 49 | margin-top: 12px; 50 | display: block; 51 | position: absolute; 52 | left: 100%; 53 | } 54 | 55 | /* Main Menu selector emphasis */ 56 | #menu > ul li:hover { 57 | width:150px; 58 | text-align: left; 59 | z-index: 10000; 60 | cursor: pointer; 61 | background: url("menu_active.png"); 62 | } 63 | 64 | /* Main Menu selector emphasis */ 65 | #menu ul li ul li:hover { 66 | width:100px; 67 | text-align: left; 68 | z-index: 10000; 69 | cursor: pointer; 70 | } 71 | /* Pop-out menu colour */ 72 | #menu > ul ul li { 73 | width:200px; 74 | cursor: pointer; 75 | line-height: 1.05em; 76 | } 77 | 78 | /* Sub-menu selector area decleration */ 79 | #menu > ul ul li:hover { 80 | width:165px; 81 | } 82 | 83 | /* Sub-menu text */ 84 | #menu > ul ul li a { 85 | display: block; 86 | color: black; 87 | text-decoration: none; } 88 | 89 | /* Menu text spacing */ 90 | #menu ul li { 91 | width:100%; 92 | list-style-type: none; 93 | position: relative; 94 | line-height: 2.0em; 95 | } 96 | 97 | #menu { padding-left: 5px;} 98 | 99 | #menu p { 100 | margin: 2px; 101 | } 102 | 103 | .programButton { 104 | border-style: none; 105 | background-color: transparent; 106 | } -------------------------------------------------------------------------------- /dotplotAnnotator.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | library(png); 4 | 5 | fname <- commandArgs(TRUE); 6 | img <- readPNG(fname); 7 | 8 | valToSci <- function(val, unit = ""){ 9 | sci.prefixes <- c("", "k", "M", "G", "T", "P", "E", "Z", "Y"); 10 | units <- rep(paste(sci.prefixes,unit,sep=""), each=3); 11 | logRegion <- floor(log10(val))+1; 12 | conv.units <- units[logRegion]; 13 | conv.div <- 10^rep(0:(length(sci.prefixes)-1) * 3, each = 3)[logRegion]; 14 | conv.val <- val / conv.div; 15 | conv.val[val == 0] <- 0; 16 | conv.units[val == 0] <- unit; 17 | return(sprintf("%s %s",conv.val,conv.units)); 18 | } 19 | 20 | fib.divs <- round(10^((0:4)/5) * 2) * 0.5; ## splits log decades into 5 21 | 22 | seqName <- sub("^.*?_(.*).png","\\1",fname); 23 | seqLen <- sub("_.*$","",fname); 24 | seqStart <- 0; 25 | seqEnd <- 0; 26 | if(grepl("-", seqLen)){ 27 | seqStart <- as.numeric(sub("-.*$","",seqLen)); 28 | seqEnd <- as.numeric(sub("^.*?-","",seqLen)); 29 | seqLen <- seqEnd - seqStart + 1; 30 | } else { 31 | seqLen <- as.numeric(seqLen); 32 | seqStart <- 1; 33 | seqEnd <- seqLen; 34 | } 35 | 36 | print(c(seqName, seqLen, seqStart, seqEnd)); 37 | 38 | png(sprintf("featurePlot_%s.png", seqName), 39 | width=1800, height=600, pointsize=12); 40 | #svg(sprintf("featurePlot_%s.svg", seqName), 41 | # width=12.8, height=7.2, pointsize=12); 42 | par(mar=c(5,6,3,1.5), cex.axis=1.5, cex.lab=1.5, cex.main=2); 43 | plot(NA, main=sprintf("Feature profile (%s)", seqName), 44 | xlab="Sequence Location (kbp)", 45 | ylab="", log="y", xlim=c(seqStart,seqEnd)/1000, 46 | ylim=c(1,seqLen), yaxt="n"); 47 | rasterImage(img, xleft=seqStart/1000, xright=seqEnd/1000, 48 | ybottom=1, ytop=seqLen); 49 | drMax <- ceiling(log10(seqLen)); 50 | axis(2, at= 10^(0:drMax), las=2, lwd=3, cex.axis=1.5, labels=valToSci(10^(0:drMax))); 51 | axis(2, at= rep(1:9, each=drMax+1) * 10^(0:drMax), labels=FALSE); 52 | abline(h=10^(0:drMax), col="#80808050", lwd = 3); 53 | mtext("Feature distance (bp)", 2, line=4.5, cex=1.5); 54 | legend(x = "bottom", 55 | fill=c("#9000a0","#8b0000", 56 | "#fdc086","#ff7f00", 57 | "#00a090","#0000ff", 58 | "#a09000","#00a000"), 59 | legend=c("Repeat (L)", "Repeat (R)", 60 | "Comp (L)", "Comp (R)", 61 | "RevComp (L)", "RevComp (R)", 62 | "Reverse (L)", "Reverse (R)"), 63 | bg="#FFFFFF", horiz=FALSE, inset=0.01, ncol=4); 64 | invisible(dev.off()); 65 | -------------------------------------------------------------------------------- /markercsv2linkage.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # csv2merlin.pl -- Copy genotype data and pedigree information 4 | # to create MERLIN-compatible QTDT files 5 | 6 | # Author: David Eccles (gringer), 2013 7 | 8 | use warnings; 9 | use strict; 10 | 11 | use Text::CSV; 12 | 13 | sub usage { 14 | print("usage: ./csv2merlin.pl [options]\n"); 15 | print("\nConvert CSV format to MERLIN-QTDT\n"); 16 | print("\nOther Options:\n"); 17 | print("-help : Only display this help message\n"); 18 | print("\n"); 19 | } 20 | 21 | my @files = (); 22 | 23 | my $pedFileName = ""; 24 | my $gtFileName = ""; 25 | 26 | # extract command line arguments 27 | while(@ARGV){ 28 | my $argument = shift @ARGV; 29 | if(-f $argument){ # file existence check 30 | if(!$pedFileName){ 31 | $pedFileName = $argument; 32 | #printf(STDERR "Setting PED file name to '%s'\n", $pedFileName); 33 | } elsif (!$gtFileName) { 34 | $gtFileName = $argument; 35 | #printf(STDERR "Setting genotype file ". 36 | # "name to '%s'\n", $gtFileName); 37 | } else { 38 | print(STDERR "Error: only two files can be specified\n"); 39 | usage(); 40 | exit(1); 41 | } 42 | } else { 43 | if($argument eq "-help"){ 44 | usage(); 45 | exit(0); 46 | } 47 | } 48 | } 49 | 50 | if(!$pedFileName || !$gtFileName){ 51 | print(STDERR "Error: *two* files must be specified, cannot continue\n"); 52 | usage(); 53 | exit(1); 54 | } 55 | 56 | my $csv = Text::CSV->new ( { binary => 1 } ) # should set binary attribute. 57 | or die "Cannot use CSV: ".Text::CSV->error_diag (); 58 | 59 | my $maxGTCount = 0; 60 | my %gtLines = (); 61 | open(my $gtFile, "<", $gtFileName) or die("Cannot open genotype file"); 62 | while(my $row = $csv->getline($gtFile)){ 63 | my @F = @{$row}; 64 | my $id = shift(@F); 65 | if(scalar(@F) > $maxGTCount){ 66 | $maxGTCount = scalar(@F); 67 | } 68 | map{$_ = ($_)?$_:"0/0"} @F; 69 | $gtLines{$id} = join(" ",@F)."\n"; 70 | } 71 | close($gtFile); 72 | 73 | my $blankLine = " 0/0" x $maxGTCount . "\n"; 74 | 75 | open(my $pedFile, "<", $pedFileName) or die("Cannot open PED file"); 76 | while(<$pedFile>){ 77 | chomp; 78 | print $_." "; 79 | my @F = split(/\s+/, $_, 3); 80 | if($gtLines{$F[1]}){ 81 | print($gtLines{$F[1]}); 82 | } else { 83 | print($blankLine); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /fastx-grep.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | ## fastx-grep -- search for a pattern in the sequence or sequence name 6 | 7 | use Getopt::Long qw(:config auto_help pass_through); 8 | 9 | my $quiet = 0; 10 | my $filterSeqs = ""; 11 | my $filterIDs = ""; 12 | my $reverse = 0; 13 | 14 | GetOptions("filter=s" => \$filterSeqs, "idfilter=s" => \$filterIDs, 15 | "reverse|v!" => \$reverse, "quiet!" => \$quiet) or 16 | die("Error in command line arguments"); 17 | 18 | # unknown commands are treated as identifiers 19 | my @files = (); 20 | while(@ARGV){ 21 | my $arg = shift(@ARGV); 22 | if(-e $arg){ 23 | push(@files, $arg); 24 | } else { 25 | $filterSeqs .= "|$arg"; 26 | } 27 | } 28 | @ARGV = @files; 29 | 30 | if($filterSeqs){ 31 | $filterSeqs =~ s/^\|//; 32 | $filterSeqs = "($filterSeqs)"; 33 | if($reverse){ 34 | printf(STDERR "Filter sequence (excluded): $filterSeqs\n"); 35 | } else { 36 | printf(STDERR "Filter sequence: $filterSeqs\n"); 37 | } 38 | } 39 | 40 | if($filterIDs){ 41 | $filterIDs =~ s/^\|//; 42 | $filterIDs = "($filterIDs)"; 43 | if($reverse){ 44 | printf(STDERR "Filter ID (excluded): $filterIDs\n"); 45 | } else { 46 | printf(STDERR "Filter ID: $filterIDs\n"); 47 | } 48 | } 49 | 50 | 51 | my $inQual = 0; # false 52 | my $seqID = ""; 53 | my $qualID = ""; 54 | my $seq = ""; 55 | my $qual = ""; 56 | while(<>){ 57 | chomp; 58 | chomp; 59 | if(!$inQual){ 60 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 61 | my $newSeqID = $2; 62 | my $newShortID = $3; 63 | if($seqID && (!$filterSeqs || ($reverse xor ($seq =~ /$filterSeqs/))) 64 | && (!$filterIDs || ($reverse xor ($seqID =~ /$filterIDs/)))){ 65 | if($qual){ 66 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 67 | } else { 68 | printf(">%s\n%s\n", $seqID, $seq); 69 | } 70 | } 71 | $seq = ""; 72 | $qual = ""; 73 | $seqID = $newSeqID; 74 | } elsif(/^\+(.*)$/) { 75 | $inQual = 1; # true 76 | $qualID = $1; 77 | } else { 78 | $seq .= $_; 79 | } 80 | } else { 81 | $qual .= $_; 82 | if(length($qual) >= length($seq)){ 83 | $inQual = 0; # false 84 | } 85 | } 86 | } 87 | 88 | if($seqID && (!$filterSeqs || ($reverse xor ($seq =~ /$filterSeqs/))) 89 | && (!$filterIDs || ($reverse xor ($seqID =~ /$filterIDs/)))){ 90 | if($qual){ 91 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 92 | } else { 93 | printf(">%s\n%s\n", $seqID, $seq); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /quantile_subset.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## quantile_subset.pl -- filters out items from a sorted list (with 4 | ## repeated values in the first field) that are the closest to the 5 | ## desired quantiles 6 | 7 | ## For example, if there are 100 values, then a request for the 5th 8 | ## and 95th quantiles will produce the 5th and 95th value respectively. 9 | 10 | ## usage: cat | quantile_subset.pl -quantiles 0.05,0.95 11 | 12 | use warnings; 13 | use strict; 14 | 15 | use Getopt::Long qw(:config auto_help pass_through); 16 | 17 | my @values = (); 18 | my $marker = ""; 19 | 20 | my @quantiles = (0.05,0.95); 21 | my $quantileStr = ""; 22 | 23 | GetOptions("quantiles=s" => \$quantileStr) or 24 | die("Error in command line arguments"); 25 | 26 | if($quantileStr){ 27 | @quantiles = split(/,/, $quantileStr); 28 | } 29 | 30 | while(<>){ 31 | chomp; 32 | my @F = split(/,/, $_); 33 | my $lastLine = $_; 34 | if($marker ne $F[0]){ 35 | if(@values){ 36 | foreach my $quantile (@quantiles){ 37 | if($quantile == 1){ 38 | printf("%s\n", $values[$#values]); 39 | } else { 40 | my $rankPos = $quantile * $#values; 41 | my $rankInt = int($rankPos); 42 | my $rankFrac = $rankPos - $rankInt; 43 | ## standard quantile calculation doesn't make sense for 44 | ## fractional locations when actual position is in between 45 | ## two arbitrary text fields, so choose 0.5 as a threshold 46 | ## [e.g. SNP at chromosome 4, location 30 Mb vs 47 | ## SNP at chromosome 7, location 10 Mb] 48 | if($rankFrac < 0.5){ 49 | printf("%s\n", $values[$rankInt]); 50 | } else { 51 | printf("%s\n", $values[$rankInt+1]); 52 | } 53 | } 54 | } 55 | @values = (); 56 | } 57 | } 58 | $marker = $F[0]; 59 | if(!/[0-9]$/){ 60 | ## Write header line(s) [end with non-numeric character] out directly 61 | printf("%s\n", $lastLine); 62 | } else { 63 | push(@values, $lastLine); 64 | } 65 | } 66 | 67 | if(@values){ 68 | foreach my $quantile (@quantiles){ 69 | if($quantile == 1){ 70 | printf("%s\n", $values[$#values]); 71 | } else { 72 | my $rankPos = $quantile * $#values; 73 | my $rankInt = int($rankPos); 74 | my $rankFrac = $rankPos - $rankInt; 75 | if($rankFrac < 0.5){ 76 | printf("%s\n", $values[$rankInt]); 77 | } else { 78 | printf("%s\n", $values[$rankInt+1]); 79 | } 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /power_analysis.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscripts 2 | data.df <- 3 | read.csv("Power_Data_30_min_kWh.csv", stringsAsFactors=FALSE); 4 | 5 | colnames(data.df) <- c("Date","Time","Usage"); 6 | 7 | data.df$Date <- as.POSIXct(data.df$Date, format="%d/%m/%Y"); 8 | data.df$Time <- as.POSIXct(data.df$Time, format="%I:%M:%S %p"); 9 | 10 | ## Graph of power usage by half-hour (aggregated over all days), with median 11 | png("PowerUsageDayAggregate.png", width=1280, height=640, pointsize=24); 12 | par(xaxs="i", mar=c(4,4,0.5,0.5), cex.axis=0.8); 13 | smoothScatter(y=log10(data.df$Usage), 14 | x= as.numeric(data.df$Time), 15 | pch=21, bg="#00000010", 16 | col="#00000010", xaxt="n", yaxt="n", ylab = "Usage (kWh)", 17 | xlab = "Time", nbin=c(48*4,128), bandwidth=c(1200,.04), nrpoints=0); 18 | axis(1, at=sort(unique(data.df$Time)), cex.axis=0.71, las=2, 19 | labels=strftime(sort(unique(data.df$Time)), format="%H:%M %p")); 20 | axis(2, at=log10(1:9 * rep(10^(-1:1),each=9)), 21 | labels=(1:9 * rep(10^(-1:1),each=9)), las=1); 22 | day.aggregate <- tapply(data.df$Usage, data.df$Time, median); 23 | points(spline(x=as.POSIXct(names(day.aggregate)),y=log10(day.aggregate)), 24 | type = "l"); 25 | points(x=as.POSIXct(names(day.aggregate)),y=log10(day.aggregate), lwd=2); 26 | invisible(dev.off()); 27 | 28 | ## Graph of daily power usage, with median daily usage per month 29 | png("PowerUsageByMonth.png", width=1280, height=640, pointsize=24); 30 | par(mar=c(4,4,0.5,0.5), cex.axis=0.8); 31 | plot(x=as.POSIXct(names(day.sum)), y=day.sum, xaxt="n", xlab="", 32 | ylab = "Daily Consumption (kWh)", col="#00000020", pch=21, 33 | bg="#00000020", cex=0.5); 34 | axis(1, at=as.POSIXct(sprintf("201%d-%02d-01",rep(5:7,each=11), 2:12)), 35 | label=strftime(sprintf("201%d-%02d-01",rep(5:7,each=11), 2:12), 36 | format="%b"), las=2); 37 | axis(1, at=as.POSIXct(sprintf("201%d-%02d-01",rep(5:7,each=1), 1)), 38 | label=strftime(sprintf("201%d-%02d-01",rep(5:7,each=1), 1), 39 | format="%Y-%b"), las=2); 40 | day.sum <- tapply(data.df$Usage, data.df$Date, sum); 41 | month.med <- tapply(day.sum, strftime(names(day.sum), format="%Y-%b"), median); 42 | names(month.med) <- sub("$","-15",names(month.med)); 43 | month.med <- month.med[order(as.POSIXct(names(month.med), format="%Y-%b-%d"))]; 44 | points(x=as.POSIXct(names(month.med), format="%Y-%b-%d"), y=month.med, 45 | pch=21, bg="black"); 46 | points(spline(x=as.POSIXct(names(month.med), format="%Y-%b-%d"), y=month.med, 47 | n=length(month.med) * 10), 48 | type="l"); 49 | invisible(dev.off()); 50 | -------------------------------------------------------------------------------- /sam-senseflagger.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## sam-sense_flagger.pl -- re-flag a SAM file using sense and 4 | ## anti-sense mapping information to create a strand-specific 5 | ## output. Query-reverse sequences are set to read #1, while 6 | ## Query-forward are set to read #2 7 | 8 | use Getopt::Long qw(:config auto_help pass_through); 9 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 10 | 11 | my $fwdFileName = ""; 12 | my $revFileName = ""; 13 | my $prefix = "split"; 14 | my $filter = 0; ## false 15 | my $filterFwd = 0; ## false 16 | my $filterRev = 0; ## false 17 | 18 | GetOptions("fwdids=s" => \$fwdFileName, "revids=s" => \$revFileName, 19 | "filter!" => \$filter, "xfwd!" => \$filterFwd, 20 | "xrev!" => \$filterRev, ) or 21 | die("Error in command line arguments"); 22 | 23 | if(!$fwdFileName || !$revFileName){ 24 | die("Error: 'fwdFileName' and 'revFileName' have not been defined"); 25 | } 26 | 27 | my %fwdIDs; 28 | my %revIDs; 29 | 30 | open(my $fwdFile, "<", $fwdFileName) or die("Error loading foward ID file"); 31 | while(<$fwdFile>){ 32 | chomp; 33 | $fwdIDs{$_} = $_; 34 | } 35 | close($fwdFile); 36 | 37 | open(my $revFile, "<", $revFileName) or die("Error loading reverse ID file"); 38 | while(<$revFile>){ 39 | chomp; 40 | $revIDs{$_} = $_; 41 | } 42 | close($fwdFile); 43 | 44 | my $pos = -1; 45 | my $seqName = ""; 46 | my $bestFlags = ""; 47 | my $bestID = ""; 48 | my $bestSeq = ""; 49 | my $bestQual = ""; 50 | my $bestLine = ""; 51 | my $seenCount = 0; 52 | 53 | sub printSeq { 54 | my ($id, $seq, $qual) = @_; 55 | if($id){ 56 | printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual); 57 | } 58 | } 59 | 60 | while(<>){ 61 | if(/^@/){ 62 | print; 63 | next; 64 | } 65 | chomp; 66 | my @F = split(/\t/); 67 | my $flag = $F[1]; 68 | my $revMap = $flag & 0x10; # mapping orientation to *reference* 69 | my $sense = 0; 70 | my $antisense = 0; 71 | my ($fwdRead, $revRead) = (0, 0); # false 72 | $flag = $flag &= ~0xC3; # clear 0x80, 0x40, 0x01, and 0x03 flags 73 | if($revIDs{$F[0]}){ ## reverse-mapped to expected *query* 74 | $flag |= 0x43; 75 | $fwdRead = $revMap; 76 | $revRead = !$revMap; 77 | } elsif($fwdIDs{$F[0]}){ ## forward-mapped to expected *query* 78 | $flag |= 0x83; 79 | $fwdRead = !$revMap; 80 | $revRead = $revMap; 81 | } else{ 82 | $flag |= 0x200; 83 | } 84 | if(!$filter | !(($flag & 0x200) || ($flag & 0x100) || ($flag & 0x800))){ 85 | $F[1] = $flag; 86 | if(($filterFwd && $fwdRead) || ($filterRev && $revRead) || 87 | !($filterFwd | $filterRev)){ 88 | print(join("\t",@F)."\n"); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /fastx-rc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | ## fastx-rc -- reverse complement a sequence 6 | 7 | use Getopt::Long qw(:config auto_help pass_through); 8 | 9 | my $quiet = 0; 10 | my $preserve = 0; 11 | 12 | sub rc { 13 | my ($seq) = @_; 14 | $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/; 15 | # work on masked sequences as well 16 | $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/; 17 | return(scalar(reverse($seq))); 18 | } 19 | 20 | GetOptions("quiet!" => \$quiet, "preserve!" => \$preserve) or 21 | die("Error in command line arguments"); 22 | 23 | # unknown commands are treated as identifiers 24 | my @files = (); 25 | while(@ARGV){ 26 | my $arg = shift(@ARGV); 27 | if(-f $arg){ 28 | push(@files, $arg); 29 | } 30 | } 31 | @ARGV = @files; 32 | 33 | my $inQual = 0; # false 34 | my $seqID = ""; 35 | my $qualID = ""; 36 | my $seq = ""; 37 | my $qual = ""; 38 | while(<>){ 39 | chomp; 40 | chomp; 41 | if(!$inQual){ 42 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 43 | my $newSeqID = $2; 44 | my $newShortID = $3; 45 | if($seqID){ 46 | if($preserve){ 47 | if($qual){ 48 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 49 | } else { 50 | $seq =~ s/\n//g; 51 | $seq =~ s/(.{70})/$1\n/g; 52 | $seq =~ s/\n$//; 53 | printf(">%s\n%s\n", $seqID, $seq); 54 | } 55 | } 56 | if($qual){ 57 | printf("@%s [RC]\n%s\n+\n%s\n", $seqID, rc($seq), scalar(reverse($qual))); 58 | } else { 59 | $seq =~ s/\n//g; 60 | $seq = rc($seq); 61 | $seq =~ s/(.{70})/$1\n/g; 62 | $seq =~ s/\n$//; 63 | printf(">%s [RC]\n%s\n", $seqID, $seq); 64 | } 65 | } 66 | $seq = ""; 67 | $qual = ""; 68 | $seqID = $newSeqID; 69 | } elsif(/^\+(.*)$/) { 70 | $inQual = 1; # true 71 | $qualID = $1; 72 | } else { 73 | $seq .= $_; 74 | } 75 | } else { 76 | $qual .= $_; 77 | if(length($qual) >= length($seq)){ 78 | $inQual = 0; # false 79 | } 80 | } 81 | } 82 | 83 | if($seqID){ 84 | if($preserve){ 85 | if($qual){ 86 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 87 | } else { 88 | $seq =~ s/\n//g; 89 | $seq =~ s/(.{70})/$1\n/g; 90 | $seq =~ s/\n$//; 91 | printf(">%s\n%s\n", $seqID, $seq); 92 | } 93 | } 94 | if($qual){ 95 | printf("@%s [RC]\n%s\n+\n%s\n", $seqID, rc($seq), scalar(reverse($qual))); 96 | } else { 97 | $seq =~ s/\n//g; 98 | $seq = rc($seq); 99 | $seq =~ s/(.{70})/$1\n/g; 100 | $seq =~ s/\n$//; 101 | printf(">%s [RC]\n%s\n", $seqID, $seq); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /fastx-isofilter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | ## fastx-isofilter.pl -- Extract the longest isoform (or ORF) from transcripts 6 | 7 | use Getopt::Long qw(:config auto_help pass_through); 8 | 9 | my $quiet = 0; 10 | my $trimString = "_i[0-9]+\$"; 11 | my $orfMode = 0; 12 | 13 | GetOptions("trim=s" => \$trimString, "orf!" => \$orfMode, 14 | "quiet!" => \$quiet) or 15 | die("Error in command line arguments"); 16 | 17 | if($orfMode){ 18 | $trimString = "_[0-9]+\$"; 19 | } 20 | 21 | # unknown commands are treated as identifiers 22 | my @files = (); 23 | while(@ARGV){ 24 | my $arg = shift(@ARGV); 25 | if(-f $arg){ 26 | push(@files, $arg); 27 | } else { 28 | $trimString .= "|$arg"; 29 | } 30 | } 31 | @ARGV = @files; 32 | 33 | # use stdin if no files supplied 34 | if(!@ARGV){ 35 | @ARGV = '-' unless (-t STDIN); 36 | } 37 | 38 | if($trimString){ 39 | $trimString =~ s/^\|//; 40 | $trimString = "($trimString)"; 41 | } 42 | 43 | my $inQual = 0; # false 44 | my $seqID = ""; 45 | my $fullID = ""; 46 | my $qualID = ""; 47 | my $seq = ""; 48 | my $qual = ""; 49 | my %fastXStrs = (); 50 | my %fastXLengths = (); 51 | while(<>){ 52 | chomp; 53 | chomp; 54 | if(!$inQual){ 55 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 56 | my $newFullID = $2; 57 | my $newSeqID = $3; 58 | $newSeqID =~ s/$trimString//; 59 | if($seqID && (!$fastXLengths{$seqID} || ($fastXLengths{$seqID} < length($seq)))){ 60 | $fastXLengths{$seqID} = length($seq); 61 | if(!$qual){ 62 | $seq =~ s/(.{100})/$1\n/g; 63 | $seq =~ s/\n$//; 64 | } 65 | $fastXStrs{$seqID} = ($qual) ? 66 | sprintf("@%s\n%s\n+\n%s\n", $fullID, $seq, $qual) : 67 | sprintf(">%s\n%s\n", $fullID, $seq); 68 | } 69 | $seq = ""; 70 | $qual = ""; 71 | $seqID = $newSeqID; 72 | $fullID = $newFullID; 73 | } elsif(/^\+(.*)$/) { 74 | $inQual = 1; # true 75 | $qualID = $1; 76 | } else { 77 | $seq .= $_; 78 | } 79 | } else { 80 | $qual .= $_; 81 | if(length($qual) >= length($seq)){ 82 | $inQual = 0; # false 83 | } 84 | } 85 | } 86 | 87 | if($seqID && (!$fastXLengths{$seqID} || ($fastXLengths{$seqID} < length($seq)))){ 88 | $fastXLengths{$seqID} = length($seq); 89 | if(!$qual){ 90 | $seq =~ s/(.{100})/$1\n/g; 91 | $seq =~ s/\n$//; 92 | } 93 | $fastXStrs{$seqID} = ($qual) ? 94 | sprintf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual) : 95 | sprintf(">%s\n%s\n", $seqID, $seq); 96 | } 97 | 98 | foreach my $pat (sort {$fastXLengths{$b} <=> $fastXLengths{$a}} (keys(%fastXStrs))){ 99 | print($fastXStrs{$pat}); 100 | } 101 | -------------------------------------------------------------------------------- /exchange_rate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use Pod::Usage; 7 | use Getopt::Long qw(:config auto_version auto_help pass_through); 8 | use Time::localtime; 9 | 10 | use HTML::TreeBuilder::XPath; 11 | 12 | =head1 DESCRIPTION 13 | 14 | Retrieve exchange rates for foreign currencies, either a specified 15 | date in the past (from Oanda), or for today (from TSB Bank) 16 | 17 | =head1 SYNOPSIS 18 | 19 | exchange_rate.pl [] 20 | 21 | Today's date will be used if the date is not specified 22 | 23 | =cut 24 | 25 | my %monthNums = ( Jan=>1, Feb=>2, Mar=>3, Apr=>4, 26 | May=>5, Jun=>6, Jul=>7, Aug=>8, 27 | Sep=>9, Oct=>10, Nov=>11, Dec=>12); 28 | my @numMonths = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ); 29 | 30 | my $todayStr = # today 31 | sprintf("%04d-%03s-%02d", 32 | localtime->year + 1900, $numMonths[localtime->mon], localtime->mday); 33 | 34 | my %options = (date => $todayStr); 35 | 36 | GetOptions(\%options, "date=s") or pod2usage(); 37 | 38 | if((@ARGV) && ($ARGV[0] =~ /20..-([A-Z][a-z]{2}|\d{2})-\d{2}/)){ 39 | $options{"date"} = shift(@ARGV); 40 | } 41 | if(!@ARGV){ 42 | pod2usage("Currency must be specified"); 43 | } 44 | 45 | my $currency = shift(@ARGV); 46 | 47 | $options{"date"} =~ s/(20..-)(\d{2})(-\d{2})/$1.$numMonths[$2-1].$3/e; 48 | 49 | my $todayDate = $options{"date"} eq $todayStr; 50 | 51 | ##printf("TSB Bank NZ;%s\n","0.9629"); 52 | ##exit(0); 53 | 54 | printf(STDERR "Getting exchange rate information for %s%s\n", $options{"date"}, 55 | ( $todayDate ? " [today]" : "")); 56 | 57 | if($todayDate){ 58 | printf(STDERR "[Fetching NZD/%s rate for %s from xe.com]\n", 59 | $currency, $options{"date"}); 60 | my $tree = HTML::TreeBuilder-> 61 | new_from_url('http://www.xe.com/currencyconverter/convert/?Amount=1&From=NZD&To='.$currency) 62 | or die("Cannot load URL"); 63 | for my $elt ($tree->findnodes('//span[@class="uccResultUnit"]')){ 64 | printf("xe.com;%0.4f\n", $elt->attr("data-amount")); 65 | } 66 | } else { 67 | my $date = $options{"date"}; 68 | $date =~ s/20(..)-(...)-(\d{2})/$1."\/".$monthNums{$2}."\/".$3/e; 69 | my $url = sprintf("http://www.oanda.com/currency/". 70 | "historical-rates-classic?date_fmt=jp&". 71 | "date=%s&date1=%s&exch=NZD&expr=%s&". 72 | "margin_fixed=0&format=CSV&redirected=1", 73 | $date,$date,$currency); 74 | printf(STDERR "[Fetching NZD/%s rate for %s from oanda.com]\n", 75 | $currency, $options{"date"}); 76 | printf(STDERR "[$url]\n"); 77 | # print(STDERR "[$url]\n"); 78 | my $tree = HTML::TreeBuilder->new_from_url($url); 79 | my $rate = $tree->findvalue('//pre'); 80 | $rate =~ s/^.*?,//; 81 | print("oanda.com;$rate\n"); 82 | } 83 | -------------------------------------------------------------------------------- /fastx-sample.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 7 | use File::Temp qw(:seekable); 8 | use Encode qw(encode_utf8); 9 | 10 | my $idFileName = ""; 11 | my $maxCount = 0; 12 | my $trim = 0; 13 | 14 | GetOptions("count=i" => \$maxCount, ) or 15 | die("Error in command line arguments"); 16 | 17 | if(!$maxCount){ 18 | die("Error: Reservoir sampling needs a maximum number of reads (-c )\n"); 19 | } else { 20 | printf(STDERR "Reservoir sampling reads to output at most %d reads:", 21 | $maxCount); 22 | } 23 | 24 | sub processReads{ 25 | my ($seq, $qual, $maxCount, $readsRead, $readsProcessed, $readStore) = @_; 26 | $readsProcessed++; 27 | my $swapPos = ($readsProcessed <= $maxCount) ? ($readsProcessed-1) : int(rand($readsProcessed)); 28 | if($swapPos < $maxCount){ 29 | my $outLines = ""; 30 | if($qual){ 31 | $outLines = sprintf("@%012d\n%s\n+\n%s\n", $readsRead, $seq, $qual); 32 | } else { 33 | $outLines = sprintf(">%012d\n%s\n", $readsRead, $seq); 34 | } 35 | ${$readStore}{$swapPos} = $outLines; 36 | } 37 | return $readsProcessed; 38 | } 39 | 40 | my $inQual = 0; # false 41 | my $seqID = ""; 42 | my $qualID = ""; 43 | my $seq = ""; 44 | my $qual = ""; 45 | my $readsRead = 0; 46 | my $readsProcessed = 0; 47 | my $dotsPrinted = 0; 48 | my %readStore = (); 49 | 50 | while(<>){ 51 | chomp; 52 | chomp; 53 | if(!$inQual){ 54 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 55 | my $newSeqID = $2; 56 | my $newShortID = $3; 57 | if($seqID){ 58 | if($maxCount && ($readsRead % 10000 == 0)){ 59 | if($dotsPrinted % 50 == 0){ 60 | if($readsRead > 1000){ 61 | printf(STDERR " (%d reads processed)", $readsRead); 62 | } 63 | printf(STDERR "\n "); 64 | } 65 | print(STDERR "."); 66 | $dotsPrinted++; 67 | } 68 | $readsProcessed = 69 | processReads($seq, $qual, $maxCount, ++$readsRead, $readsProcessed, \%readStore); 70 | } 71 | $seq = ""; 72 | $qual = ""; 73 | $seqID = $newSeqID; 74 | } elsif(/^\+(.*)$/) { 75 | $inQual = 1; # true 76 | $qualID = $1; 77 | } else { 78 | $seq .= $_; 79 | } 80 | } else { 81 | $qual .= $_; 82 | if(length($qual) >= length($seq)){ 83 | $inQual = 0; # false 84 | } 85 | } 86 | } 87 | 88 | if($seqID){ 89 | $readsProcessed = 90 | processReads($seq, $qual, $maxCount, ++$readsRead, $readsProcessed, \%readStore); 91 | } 92 | 93 | printf(STDERR "\ndone (%d reads processed from %d total reads)\n", 94 | $readsProcessed, $readsRead); 95 | 96 | foreach my $id (keys(%readStore)){ 97 | print $readStore{$id}; 98 | } 99 | -------------------------------------------------------------------------------- /pedtodot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Read a GAS or LINKAGE format pedigree, return a digraph in the dot language 4 | # call dot to make pedigree drawing 5 | # 6 | AWK=$(which awk) 7 | DOTEXE=$(which dot) 8 | NEATOEXE=$(which neato) 9 | 10 | for fil in $* 11 | do 12 | for ped in `$AWK '!/^[!#]/ {print $1}' $fil | sort -u` 13 | do 14 | echo "Pedigree $ped" 15 | $AWK -v ped=$ped ' 16 | BEGIN { shape["f"]="box,regular=1" 17 | shape["1"]="box,regular=1" 18 | shape["m"]="circle" 19 | shape["2"]="circle" 20 | shape["u"]="diamond" 21 | shape["0"]="diamond" 22 | shade["y"]="grey" 23 | shade["2"]="grey" 24 | shade["n"]="white" 25 | shade["1"]="white" 26 | shade["x"]="white" 27 | shade["0"]="white" 28 | } 29 | !/^[!#]/ && $1==ped { 30 | sex[$2]=$5 31 | aff[$2]="x" ; if ($6 ~ /[012nyx]/) aff[$2]=$6 32 | if($3!="x" && $3!="0") { 33 | marriage[$3,$4]++ 34 | child[$3,$4,marriage[$3,$4]]=$2 35 | } 36 | } 37 | END { print "digraph Ped_" ped " {" 38 | # print "# page =\"11,8.5\" ;" 39 | print "node [shape=diamond] ;" 40 | print "ratio =\"auto\" ;" 41 | print "mincross = 2.0 ;" 42 | print "label=\"Pedigree " ped "\" ;" 43 | print "rotate=0 ;" 44 | for(s in sex) { 45 | print "\"" s "\" [shape=" shape[sex[s]] "," \ 46 | " style=filled,fillcolor=" shade[aff[s]] "] ;" 47 | } 48 | for(m in marriage) { 49 | n=split(m,par,"\034") 50 | mating_t="\"t_" par[1] "x" par[2] "\"" 51 | mating_b="\"b_" par[1] "x" par[2] "\"" 52 | print mating_t "[shape=diamond,style=filled," \ 53 | "label=\"\",height=.1,width=.1] ;" 54 | print mating_b "[shape=diamond,style=filled," \ 55 | "label=\"\",height=.1,width=.1] ;" 56 | print "\"" par[1] "\" -> " mating_t " [dir=none, weight=1, penwidth=3.0] ;" 57 | print "\"" par[2] "\" -> " mating_t " [dir=none, weight=1, penwidth=3.0] ;" 58 | print mating_t " -> " mating_b " [dir=none, weight=1, penwidth=3.0] ;" 59 | for(k=1;k<=marriage[par[1],par[2]];k++) { 60 | print mating_b " -> \"" child[par[1],par[2],k] "\"" \ 61 | " [dir=none, weight=2] ;" 62 | } 63 | } 64 | print "}" 65 | }' $fil > $ped.dot 66 | # echo "running ${DOTEXE} -Tsvg ${ped}.dot -o ${ped}_dot.svg" 67 | ${DOTEXE} -Tpdf ${ped}.dot -o ${ped}_dot.pdf 68 | # ${NEATOEXE} -Tsvg ${ped}.dot -o ${ped}_neato.svg 69 | done 70 | done 71 | -------------------------------------------------------------------------------- /mpileupDC.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | 7 | my $startName = ""; 8 | my $startPos = -1; 9 | my $lastPos = -1; 10 | my $lastDesc = ""; 11 | 12 | my $covAdjust = 5; 13 | my $printedHeader = 0; # false 14 | my $accuracyDP = 0; 15 | my $DCOnly = 0; # false 16 | my $threshold = 0; # Log2 threshold to report DC (otherwise DC is set to 0) 17 | my $header = 1; # true 18 | 19 | my $compare = ""; 20 | 21 | GetOptions("adjustment=i" => $covAdjust, "header!" => $header, 22 | "compare=s" => \$compare, "dpaccuracy=i" => \$accuracyDP, 23 | "threshold=f" => \$threshold, "onlydc!" => \$DCOnly ) or 24 | die("Error in command line arguments"); 25 | 26 | my @comp1 = (); 27 | my @comp2 = (); 28 | my @comps = (); 29 | 30 | while(<>){ 31 | chomp; 32 | if(!$_){ 33 | next; 34 | } 35 | my ($refName, $pos, $refAllele, $cov, $bases, $qual, $rest) = 36 | split(/\t/, $_, 7); 37 | my $skip = ($bases =~ tr/<>//); 38 | my @adjCovs = ($cov - $skip); 39 | while($rest){ 40 | ($cov, $bases, $qual, $rest) = split(/\t/, $rest, 4); 41 | $skip = ($bases =~ tr/<>//); 42 | push(@adjCovs, $cov - $skip); 43 | } 44 | if(!$printedHeader){ 45 | ## set up comparisons 46 | if(!$compare){ 47 | for(my $x = 0; $x < $#adjCovs; $x++){ 48 | for(my $y = $x+1; $y <= $#adjCovs; $y++){ 49 | push(@comp1, $x+1); 50 | push(@comp2, $y+1); 51 | push(@comps, sprintf("%d,%d", $x+1, $y+1)); 52 | } 53 | } 54 | } else { 55 | while($compare =~ s/([0-9]+?)[-,]([0-9]+?)[;,\s]?//){ 56 | push(@comp1, $1); 57 | push(@comp2, $2); 58 | push(@comps, "${1},${2}"); 59 | } 60 | } 61 | if($header){ 62 | if($DCOnly){ 63 | print(join("\t", "##Ref", "Start", "End", @comps)."\n"); 64 | } else { 65 | print(join("\t", "##Ref", "Start", "End", 66 | (1..($#adjCovs+1)), @comps)."\n"); 67 | } 68 | } 69 | $printedHeader = 1; # true 70 | } 71 | my @DCov = (); 72 | for(my $i = 0; $i <= $#comp1; $i++){ 73 | my $x = $comp1[$i]-1; 74 | my $y = $comp2[$i]-1; 75 | my $val = (log($adjCovs[$x]+$covAdjust) - 76 | log($adjCovs[$y]+$covAdjust)) / log(2); 77 | push(@DCov, sprintf("%0.${accuracyDP}f", 78 | abs($val) < $threshold ? 0 : $val)); 79 | } 80 | my $descLine = ($DCOnly) ? 81 | join("\t", @DCov) : 82 | join("\t", @adjCovs, @DCov); 83 | $descLine =~ s/-0/0/g; 84 | if(($refName ne $startName) || ($descLine ne $lastDesc)){ 85 | # print sequence (if any) 86 | if($startName){ 87 | print(join("\t", $startName, $startPos-1, $lastPos, $lastDesc)."\n"); 88 | } 89 | $startName = $refName; 90 | $startPos = $pos; 91 | $lastDesc = $descLine; 92 | } 93 | $lastPos = $pos; 94 | } 95 | 96 | if($startName){ 97 | print(join("\t", $startName, $startPos-1, $lastPos, $lastDesc)."\n"); 98 | } 99 | -------------------------------------------------------------------------------- /fastx-cgmasker.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | ## fastx-hplength.pl -- get statistics on homopolymers in a fastq/fasta file 6 | 7 | use Getopt::Long qw(:config auto_help pass_through); 8 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 9 | 10 | my $quiet = 0; 11 | my $mode = "CG"; 12 | 13 | GetOptions("quiet!" => \$quiet, "mode=s" => \$mode) or 14 | die("Error in command line arguments"); 15 | 16 | # unknown commands are treated as identifiers 17 | my @files = (); 18 | while(@ARGV){ 19 | my $arg = shift(@ARGV); 20 | if(-e $arg){ 21 | push(@files, $arg); 22 | } 23 | } 24 | @ARGV = @files; 25 | 26 | # use stdin if no files supplied 27 | if(!@ARGV){ 28 | @ARGV = '-' unless (-t STDIN); 29 | } 30 | 31 | my %hpCounts = (); 32 | 33 | my $baseCount = 0; 34 | my $inQual = 0; # false 35 | my $seqID = ""; 36 | my $qualID = ""; 37 | my $seq = ""; 38 | my $qual = ""; 39 | 40 | foreach my $file (@ARGV) { 41 | # This little gunzip dance makes sure the script can handle both 42 | # gzip-compressed and uncompressed input, regardless of whether 43 | # or not it is piped 44 | my $z = new IO::Uncompress::Gunzip($file, "transparent", 1) 45 | or die "gunzip failed: $GunzipError\n"; 46 | while(<$z>){ 47 | s/\s+$//; # remove ending whitespace 48 | if (!$inQual) { 49 | if (/^(>|@)((.+?)( .*?\s*)?)$/) { 50 | my $newSeqID = $2; 51 | my $newShortID = $3; 52 | $baseCount += length($seq); 53 | my $cur = ""; 54 | my $cchr = ""; 55 | if($mode eq "CG"){ 56 | $seq =~ s/CG/XX/gi; 57 | $seq =~ tr/TtCcGg/AaAaAa/; 58 | $seq =~ s/XX/CG/gi; 59 | } 60 | if($seqID){ 61 | if($qual){ 62 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 63 | } else { 64 | $seq =~ s/(.{100})/$1\n/g; 65 | $seq =~ s/\n$//; 66 | printf(">%s\n%s\n", $seqID, $seq); 67 | } 68 | } 69 | $seq = ""; 70 | $qual = ""; 71 | $seqID = $newSeqID; 72 | } elsif (/^\+(.*)$/) { 73 | $inQual = 1; # true 74 | $qualID = $1; 75 | } else { 76 | $seq .= uc($_); 77 | } 78 | } else { 79 | $qual .= $_; 80 | if (length($qual) >= length($seq)) { 81 | $inQual = 0; # false 82 | } 83 | } 84 | } 85 | close($z); 86 | } 87 | 88 | $mode = uc($mode); 89 | 90 | $baseCount += length($seq); 91 | my $cur = ""; 92 | my $cchr = ""; 93 | if($mode eq "CG"){ 94 | $seq =~ s/CG/XX/gi; 95 | $seq =~ tr/TtCcGg/AaAaAa/; 96 | $seq =~ s/XX/CG/gi; 97 | } 98 | if($seqID){ 99 | if($qual){ 100 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 101 | } else { 102 | $seq =~ s/(.{100})/$1\n/g; 103 | $seq =~ s/\n$//; 104 | printf(">%s\n%s\n", $seqID, $seq); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /addKraken2Silva.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # addKraken2Silva.pl -- modifies SILVA FASTA files to add in 4 | # embl taxIDs for the Kraken database and translate U to T 5 | 6 | # Author: David Eccles (gringer), 2015 7 | 8 | use strict; 9 | use warnings; 10 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 11 | 12 | sub usage { 13 | print(STDERR "usage: ./addKraken2Silva.pl \n"); 14 | print(STDERR "\nmodifies SILVA FASTA files to add in embl taxIDs for the Kraken database\n"); 15 | print(STDERR "\nOther Options:\n"); 16 | print(STDERR "-help : Only display this help message\n"); 17 | print(STDERR "\n"); 18 | } 19 | 20 | my $mapFileName = 0; # false 21 | 22 | my @files = (); 23 | 24 | # extract command line arguments 25 | while(@ARGV){ 26 | my $argument = shift @ARGV; 27 | if(-f $argument){ # file existence check 28 | if(!$mapFileName){ 29 | $mapFileName = $argument; 30 | } else { 31 | push(@files, $argument); 32 | } 33 | } else { 34 | if($argument eq "-help"){ 35 | usage(); 36 | exit(0); 37 | } else { 38 | print(STDERR "Error: command line parameter '$argument' not understood\n"); 39 | usage(); 40 | exit(1); 41 | } 42 | } 43 | } 44 | 45 | @ARGV = @files; 46 | 47 | if(!$mapFileName){ 48 | print(STDERR "Error: No valid map file given\n"); 49 | usage(); 50 | exit(1); 51 | } 52 | 53 | my $mapFile = 0; 54 | $mapFile = new IO::Uncompress::Gunzip "$mapFileName" or 55 | die "Unable to open $mapFileName\n"; 56 | 57 | my %seqMap = (); 58 | 59 | print(STDERR "Reading from map file..."); 60 | 61 | my $mapLinesCounter = 0; 62 | 63 | while(<$mapFile>){ 64 | chomp; 65 | my $seqID = 0; 66 | my $taxID = 0; 67 | if($_ =~ /^(.*?)\s/){ 68 | $seqID = $1; 69 | } 70 | if($_ =~ /\s([0-9]+)$/){ 71 | $taxID = $1; 72 | } 73 | if($seqID && $taxID){ 74 | $seqMap{$seqID} = $taxID; 75 | } 76 | if($mapLinesCounter++ > 100000){ 77 | $mapLinesCounter = 0; 78 | print(STDERR "."); 79 | } 80 | } 81 | 82 | printf(STDERR " found %d mappings from sequence IDs to taxa\n", scalar(keys(%seqMap))); 83 | 84 | my $seqID = ""; 85 | my $seq = ""; 86 | while(<>){ 87 | chomp; 88 | if(/^>(([^\.]+).*?)(( |$).*)$/){ 89 | $seqID = $1; 90 | my $seqBase = $2; 91 | my $rest = $3; 92 | ##print(STDERR "base: $seqBase, id:$seqID, rest:$rest\n"); 93 | my $taxID = $seqMap{$seqBase}; 94 | if($taxID){ 95 | $seqID.= "|kraken:taxid|".$taxID; 96 | printf(">%s%s\n", $seqID, $rest); 97 | } else { 98 | print(STDERR "Warning: Sequence '$seqID' skipped: no reference found in map file\n"); 99 | $seqID = ""; 100 | } 101 | $seq = ""; 102 | } elsif($seqID) { 103 | $_ =~ tr/U/T/; 104 | print($_."\n"); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /edgebeeguildfinder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Edgebee guild finder -- given a list of user names, 4 | # work out what edgebee guild those users come from. Output is csv, 5 | # containing User, Game, and Guild 6 | 7 | # also found at http://pastebin.com/W6sYanuQ 8 | 9 | # Author: David Eccles (gringer) 10 | 11 | # usage: echo -e "user1\nuser2" | ./edgebeeguildfinder.py 12 | 13 | import cookielib, urllib2 14 | import fileinput 15 | import re 16 | import sys 17 | import json 18 | import csv 19 | 20 | userNames = [] 21 | addedUsers = set() 22 | 23 | for line in fileinput.input(): 24 | if(line.startswith("{")): 25 | jsonObj = json.loads(line) 26 | if(('result' in jsonObj) and ('players' in jsonObj['result'])): 27 | for userName in map(lambda x: x['name'],jsonObj['result']['players']): 28 | if(not userName in addedUsers): 29 | userNames.append(userName) 30 | addedUsers.add(userName) 31 | else: 32 | userName = line.rstrip() 33 | if(not userName in addedUsers): 34 | userNames.append(userName) 35 | addedUsers.add(userName) 36 | 37 | cj = cookielib.CookieJar() 38 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 39 | f = opener.open('http://www.edgebee.com/signin?' + 40 | 'username=gringerscripts&password=gringerscripts&remember=1') 41 | 42 | cr = csv.writer(sys.stdout) 43 | cr.writerow(['User','Game','Guild','LastOnline','Registered']) 44 | for user in userNames: 45 | foundSandP = False 46 | try: 47 | f = opener.open('http://www.edgebee.com/user?name=%s' % (user)) 48 | except: 49 | continue 50 | inItem = False 51 | gameName = None 52 | extraStat = None 53 | extraStats = {} 54 | extraStats['lastOn'] = None 55 | extraStats['regOn'] = None 56 | extraStats['guildName'] = None 57 | for line in f.read().splitlines(True): 58 | line = line.rstrip() 59 | if('playerListItem' in line): 60 | inItem = True 61 | gameName = None 62 | extraStats['guildName'] = None 63 | if(inItem): 64 | if('h2' in line): 65 | gameName = re.compile(r'<[^>]+>').sub('', line).lstrip() 66 | if('h4' in line): 67 | inItem = False 68 | elif('h4' in line): 69 | if('Registered on' in line): 70 | extraStat = 'regOn' 71 | elif('Last online' in line): 72 | extraStat = 'lastOn' 73 | else: 74 | extraStat = None 75 | elif((extraStat is not None) and ('
' in line)): 76 | extraStats[extraStat] = re.compile(r'<[^>]+>').sub('', line).lstrip() 77 | elif('Guild:' in line): 78 | guildName = re.compile(r'<[^>]+>').sub('', line).lstrip() 79 | guildName = guildName.replace('Guild: ','') 80 | extraStats['guildName'] = guildName 81 | if(('
' in line) and (gameName is not None)): 82 | cr.writerow([user, gameName, extraStats['guildName'], 83 | extraStats['lastOn'],extraStats['regOn']]) 84 | -------------------------------------------------------------------------------- /shufflefastx.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | use List::Util qw(shuffle); 7 | 8 | my $idFileName = ""; 9 | my $quiet = 0; 10 | my $wrap = 1; 11 | 12 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet, 13 | "wrap!" => \$wrap) or 14 | die("Error in command line arguments"); 15 | 16 | my %idsToGet = (); 17 | 18 | # unknown commands are treated as identifiers 19 | my @files = (); 20 | while(@ARGV){ 21 | my $arg = shift(@ARGV); 22 | if(-f $arg){ 23 | push(@files, $arg); 24 | } else { 25 | $idsToGet{$arg} = 1; 26 | } 27 | } 28 | @ARGV = @files; 29 | 30 | if($idFileName){ 31 | # read sequence IDs from input file 32 | open(my $idFile, "<", $idFileName); 33 | while(<$idFile>){ 34 | chomp; 35 | s/^[>@]//; 36 | $idsToGet{$_} = 1; 37 | } 38 | close($idFile); 39 | } 40 | 41 | my $numToFind = scalar(keys(%idsToGet)); 42 | 43 | if(!$quiet && $numToFind){ 44 | printf(STDERR "Read %d identifiers\n", $numToFind); 45 | } 46 | 47 | 48 | my $inQual = 0; # false 49 | my $seqID = ""; 50 | my $qualID = ""; 51 | my $seq = ""; 52 | my $qual = ""; 53 | while(<>){ 54 | chomp; 55 | chomp; 56 | if(!$inQual){ 57 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 58 | my $newSeqID = $2; 59 | my $newShortID = $3; 60 | if($seqID){ 61 | my $newSeq = ""; 62 | my @shuffleLocs = (shuffle (0 .. length($seq))); 63 | foreach my $rp (@shuffleLocs){ 64 | $newSeq .= substr($seq,$rp,1); 65 | } 66 | if($qual){ 67 | my $newQual = ""; 68 | foreach my $rp (@shuffleLocs){ 69 | $newQual .= substr($qual,$rp,1); 70 | } 71 | printf("@"."shuffled_%s\n%s\n+\n%s\n", $seqID, $newSeq, $qual); 72 | } else { 73 | if($wrap){ 74 | $newSeq =~ s/(.{70})/$1\n/g; 75 | $newSeq =~ s/\s+$//; 76 | } 77 | printf(">shuffled_%s\n%s\n", $seqID, $newSeq); 78 | } 79 | } 80 | $seq = ""; 81 | $qual = ""; 82 | if(!$numToFind || 83 | exists($idsToGet{$newSeqID}) || exists($idsToGet{$newShortID})){ 84 | $seqID = $newSeqID; 85 | } else { 86 | $seqID = ""; 87 | } 88 | } elsif(/^\+(.*)$/) { 89 | $inQual = 1; # true 90 | $qualID = $1; 91 | } else { 92 | $seq .= $_; 93 | } 94 | } else { 95 | $qual .= $_; 96 | if(length($qual) >= length($seq)){ 97 | $inQual = 0; # false 98 | } 99 | } 100 | } 101 | 102 | if ($seqID) { 103 | my $newSeq = ""; 104 | my @shuffleLocs = (shuffle (0 .. length($seq))); 105 | foreach my $rp (@shuffleLocs) { 106 | $newSeq .= substr($seq,$rp,1); 107 | } 108 | if ($qual) { 109 | my $newQual = ""; 110 | foreach my $rp (@shuffleLocs) { 111 | $newQual .= substr($qual,$rp,1); 112 | } 113 | printf("@"."shuffled_%s\n%s\n+\n%s\n", $seqID, $newSeq, $qual); 114 | } else { 115 | if ($wrap) { 116 | $newSeq =~ s/(.{70})/$1\n/g; 117 | $newSeq =~ s/\s+$//; 118 | } 119 | printf(">shuffled_%s\n%s\n", $seqID, $newSeq); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /fastx-sort.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | 7 | my $quiet = 0; 8 | my $searchPattern = ""; ## "(^.*\$)"; 9 | my $numeric = 0; 10 | my $length = 0; 11 | my $r = 0; 12 | 13 | GetOptions("reverse|r!" => \$r, "pattern=s" => \$searchPattern, 14 | "quiet!" => \$quiet, 15 | "numeric|n!" => \$numeric, "length!" => \$length) or 16 | die("Error in command line arguments"); 17 | 18 | if($length){ 19 | $numeric = 1; 20 | print(STDERR "Numeric sort based on sequence length\n"); 21 | } elsif($numeric){ 22 | $numeric = 1; 23 | print(STDERR "Numeric sort\n"); 24 | } 25 | 26 | if($r){ 27 | print(STDERR "Reversing sort direction\n"); 28 | } 29 | 30 | # Complain about non-file command line argument 31 | my @files = (); 32 | while(@ARGV){ 33 | my $arg = shift(@ARGV); 34 | if(-e $arg){ 35 | push(@files, $arg); 36 | } else { 37 | die("Unknown argument: $arg"); 38 | } 39 | } 40 | @ARGV = @files; 41 | 42 | my %fastXStrs = (); 43 | 44 | my $inQual = 0; # false 45 | my $seqID = ""; 46 | my $qualID = ""; 47 | my $seq = ""; 48 | my $qual = ""; 49 | while(<>){ 50 | chomp; 51 | chomp; 52 | if(!$inQual){ 53 | if(/^(>|@)(.*)$/){ 54 | my $newSeqID = $2; 55 | if($seqID){ 56 | if(!$searchPattern || ($seqID =~ /($searchPattern)/)){ 57 | my $matchPattern = ($2) ? $2 : $1; 58 | my $key = (!$searchPattern) ? 59 | ($length ? (-length($seq)) : $seqID) : $matchPattern; 60 | if(!$qual){ 61 | $seq =~ s/(.{100})/$1\n/g; 62 | $seq =~ s/\n$//; 63 | } 64 | $fastXStrs{$key} .= ($qual) ? 65 | sprintf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual) : 66 | sprintf(">%s\n%s\n", $seqID, $seq); 67 | } else { 68 | printf(STDERR "Warning: No match for pattern '$searchPattern' for sequence '$seqID'\n"); 69 | } 70 | } 71 | $seq = ""; 72 | $qual = ""; 73 | $seqID = $newSeqID; 74 | } elsif(/^\+(.*)$/) { 75 | $inQual = 1; # true 76 | $qualID = $1; 77 | } else { 78 | $seq .= $_; 79 | } 80 | } else { 81 | $qual .= $_; 82 | if(length($qual) >= length($seq)){ 83 | $inQual = 0; # false 84 | } 85 | } 86 | } 87 | 88 | if ($seqID) { 89 | if (!$searchPattern || ($seqID =~ /($searchPattern)/)) { 90 | my $matchPattern = ($2) ? $2 : $1; 91 | my $key = (!$searchPattern) ? 92 | ($length ? (-length($seq)) : $seqID) : $matchPattern; 93 | if (!$qual) { 94 | $seq =~ s/(.{100})/$1\n/g; 95 | $seq =~ s/\n$//; 96 | } 97 | $fastXStrs{$key} .= ($qual) ? 98 | sprintf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual) : 99 | sprintf(">%s\n%s\n", $seqID, $seq); 100 | } else { 101 | printf(STDERR "Warning: No match for pattern '$searchPattern' for sequence '$seqID'\n"); 102 | } 103 | } 104 | 105 | printf(STDERR "Seen %d keys\n", scalar(keys(%fastXStrs))); 106 | 107 | if($numeric){ 108 | foreach my $pat (sort {($r?$b:$a) <=> ($r?$a:$b)} (keys(%fastXStrs))){ 109 | print($fastXStrs{$pat}); 110 | } 111 | } else { 112 | foreach my $pat (sort {($r?$b:$a) cmp ($r?$a:$b)} (keys(%fastXStrs))){ 113 | print($fastXStrs{$pat}); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /GenABEL2GIANT.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 7 | use IO::Compress::Gzip qw(gzip $GzipError); 8 | 9 | my ($snpName, $mafName, @giantNames) = @ARGV; 10 | 11 | my %chipSNPs = (); 12 | my %MAFs = (); 13 | 14 | print(STDERR "Reading SNP file..."); 15 | my $snpFile = new IO::Uncompress::Gunzip("$snpName") or 16 | die("Unable to open $snpName\n"); 17 | my $read = 0; 18 | while(<$snpFile>){ 19 | chomp; 20 | $chipSNPs{$_} = 1; 21 | if($read++ > 10000){ 22 | print(STDERR "."); 23 | $read = 0; 24 | } 25 | } 26 | close($snpFile); 27 | print(STDERR " done.\n"); 28 | 29 | print(STDERR "Reading MAF file..."); 30 | my $mafFile = new IO::Uncompress::Gunzip("$mafName") or 31 | die("Unable to open $mafName\n"); 32 | $read = 0; 33 | while(<$mafFile>){ 34 | chomp; 35 | if(/^Marker/){ 36 | next; 37 | } 38 | my ($marker, $sex, $base, $MAF) = split(/,/); 39 | $MAFs{$sex}{$marker}="$base,$MAF"; 40 | if($read++ > 300000){ 41 | print(STDERR "."); 42 | $read = 0; 43 | } 44 | } 45 | close($mafFile); 46 | print(STDERR " done.\n"); 47 | 48 | 49 | foreach my $giantName (@giantNames){ 50 | my $sexCode = ""; 51 | if ($giantName =~ /NI_([fmc]).*?_IMPUTE2/) { 52 | $sexCode = ($1 eq "c") ? "a" : $1; 53 | } 54 | my $giantFile = new IO::Uncompress::Gunzip("$giantName") or 55 | die("Unable to open $giantName\n"); 56 | print(STDERR "Reading GIANT file [$giantName]..."); 57 | $read = 0; 58 | my $of = new IO::Compress::Gzip "GIANT_${giantName}" or 59 | die "Unable to open GIANT_${giantName} for writing\n"; 60 | print($of "chrom,markername,strand,n,effect_allele,". 61 | "other_allele,eaf,imputation_type,chi2.1df,beta,". 62 | "se,p\n"); 63 | while (<$giantFile>) { 64 | chomp; 65 | tr/\"//d; 66 | if (substr($_,0,1) eq ",") { 67 | next; 68 | } 69 | my ($markername, $chrom, $pos, $strand, $A1, $A2, $n, $beta, $se, 70 | $chi2_1df, $P1df, $p, $effAB, $effBB, $chi2_2df, $P2df, 71 | $effect_allele) = split(/,/); 72 | my ($maa,$maf) = split(/,/, $MAFs{$sexCode}{$markername}); 73 | if (($maf == 1) || ($maf == 0)) { 74 | next; # skip homozygous / missing SNPs 75 | } 76 | my $eaf = $maf; 77 | my $other_allele = $A1; 78 | if ($maa ne $effect_allele) { 79 | $eaf = 1 - $maf; 80 | } 81 | $eaf = sprintf("%0.4f", $eaf); 82 | if ($other_allele eq $effect_allele) { 83 | $other_allele = $A2; 84 | } 85 | my $imputation_type = 4; 86 | if ($chipSNPs{$markername}) { 87 | $imputation_type = 0; 88 | } 89 | if($chi2_1df ne "NA"){ 90 | $chi2_1df = sprintf("%0.4f", $chi2_1df); 91 | } 92 | if($beta ne "NA"){ 93 | $beta = sprintf("%0.4f", $beta); 94 | } 95 | if($se ne "NA"){ 96 | $se = sprintf("%0.6f", $se); 97 | } 98 | if($p ne "NA"){ 99 | $p = sprintf("%0.4f", $p); 100 | } 101 | print($of "$chrom,$markername,$strand,$n,$effect_allele,". 102 | "$other_allele,$eaf,$imputation_type,$chi2_1df,$beta,". 103 | "$se,$p\n"); 104 | if($read++ > 400000){ 105 | print(STDERR "."); 106 | $read = 0; 107 | } 108 | } 109 | close($of); 110 | close($giantFile); 111 | print(STDERR " done.\n"); 112 | } 113 | -------------------------------------------------------------------------------- /maf2gfa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## Filters out non-end and identical LAST results to simulate overlap alignments 4 | 5 | use warnings; 6 | use strict; 7 | 8 | use Getopt::Long qw(:config auto_help pass_through); 9 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 10 | 11 | my $seqFileName = ""; 12 | my $ignoreSelf = 0; 13 | 14 | my $endLeniency = 100; ## base pairs from end to allow 15 | my $containFrac = 0.9; ## minimum containment fraction 16 | my $containIdent = 0.9; ## minimum containment identity 17 | 18 | my %quals = (); 19 | 20 | my $qSeq = ""; 21 | my $qStart = 0; 22 | my $qEnd = 0; 23 | my $qLen = 0; 24 | my $qStrand = ""; 25 | my $qMatchLen = 0; 26 | my $qName = ""; 27 | my $tSeq = ""; 28 | my $tStart = 0; 29 | my $tEnd = 0; 30 | my $tMatchLen = 0; 31 | my $tLen = 0; 32 | my $tName = ""; 33 | 34 | my $lineBuffer = ""; 35 | my $GFAspec = 1; 36 | 37 | my %matches = (); 38 | 39 | if($GFAspec == 2){ 40 | print("H\tVN:Z:2.0\n"); 41 | } else { 42 | print("H\tVN:Z:bogart/edges\n"); 43 | } 44 | 45 | my $fastaFileName = shift(@ARGV); 46 | if(!(-e "${fastaFileName}.fai")){ 47 | print(STDERR "Generating index file... "); 48 | system(("samtools", "faidx", $fastaFileName)); 49 | print("done\n"); 50 | } 51 | 52 | open(my $fastaFile, "<", "${fastaFileName}.fai"); 53 | while(<$fastaFile>){ 54 | chomp; 55 | my @F = split(/\s+/); 56 | if($GFAspec == 2){ 57 | printf("S\t%s\t%d\t*\n", $F[0], $F[1]); 58 | } else { 59 | printf("S\t%s\t*\tLN:i:%d\n", $F[0], $F[1]); 60 | } 61 | } 62 | close($fastaFile); 63 | 64 | while(<>){ 65 | if(/^$/){ 66 | next; 67 | } 68 | if(!/^[as]/){ 69 | next; 70 | } 71 | $lineBuffer .= $_; 72 | my @F = split(/\s+/); 73 | if($F[0] eq "a"){ 74 | $qSeq = ""; 75 | $tSeq = ""; 76 | } elsif($F[0] eq "s"){ 77 | if($tSeq){ 78 | $qName = $F[1]; 79 | $qStart = $F[2]; 80 | $qMatchLen = $F[3]; 81 | $qEnd = $qStart + $qMatchLen; 82 | $qStrand = $F[4]; 83 | $qLen = $F[5]; 84 | $qSeq = $F[6]; 85 | if($GFAspec == 2){ 86 | if(($qSeq ne $tSeq) && 87 | (($qStart < $endLeniency) || ($tStart < $endLeniency) || 88 | ($qEnd > ($qLen-$endLeniency)) || ($tEnd > ($tLen-$endLeniency)))){ 89 | print(join("\t",("E","${tName}+","${qName}${qStrand}",$tStart,$tEnd, 90 | $qStart,$qEnd))."\n"); 91 | } 92 | } else { 93 | if($qSeq ne $tSeq){ 94 | if(($tEnd > ($tLen-$endLeniency)) && ($qStart < $endLeniency)){ 95 | print(join("\t",("L",$tName,"+",$qName,$qStrand,"*"))."\n"); 96 | } 97 | if(($qEnd > ($qLen-$endLeniency)) && ($tStart < $endLeniency)){ 98 | print(join("\t",("L",$qName,$qStrand,$tName,"+","*"))."\n"); 99 | } 100 | if(($qMatchLen / $qLen) >= $containFrac){ 101 | print(join("\t",("C",$tName,"+",$qName,$qStrand,"*"))."\n"); 102 | } 103 | if(($tMatchLen / $tLen) >= $containFrac){ 104 | print(join("\t",("C",$qName,"+",$tName,$qStrand,"*"))."\n"); 105 | } 106 | } 107 | } 108 | $lineBuffer = ""; 109 | } else { 110 | $tName = $F[1]; 111 | $tStart = $F[2]; 112 | $tMatchLen = $F[3]; 113 | $tEnd = $tStart + $tMatchLen; 114 | $tLen = $F[5]; 115 | $tSeq = $F[6]; 116 | } 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /nbmt-translate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | ## Translate mitochondrial sequence of Nb mitochondria 3 | ## (mostly because UAG is *not* a stop codon) 4 | use warnings; 5 | use strict; 6 | 7 | ## codon usage table -- consider proportion of mitochondrial codons 8 | 9 | ## for x in A C G T; do for y in A C G T; do for z in A C G T; do echo -n "${x}${y}${z} => \"" | perl -pe 'tr/T/U/'; transeq <(echo -e ">1\n${x}${y}${z}") /dev/stdout 2>/dev/null | perl -pe 's/$/ \",/;' | grep -v '^>'; done; done; done 10 | 11 | my %transTable = ## 3-letter codes are for translations from mtDNA tRNAs 12 | ( AAA => "(K)", 13 | AAC => "(N)", 14 | AAG => "(K)", 15 | AAU => "(N)", 16 | ACA => "(T)", 17 | ACC => "(T)", 18 | ACG => "Arg", 19 | ACU => "(T)", 20 | AGA => "(S)", 21 | AGC => "(S)", 22 | AGG => "(S)", 23 | AGU => "(S)", 24 | AUA => "(M)", 25 | AUC => "(I)", 26 | AUG => "(M)", 27 | AUU => "(I)", 28 | CAA => "(Q)", 29 | CAC => "(H)", 30 | CAG => "(Q)", 31 | CAU => "Met", 32 | CCA => "(P)", 33 | CCC => "(P)", 34 | CCG => "(P)", 35 | CCU => "(P)", 36 | CGA => "(R)", 37 | CGC => "(R)", 38 | CGG => "(R)", 39 | CGU => "(R)", 40 | CUA => "(L)", 41 | CUC => "(L)", 42 | CUG => "(L)", 43 | CUU => "(L)", 44 | GAA => "Phe", 45 | GAC => "(D)", 46 | GAG => "(E)", 47 | GAU => "Ile", 48 | GCA => "Cys", 49 | GCC => "(A)", 50 | GCG => "(A)", 51 | GCU => "(A)", 52 | GGA => "(G)", 53 | GGC => "(G)", 54 | GGG => "(G)", 55 | GGU => "(G)", 56 | GUA => "Tyr", 57 | GUC => "Asp", 58 | GUG => "His", 59 | GUU => "Asn", 60 | UAA => "L/*", ## was * (mtDNA/Leu) 61 | UAC => "Val", 62 | UAG => "Leu", 63 | UAU => "(Y)", 64 | UCA => "Trp", 65 | UCC => "Gly", 66 | UCG => "(S)", 67 | UCU => "Ser", 68 | UGA => "(W)", 69 | UGC => "Ala", 70 | UGG => "Pro", 71 | UGU => "Thr", 72 | UUA => "(L)", 73 | UUC => "Glu", 74 | UUG => "Gln", 75 | UUU => "Lys", 76 | 77 | ); 78 | 79 | my $seq = ""; 80 | my $seqID = ""; 81 | while(<>){ 82 | chomp; 83 | if(/^>(.*)$/){ 84 | my $newID = $1; 85 | if($seq){ 86 | printf(">%s\n", $seqID); 87 | while($seq =~ s/^(.{1,60})//){ 88 | my $seqHead = $1; 89 | for(my $i=0; $i < length($seqHead); $i+=3){ 90 | my $codon = substr($seqHead, $i, 3); 91 | $codon =~ tr/T/U/; 92 | if($transTable{$codon}){ 93 | print($transTable{$codon}); 94 | } else { 95 | print("???"); 96 | } 97 | } 98 | print("\n".("|----:----" x 6)."\n"); 99 | printf("%s\n", $seqHead); 100 | } 101 | } 102 | $seq = ""; 103 | $seqID = $newID; 104 | } else { 105 | $seq .= $_; 106 | } 107 | } 108 | if($seq){ 109 | printf(">%s\n", $seqID); 110 | my $pos = 0; 111 | while($seq =~ s/^(.{1,60})//){ 112 | my $seqHead = $1; 113 | for(my $i=0; $i < length($seqHead); $i+=3){ 114 | my $codon = substr($seqHead, $i, 3); 115 | $codon =~ tr/T/U/; 116 | if($transTable{$codon}){ 117 | print($transTable{$codon}); 118 | } else { 119 | print("???"); 120 | } 121 | } 122 | print("\n "); 123 | for(my $i=1; $i < (length($seqHead)-3); $i+=3){ 124 | my $codon = substr($seqHead, $i, 3); 125 | $codon =~ tr/T/U/; 126 | if($transTable{$codon}){ 127 | print($transTable{$codon}); 128 | } else { 129 | print("???"); 130 | } 131 | } 132 | print("\n "); 133 | for(my $i=2; $i < (length($seqHead)-3); $i+=3){ 134 | my $codon = substr($seqHead, $i, 3); 135 | $codon =~ tr/T/U/; 136 | if($transTable{$codon}){ 137 | print($transTable{$codon}); 138 | } else { 139 | print("???"); 140 | } 141 | } 142 | print("\n".("|----:----" x 6)."\n"); 143 | printf("%s\n", $seqHead); 144 | printf("%-10s",$pos); 145 | for(my $posInc = 10; $posInc < 60; $posInc += 10){ 146 | printf("%-10s",substr(sprintf("%10s",$pos+$posInc),8,2)); 147 | } 148 | $pos += 60; 149 | print("\n\n"); 150 | } 151 | } 152 | 153 | ## TAGTA- 154 | -------------------------------------------------------------------------------- /fastx-length.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 7 | 8 | sub SIConvert{ 9 | my ($val) = @_; 10 | my @largePrefixes = ("k", "M", "G"); 11 | my $pID = 0; 12 | my $prefix = ""; 13 | my $changed=0; 14 | while(($val > 1000) && ($pID < $#largePrefixes)){ 15 | $prefix = $largePrefixes[$pID]; 16 | $val /= 1000; 17 | $pID++; 18 | } 19 | return(sprintf("%.12g %s", $val, $prefix)); 20 | } 21 | 22 | my $showMD5 = 0; # false 23 | my $showFName = 0; # false 24 | 25 | GetOptions("md5!" => \$showMD5, "fname!" => \$showFName, ) or 26 | die("Error in command line arguments"); 27 | 28 | if($showMD5){ 29 | use Digest::MD5 qw(md5_hex); ## only attempt to load if md5 is requested 30 | } 31 | 32 | # use stdin if no files supplied 33 | if(!@ARGV){ 34 | @ARGV = '-' unless (-t STDIN); 35 | } 36 | 37 | my @lengths = (); 38 | my $inQual = 0; # false 39 | my $seqID = ""; 40 | my $qualID = ""; 41 | my $seq = ""; 42 | my $qual = ""; 43 | my $seqFName = ""; 44 | foreach my $file (@ARGV) { 45 | # This little gunzip dance makes sure the script can handle both 46 | # gzip-compressed and uncompressed input, regardless of whether 47 | # or not it is piped 48 | my $z = new IO::Uncompress::Gunzip($file, "transparent", 1) 49 | or die "gunzip failed: $GunzipError\n"; 50 | while(<$z>){ 51 | chomp;chomp;chomp; 52 | if (!$inQual) { 53 | if (/^(>|@)((.+?)( .*?\s*)?)$/) { 54 | my $newSeqID = $2; 55 | my $newShortID = $3; 56 | if ($seqID) { 57 | print(length($seq)); 58 | if($showMD5){ 59 | print(" ".md5_hex($seq)); 60 | } 61 | print(" ${seqID}"); 62 | if($showFName){ 63 | print(" ".$seqFName); 64 | } 65 | print("\n"); 66 | push(@lengths, length($seq)); 67 | } 68 | $seq = ""; 69 | $qual = ""; 70 | $seqID = $newSeqID; 71 | $seqFName = $file; 72 | } elsif (/^\+(.*)$/) { 73 | $inQual = 1; # true 74 | $qualID = $1; 75 | } else { 76 | $seq .= $_; 77 | } 78 | } else { 79 | $qual .= $_; 80 | if (length($qual) >= length($seq)) { 81 | $inQual = 0; # false 82 | } 83 | } 84 | } 85 | } 86 | 87 | if ($seqID) { 88 | print(length($seq)); 89 | if($showMD5){ 90 | print(" ".md5_hex($seq)); 91 | } 92 | print(" ${seqID}"); 93 | if($showFName){ 94 | print(" ".$seqFName); 95 | } 96 | print("\n"); 97 | push(@lengths, length($seq)); 98 | } 99 | 100 | ## calculate statistics 101 | @lengths = sort {$b <=> $a} (@lengths); 102 | my $sum = 0; 103 | my @cumLengths = map {$sum += $_} (@lengths); 104 | 105 | my $L50LengthNum = 0; 106 | while($cumLengths[$L50LengthNum] < ($sum * 0.5)){ 107 | $L50LengthNum++; 108 | } 109 | 110 | my $L90LengthNum = 0; 111 | while($cumLengths[$L90LengthNum] < ($sum * 0.9)){ 112 | $L90LengthNum++; 113 | } 114 | 115 | my $L10LengthNum = 0; 116 | while($cumLengths[$L10LengthNum] < ($sum * 0.1)){ 117 | $L10LengthNum++; 118 | } 119 | 120 | printf(STDERR "Total sequences: %d\n", scalar(@lengths)); 121 | printf(STDERR "Total length: %sb\n", SIConvert($sum)); 122 | printf(STDERR "Longest sequence: %sb\n", SIConvert($lengths[0])); 123 | printf(STDERR "Shortest sequence: %sb\n", SIConvert($lengths[$#lengths])); 124 | printf(STDERR "Mean Length: %sb\n", SIConvert(sprintf("%d", ($sum) / scalar(@lengths)))); 125 | printf(STDERR "Median Length: %sb\n", SIConvert($lengths[$#lengths / 2])); 126 | printf(STDERR "N10: %d sequences; L10: %sb\n", 127 | ($L10LengthNum+1), SIConvert($lengths[$L10LengthNum])); 128 | printf(STDERR "N50: %d sequences; L50: %sb\n", 129 | ($L50LengthNum+1), SIConvert($lengths[$L50LengthNum])); 130 | printf(STDERR "N90: %d sequences; L90: %sb\n", 131 | ($L90LengthNum+1), SIConvert($lengths[$L90LengthNum])); 132 | -------------------------------------------------------------------------------- /expenseTemplate.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,a4paper,twoside]{letter} 2 | \usepackage{graphicx} 3 | \usepackage{fp} 4 | \usepackage{charter} 5 | 6 | % length changes to get header in the right place 7 | \setlength{\voffset}{1.5cm} 8 | \setlength{\headheight}{2cm} 9 | \setlength{\headsep}{0pt} 10 | 11 | \usepackage[margin=1cm,bottom=3cm]{geometry} 12 | \usepackage{multirow} 13 | \usepackage{fancyhdr} 14 | \usepackage{lastpage} % provides the number of the last page 15 | \usepackage{framed} 16 | \usepackage{color} 17 | 18 | \pagestyle{fancy} 19 | 20 | \newcommand{\myHeader}{ 21 | 22 | %% header section (including business logo, name, address) 23 | \begin{tabular}{llr} 24 | \begin{minipage}[b]{0.4\textwidth} 25 | \includegraphics[height=2cm]{@@@logoLocation@@@} 26 | \end{minipage} & 27 | \begin{minipage}[b]{0.55\textwidth} 28 | \flushright% 29 | {\Huge \textsc{gringene} Bioinformatics}\\ 30 | \vspace{6pt}% 31 | {\Large bioinformatics@gringene.org}\\ 32 | \vspace{3pt}% 33 | 5 Debra Way, Karori, Wellington 6012\\ 34 | \vfill% 35 | \end{minipage}% 36 | \end{tabular} 37 | 38 | \rule{\linewidth}{0.5mm} 39 | 40 | } 41 | 42 | \fancyhead{} 43 | \fancyhead[CO,CE]{\myHeader} 44 | \fancyfoot{} 45 | \fancyfoot[CO,CE]{Page \thepage\ of \pageref{LastPage}} 46 | \renewcommand{\headrulewidth}{0pt} 47 | \renewcommand{\footrulewidth}{0pt} 48 | 49 | \renewcommand*{\thefootnote}{\alph{footnote}} 50 | 51 | \definecolor{lightsalmon}{rgb}{1,0.63,0.48} 52 | 53 | 54 | \begin{document} 55 | 56 | %% Address section (client name / address, invoice type) 57 | 58 | \hspace{\fill} 59 | 60 | \begin{tabular}{lr} 61 | \begin{minipage}[c]{0.55\textwidth} 62 | @@@clientName@@@\\%%CLIENTNAME 63 | @@@clientAddress@@@ 64 | \end{minipage} & 65 | \begin{minipage}[c]{0.4\textwidth} 66 | \begin{center} 67 | \begin{framed} 68 | \huge{\textbf{Expense Claim}} 69 | \end{framed} 70 | \end{center} 71 | \end{minipage}% 72 | \end{tabular} 73 | 74 | \rule{\linewidth}{1pt} 75 | 76 | \leftskip=10pt 77 | 78 | Summary of expenses claimed by David Eccles for \emph{@@@clientBusinessName@@@} 79 | (see attached invoices for more details): 80 | 81 | \begin{flushleft} 82 | 83 | \setlength{\tabcolsep}{1.5pt} 84 | \hspace{30pt} 85 | \begin{tabular}{lrl} 86 | \textbf{Claim Date} & \multicolumn{2}{l}{@@@invDate@@@}\\ 87 | \textbf{Job ID} & \multicolumn{2}{l}{@@@jobID@@@}\\ 88 | \textbf{Job Description}\hspace{6pt}\vspace{\fill} & 89 | \multicolumn{2}{p{10cm}}{\raggedright{@@@jobDesc@@@}}\\ 90 | \\ 91 | \begin{tabular}{llrl} 92 | \textbf{Description} & \textbf{Category} & %% 93 | \textbf{Date} & \textbf{Amount}\\ 94 | \hline\\ 95 | @@@expenseLines@@@\\ 96 | \hline\\ 97 | Total @@@payType@@@s & & @@@totalUnits@@@\\%%UNITLINE 98 | \end{tabular} 99 | 100 | 101 | \cline{1-2}\\ 102 | \textbf{Expense Total} & \textbf{@@@tAmt@@@} & NZD\\ 103 | \cline{1-2}\\%%DUEAMOUNT 104 | \textbf{Amount due} & \textbf{@@@TAmt@@@} & NZD\\%%DUEAMOUNT 105 | & & \begin{picture}(0,0)%%PAIDEXPENSES 106 | \put(65,30){\rotatebox[origin=c]{45}{%%PAIDEXPENSES 107 | \textcolor{lightsalmon}{\fbox{\Huge{PAID}}}}}%%PAIDEXPENSES 108 | \end{picture}%%PAIDEXPENSES 109 | \end{tabular} 110 | 111 | \end{flushleft} 112 | \leftskip=10pt 113 | 114 | \addvspace{\fill} 115 | \addvspace{6pt} 116 | 117 | Thank you for your business. 118 | 119 | \addvspace{12pt} 120 | 121 | David Eccles 122 | 123 | \addvspace{6pt} 124 | 125 | \addvspace{\fill}%%PAIDEXPENSES 126 | 127 | \filbreak%%DUEEXPENSES 128 | 129 | \leftskip=0pt 130 | 131 | \rule{\linewidth}{0.5mm}%%DUEEXPENSES 132 | 133 | \begin{center}%%DUEEXPENSES 134 | \Large{\textbf{Payment Details}}\\ %%DUEEXPENSES 135 | \end{center}%%DUEEXPENSES 136 | 137 | \rule{\linewidth}{0.5mm}%%DUEEXPENSES 138 | 139 | \leftskip=10pt 140 | 141 | @@@bankMessage@@@%%DUEEXPENSES 142 | 143 | \hspace{30pt}%%DUEEXPENSES 144 | \begin{tabular}{ll}%%DUEEXPENSES 145 | @@@accountDetails@@@%%DUEEXPENSES 146 | \end{tabular}%%DUEEXPENSES 147 | 148 | \end{document} 149 | -------------------------------------------------------------------------------- /maf_bcsplit.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## Splits up a FASTA/FASTQ file based on the location of identified sequences (e.g. adapter/barcode sequences) 4 | ## [currently only identifies the location of the sequences] 5 | 6 | use warnings; 7 | use strict; 8 | 9 | use Getopt::Long qw(:config auto_help pass_through); 10 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 11 | 12 | my $seqFileName = ""; 13 | my $ignoreSelf = 0; 14 | my $includeQuery = 0; 15 | my $includeTarget = 0; 16 | 17 | GetOptions("seqfile=s" => \$seqFileName, "overlap!" => \$ignoreSelf, 18 | "query!" => \$includeQuery, "target!" => \$includeTarget) or 19 | die("Error in command line arguments"); 20 | 21 | my %seqs = (); 22 | my %quals = (); 23 | 24 | # if(!$seqFileName){ 25 | # die("Error: read sequence file must be specified, '-seqfile '"); 26 | # } 27 | 28 | # ## read in sequences 29 | # my $inQual = 0; # false 30 | # my $seqID = ""; 31 | # my $qualID = ""; 32 | 33 | # my $seqFile = new IO::Uncompress::Gunzip "$seqFileName" or 34 | # die "Unable to open $seqFileName\n"; 35 | # while(<$seqFile>){ 36 | # chomp; 37 | # chomp; 38 | # if(!$inQual){ 39 | # if(/^(>|@)((.+?)( .*?\s*)?)$/){ 40 | # my $newSeqID = $2; 41 | # my $newShortID = $3; 42 | # $seqID = $newShortID; 43 | # if($seqID){ 44 | # $seqs{$seqID} = ""; 45 | # $quals{$seqID} = ""; 46 | # } 47 | # } elsif(/^\+(.*)$/) { 48 | # $inQual = 1; # true 49 | # $qualID = $1; 50 | # if(($qualID ne "") && ($qualID ne $seqID)){ 51 | # warn("Sequence ID and Qual ID do not match: $seqID; $qualID"); 52 | # } 53 | # } else { 54 | # $seqs{$seqID} .= $_; 55 | # } 56 | # } else { 57 | # $quals{$seqID} .= $_; 58 | # my $lq = length($quals{$seqID}); 59 | # my $ls = length($seqs{$seqID}); 60 | # if($lq >= $ls){ 61 | # $inQual = 0; # false 62 | # if($lq != $ls){ 63 | # warn(sprintf("Sequence and Qual length do not match: $seqID (%d; %d)", 64 | # $ls, $lq)); 65 | # } 66 | # } 67 | # } 68 | # } 69 | # close($seqFile); 70 | 71 | if(keys(%seqs)){ 72 | printf(STDERR "Read in %d sequences\n", scalar(keys(%seqs))); 73 | } 74 | 75 | my $qSeq = ""; 76 | my $qStart = 0; 77 | my $qEnd = 0; 78 | my $qLen = 0; 79 | my $qStrand = ""; 80 | my $qMatchLen = 0; 81 | my $qName = ""; 82 | my $tSeq = ""; 83 | my $tStart = 0; 84 | my $tEnd = 0; 85 | my $tMatchLen = 0; 86 | my $tLen = 0; 87 | my $tName = ""; 88 | 89 | my %matches = (); 90 | 91 | print("query,target,dir,qS,qE,qML,qL,qPct,tS,tE,tML,tL,tPct"); 92 | print(",qStr") if($includeQuery); 93 | print(",tStr") if($includeTarget); 94 | print("\n"); 95 | 96 | while(<>){ 97 | if(!/^[as]/){ 98 | next; 99 | } 100 | my @F = split(/\s+/); 101 | if($F[0] eq "a"){ 102 | $qSeq = ""; 103 | $tSeq = ""; 104 | } elsif($F[0] eq "s"){ 105 | if($tSeq){ 106 | $qName = $F[1]; 107 | $qStart = $F[2]; 108 | $qMatchLen = $F[3]; 109 | $qEnd = $qStart + $qMatchLen; 110 | $qStrand = $F[4]; 111 | $qLen = $F[5]; 112 | $qSeq = $F[6]; 113 | if($qStrand eq "-"){ ## correct for reverse complement 114 | $qEnd = $qLen - $qStart; 115 | $qStart = $qEnd - $qMatchLen; 116 | } 117 | my $matchLine = 118 | sprintf("%s,%s,%s,%d,%d,%d,%d,%0.2f,%d,%d,%d,%d,%0.2f", 119 | $qName, $tName, $qStrand, 120 | $qStart, $qEnd, $qMatchLen, $qLen, ($qMatchLen / $qLen) * 100, 121 | $tStart, $tEnd, $tMatchLen, $tLen, ($tMatchLen / $tLen) * 100); 122 | if(!$ignoreSelf || ($qName ne $tName)){ 123 | print("${matchLine}"); 124 | print(",${qSeq}") if($includeQuery); 125 | print(",${tSeq}") if($includeTarget); 126 | print("\n"); 127 | } 128 | $matches{$qName}{$qStart} .= ":matchLine"; 129 | } else { 130 | $tName = $F[1]; 131 | $tStart = $F[2]; 132 | $tMatchLen = $F[3]; 133 | $tEnd = $tStart + $tMatchLen; 134 | $tLen = $F[5]; 135 | $tSeq = $F[6]; 136 | } 137 | } 138 | } 139 | 140 | -------------------------------------------------------------------------------- /rsfilter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # rsfilter.pl -- hunts (quickly) for a set of markers in one pass of a 4 | # file. 5 | 6 | # Note: marker names *must not* be the same as an existing file 7 | 8 | # Author: David Eccles (gringer), 2008 9 | 10 | # This program tries to be fast and memory-efficient, so decisions are 11 | # pushed to outer loops. Using the ordered option, '-o', retains 12 | # genotype lines in memory for the specified markers until all input 13 | # has been read, which can consume large amounts of memory when the 14 | # requested marker set is large. 15 | 16 | use strict; 17 | use warnings; 18 | 19 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 20 | 21 | sub usage { 22 | print("usage: ./rsfilter.pl ... [options] < \n"); 23 | print("\nOther Options:\n"); 24 | print("-r : invert filter (i.e. select markers to exclude)\n"); 25 | print("-o : order by selection\n"); 26 | print("-- : no more files on command line (treat file names as markers)\n"); 27 | print("\n"); 28 | } 29 | 30 | my %markers = (); 31 | my $complement = 0; # false # determines if complement, rather 32 | # than intersect, should be chosen 33 | my $order = 0; # false # order markers by given ordering 34 | my @markerorder = (); 35 | 36 | my @inFiles = (); 37 | 38 | my $filesFinished = 0; # false 39 | 40 | while(@ARGV){ 41 | my $arg = shift(@ARGV); 42 | if(!$filesFinished && (-e $arg)){ 43 | if(!keys(%markers)){ 44 | print(STDERR "Retrieving marker names from $arg..."); 45 | my $markFile = 0; 46 | my $fileName = $arg; 47 | $markFile = new IO::Uncompress::Gunzip "$fileName" or 48 | die "Unable to open $fileName\n"; 49 | while(<$markFile>){ 50 | if(/^\"?(.*?)\"?[\s,]+/){ 51 | my $marker = $1; 52 | # print(STDERR "adding marker $marker\n"); 53 | $markers{$marker} = 1; 54 | push(@markerorder, $marker); 55 | } 56 | } 57 | print(STDERR keys(%markers)." marker names extracted\n"); 58 | } else { 59 | push(@inFiles, $arg); 60 | } 61 | } elsif ($arg =~ /^-r/){ 62 | $complement = 1; # true 63 | print(STDERR "Markers will be excluded, rather than filtered\n"); 64 | } elsif ($arg =~ /^-o/){ 65 | $complement = 0; # false 66 | $order = 1; # true 67 | print(STDERR "Markers will output in specified order\n"); 68 | } elsif ($arg =~ /^--/){ 69 | $filesFinished = 1; # true 70 | print(STDERR "No more files will be read\n"); 71 | } elsif ($arg =~ /^-help/){ 72 | usage(); 73 | exit(0); 74 | } else { 75 | $markers{$arg} = 1; 76 | push(@markerorder, $arg); 77 | } 78 | } 79 | 80 | @ARGV = @inFiles; 81 | 82 | my %markerlines = (); 83 | 84 | #print keys(%markers)." marker names extracted\n"; 85 | 86 | if($complement){ # tests are in outer loop to slightly reduce processor effort 87 | while (<>){ 88 | my $line = $_; 89 | if($line =~ /^(\"?.*?\"?)[\s,]+/){ 90 | if (!$markers{$1}){ 91 | print($line); 92 | } 93 | } 94 | } 95 | } else { 96 | if(!$order){ 97 | while (<>){ 98 | my $line = $_; 99 | if($line =~ /^(\"?.*?\"?)[\s,]+/){ 100 | if ($markers{$1}){ 101 | print($line); 102 | } 103 | } 104 | } 105 | } else { 106 | while (<>){ 107 | my $line = $_; 108 | if($line =~ /^(\"?.*?\"?)[\s,]+/){ 109 | if ($markers{$1}){ 110 | my $marker = $1; 111 | $markerlines{$1} = $line; 112 | } 113 | } 114 | } 115 | foreach(@markerorder){ 116 | my $marker = $_; 117 | if($markerlines{$marker}){ 118 | print($markerlines{$marker}); 119 | } 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /fastx-rlength.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | #use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error); 7 | #use IO::File; 8 | 9 | sub rc { 10 | my ($seq) = @_; 11 | $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/; 12 | # work on masked sequences as well 13 | $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/; 14 | return(scalar(reverse($seq))); 15 | } 16 | 17 | sub rev { 18 | my ($seq) = @_; 19 | return(scalar(reverse($seq))); 20 | } 21 | 22 | sub printStats { 23 | my ($seq, $seqID, $trim, $kmerLength) = @_; 24 | my $len = length($seq); 25 | my $sseq = ""; 26 | if($seqID && (length($seq) > $trim) && (length($seq) > $kmerLength)){ 27 | my $countTotal = 0; 28 | my $countMax = 0; 29 | my $maxKmer = ""; 30 | my %rptPos = (); 31 | my %allGapCounts = (); 32 | my %minGaps = (); 33 | my $revCount = 1; 34 | my $rcCount = 1; 35 | for(my $p = 0; ($p + $kmerLength) <= $len; $p++){ 36 | $sseq = substr($seq, $p, $kmerLength); 37 | if(exists($rptPos{$sseq})){ 38 | my $gap = $p - $rptPos{$sseq}; 39 | $allGapCounts{$gap}++; 40 | if(!exists($minGaps{$sseq}) || ($minGaps{$sseq} > $gap)){ 41 | $minGaps{$sseq} = $gap; 42 | } 43 | } 44 | if(exists($rptPos{rev($sseq)})){ 45 | $revCount++; 46 | } 47 | if(exists($rptPos{rc($sseq)})){ 48 | $rcCount++; 49 | } 50 | $rptPos{$sseq} = $p; 51 | } 52 | if($revCount == 1){ 53 | $revCount = 0; 54 | } 55 | if($rcCount == 1){ 56 | $rcCount = 0; 57 | } 58 | my $numKmers = scalar(keys(%rptPos)); 59 | my $kmerRatio = $numKmers/($len - $kmerLength + 1); 60 | my $numRepeats = scalar(keys(%minGaps)); 61 | my @gaps = sort {$a <=> $b} (values(%minGaps)); 62 | my $medianGap = (@gaps) ? $gaps[$#gaps / 2] : 0; 63 | my $medianCount = 0; 64 | my $modalGap = 0; 65 | my $modalCount = 0; 66 | my $rangeCountMed = 0; 67 | my $rangeCountMod = 0; 68 | if($medianGap){ 69 | my %gapCounts = (); 70 | foreach my $gap (@gaps){ 71 | $gapCounts{$gap}++; 72 | } 73 | $medianCount = ${allGapCounts{$medianGap}}; 74 | my @modalSort = sort {$allGapCounts{$b} <=> $allGapCounts{$a}} (@gaps); 75 | $modalGap = $modalSort[0]; 76 | $modalCount = $allGapCounts{$modalGap}; 77 | for(my $gP = int($medianGap * 0.99); ($gP <= ($medianGap / 0.99)); 78 | $gP++){ 79 | $rangeCountMed += $allGapCounts{$gP} if($allGapCounts{$gP}); 80 | } 81 | for(my $gP = int($modalGap * 0.99); ($gP <= ($modalGap / 0.99)); 82 | $gP++){ 83 | $rangeCountMod += $allGapCounts{$gP} if($allGapCounts{$gP}); 84 | } 85 | } 86 | printf("%8d %0.3f %6d %6d %5d %5d %6d %5d %5d %6d %6d %6d %s\n", 87 | $len, $kmerRatio, 88 | $numRepeats, 89 | $countTotal, 90 | $medianCount, 91 | $rangeCountMed, 92 | $medianGap, 93 | $modalCount, 94 | $rangeCountMod, 95 | $modalGap, 96 | $revCount, 97 | $rcCount, 98 | $seqID); 99 | } 100 | } 101 | 102 | my $trim = 0; 103 | my $kmerLength = 17; ## number of bases in hash keys 104 | 105 | GetOptions("trim=s" => \$trim) or 106 | die("Error in command line arguments"); 107 | 108 | my $inQual = 0; # false 109 | my $seqID = ""; 110 | my $qualID = ""; 111 | my $seq = ""; 112 | my $qual = ""; 113 | my $buffer = ""; 114 | printf("%8s %5s %6s %6s %5s %5s %6s %5s %5s %6s %6s %6s %s\n", 115 | "length", "kRat", "cntRep", "cntTot", 116 | "medCt", "RCMed", "medGap", "modCt", "RCMod", 117 | "modGap", "rvCnt", "rcCnt", "SeqID"); 118 | while(<>){ 119 | chomp; 120 | chomp; 121 | if(!$inQual){ 122 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 123 | my $newSeqID = $2; 124 | my $newShortID = $3; 125 | printStats($seq, $seqID, $trim, $kmerLength); 126 | $seq = ""; 127 | $qual = ""; 128 | $buffer = ""; 129 | $seqID = $newSeqID; 130 | } elsif(/^\+(.*)$/) { 131 | $inQual = 1; # true 132 | $qualID = $1; 133 | } else { 134 | $seq .= $_; 135 | } 136 | } else { 137 | $qual .= $_; 138 | if(length($qual) >= length($seq)){ 139 | $inQual = 0; # false 140 | } 141 | } 142 | } 143 | 144 | printStats($seq, $seqID, $trim, $kmerLength); 145 | -------------------------------------------------------------------------------- /fastx-hplength.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | ## fastx-hplength.pl -- get statistics on homopolymers in a fastq/fasta file 6 | 7 | use Getopt::Long qw(:config auto_help pass_through); 8 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 9 | 10 | my $quiet = 0; 11 | my $mode = "ACGT"; 12 | 13 | GetOptions("quiet!" => \$quiet, "mode=s" => \$mode) or 14 | die("Error in command line arguments"); 15 | 16 | # unknown commands are treated as identifiers 17 | my @files = (); 18 | while(@ARGV){ 19 | my $arg = shift(@ARGV); 20 | if(-e $arg){ 21 | push(@files, $arg); 22 | } 23 | } 24 | @ARGV = @files; 25 | 26 | # use stdin if no files supplied 27 | if(!@ARGV){ 28 | @ARGV = '-' unless (-t STDIN); 29 | } 30 | 31 | my %hpCounts = (); 32 | 33 | my $baseCount = 0; 34 | my $inQual = 0; # false 35 | my $seqID = ""; 36 | my $qualID = ""; 37 | my $seq = ""; 38 | my $qual = ""; 39 | 40 | foreach my $file (@ARGV) { 41 | # This little gunzip dance makes sure the script can handle both 42 | # gzip-compressed and uncompressed input, regardless of whether 43 | # or not it is piped 44 | my $z = new IO::Uncompress::Gunzip($file, "transparent", 1) 45 | or die "gunzip failed: $GunzipError\n"; 46 | while(<$z>){ 47 | s/\s+$//; # remove ending whitespace 48 | if (!$inQual) { 49 | if (/^(>|@)((.+?)( .*?\s*)?)$/) { 50 | my $newSeqID = $2; 51 | my $newShortID = $3; 52 | $baseCount += length($seq); 53 | my $cur = ""; 54 | my $cchr = ""; 55 | if($mode ne "ACGT"){ 56 | $seq =~ tr/ACGT/ABBB/ if ($mode eq "B"); 57 | $seq =~ tr/ACGT/DCDD/ if ($mode eq "D"); 58 | $seq =~ tr/ACGT/HHGH/ if ($mode eq "H"); 59 | $seq =~ tr/ACGT/VVVT/ if ($mode eq "V"); 60 | $seq =~ tr/ACGT/RYRY/ if (($mode eq "RY") || ($mode eq "YR")); 61 | $seq =~ tr/ACGT/WSSW/ if (($mode eq "WS") || ($mode eq "SW")); 62 | $seq =~ tr/ACGT/MMKK/ if (($mode eq "MK" || $mode eq "KM")); 63 | } 64 | grep { # collect homopolymers 65 | if($_ ne $cchr){ 66 | $hpCounts{$cur}++ if($cur); 67 | $cur = $cchr = $_; 68 | } else { 69 | $cur .= $cchr; 70 | } 71 | } split(//, $seq); 72 | $hpCounts{$cur}++ if($cur); # collect remaining homopolymer (if any) 73 | $seq = ""; 74 | $qual = ""; 75 | $seqID = $newSeqID; 76 | } elsif (/^\+(.*)$/) { 77 | $inQual = 1; # true 78 | $qualID = $1; 79 | } else { 80 | $seq .= uc($_); 81 | } 82 | } else { 83 | $qual .= $_; 84 | if (length($qual) >= length($seq)) { 85 | $inQual = 0; # false 86 | } 87 | } 88 | } 89 | close($z); 90 | } 91 | 92 | $mode = uc($mode); 93 | 94 | $baseCount += length($seq); 95 | my $cur = ""; 96 | my $cchr = ""; 97 | if($mode ne "ACGT"){ 98 | $seq =~ tr/ACGT/ABBB/ if ($mode eq "B"); 99 | $seq =~ tr/ACGT/DCDD/ if ($mode eq "D"); 100 | $seq =~ tr/ACGT/HHGH/ if ($mode eq "H"); 101 | $seq =~ tr/ACGT/VVVT/ if ($mode eq "V"); 102 | $seq =~ tr/ACGT/RYRY/ if (($mode eq "RY") || ($mode eq "YR")); 103 | $seq =~ tr/ACGT/WSSW/ if (($mode eq "WS") || ($mode eq "SW")); 104 | $seq =~ tr/ACGT/MMKK/ if (($mode eq "MK" || $mode eq "KM")); 105 | } 106 | grep { # collect homopolymers 107 | if($_ ne $cchr){ 108 | $hpCounts{$cur}++ if($cur); 109 | $cur = $cchr = $_; 110 | } else { 111 | $cur .= $cchr; 112 | } 113 | } split(//, $seq); 114 | $hpCounts{$cur}++ if($cur); # collect remaining homopolymer (if any) 115 | 116 | my $cumCount = 0; 117 | foreach my $hpChar (sort {length($a) <=> length($b) || $a cmp $b} 118 | (keys(%hpCounts))){ 119 | my $hpCount = $hpCounts{$hpChar}; 120 | my $hpBaseCount = $hpCount * length($hpChar); 121 | $cumCount += $hpBaseCount; 122 | printf("%10d %10d ( %6.2f%% / %6.2f%% ) %10s %s %d\n", 123 | $hpCount, $hpBaseCount, 124 | $hpBaseCount * 100 / $baseCount, 125 | $cumCount * 100 / $baseCount, 126 | (length($hpChar) < 10 ? $hpChar : substr($hpChar,0,1)."........."), 127 | (length($hpChar) < 10 ? ":" : "x" ), length($hpChar)); 128 | } 129 | 130 | printf(STDERR "Total sequence length: %d\n", $baseCount) unless $quiet; 131 | -------------------------------------------------------------------------------- /fmod.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import array 5 | import struct 6 | import wave 7 | import csv 8 | from math import sin, asin, pi, log, exp, sqrt 9 | 10 | ## Carry out frequency modulation; vary the frequency of a wave based 11 | ## on a signal 12 | ## 13 | ## Here is the rough process: 14 | # 1) find out the phase of the current frequency at the previous position 15 | # 2) calculate the new amplitude by adding one time slice to that phase 16 | 17 | def fmod(outFile, signal, minFreq, maxFreq, oldRate, newRate, 18 | speed=1.0, volume=0.8, logScale=True): 19 | oldRate = oldRate * speed 20 | logRange = log(maxFreq) - log(minFreq) 21 | linRange = maxFreq - minFreq 22 | meanSig = sum(signal) / len(signal) 23 | madSig = sum(map(lambda x: abs(x - meanSig), signal)) / len(signal) 24 | minSig = meanSig - madSig * 4; 25 | maxSig = meanSig + madSig * 4; 26 | if(min(signal) > minSig): 27 | minSig = min(signal) 28 | if(max(signal) < maxSig): 29 | maxSig = max(signal) 30 | sys.stderr.write("Min: %f, Max: %f, Signal: %d\n" % 31 | (minSig, maxSig, len(signal))) 32 | ## limit signal to within MAD range, scale to 0..100 33 | signal = map(lambda x: float(0) if (x < minSig) else 34 | (float(99) if x > maxSig else 35 | float(99) * (x - minSig) / (maxSig - minSig)), signal) 36 | newFreqs = (map( 37 | lambda x: exp((log(x + 1) / log(100)) * logRange + log(minFreq)), 38 | signal)) if logScale else ( 39 | map(lambda x: (x/100) * linRange + minFreq, signal)) 40 | # number of sound samples per signal sample 41 | newPerOld = (float(newRate) / (float(oldRate))) 42 | amp = [0] * int(len(signal) * newPerOld) 43 | # start in first quadrant [0..pi/2) 44 | quadrant = 0 45 | oldPhase = 0 46 | oldAmplitude = 0 47 | for s in xrange(len(signal)-1): 48 | ## work out phase of previous signal 49 | if(quadrant == 0): 50 | oldPhase = asin(oldAmplitude) 51 | if(quadrant == 1): 52 | oldPhase = pi - asin(oldAmplitude) 53 | if(quadrant == 2): 54 | oldPhase = pi - asin(oldAmplitude) 55 | if(quadrant == 3): 56 | oldPhase = 2 * pi + asin(oldAmplitude) 57 | newFreq = newFreqs[s+1] 58 | ## determine phase step (for input) 59 | sigPhaseStep = (newFreq * 2 * pi / oldRate) 60 | ## determine phase step (for output) 61 | outPhaseStep = (newFreq * 2 * pi / newRate) 62 | ## determine new phase 63 | newPhase = (oldPhase + sigPhaseStep) % (2 * pi) 64 | ## calculate new quadrant 65 | quadrant = int(newPhase / (pi/2)) 66 | ## determine new amplitude 67 | oldAmplitude = sin(newPhase) 68 | ## write signal to file 69 | sStart = int(s*newPerOld); 70 | sEnd = int((s+1)*newPerOld); 71 | packedSamples = map( 72 | lambda x: struct.pack('h',int(sin(oldPhase + x*outPhaseStep) * 73 | volume * 32767)), 74 | xrange(sEnd-sStart)) 75 | fmodOut.writeframes(''.join(packedSamples)) 76 | 77 | rate=int(sys.argv[2]) 78 | 79 | if(".csv" in sys.argv[1]): 80 | with open(sys.argv[1]) as csvfile: 81 | myreader = csv.reader(csvfile, delimiter=",", quotechar='"') 82 | data = array.array('f') 83 | for row in myreader: 84 | data.append(float(row[1])) 85 | outRate = 44100 86 | fmodOut = wave.open('out.wav', 'w') 87 | fmodOut.setparams((1, 2, outRate, 0, 'NONE', 'not compressed')) 88 | fmod(outFile=fmodOut, signal=data, minFreq=50, maxFreq=1000, 89 | speed=1.0, 90 | oldRate=rate, newRate=outRate, volume=0.1) 91 | fmodOut.close() 92 | pass 93 | else: 94 | with open(sys.argv[1], "rb") as f: 95 | outRate = 44100 96 | inData = f.read() 97 | if(len(inData) % 2 == 1): 98 | inData = inData[:-1] 99 | sys.stderr.write("Input length: %d\n" % len(inData)) 100 | data = array.array('H', inData) 101 | #print(",".join(map(str,newData[:50]))) 102 | fmodOut = wave.open('out.wav', 'w') 103 | fmodOut.setparams((1, 2, outRate, 0, 'NONE', 'not compressed')) 104 | fmod(outFile=fmodOut, signal=data, minFreq=200, maxFreq=1000, 105 | speed=1.0, 106 | oldRate=rate, newRate=outRate, volume=0.1) 107 | fmodOut.close() 108 | 109 | #with open ("out.raw", "wb") as soundFile: 110 | # soundFile.write(newData) 111 | -------------------------------------------------------------------------------- /svgImport.r: -------------------------------------------------------------------------------- 1 | library(gridSVG); 2 | library(grid); 3 | library(XML); 4 | 5 | x = c(0, 0.5, 1, 0.5) 6 | y = c(0.5, 1, 0.5, 0) 7 | grid.newpage() 8 | grid.polygon(x,y, name="goodshape") 9 | pat <- pattern(linesGrob(gp=gpar(col="black",lwd=3)), 10 | width = unit(5, "mm"), height = unit(5, "mm"), 11 | dev.width = 1, dev.height = 1) 12 | # Registering pattern 13 | registerPatternFill("pat", pat) 14 | # Applying pattern fill 15 | grid.patternFill("goodshape", label = "pat") 16 | grid.export("test-pattern.svg") 17 | 18 | svg.data <- xmlParse("dna_linear_DS_v4_backbone.svg"); 19 | 20 | xmlGetAttr(svg.data,"viewBox") 21 | 22 | 23 | svginfo <- getNodeSet(svg.data, "//*[name()='svg']")[[1]]; 24 | svgVB <- xmlGetAttr(svginfo, "viewBox"); 25 | svgW <- xmlGetAttr(svginfo, "width"); 26 | svgH <- xmlGetAttr(svginfo, "height"); 27 | if(!is.null(svgVB)){ 28 | svgVB <- as.numeric(strsplit(svgVB,"\\s+")[[1]]); 29 | } else if(!is.null(svgW)){ 30 | svgVB <- c(0,0,svgW,svgH); 31 | } 32 | 33 | svg.paths <- xpathApply(svg.data, '//svg:path', fun=function(x){ 34 | list(d=xmlGetAttr(x, "d"), style=xmlGetAttr(x, "style")); 35 | }); 36 | 37 | grid.newpage(); 38 | 39 | for(myPath in svg.paths){ 40 | ## see https://stackoverflow.com/questions/4246077/matching-numbers-with-regular-expressions-only-digits-and-commas/4247184 41 | numRE <- "(([+-]?)(?=\\d|\\.\\d)\\d*(\\.\\d*)?([Ee]([+-]?\\d+))?)"; 42 | pathChunks <- 43 | unlist(regmatches(myPath$d, 44 | gregexpr(paste0("([MmLl]?\\s*",numRE, 45 | "[,\\s]",numRE,"|[zZ])"), perl=TRUE, 46 | myPath$d))); 47 | ## not yet implemented: 48 | ## # HhVv - Horiz / vert lines # CcSs - Curve (cubic bezier) 49 | ## # QqTt - Curve (quadratic) # Aa - Elliptical arc 50 | blankLineChunks <- grep("^[^MmLlHhVvCcSsQqTtAaZz]", pathChunks); 51 | pathChunks[blankLineChunks] <- paste0("#",pathChunks[blankLineChunks]); 52 | ## remove filler whitespace (if any) 53 | pathChunks <- sub(paste0("^(.)\\s+",numRE),"\\1\\2", pathChunks, perl=TRUE); 54 | path.df <- data.frame(chunk = pathChunks, stringsAsFactors=FALSE); 55 | path.df$command <- substr(path.df$chunk,1,1); 56 | path.df$remainder <- sub("^.","",path.df$chunk); 57 | path.df$posX <- as.numeric(sub(paste0("^",numRE,"(.*)$"),"\\1", 58 | path.df$remainder, perl=TRUE)); 59 | path.df$posY <- as.numeric(sub(paste0("^",numRE,"(,|\\s+)"),"", 60 | path.df$remainder, perl=TRUE)); 61 | #if(grepl("e",myPath$d)){ 62 | # break; 63 | #} 64 | command.rle <- rle(path.df$command); 65 | anonPoss <- which(command.rle$values == "#"); 66 | ## [8.3.2] "If a moveto is followed by multiple pairs of coordinates, the 67 | ## subsequent pairs are treated as implicit lineto commands." 68 | command.rle$values[anonPoss] <- 69 | ifelse(command.rle$values[anonPoss-1] == "M", "L", 70 | ifelse(command.rle$values[anonPoss-1] == "m", "l", "#")); 71 | path.df$command <- inverse.rle(command.rle); 72 | #if(path.df$command[1] == "M"){ 73 | # next; 74 | #} 75 | penX <- 0; penY <- 0; startX <- 0; startY <- 0; 76 | absPoss <- matrix(NA,nrow=nrow(path.df), ncol=2); 77 | for(li in seq_along(path.df$command)){ 78 | cmd <- path.df$command[li]; 79 | px <- path.df$posX[li]; py <- path.df$posY[li]; 80 | if((cmd == "m") || (cmd == "M")){ 81 | startX <- px; startY <- py; 82 | } else if(cmd == "l"){ 83 | px <- px + penX; py <- py + penY; 84 | } else if((cmd == "z") || (cmd == "Z")){ 85 | px <- startX; py <- startY; 86 | } 87 | absPoss[li,] <- c(px, py); 88 | penX <- px; penY <- py; 89 | } 90 | path.df[,c("absX","absY")] <- absPoss; 91 | path.df$absX <- (path.df$absX - svgVB[1]) / (svgVB[3] - svgVB[1]); 92 | path.df$absY <- 1 - (path.df$absY - svgVB[2]) / (svgVB[4] - svgVB[2]); 93 | pathStyle <- myPath$style; 94 | pathFill <- sub("^fill:(.*)[;$]","\\1", 95 | regmatches(pathStyle,gregexpr("fill:.*?[;$]",pathStyle))); 96 | pathStroke <- sub("^stroke:(.*)[;$]","\\1", 97 | regmatches(pathStyle,gregexpr("stroke:.*?[;$]",pathStyle))); 98 | if((tail(path.df$command,1) == "z") || (tail(path.df$command,1) == "Z")){ 99 | grid.polygon(x=head(path.df$absX,-1), y=head(path.df$absY,-1), 100 | gp=gpar(col=pathStroke,fill=pathFill)); 101 | } else { 102 | grid.lines(x=path.df$absX, y=path.df$absY, 103 | gp=gpar(col=pathStroke,fill=pathFill)); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /tped2trios.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # tped2trios.pl -- Convert plink tped/tfam format file to a BEAGLE 4 | # unphased trio data file. If a second file is provided indicating the 5 | # genotyped individuals, then only trios containing two (or three) 6 | # genotyped individuals will be output. 7 | 8 | # Author: David Eccles (gringer), 2012 9 | 10 | use strict; 11 | use warnings; 12 | 13 | sub usage { 14 | print("usage: ./tped2trios.pl [id list file] [options]\n"); 15 | print("\nConvert plink tped/tfam file to BEAGLE trio file\n"); 16 | print("\nOther Options:\n"); 17 | print("-3 : ensure all trio members are genotyped (instead of 2)\n"); 18 | print("-help : Only display this help message\n"); 19 | print("\n"); 20 | } 21 | 22 | my @files = (); 23 | 24 | my $tpedFileName = ""; 25 | my $listFileName = ""; 26 | my $trioMin = 2; 27 | 28 | # extract command line arguments 29 | while(@ARGV){ 30 | my $argument = shift @ARGV; 31 | if(-f $argument){ # file existence check 32 | if(!$tpedFileName){ 33 | $tpedFileName = $argument; 34 | printf(STDERR "Setting TPED file name to '%s'\n", $tpedFileName); 35 | } elsif (!$listFileName) { 36 | $listFileName = $argument; 37 | printf(STDERR "Setting genotyped individual list file ". 38 | "name to '%s'\n", $listFileName); 39 | } else { 40 | print(STDERR "Error: only two files can be specified\n"); 41 | usage(); 42 | exit(1); 43 | } 44 | } else { 45 | if($argument eq "-3"){ 46 | $trioMin = 3; 47 | print(STDERR "All trio individuals output will be in list file\n"); 48 | } 49 | if($argument eq "-help"){ 50 | usage(); 51 | exit(0); 52 | } 53 | } 54 | } 55 | 56 | if(!$tpedFileName){ 57 | print(STDERR "Error: no files specified, cannot continue\n"); 58 | usage(); 59 | exit(1); 60 | } 61 | 62 | my $tfamFileName = $tpedFileName; 63 | $tfamFileName =~ s/\.tped$/\.tfam/; 64 | 65 | if(!(-f $tfamFileName)){ 66 | print(STDERR "Error: TFAM file associated with TPED file does not exist\n"); 67 | printf(STDERR " - expecting '%s'\n", $tfamFileName); 68 | usage(); 69 | exit(1); 70 | } 71 | 72 | if(!$listFileName){ 73 | $listFileName = $tfamFileName; 74 | } 75 | 76 | my %genotypedIndivs = (); 77 | # get IDs for people who have been genotyped 78 | my $lineNum = 0; 79 | open(my $listFile, "<", $listFileName) or die("Cannot open $listFileName"); 80 | while(<$listFile>){ 81 | chomp; 82 | my @F = split(/\s+/, $_); 83 | # assume field 0 is family, field 1 is individual 84 | $genotypedIndivs{$F[1]."@".$F[0]} = $lineNum++; 85 | } 86 | close($listFile); 87 | printf(STDERR "Added %d genotyped IDs\n", scalar(keys(%genotypedIndivs))); 88 | 89 | my %tfamIndivs = (); 90 | my @trioIDs = (); 91 | # get locations/IDs and parents for indviduals in TFAM file 92 | $lineNum = 0; 93 | open(my $tfamFile, "<", $tfamFileName) or die("Cannot open $tfamFileName"); 94 | while(<$tfamFile>){ 95 | chomp; 96 | my @F = split(/\s+/, $_); 97 | # assume field 0 is family, field 1 is individual 98 | # beagle output should be parent, parent, offspring 99 | my @IDs = ($F[3]."@".$F[0], $F[2]."@".$F[0], $F[1]."@".$F[0]); 100 | # print(join(";",@IDs)."\n"); 101 | $tfamIndivs{$IDs[2]} = $lineNum++; 102 | if(scalar(grep {$genotypedIndivs{$_}} @IDs) >= $trioMin){ 103 | push(@trioIDs, @IDs); 104 | } 105 | } 106 | close($tfamFile); 107 | 108 | foreach my $id (@trioIDs){ 109 | if(!defined($tfamIndivs{$id})){ 110 | printf("column ID for $id is undefined, bailing out\n"); 111 | usage(); 112 | exit(1); 113 | # $genotypedIndivs{$id} = -1; 114 | } 115 | } 116 | 117 | # convert trio IDs into column numbers 118 | my @trioColumns = 119 | map {($tfamIndivs{$_} * 2, $tfamIndivs{$_} * 2 + 1)} @trioIDs; 120 | 121 | # print first line (list of IDs) 122 | print("I ind\@fam ".join(" ", map {($_,$_)} @trioIDs)."\n"); 123 | 124 | open(my $tpedFile, "<", $tpedFileName) or die("Cannot open $tpedFileName"); 125 | print(STDERR "Writing output (one '.' per 1,000 lines)..."); 126 | my $lineCount = 0; 127 | while(<$tpedFile>){ 128 | chomp; 129 | my @F = split(/\s+/, $_); 130 | push(@F, 0, 0); 131 | shift(@F); # remove chromosome number 132 | my $marker = shift(@F); # extract marker name 133 | shift(@F);shift(@F); # remove map position / base location 134 | # print out marker line with trio genotypes 135 | printf("M $marker %s\n", join(" ", map {$F[$_]} @trioColumns)); 136 | if($lineCount++ % 1000 == 0){ 137 | print(STDERR "."); 138 | } 139 | } 140 | print(STDERR " done!\n"); 141 | -------------------------------------------------------------------------------- /vcf2fq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Author: David Eccles (gringer) 2012 [based on vcf2fq by lh3] 4 | 5 | use strict; 6 | use warnings; 7 | use Getopt::Std; 8 | 9 | &vcf2fq(); 10 | exit; 11 | 12 | sub vcf2fq { 13 | my %opts = (d=>3, D=>100000, Q=>10, L=>100, f=>""); 14 | getopts('d:D:Q:L:f:', \%opts); 15 | die(qq/ 16 | Usage: vcfutils.pl vcf2fq [options] 17 | 18 | Options: -d INT minimum depth [$opts{d}] 19 | -D INT maximum depth [$opts{D}] 20 | -Q INT min RMS mapQ [$opts{Q}] 21 | -L INT min INDEL Qual [$opts{L}] 22 | -f FASTA file with reference sequence(s) [$opts{f}] 23 | \n/) if (@ARGV == 0 && -t STDIN); 24 | 25 | my ($last_chr, $seq, $qual, $last_pos, @gaps); 26 | my $_Q = $opts{Q}; 27 | my $_L = $opts{L}; 28 | my $_d = $opts{d}; 29 | my $_D = $opts{D}; 30 | 31 | my %baseSeqs = (); 32 | 33 | if($opts{f}){ 34 | open(my $fastaFile, "<", $opts{f}) or die("unable to open reference sequence file: ".$opts{f}); 35 | my $seqID = ""; 36 | $seq = ""; 37 | while(<$fastaFile>){ 38 | chomp; 39 | if(/^>([^ ]+)/){ 40 | $baseSeqs{$seqID} = $seq unless ($seqID eq ""); 41 | $seqID = $1; 42 | $seq = ""; 43 | } else { 44 | $seq .= $_; 45 | } 46 | } 47 | $baseSeqs{$seqID} = $seq unless ($seqID eq ""); 48 | close($fastaFile); 49 | } 50 | 51 | my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', 52 | GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); 53 | 54 | $last_chr = ''; 55 | while (<>) { 56 | next if (/^#/); 57 | my @t = split; 58 | if ($last_chr ne $t[0]) { 59 | &v2q_post_process($last_chr, \$seq, \$qual, \@gaps) if ($last_chr); 60 | ($last_chr, $last_pos) = ($t[0], 0); 61 | $seq = $qual = ''; 62 | if(defined($baseSeqs{$t[0]})){ 63 | $seq .= $baseSeqs{$t[0]}; 64 | $qual .= '~' x length($seq); 65 | } 66 | @gaps = (); 67 | } 68 | die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); 69 | if ($t[1] > length($seq)) { 70 | $seq .= 'n' x ($t[1] - $last_pos); 71 | $qual .= '!' x ($t[1] - $last_pos); 72 | } 73 | if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference 74 | my ($ref, $alt) = ($t[3], $1); 75 | my ($b, $q); 76 | $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); 77 | if ($q < 0) { 78 | $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; 79 | $b = ($_ < .5 || $alt eq '.')? $ref : $alt; 80 | $q = -$q; 81 | } else { 82 | $b = $het{"$ref$alt"}; 83 | $b ||= 'N'; 84 | } 85 | $b = lc($b); 86 | $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); 87 | $q = int($q + 33 + .499); 88 | $q = chr($q <= 126? $q : 126); 89 | substr($seq,$t[1]-1,1) = $b; 90 | substr($qual,$t[1]-1,1) = $q; 91 | } elsif (($t[4] ne '.') && ($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && 92 | ($t[5] > $_L)) { 93 | # an INDEL 94 | my $fq = 126; 95 | if($t[7] =~ /MQ=(\d+)/){ 96 | $fq = $1; 97 | } 98 | push(@gaps, [$t[1], length($t[3]), (split(/,/,$t[4],2))[0], $fq]); 99 | } 100 | $last_pos = $t[1]; 101 | } 102 | &v2q_post_process($last_chr, \$seq, \$qual, \@gaps); 103 | } 104 | 105 | sub v2q_post_process { 106 | my ($chr, $seq, $qual, $gaps) = @_; 107 | for my $g (@$gaps) { 108 | # blank out replaced sequence (there was an off-by-1 error in the original code) 109 | substr($$seq, ($g->[0]-1), $g->[1]) = '-' x ($g->[1]); 110 | substr($$qual, ($g->[0]-1), $g->[1]) = ' ' x ($g->[1]); 111 | } 112 | my $newSeq = ''; 113 | my $newQual = ''; 114 | my $seqPos = 1; 115 | my $indelOffset = 0; 116 | for my $g (@$gaps) { 117 | $newSeq .= substr($$seq,$seqPos-1,($g->[0])-$seqPos); 118 | $newQual .= substr($$qual,$seqPos-1,($g->[0])-$seqPos); 119 | $indelOffset = $indelOffset - (($g->[0])-$seqPos); 120 | $indelOffset = 0 if ($indelOffset < 0); 121 | $seqPos = $g->[0]; 122 | $newSeq .= lc(substr($g->[2],$indelOffset)) unless ($indelOffset > length($g->[2])); 123 | my $q = int($g->[3] + 33 + .499); 124 | $q = chr($q <= 126? $q : 126); 125 | $newQual .= $q x (length($g->[2]) - $indelOffset) unless ($indelOffset > length($g->[2])); 126 | $indelOffset = $g->[1] unless ($indelOffset > $g->[1]); 127 | } 128 | if($seqPos < length($$seq)){ 129 | $newSeq .= substr($$seq,$seqPos); 130 | $newQual .= substr($$qual,$seqPos); 131 | } 132 | $newSeq =~ tr/\-//d; 133 | $newQual =~ tr/ //d; 134 | print "\@$chr\n"; &v2q_print_str(\$newSeq); 135 | print "+\n"; &v2q_print_str(\$newQual); 136 | } 137 | 138 | sub v2q_print_str { 139 | my ($s) = @_; 140 | my $l = length($$s); 141 | for (my $i = 0; $i < $l; $i += 60) { 142 | print substr($$s, $i, 60), "\n"; 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastn_params.html: -------------------------------------------------------------------------------- 1 |

Program Parameters

2 | 3 |
4 | General Parameters 5 | 6 |

7 | 8 | 22 |

23 | 24 |

25 | 26 | 27 |

28 | 29 |

30 | 31 | 32 |

33 | 34 |

35 | 36 | 43 |

44 | 45 | 46 | 47 | 48 | 49 | 50 |
51 | 52 |
53 | Scoring Parameters 54 |

55 | 56 | 65 |

66 | 67 |

68 | 69 | 79 |

80 | 81 |

82 | 83 | 96 |

97 |

98 | 99 | 107 |

108 | 109 |
110 | 111 | 114 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastp_params.html: -------------------------------------------------------------------------------- 1 |

Program Parameters

2 | 3 |
4 | General Parameters 5 | 6 |

7 | 8 | 22 |

23 | 24 |

25 | 26 | 27 |

28 | 29 |

30 | 31 | 32 |

33 | 34 |

35 | 36 | 43 |

44 | 45 | 46 | 47 | 48 | 49 | 50 |
51 | 52 |
53 | Scoring Parameters 54 |

55 | 56 | 65 |

66 | 67 |

68 | 69 | 79 |

80 | 81 |

82 | 83 | 96 |

97 |

98 | 99 | 107 |

108 | 109 |
110 | 111 | 114 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/blastx_params.html: -------------------------------------------------------------------------------- 1 |

Program Parameters

2 | 3 |
4 | General Parameters 5 | 6 |

7 | 8 | 22 |

23 | 24 |

25 | 26 | 27 |

28 | 29 |

30 | 31 | 32 |

33 | 34 |

35 | 36 | 43 |

44 | 45 | 46 | 47 | 48 | 49 | 50 |
51 | 52 |
53 | Scoring Parameters 54 |

55 | 56 | 65 |

66 | 67 |

68 | 69 | 79 |

80 | 81 |

82 | 83 | 96 |

97 |

98 | 99 | 107 |

108 | 109 |
110 | 111 | 114 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastn_params.html: -------------------------------------------------------------------------------- 1 |

Program Parameters

2 | 3 |
4 | General Parameters 5 | 6 |

7 | 8 | 22 |

23 | 24 |

25 | 26 | 27 |

28 | 29 |

30 | 31 | 32 |

33 | 34 |

35 | 36 | 43 |

44 | 45 | 46 | 47 | 48 | 49 | 50 |
51 | 52 |
53 | Scoring Parameters 54 |

55 | 56 | 65 |

66 | 67 |

68 | 69 | 79 |

80 | 81 |

82 | 83 | 96 |

97 |

98 | 99 | 107 |

108 | 109 |
110 | 111 | 114 | -------------------------------------------------------------------------------- /webblast/cgi-bin/templates/tblastx_params.html: -------------------------------------------------------------------------------- 1 |

Program Parameters

2 | 3 |
4 | General Parameters 5 | 6 |

7 | 8 | 22 |

23 | 24 |

25 | 26 | 27 |

28 | 29 |

30 | 31 | 32 |

33 | 34 |

35 | 36 | 43 |

44 | 45 | 46 | 47 | 48 | 49 | 50 |
51 | 52 |
53 | Scoring Parameters 54 |

55 | 56 | 65 |

66 | 67 |

68 | 69 | 79 |

80 | 81 |

82 | 83 | 96 |

97 |

98 | 99 | 107 |

108 | 109 |
110 | 111 | 114 | -------------------------------------------------------------------------------- /fastx-fetch.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 7 | 8 | my $idFileName = ""; 9 | my $quiet = 0; 10 | my $minLength = 1; 11 | my $maxLength = 10 ** 12; # 1 Tbp 12 | my $count = -1; 13 | my $invert = 0; # invert logic 14 | my $trim = 0; 15 | my $trimString = ""; 16 | 17 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet, 18 | "reverse|v!" => \$invert, "trim=i" => \$trim, 19 | "minLength=i" => \$minLength, "maxLength=i" => \$maxLength, 20 | "count=i" => \$count, "nametrim=s" => \$trimString, ) or 21 | die("Error in command line arguments"); 22 | 23 | my %idsToGet = (); 24 | if($trim){ 25 | $minLength = $minLength + $trim * 2; 26 | $maxLength = $maxLength + $trim * 2; 27 | } 28 | 29 | if($trimString){ 30 | printf(STDERR "Will remove text matching '(%s)' from the sequence IDs", 31 | $trimString); 32 | $trimString =~ s/^\|//; 33 | $trimString = "($trimString)"; 34 | } 35 | 36 | # unknown commands are treated as identifiers 37 | my @files = (); 38 | while(@ARGV){ 39 | my $arg = shift(@ARGV); 40 | if(-e $arg){ 41 | push(@files, $arg); 42 | } else { 43 | $idsToGet{$arg} = 1; 44 | } 45 | } 46 | @ARGV = @files; 47 | 48 | # use stdin if no files supplied 49 | if(!@ARGV){ 50 | @ARGV = '-' unless (-t STDIN); 51 | } 52 | 53 | if($idFileName){ 54 | # read sequence IDs from input file 55 | if(!$quiet){ 56 | printf(STDERR "Attempting to read from input file ($idFileName)\n"); 57 | } 58 | my $idFile = new IO::Uncompress::Gunzip "$idFileName" or 59 | die "Unable to open $idFileName\n"; 60 | while(<$idFile>){ 61 | chomp; 62 | s/^[>@]//; 63 | s/\s.*$//; 64 | $idsToGet{$_} = 1; 65 | } 66 | close($idFile); 67 | } 68 | 69 | if(!$quiet){ 70 | printf(STDERR "Read %d identifiers\n", scalar(keys(%idsToGet))); 71 | } 72 | 73 | if(!$quiet && $invert){ 74 | printf(STDERR "Excluding IDs, rather than selecting\n"); 75 | } else { 76 | ## Stop when all IDs have been seen 77 | if(($count == -1) && (keys(%idsToGet))){ 78 | $count = scalar(keys(%idsToGet)); 79 | } 80 | } 81 | 82 | my $inQual = 0; # false 83 | my $seqID = ""; 84 | my $qualID = ""; 85 | my $seq = ""; 86 | my $qual = ""; 87 | foreach my $file (@ARGV) { 88 | # This little gunzip dance makes sure the script can handle both 89 | # gzip-compressed and uncompressed input, regardless of whether 90 | # or not it is piped 91 | my $z = new IO::Uncompress::Gunzip($file, "transparent", 1) 92 | or die "gunzip failed: $GunzipError\n"; 93 | while(<$z>){ 94 | chomp; 95 | chomp; 96 | if (!$inQual) { 97 | if (/^(>|@)((.+?)( .*?\s*)?)$/) { 98 | my $newSeqID = $2; 99 | my $newShortID = $3; 100 | my $testSeqID = $newSeqID; 101 | my $testShortID = $newShortID; 102 | if($trimString){ 103 | $testShortID =~ s/$trimString//; 104 | $testSeqID =~ s/$trimString//; 105 | } 106 | if ($seqID && (length($seq) >= $minLength) && (length($seq) <= $maxLength)) { 107 | if ($trim > 0) { 108 | $seq = substr($seq, $trim, length($seq)-($trim * 2)); 109 | if ($qual) { 110 | $qual = substr($qual, $trim, length($qual)-($trim * 2)); 111 | } 112 | } 113 | if ($qual) { 114 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 115 | } else { 116 | $seq =~ s/(.{100})/$1\n/g; 117 | $seq =~ s/\n$//; 118 | printf(">%s\n%s\n", $seqID, $seq); 119 | } 120 | if (--$count == 0) { 121 | $seqID = ""; 122 | last; 123 | } 124 | } 125 | $seq = ""; 126 | $qual = ""; 127 | if ((!(keys(%idsToGet)) || exists($idsToGet{$testSeqID}) || exists($idsToGet{$testShortID})) xor $invert) { 128 | delete $idsToGet{$testSeqID}; 129 | delete $idsToGet{$testShortID}; 130 | $seqID = $newSeqID; 131 | } else { 132 | $seqID = ""; 133 | } 134 | } elsif (/^\+(.*)$/) { 135 | $inQual = 1; # true 136 | $qualID = $1; 137 | } else { 138 | $seq .= $_; 139 | } 140 | } else { 141 | $qual .= $_; 142 | if (length($qual) >= length($seq)) { 143 | $inQual = 0; # false 144 | } 145 | } 146 | } 147 | } 148 | 149 | if($seqID && (length($seq) >= $minLength) && (length($seq) <= $maxLength)){ 150 | if($trim > 0){ 151 | $seq = substr($seq, $trim, length($seq)-($trim * 2)); 152 | if($qual){ 153 | $qual = substr($qual, $trim, length($qual)-($trim * 2)); 154 | } 155 | } 156 | if($qual){ 157 | printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual); 158 | } else { 159 | $seq =~ s/(.{100})/$1\n/g; 160 | $seq =~ s/\n$//; 161 | printf(">%s\n%s\n", $seqID, $seq); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /readthrough.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import HTSeq 3 | import string 4 | from Bio.Seq import Seq 5 | from Bio.Alphabet import generic_dna 6 | import re 7 | import sys 8 | 9 | def writeSeqTrans(name, exons, seq, interval): 10 | if(interval.strand == "-"): 11 | #print(">%s (rc)\n%s" % (name, seq)) 12 | seq = string.replace(seq,"|","") 13 | translation = str(Seq(seq, generic_dna).reverse_complement().translate()) 14 | endText = " from RC strand" 15 | else: 16 | #print(">%s\n%s" % (name, seq)) 17 | seq = string.replace(seq,"|","") 18 | translation = str(Seq(seq, generic_dna).translate()) 19 | endText = "" 20 | translation = string.replace(translation, "*", "X", 1) 21 | translation = re.sub("\\*.*$","*",translation) 22 | if(translation.endswith("X")): 23 | endText += ", no sequence beyond stop codon" 24 | translation = string.replace(translation, "X", "*", 1) 25 | throughDist = 0 26 | elif(not translation.endswith("*")): 27 | endText += ", no additional stop codons found" 28 | throughDist = 0 29 | else: 30 | throughDist = translation.find("*") - translation.find("X") 31 | endText += ", stop codon distance: %d" % throughDist 32 | name = "%05d_%s" % (throughDist, name) 33 | if(numExons == 1): 34 | print(">%s [translation of %d exon%s]\n%s" % (name, exons, endText, translation)) 35 | else: 36 | print(">%s [translation of %d exons%s]\n%s" % (name, exons, endText, translation)) 37 | 38 | def writeSeqMrna(name, exons, seq, interval): 39 | if(interval.strand == "-"): 40 | #print(">%s (rc)\n%s" % (name, seq)) 41 | seq = string.replace(seq,"|","") 42 | translation = str(Seq(seq, generic_dna).reverse_complement().translate()) 43 | endText = " from RC strand" 44 | else: 45 | #print(">%s\n%s" % (name, seq)) 46 | seq = string.replace(seq,"|","") 47 | translation = str(Seq(seq, generic_dna).translate()) 48 | endText = "" 49 | translation = string.replace(translation, "*", "X", 1) 50 | translation = re.sub("\\*.*$","*",translation) 51 | if(translation.endswith("X")): 52 | endText += ", no sequence beyond stop codon" 53 | translation = string.replace(translation, "X", "*", 1) 54 | throughDist = 0 55 | elif(not translation.endswith("*")): 56 | endText += ", no additional stop codons found" 57 | throughDist = 0 58 | else: 59 | throughDist = translation.find("*") - translation.find("X") 60 | endText += ", stop codon distance: %d" % throughDist 61 | name = "%05d_%s" % (throughDist, name) 62 | if(numExons == 1): 63 | print(">%s [translation of %d exon%s]\n%s" % (name, exons, endText, translation)) 64 | else: 65 | print(">%s [translation of %d exons%s]\n%s" % (name, exons, endText, translation)) 66 | 67 | 68 | # load Scer reference genome 69 | scerFile = open('saccharomyces_cerevisiae.gff', 'r') 70 | gffLines = () 71 | fastaLines = () 72 | hitFasta = False 73 | for(line in scerFile): 74 | if(line.startswith('##FASTA')): 75 | hitFasta = True 76 | if(not hitFasta): 77 | gfflines.append(line) 78 | else if(not line.startswith('#')): 79 | fastaLines.append(line) 80 | 81 | 82 | gtfFile = HTSeq.GFF_Reader(gffLines) 83 | # load all sequences into memory 84 | sequences = dict() 85 | for s in HTSeq.FastaReader(fastaLines): 86 | s.seq = string.replace(s.seq,"\r","") 87 | sequences[s.name] = s 88 | 89 | lastSequence = "" 90 | lastName = "" 91 | lastInterval = None 92 | extend = 50 # number of base pairs to extend 93 | readDistance = 5000 94 | numExons = 0 95 | numFeatures = 0 96 | sys.stderr.write("\n") 97 | for feature in gtfFile: 98 | sys.stderr.write("\rFeatures read: %d" % numFeatures) 99 | numFeatures += 1 100 | if(feature.type != "CDS"): 101 | continue 102 | if(feature.name != lastName): 103 | if((lastInterval != None) and (lastInterval.strand != "-")): 104 | # add readDistance bases to end of sequence 105 | lastSequence += sequences[lastInterval.chrom].seq[lastInterval.end:lastInterval.end+readDistance] 106 | # write out sequence / translation 107 | if(lastName != ""): 108 | writeSeqTrans(lastName, numExons, lastSequence, lastInterval) 109 | lastSequence = "" 110 | numExons = 0 111 | if(feature.iv.strand == "-"): 112 | # add readDistance bases to start of sequence 113 | startPos = lastInterval.start - readDistance 114 | if(startPos < 0): 115 | startPos = 0 116 | lastSequence += sequences[feature.iv.chrom].seq[startPos:lastInterval.end] 117 | lastName = feature.name 118 | lastInterval = feature.iv 119 | # add current interval to sequence 120 | lastSequence += sequences[feature.iv.chrom].seq[feature.iv.start:feature.iv.end] 121 | lastSequence += "|" 122 | numExons += 1 123 | 124 | # write out sequence / translation 125 | writeSeqTrans(lastName, numExons, lastSequence, lastInterval) 126 | 127 | sys.stderr.write("\n") 128 | -------------------------------------------------------------------------------- /spiralign.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | setwd("~/bioinf/presentations/2017-Sep-03"); 4 | 5 | type <- "aa"; 6 | 7 | if(type == "nucl"){ 8 | ## Nucleotides 9 | library(msa); 10 | input.seqs <- readDNAStringSet("notUSCO_EOG091H04CB.tran.fa"); 11 | names(input.seqs) <- sub(" .*$","",names(input.seqs)); 12 | msa.df <- data.frame(t(as.matrix(msa(input.seqs))), stringsAsFactors=FALSE); 13 | colnames(msa.df) <- 1:ncol(msa.df); 14 | 15 | efg.cols <- c("G" = "gold", "C" = "blue", "A" = "darkgreen", 16 | "T" = "red", "-" = "grey20"); 17 | 18 | png("msa.png", width=1024, height=1024, pointsize=24); 19 | par(mar=c(0.5,0.5,0.5,0.5), bg="black"); 20 | loops <- 5; 21 | lstt <- 3; 22 | lend <- loops+lstt; 23 | ## integrate(2*pi*r,r=lstt..x) 24 | ## => pi(x²-(lstt)²) 25 | dTot <- pi*((lstt + loops)^2 - (lstt)^2); ## total "distance" travelled 26 | ## s = pi(x²-(lstt)²) 27 | ## => s/pi = x² - (lstt)² 28 | ## => x = sqrt((lstt)² + s/pi) 29 | msa.df$s <- seq(0,dTot, length.out=nrow(msa.df)); ## distance at each pos 30 | msa.df$r <- sqrt(lstt^2 + msa.df$s/pi); ## path radius at each pos 31 | msa.df$theta <- msa.df$r * 2*pi; ## traversed angle at each pos 32 | msa.df$deg <- (msa.df$theta / (2*pi)) * 360; 33 | msa.df$x <- msa.df$r * cos(msa.df$theta); 34 | msa.df$y <- msa.df$r * sin(msa.df$theta); 35 | plot(NA,xlim=c(-lend,lend), ylim=c(-lend,lend), ann=FALSE, axes=FALSE); 36 | pcex <- 0.45; 37 | for(p in seq(1,nrow(msa.df))){ 38 | pr <- msa.df$r[p]; 39 | pt <- msa.df$theta[p]; 40 | text(x=-(pr-0.25)*cos(pt), y=(pr-0.25)*sin(pt), labels="▅", 41 | srt=-msa.df$deg[p], 42 | cex=pcex*0.9, col=efg.cols[msa.df[p,1]]); 43 | text(x=-pr*cos(pt), y=pr*sin(pt), labels="▅", 44 | srt=-msa.df$deg[p], 45 | cex=pcex, col=efg.cols[msa.df[p,2]]); 46 | text(x=-(pr+0.25)*cos(pt), y=(pr+0.25)*sin(pt), labels="▅", 47 | srt=-msa.df$deg[p], 48 | cex=pcex*1.05, col=efg.cols[msa.df[p,3]]); 49 | } 50 | text(0,0.5, expression(italic(Nippostrongylus)), col="white"); 51 | text(0,0, expression(italic(brasiliensis)), col="white"); 52 | text(0,-0.5, "Fructose-1,6-bisphosphatase", col="white", cex=0.75); 53 | invisible(dev.off()); 54 | } 55 | 56 | ## Amino Acids 57 | library(msa); 58 | input.seqs <- readAAStringSet("notUSCO_EOG091H04CB.prot.fa"); 59 | names(input.seqs) <- sub(" .*$","",names(input.seqs)); 60 | msa.df <- data.frame(t(as.matrix(msa(input.seqs, order="input"))), 61 | stringsAsFactors=FALSE); 62 | msa.df <- msa.df[nrow(msa.df):1,] 63 | 64 | rasmol.cols <- c("D" = "#E60A0A", "E" = "#E60A0A", 65 | "C" = "#E6E600", "M" = "#E6E600", 66 | "K" = "#145AFF", "R" = "#145AFF", 67 | "S" = "#FA9600", "T" = "#FA9600", 68 | "F" = "#3232AA", "Y" = "#3232AA", 69 | "N" = "#00DCDC", "Q" = "#00DCDC", 70 | "G" = "#EBEBEB", 71 | "L" = "#0F820F", "V" = "#0F820F", "I" = "#0F820F", 72 | "A" = "#C8C8C8", 73 | "W" = "#B45AB4", 74 | "H" = "#8282D2", 75 | "P" = "#DC9682", 76 | "-" = "grey20", "X" = "grey20"); 77 | rasmol.cats <- tapply(names(rasmol.cols),rasmol.cols,paste,collapse=","); 78 | 79 | png("msa_aa.png", width=1024, height=1024, pointsize=24); 80 | par(mar=c(0.5,0.5,0.5,0.5), bg="black"); 81 | loops <- 2.75; 82 | lstt <- 3; 83 | lend <- loops+lstt; 84 | ## integrate(2*pi*r,r=lstt..x) 85 | ## => pi(x²-(lstt)²) 86 | dTot <- pi*((lstt + loops)^2 - (lstt)^2); ## total "distance" travelled 87 | ## s = pi(x²-(lstt)²) 88 | ## => s/pi = x² - (lstt)² 89 | ## => x = sqrt((lstt)² + s/pi) 90 | msa.df$s <- seq(0,dTot, length.out=nrow(msa.df)); ## distance at each pos 91 | msa.df$r <- sqrt(lstt^2 + msa.df$s/pi); ## path radius at each pos 92 | msa.df$theta <- msa.df$r * 2*pi; ## traversed angle at each pos 93 | msa.df$deg <- (msa.df$theta / (2*pi)) * 360; 94 | msa.df$x <- msa.df$r * cos(msa.df$theta); 95 | msa.df$y <- msa.df$r * sin(msa.df$theta); 96 | plot(NA,xlim=c(-lend,lend), ylim=c(-lend,lend), ann=FALSE, axes=FALSE); 97 | pcex <- 0.8; 98 | for(p in seq(1,nrow(msa.df))){ 99 | pr <- msa.df$r[p]; 100 | pt <- msa.df$theta[p]; 101 | pym <- length(input.seqs); 102 | pyr <- seq(-(pym-2)/(pym-1),(pym-2)/(pym-1), length.out=pym)/2; 103 | for(py in 1:length(input.seqs)){ 104 | text(x=-(pr+pyr[py])*cos(pt), y=(pr+pyr[py])*sin(pt), labels="▅", 105 | srt=-msa.df$deg[p], 106 | cex=pcex, col=rasmol.cols[msa.df[p,py]]); 107 | } 108 | } 109 | for(py in 1:length(input.seqs)){ 110 | text(x=-(max(msa.df$r)+pyr[py])*cos(max(msa.df$theta)), 111 | y=(max(msa.df$r)+pyr[py])*sin(max(msa.df$theta))-0.02, 112 | labels=names(input.seqs)[py], 113 | srt=0, pos=2, 114 | cex=0.6, col="white"); 115 | } 116 | text(0,0.5, expression(italic(Nippostrongylus)), col="white"); 117 | text(0,0, expression(italic(brasiliensis)), col="white"); 118 | text(0,-0.5, "Fructose-1,6-bisphosphatase", col="white", cex=0.75); 119 | legend("topleft", fill=names(rasmol.cats), text.col="white", 120 | legend=rasmol.cats, ncol=2, cex=0.8); 121 | invisible(dev.off()); 122 | -------------------------------------------------------------------------------- /fastx-repeatFilter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use Pod::Usage; ## uses pod documentation in usage code 7 | use Getopt::Long qw(:config auto_version auto_help); 8 | use POSIX; ## for ceil 9 | 10 | our $VERSION = "0.1"; 11 | our $DEBUG = 0; 12 | 13 | =head1 NAME 14 | 15 | fastx-repeatFilter.pl -- Repeat Match Algorithm for Long-read Evaluation 16 | 17 | =head1 SYNOPSIS 18 | 19 | ./fastx-repeatFilter.pl [options] 20 | 21 | =head2 Options 22 | 23 | =over 2 24 | 25 | =item B<-help> 26 | 27 | Only display this help message 28 | 29 | =item B<-lmin> 30 | 31 | Minimum repeat length 32 | 33 | =item B<-lmax> 34 | 35 | Maximum repeat length 36 | 37 | =item B<-length> 38 | 39 | Repeat length; sets I and I to the same value 40 | 41 | =item B<-skew> 42 | 43 | Maximum skew between repeats for matching lines 44 | 45 | =item B<-threshold> 46 | 47 | Threshold score for identifying repeat reads 48 | 49 | =item B<-fraction> 50 | 51 | Fraction of read that must pass threshold score 52 | 53 | =item B<-reverse> 54 | 55 | Remove repeat-containing reads instead of extracting them 56 | 57 | =back 58 | 59 | =head1 DESCRIPTION 60 | 61 | Identifies (filters) repeat-containing reads in a long-read dataset. 62 | 63 | =head1 METHODS 64 | 65 | something 66 | 67 | =cut 68 | 69 | =head2 processSeq(id, seq, qual, lmin, lmax, skew, threshold, fraction, reverse) 70 | 71 | Analyses the sequence I to work out if it is likely to contain a 72 | substantial proportion of repeats of length in the range I to 73 | I. Depending on the value of I and whether a repeat 74 | read was detected, either return an empty string or the sequence. 75 | 76 | =cut 77 | 78 | sub processSeq{ 79 | my ($id, $seq, $qual, $lmin, $lmax, $skew, 80 | $threshold, $fraction, $reverse) = @_; 81 | if(!$id || (length($seq) < $lmin)){ 82 | if($reverse){ 83 | if($qual){ 84 | print("@${id}\n${seq}\n+\n${qual}\n"); 85 | } else { 86 | print(">${id}\n${seq}\n"); 87 | } 88 | return; 89 | } 90 | } 91 | foreach my $len ($lmin..$lmax){ 92 | my @scores = (); 93 | for(my $spos = $skew; ($spos+2*$len+$skew+1) < length($seq); $spos += $len){ 94 | my $maxScore = 0; 95 | foreach my $ofs (-$skew..$skew){ 96 | my $score=0; 97 | foreach my $c (0..($len-1)){ 98 | if(substr($seq,$c+$spos,1) eq substr($seq,$spos+$c+$len+$ofs,1)){ 99 | $score++; 100 | } 101 | } 102 | if($score > $maxScore){ 103 | $maxScore = $score; 104 | } 105 | # printf("---\n%s\n%s\n--- [%d+%d, %0.2f]\n", 106 | # substr($seq,$spos,$len), 107 | # substr($seq,$spos+$len+$ofs,$len), 108 | # $spos, $ofs, $score); 109 | } 110 | push(@scores, $maxScore); 111 | # printf("%3d %0.2f\n", $spos, $maxScore / $len); 112 | } 113 | @scores = (sort {$b <=> $a} (@scores))[0..($#scores * $fraction)]; 114 | print("${len},$scores[0],$scores[$#scores]\n"); 115 | } 116 | #printf("Length %d, min score with fraction %0.2f: %0.2f\n", 117 | # $len, $fraction, $scores[$#scores]/$len); 118 | } 119 | 120 | 121 | #################################################### 122 | # Command line parsing and verification starts here 123 | #################################################### 124 | 125 | my $argLine = join(" ",@ARGV); 126 | 127 | my $options = 128 | { 129 | "length" => 100, 130 | "lmin" => -1, 131 | "lmax" => -1, 132 | "skew" => 3, 133 | "threshold" => 0.8, 134 | "fraction" => 0.5, 135 | "reverse" => 0 136 | }; 137 | 138 | GetOptions($options, 139 | 'length|l=i', 140 | 'lmin=i', 141 | 'lmax=i', 142 | 'skew=i', 143 | 'threshold=f', 144 | 'fraction=i', 145 | 'reverse|v!', 146 | 'debug!' => \$DEBUG, 147 | ) or pod2usage(1); 148 | 149 | if($options->{"skew"} >= $options->{"length"}){ 150 | die("Skew must be less than repeat length"); 151 | } 152 | 153 | if(($options->{lmin} == -1) || ($options->{lmax} == -1)){ 154 | $options->{lmin} = $options->{length}; 155 | $options->{lmax} = $options->{length}; 156 | } 157 | 158 | my $inQual = 0; # false 159 | my $seqID = ""; 160 | my $qualID = ""; 161 | my $seq = ""; 162 | my $qual = ""; 163 | while(<>){ 164 | chomp; 165 | chomp; 166 | if(!$inQual){ 167 | if(/^(>|@)((.+?)( .*?\s*)?)$/){ 168 | my $newSeqID = $2; 169 | my $newShortID = $3; 170 | if($seqID){ 171 | processSeq($seqID, $seq, $qual, 172 | $options->{"lmin"}, $options->{"lmax"}, 173 | $options->{"skew"}, 174 | $options->{"threshold"}, $options->{"fraction"}, 175 | $options->{"reverse"}); 176 | } 177 | $seq = ""; 178 | $qual = ""; 179 | $seqID = $newSeqID; 180 | } elsif(/^\+(.*)$/) { 181 | $inQual = 1; # true 182 | $qualID = $1; 183 | } else { 184 | $seq .= $_; 185 | } 186 | } else { 187 | $qual .= $_; 188 | if(length($qual) >= length($seq)){ 189 | $inQual = 0; # false 190 | } 191 | } 192 | } 193 | 194 | if($seqID){ 195 | processSeq($seqID, $seq, $qual, 196 | $options->{"lmin"}, $options->{"lmax"}, $options->{"skew"}, 197 | $options->{"threshold"}, $options->{"fraction"}, 198 | $options->{"reverse"}); 199 | } 200 | -------------------------------------------------------------------------------- /fastx-interleave.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | use Getopt::Long qw(:config auto_help pass_through); 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError); 7 | 8 | my $singleLine = 0; 9 | my $minLength = 0; 10 | my $maxCount = 0; # use reservoir sampling to randomise reads 11 | 12 | my %seqFiles = (); 13 | my @seqFileOrder = (); 14 | 15 | GetOptions("singleLine!" => \$singleLine, "minLength=i" => \$minLength, 16 | "count=i" => \$maxCount) or 17 | die("Error in command line arguments"); 18 | 19 | while(@ARGV){ 20 | my $argument = shift(@ARGV); 21 | if(-e $argument){ # file existence check 22 | $seqFiles{$argument} = 1; 23 | push(@seqFileOrder, $argument); 24 | } else { 25 | die("Error in command line arguments: $argument"); 26 | } 27 | } 28 | 29 | foreach my $seqFileName (keys(%seqFiles)){ 30 | my $tmpFile = new IO::Uncompress::Gunzip "$seqFileName" or 31 | die "Unable to open $seqFileName\n"; 32 | $seqFiles{$seqFileName} = $tmpFile; 33 | } 34 | 35 | my $numFiles = scalar(@seqFileOrder); 36 | 37 | if($numFiles < 2){ 38 | die("Too few input files"); 39 | } else { 40 | printf(STDERR "Interleaving %d input files\n", $numFiles); 41 | } 42 | 43 | if($minLength){ 44 | printf(STDERR "Only outputting read groups for reads of at least %d bases\n", $minLength); 45 | } 46 | 47 | if($maxCount){ 48 | printf(STDERR "Reservoir sampling to output at most %d read groups:", 49 | $maxCount); 50 | } 51 | 52 | my @inQual = (0) x $numFiles; # false 53 | my @seqID = ("") x $numFiles; 54 | my @qualID = ("") x $numFiles; 55 | my @seq = ("") x $numFiles; 56 | my @qual = ("") x $numFiles; 57 | my @printable = ("") x $numFiles; 58 | my $lineCount = 0; 59 | my $inFile = 0; 60 | my $recordsRead = 0; 61 | my $dotsPrinted = 0; 62 | 63 | my @printReservoir = (); 64 | 65 | for(my @lines = map { 66 | $inFile = $seqFiles{$_}; my $res = <$inFile>; $res} @seqFileOrder; 67 | grep {$_} @lines; # stop if all input is invalid 68 | @lines = map { 69 | $inFile = $seqFiles{$_}; my $res = <$inFile>; $res} @seqFileOrder){ 70 | $lineCount++; 71 | for(my $i = 0; $i < $numFiles; $i++){ 72 | if(!$lines[$i]){ 73 | next; 74 | } 75 | my $line = $lines[$i]; 76 | chomp $line; chomp $line; 77 | #printf(STDERR "Line $lineCount,$i: $line\n"); 78 | if ($line =~ /^\s+$/) { 79 | next; 80 | } 81 | if (!$inQual[$i]) { 82 | if($line =~ /^(>|@)(.*)$/){ 83 | my $newSeqID = $2; 84 | if($seqID[$i]){ 85 | if($printable[$i]){ 86 | print(STDERR "Warning: double print for file $i\n"); 87 | } 88 | if($qual[$i]){ 89 | $printable[$i] .= 90 | sprintf("@%s\n%s\n+\n%s\n", $seqID[$i], $seq[$i], $qual[$i]); 91 | } else { 92 | $printable[$i] .= 93 | sprintf(">%s\n%s\n", $seqID[$i], $seq[$i]); 94 | } 95 | if(length($seq[$i]) < $minLength){ 96 | $printable[$i] = "#"; 97 | } 98 | } 99 | $qual[$i] = ""; 100 | $seq[$i] = ""; 101 | # printf(STDERR "Line $lineCount,$i: ". 102 | # "setting SeqID for $i to $newSeqID\n"); 103 | $seqID[$i] = $newSeqID; 104 | } elsif ($line =~ /^\+(.*)$/) { 105 | if(!$seqID[$i]){ 106 | die("[QID ] no sequence ID for $i on line $lineCount\nline: ".$line); 107 | } 108 | $inQual[$i] = 1; # true 109 | $qualID[$i] = $1; 110 | $qual[$i] = ""; 111 | } else { 112 | if(!$seqID[$i]){ 113 | die("[SEQ ] no sequence ID for $i on line $lineCount\nline: ".$line); 114 | } 115 | $seq[$i] .= $line; 116 | } 117 | } else { 118 | if(!$seqID[$i]){ 119 | die("[QUAL] no sequence ID for $i on line $lineCount\nline: ".$line); 120 | } 121 | $qual[$i] .= $line; 122 | if (length($qual[$i]) >= length($seq[$i])) { 123 | $inQual[$i] = 0; # false 124 | } 125 | } 126 | } # end loop over lines from files 127 | if(scalar(grep {$_} @printable) == $numFiles){ # print if all can be printed 128 | if(grep {/^#/} @printable){ 129 | ## reset / clear all if any are non-printable 130 | for(my $i = 0; $i < $numFiles; $i++){ 131 | $printable[$i] = ""; 132 | } 133 | } else { 134 | if($maxCount && ($recordsRead % 10000 == 0)){ 135 | if($dotsPrinted % 50 == 0){ 136 | if($recordsRead > 1000){ 137 | printf(STDERR " (%d read groups processed)", $recordsRead); 138 | } 139 | printf(STDERR "\n "); 140 | } 141 | print(STDERR "."); 142 | $dotsPrinted++; 143 | } 144 | $recordsRead++; 145 | my $linesToAdd = ""; 146 | for(my $i = 0; $i < $numFiles; $i++){ 147 | $linesToAdd .= $printable[$i]; 148 | $printable[$i] = ""; 149 | } 150 | if(!$maxCount){ 151 | print($linesToAdd); 152 | } elsif($maxCount >= $recordsRead){ 153 | push(@printReservoir, $linesToAdd); 154 | } else { 155 | my $swapPos = rand($recordsRead); 156 | if($swapPos < $maxCount){ 157 | $printReservoir[$swapPos] = $linesToAdd; 158 | } 159 | } 160 | } 161 | } 162 | } # end loop over files 163 | 164 | foreach my $seqFileName (keys(%seqFiles)){ 165 | close($seqFiles{$seqFileName}); 166 | } 167 | 168 | if($maxCount){ 169 | printf(STDERR "\ndone (%d read groups processed)\n", $recordsRead); 170 | print(join("",@printReservoir)); 171 | } 172 | --------------------------------------------------------------------------------