├── webblast
    ├── cgi-bin
    │   ├── templates
    │   │   ├── blastn_results.html
    │   │   ├── blastp_results.html
    │   │   ├── blastx_results.html
    │   │   ├── tblastn_results.html
    │   │   ├── tblastx_results.html
    │   │   ├── sorttable.js
    │   │   ├── footer.html
    │   │   ├── site_defaults.csv
    │   │   ├── blastn_defaults.csv
    │   │   ├── blastp_defaults.csv
    │   │   ├── blastx_defaults.csv
    │   │   ├── tblastn_defaults.csv
    │   │   ├── tblastx_defaults.csv
    │   │   ├── blastn_query.html
    │   │   ├── blastp_query.html
    │   │   ├── blastx_query.html
    │   │   ├── tblastn_query.html
    │   │   ├── tblastx_query.html
    │   │   ├── header.html
    │   │   ├── blastn_params.html
    │   │   ├── blastp_params.html
    │   │   ├── blastx_params.html
    │   │   ├── tblastn_params.html
    │   │   └── tblastx_params.html
    │   └── db
    │   │   ├── ZEV_mmusmtDNA_bphagelambda.nhr
    │   │   ├── ZEV_mmusmtDNA_bphagelambda.nin
    │   │   └── ZEV_mmusmtDNA_bphagelambda.nsq
    └── style
    │   ├── css
    │       ├── tabs_bg.png
    │       ├── menu_active.png
    │       ├── menu_active.xcf
    │       ├── tabs_norm_left.png
    │       ├── tabs_norm_right.png
    │       ├── tabs_norm_left_on.png
    │       ├── tabs_norm_right_on.png
    │       ├── tabs_custom_corners_topleft.png
    │       ├── tabs_custom_corners_bottomleft.png
    │       ├── tabs_custom_corners_topright.png
    │       ├── tabs_custom_corners_bottomright.png
    │       ├── tabs_custom_corners_rightborder.png
    │       ├── main_area.css
    │       ├── tabs_style.css
    │       └── side_menu.css
    │   └── project_logo.png
├── affytools
    ├── Affy2SimpleGT.jar
    ├── Affy2SimpleGT.class
    ├── manifest_source.txt
    ├── .classpath
    └── .project
├── weblabeler
    └── room_label
    │   └── footer.html
├── README.md
├── aggregate.pl
├── transpose.pl
├── RCVS_filter.pl
├── bootstrap2rank.sh
├── blastx_boundaries.pl
├── last-rmident.pl
├── paf2mat.pl
├── obo2csv.pl
├── fastqcat.pl
├── gfa_cleaner.pl
├── fastx2fastm.pl
├── fastq2fasta.pl
├── align2csv.pl
├── chunkFilter.pl
├── gtcounts2MAF.pl
├── fourievar.r
├── fastm2fastx.pl
├── fasta-nsplit.pl
├── pileupPropView.r
├── mitochondrial_descendants.pl
├── minimus2_OCA.sh
├── mergeFast5events.pl
├── sam2mapped.pl
├── sam2LongestBase.pl
├── fastx-qualHist.pl
├── labid2UUID.pl
├── posAggregate.pl
├── vcf_normalise.pl
├── gtsubset.pl
├── tped2gtcounts.pl
├── vcf2simplegt.pl
├── vcf_windowSorter.pl
├── bam2proportion.sh
├── fastx-annotate.pl
├── dotplotAnnotator.r
├── markercsv2linkage.pl
├── fastx-grep.pl
├── quantile_subset.pl
├── power_analysis.r
├── sam-senseflagger.pl
├── fastx-rc.pl
├── fastx-isofilter.pl
├── exchange_rate.pl
├── fastx-sample.pl
├── pedtodot.sh
├── mpileupDC.pl
├── fastx-cgmasker.pl
├── addKraken2Silva.pl
├── edgebeeguildfinder.py
├── shufflefastx.pl
├── fastx-sort.pl
├── GenABEL2GIANT.pl
├── maf2gfa.pl
├── nbmt-translate.pl
├── fastx-length.pl
├── expenseTemplate.tex
├── maf_bcsplit.pl
├── rsfilter.pl
├── fastx-rlength.pl
├── fastx-hplength.pl
├── fmod.py
├── svgImport.r
├── tped2trios.pl
├── vcf2fq.pl
├── fastx-fetch.pl
├── readthrough.py
├── spiralign.r
├── fastx-repeatFilter.pl
└── fastx-interleave.pl


/webblast/cgi-bin/templates/blastn_results.html:
--------------------------------------------------------------------------------
1 | <h1>Results</h1>
2 | %(results)
3 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastp_results.html:
--------------------------------------------------------------------------------
1 | <h1>Results</h1>
2 | %(results)
3 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastx_results.html:
--------------------------------------------------------------------------------
1 | <h1>Results</h1>
2 | %(results)
3 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastn_results.html:
--------------------------------------------------------------------------------
1 | <h1>Results</h1>
2 | %(results)
3 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastx_results.html:
--------------------------------------------------------------------------------
1 | <h1>Results</h1>
2 | %(results)
3 | 


--------------------------------------------------------------------------------
/affytools/Affy2SimpleGT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/affytools/Affy2SimpleGT.jar


--------------------------------------------------------------------------------
/affytools/Affy2SimpleGT.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/affytools/Affy2SimpleGT.class


--------------------------------------------------------------------------------
/webblast/style/css/tabs_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_bg.png


--------------------------------------------------------------------------------
/webblast/style/project_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/project_logo.png


--------------------------------------------------------------------------------
/webblast/style/css/menu_active.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/menu_active.png


--------------------------------------------------------------------------------
/webblast/style/css/menu_active.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/menu_active.xcf


--------------------------------------------------------------------------------
/affytools/manifest_source.txt:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Class-Path: affy2simplegt.jar
3 | Main-Class: affytools.Affy2SimpleGT
4 | 


--------------------------------------------------------------------------------
/webblast/style/css/tabs_norm_left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_left.png


--------------------------------------------------------------------------------
/webblast/style/css/tabs_norm_right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_right.png


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/sorttable.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/templates/sorttable.js


--------------------------------------------------------------------------------
/webblast/style/css/tabs_norm_left_on.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_left_on.png


--------------------------------------------------------------------------------
/webblast/style/css/tabs_norm_right_on.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_norm_right_on.png


--------------------------------------------------------------------------------
/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nhr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nhr


--------------------------------------------------------------------------------
/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nin


--------------------------------------------------------------------------------
/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nsq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/cgi-bin/db/ZEV_mmusmtDNA_bphagelambda.nsq


--------------------------------------------------------------------------------
/webblast/style/css/tabs_custom_corners_topleft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_topleft.png


--------------------------------------------------------------------------------
/webblast/style/css/tabs_custom_corners_bottomleft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_bottomleft.png


--------------------------------------------------------------------------------
/webblast/style/css/tabs_custom_corners_topright.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_topright.png


--------------------------------------------------------------------------------
/webblast/style/css/tabs_custom_corners_bottomright.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_bottomright.png


--------------------------------------------------------------------------------
/webblast/style/css/tabs_custom_corners_rightborder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gringer/bioinfscripts/HEAD/webblast/style/css/tabs_custom_corners_rightborder.png


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/footer.html:
--------------------------------------------------------------------------------
1 | <p><strong>SessionID: </strong>%(sessionID)</p>
2 | </div> <!-- tabarea -->
3 | </div> <!-- tabcontent -->
4 | </form>
5 | </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/site_defaults.csv:
--------------------------------------------------------------------------------
1 | site_name,SITE_<br>NAME
2 | gbrowse_patterns,mmus_mtDNA:<a href="/fgb2/gbrowse/mmus/?name=%s:%d..%d&h_region=%s:%d..%d@cornsilk">%s</a>
3 | 


--------------------------------------------------------------------------------
/weblabeler/room_label/footer.html:
--------------------------------------------------------------------------------
1 | <!-- <p><strong>SessionID: </strong>%(sessionID)</p> -->
2 | </div> <!-- tabarea -->
3 | </div> <!-- tabcontent -->
4 | </form>
5 | </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/affytools/.classpath:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <classpath>
3 | 	<classpathentry kind="src" path=""/>
4 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
5 | 	<classpathentry kind="output" path=""/>
6 | </classpath>
7 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastn_defaults.csv:
--------------------------------------------------------------------------------
 1 | results,There are no results
 2 | runBlast,''
 3 | queryDB,''
 4 | MAX_NUM_SEQ,100
 5 | SHORT_QUERY_ADJUST,checked
 6 | EXPECT,10
 7 | WORD_SIZE,7
 8 | HSP_RANGE_MAX,0
 9 | MATRIX_NAME,""
10 | MATCH_SCORES,"2,-3"
11 | GAPCOSTS,"5 2"
12 | COMPOSITION_BASED_STATISTICS,""
13 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastp_defaults.csv:
--------------------------------------------------------------------------------
 1 | results,There are no results
 2 | runBlast,''
 3 | queryDB,''
 4 | MAX_NUM_SEQ,100
 5 | SHORT_QUERY_ADJUST,checked
 6 | EXPECT,10
 7 | WORD_SIZE,7
 8 | HSP_RANGE_MAX,0
 9 | MATRIX_NAME,""
10 | MATCH_SCORES,"2,-3"
11 | GAPCOSTS,"5 2"
12 | COMPOSITION_BASED_STATISTICS,""
13 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastx_defaults.csv:
--------------------------------------------------------------------------------
 1 | results,There are no results
 2 | runBlast,''
 3 | queryDB,''
 4 | MAX_NUM_SEQ,100
 5 | SHORT_QUERY_ADJUST,checked
 6 | EXPECT,10
 7 | WORD_SIZE,7
 8 | HSP_RANGE_MAX,0
 9 | MATRIX_NAME,""
10 | MATCH_SCORES,"2,-3"
11 | GAPCOSTS,"5 2"
12 | COMPOSITION_BASED_STATISTICS,""
13 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastn_defaults.csv:
--------------------------------------------------------------------------------
 1 | results,There are no results
 2 | runBlast,''
 3 | queryDB,''
 4 | MAX_NUM_SEQ,100
 5 | SHORT_QUERY_ADJUST,checked
 6 | EXPECT,10
 7 | WORD_SIZE,7
 8 | HSP_RANGE_MAX,0
 9 | MATRIX_NAME,""
10 | MATCH_SCORES,"2,-3"
11 | GAPCOSTS,"5 2"
12 | COMPOSITION_BASED_STATISTICS,""
13 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastx_defaults.csv:
--------------------------------------------------------------------------------
 1 | results,There are no results
 2 | runBlast,''
 3 | queryDB,''
 4 | MAX_NUM_SEQ,100
 5 | SHORT_QUERY_ADJUST,checked
 6 | EXPECT,10
 7 | WORD_SIZE,7
 8 | HSP_RANGE_MAX,0
 9 | MATRIX_NAME,""
10 | MATCH_SCORES,"2,-3"
11 | GAPCOSTS,"5 2"
12 | COMPOSITION_BASED_STATISTICS,""
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | bioinfscripts
2 | =============
3 | 
4 | Bioinformatics scripts created or modified by David Eccles (gringer)
5 | 
6 | Note: these scripts are now maintained on [gitlab](https://gitlab.com/gringer/bioinfscripts). If you want to use the most recent version of these scripts (including *many* additional features and bugfixes), download from there.
7 | 


--------------------------------------------------------------------------------
/aggregate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $keyName = "";
 7 | my @valueAggregate = ();
 8 | 
 9 | while(<>){
10 |   chomp;
11 |   my @F = split(",", $_);
12 |   if($F[0] ne $keyName){
13 |     if($keyName){
14 |       printf("${keyName},%d\n", scalar(@valueAggregate));
15 |     }
16 |     $keyName = $F[0];
17 |     @valueAggregate = ();
18 |   }
19 |   push(@valueAggregate, $F[1]);
20 | }
21 | 


--------------------------------------------------------------------------------
/affytools/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>affytools</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.eclipse.jdt.core.javanature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/transpose.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | use List::MoreUtils qw(pairwise);
 7 | 
 8 | my @rows = ();
 9 | my $delim = "\t";
10 | 
11 | while(<>){
12 |     chomp;
13 |     if(/(\s)/){
14 | 	$delim = $1;
15 |     }
16 |     my @F = split(/\s+/);
17 |     if(!@rows){
18 | 	@rows = @F;
19 |     } else {
20 | 	@rows = pairwise {$a . $delim . $b} @rows, @F;
21 |     }
22 | }
23 | 
24 | foreach my $row (@rows){
25 |     print($row."\n");
26 | }
27 | 


--------------------------------------------------------------------------------
/RCVS_filter.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | my $snpFileName = shift(@ARGV);
 6 | open(my $snpFile, "<", $snpFileName);
 7 | 
 8 | my %inMap = ();
 9 | while(<$snpFile>){
10 |     chomp;
11 |     $inMap{$_} = 1;
12 | }
13 | close($snpFile);
14 | 
15 | $inMap{"dbSNP RS ID"} = 1;
16 | 
17 | while(<>){
18 |     if(/^#/){
19 |         print;
20 |     } else {
21 |         my @fields = split(/\t/);
22 |         if($inMap{$fields[14]}){
23 |             print;
24 |         }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastn_query.html:
--------------------------------------------------------------------------------
 1 | <h2>Query Database</h2>
 2 | 
 3 | <select name="queryDB">
 4 |   %(databases)
 5 | </select>
 6 | 
 7 | 
 8 | <h2>Target Sequence</h2>
 9 | 
10 | <p>Enter a <em>%(inputType)</em> sequence in FASTA format: </p>
11 | 
12 | <textarea name="inputText" rows="10" cols="71" wrap="soft">%(inputText)</textarea>
13 | 
14 | <div>
15 |   Or upload a file: <input type="file" name="inputFile" size="30" />
16 | </div>
17 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
18 |   <p>BLAST</p>
19 | </button>
20 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastp_query.html:
--------------------------------------------------------------------------------
 1 | <h2>Query Database</h2>
 2 | 
 3 | <select name="queryDB">
 4 |   %(databases)
 5 | </select>
 6 | 
 7 | 
 8 | <h2>Target Sequence</h2>
 9 | 
10 | <p>Enter a <em>%(inputType)</em> sequence in FASTA format: </p>
11 | 
12 | <textarea name="inputText" rows="10" cols="71" wrap="soft">%(inputText)</textarea>
13 | 
14 | <div>
15 |   Or upload a file: <input type="file" name="inputFile" size="30" />
16 | </div>
17 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
18 |   <p>BLAST</p>
19 | </button>
20 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastx_query.html:
--------------------------------------------------------------------------------
 1 | <h2>Query Database</h2>
 2 | 
 3 | <select name="queryDB">
 4 |   %(databases)
 5 | </select>
 6 | 
 7 | 
 8 | <h2>Target Sequence</h2>
 9 | 
10 | <p>Enter a <em>%(inputType)</em> sequence in FASTA format: </p>
11 | 
12 | <textarea name="inputText" rows="10" cols="71" wrap="soft">%(inputText)</textarea>
13 | 
14 | <div>
15 |   Or upload a file: <input type="file" name="inputFile" size="30" />
16 | </div>
17 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
18 |   <p>BLAST</p>
19 | </button>
20 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastn_query.html:
--------------------------------------------------------------------------------
 1 | <h2>Query Database</h2>
 2 | 
 3 | <select name="queryDB">
 4 |   %(databases)
 5 | </select>
 6 | 
 7 | 
 8 | <h2>Target Sequence</h2>
 9 | 
10 | <p>Enter a <em>%(inputType)</em> sequence in FASTA format: </p>
11 | 
12 | <textarea name="inputText" rows="10" cols="71" wrap="soft">%(inputText)</textarea>
13 | 
14 | <div>
15 |   Or upload a file: <input type="file" name="inputFile" size="30" />
16 | </div>
17 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
18 |   <p>BLAST</p>
19 | </button>
20 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastx_query.html:
--------------------------------------------------------------------------------
 1 | <h2>Query Database</h2>
 2 | 
 3 | <select name="queryDB">
 4 |   %(databases)
 5 | </select>
 6 | 
 7 | 
 8 | <h2>Target Sequence</h2>
 9 | 
10 | <p>Enter a <em>%(inputType)</em> sequence in FASTA format: </p>
11 | 
12 | <textarea name="inputText" rows="10" cols="71" wrap="soft">%(inputText)</textarea>
13 | 
14 | <div>
15 |   Or upload a file: <input type="file" name="inputFile" size="30" />
16 | </div>
17 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
18 |   <p>BLAST</p>
19 | </button>
20 | 


--------------------------------------------------------------------------------
/bootstrap2rank.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/sh
2 | 
3 | pv bootstrap100_anzgene200_vs_1kg200.csv.gz | zcat | sort -t',' -k 2,2n -k 3,3rn | perl -F',' -lane 'if($run != $F[1]){$run = $F[1]; $rank=1; $lastVal=0; $lastRank=0} if($F[1] ne "bs.run"){$nextVal = $F[2]; $F[2] = ($F[2] == $lastVal) ? $lastRank : $rank; $lastRank = $F[2]; $lastVal = $nextVal; $rank++} print join(",",@F)' | sort -t',' -k 1,1 -k 3,3n | perl -F',' -lane 'if($marker ne $F[0]){if($marker){print $lastLine; print $_} $marker = $F[0]} {$lastLine = $_} END{ print $lastLine}' | gzip > maxminRank_bootstrap100_anzgene200_vs_1kg200.csv.gz
4 | 


--------------------------------------------------------------------------------
/blastx_boundaries.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my %starts = ();
 7 | my %ends = ();
 8 | 
 9 | while(<>){
10 |   chomp;
11 |   my @F = split(/\s+/);
12 |   my $qs=$F[6];
13 |   my $qe=$F[7];
14 |   if($qe < $qs){
15 |     ($qs, $qe) = ($qe, $qs);
16 |   }
17 |   my $sig = $F[0]."-".$F[1];
18 |   if(!$starts{$sig} || ($starts{$sig} > $qs)){
19 |     $starts{$sig} = $qs
20 |   }
21 |   if(!$ends{$sig} || ($ends{$sig} < $qe)){
22 |     $ends{$sig} = $qe
23 |   }
24 | }
25 | 
26 | foreach my $key (keys(%starts)){
27 |   if($key =~ /(.*)-(.*)/){
28 |     my $target = $1;
29 |     my $query = $2;
30 |     printf("%s %s:%d-%d\n", $query, $target, $starts{$key}, $ends{$key});
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/webblast/style/css/main_area.css:
--------------------------------------------------------------------------------
 1 | a {
 2 |   text-decoration: none;
 3 |   color: black;
 4 | }
 5 | 
 6 | a:hover {
 7 |   text-decoration: underline;
 8 | }
 9 | 
10 | #tabcontent, #contentarea {
11 |    overflow: auto;
12 |    position: fixed;
13 |    top: 38px;
14 |    left: 173px;
15 |    right: 5px;
16 |    width: auto;
17 |    height: auto;
18 |    bottom: 5px;
19 |    padding: 8px;
20 | }
21 | 
22 | #tabcontent {
23 |    margin-top: 15px;
24 |    margin-bottom: 15px;
25 |    margin-right: 15px;
26 |    margin-left: 7px;
27 |    background-attachment: scroll;
28 | }
29 | 
30 | #contentarea {
31 |     border: 1px solid black;
32 |     /* border-radius: 5px; */
33 |     box-shadow: 2px -2px 4px rgba(0,0,0,0.5);
34 |     background: white;
35 | }
36 | 
37 | #contentarea p { text-align: justify}


--------------------------------------------------------------------------------
/last-rmident.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | my @lineBuffer = ();
 4 | my @ids = ();
 5 | 
 6 | while(<>){
 7 |   my $line = $_;
 8 |   if(/^([a-z])\s(.*?)\s/){
 9 |     my ($flag, $id) = ($1, $2);
10 |     if($flag eq "s"){
11 |         push(@ids, $id);
12 |     } elsif($flag eq "a"){ # new alignment
13 |       if((scalar(@ids) != 2) || ($ids[0] ne $ids[1])){
14 |         print(join("",@lineBuffer));
15 |       }
16 |       # if(@ids){
17 |       #   printf("a '%s' '%s' %d %d\n", $ids[0], $ids[1],
18 |       #          scalar(@lineBuffer), scalar(@ids));
19 |       # } else {
20 |       #   print("a\n");
21 |       # }
22 |       @lineBuffer = ();
23 |       @ids = ();
24 |     }
25 |   }
26 |   push(@lineBuffer, $line);
27 | }
28 | 
29 | if((scalar(@ids) != 2) || ($ids[0] ne $ids[1])){
30 |   print(join("",@lineBuffer));
31 | }
32 | 


--------------------------------------------------------------------------------
/paf2mat.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $read = "";
 7 | my @tigs = ();
 8 | my %links = ();
 9 | while(<>){
10 |   my @F = split(/\s/);
11 |   if($F[0] eq $read){
12 |     push(@tigs, $F[5]);
13 |   } else {
14 |     foreach my $t1 (@tigs){
15 |       foreach my $t2 (@tigs){
16 |         $links{$t1}{$t2}++;
17 |         $links{$t2}{$t1}++;
18 |       }
19 |     }
20 |     $read = $F[0];
21 |     @tigs = ();
22 |   }
23 | }
24 | 
25 | foreach my $t1 (@tigs){
26 |   foreach my $t2 (@tigs){
27 |     $links{$t1}{$t2}++;
28 |   }
29 | }
30 | 
31 | print("tig1,tig2,count\n");
32 | foreach my $t1 (sort(keys(%links))){
33 |   foreach my $t2 (sort(keys(%{$links{$t1}}))){
34 |     my $line = sprintf("%s,%s,%d\n", $t1, $t2, $links{$t1}{$t2});
35 |     $line =~ s/Consensus_//g;
36 |     print $line;
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/obo2csv.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $category = "";
 7 | my %data = ();
 8 | 
 9 | print "id,is_a,name,def\n";
10 | 
11 | while(<>){
12 |   chomp;
13 |   if(/^\[(.*?)\]/){
14 |     if($category eq "Term"){
15 |       printf("%s,%s,\"%s\",%s\n",
16 |              $data{"id"},
17 |              $data{"is_a"} ? $data{"is_a"} : "",
18 |              $data{"name"} ? $data{"name"} : "",
19 |              $data{"def"} ? $data{"def"} : "");
20 |     }
21 |     %data = ();
22 |     $category = $1;
23 |   } elsif(/^([^:]+): (.*)$/){
24 |     my $field = $1;
25 |     my $value = $2;
26 |     $value =~ s/\[.*?\]//g;
27 |     if($field eq "is_a"){
28 |       $value =~ s/ .*$//;
29 |     }
30 |     if($category eq "Term"){
31 |       $data{$field} = ($data{$field}) ? ($data{$field}.";".$value) : $value;
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/fastqcat.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | use Getopt::Long qw(:config auto_version auto_help pass_through); # for option parsing
 6 | 
 7 | my $inQual = 0; # false
 8 | my $seqID = "";
 9 | my $qualID = "";
10 | my $seq = "";
11 | my $qual = "";
12 | my $minLen = 0;
13 | 
14 | GetOptions('minlength=i' => \$minLen) or
15 |   die("Error in command line arguments");
16 | 
17 | while(<>){
18 |   chomp;
19 |   chomp;
20 |   if(!$inQual){
21 |     if(/^@(.+)$/){
22 |       my $newSeqID = $1;
23 |       if($seqID && (length($seq) >= $minLen)){
24 |         printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
25 |       }
26 |       $seq = "";
27 |       $qual = "";
28 |       $seqID = $newSeqID;
29 |     } elsif(/^\+(.*)$/) {
30 |       $inQual = 1; # true
31 |       $qualID = $1;
32 |     } else {
33 |       $seq .= $_;
34 |     }
35 |   } else {
36 |     $qual .= $_;
37 |     if(length($qual) >= length($seq)){
38 |       $inQual = 0; # false
39 |     }
40 |   }
41 | }
42 | 
43 | if($seqID && (length($seq) >= $minLen)){
44 |   printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
45 | }
46 | 


--------------------------------------------------------------------------------
/gfa_cleaner.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## gfa_cleaner.pl -- removes duplicate links and links that include
 4 | ## vertexes with high numbers of outgoing or incoming edges
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | my %seen = ();
10 | my %counts = ();
11 | my %displayBuffer = ();
12 | 
13 | my $maxCount = 4;
14 | 
15 | while(<>){
16 |   if(/^[^L]/){
17 |     print;
18 |     next;
19 |   }
20 |   my $line = $_;
21 |   chomp;
22 |   my @F = split(/\t/);
23 |   my $matchFwd = join(";",@F[(1,2,3,4)]);
24 |   my $strA = join(";",@F[(1,2)]);
25 |   grep {tr/\-\+/\+\-/} @F[(4,2)];
26 |   my $matchRev = join(";",@F[(3,4,1,2)]);
27 |   my $strB = join(";",@F[(3,4)]); ## after sign flip
28 |   if(!$seen{$matchFwd}){
29 |     $counts{$strA}++;
30 |     $counts{$strB}++;
31 |     $seen{$matchFwd} = 1;
32 |     $seen{$matchRev} = 1;
33 |     $displayBuffer{$line}{$strA} = 1;
34 |     $displayBuffer{$line}{$strB} = 1;
35 |   }
36 | }
37 | 
38 | foreach my $line (sort(keys(%displayBuffer))){
39 |   my @strs = keys(%{$displayBuffer{$line}});
40 |   if(grep {$counts{$_} > $maxCount} @strs){
41 |     next;
42 |   }
43 |   print($line);
44 | }
45 | 


--------------------------------------------------------------------------------
/webblast/style/css/tabs_style.css:
--------------------------------------------------------------------------------
 1 | #tabbar {
 2 |    z-index: 1;
 3 |    font-size: 93%;
 4 |    position: fixed;
 5 |    top: 5px;
 6 |    width: auto;
 7 |    height: 43px;
 8 |    left: 175px;
 9 |    right: 92px;
10 |    bottom: 26px;
11 |    padding: 0px;
12 |    margin: 0px;
13 | }
14 | 
15 | #tabbar ul {
16 |    margin: 0;
17 |    padding:5px 5px;
18 |    list-style:none;
19 | }
20 | 
21 | #tabbar button {
22 |     border-style: none;
23 |     background-color: transparent;
24 | }
25 | 
26 | #tabbar p {
27 |    color: gray;
28 |    font-weight: bold;
29 |    margin: 2px;
30 | }
31 | 
32 | #tabbar li {
33 |    display: block;
34 |    float:left;
35 |    border-radius: 5px 5px 0 0;
36 |    box-shadow: 1px -1px 1px rgba(0,0,0,0.5);
37 |    border: 1px solid black;
38 |    border-bottom: 1px solid white;
39 |    background: white;
40 |    margin: 5px 5px 0 0;
41 |    padding:0px 2px 0px 0px;
42 | }
43 | 
44 | #tabbar .tabon p { 
45 |     color: black;
46 | }
47 | 
48 | #tabbar .taboff {
49 |     border-bottom: 1px solid black;
50 | }
51 | 
52 | #tabbar .tabdisabled {
53 |     background: rgba(192,192,192,1);
54 |     visibility: hidden;
55 | }
56 | 


--------------------------------------------------------------------------------
/fastx2fastm.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $inQual = 0; # false
 7 | my $seqID = "";
 8 | my $qualID = "";
 9 | my $seq = "";
10 | my $qual = "";
11 | while(<>){
12 |   if(!$inQual){
13 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
14 |       my $newSeqID = $2;
15 |       if($seq){
16 | 	$seq =~ s/\n/\n /g;
17 | 	$seq =~ s/ $//;
18 | 	printf("#seq:%s\n", $seqID);
19 | 	printf(" %s", $seq);
20 | 	if($qual){
21 | 	  $qual =~ s/\n/\n /g;
22 | 	  $qual =~ s/ $//;
23 | 	  printf("#qual:%s\n", $seqID);
24 | 	  printf(" %s", $qual);
25 | 	}
26 |       }
27 |       $seq = "";
28 |       $qual = "";
29 |       $seqID = $newSeqID;
30 |       chomp $seqID;
31 |       chomp $seqID;
32 |     } elsif(/^\+(.*)$/) {
33 |       $inQual = 1; # true
34 |       $qualID = $1;
35 |     } else {
36 |       $seq .= $_;
37 |     }
38 |   } else {
39 |     $qual .= $_;
40 |     if(length($qual) >= length($seq)){
41 |       $inQual = 0; # false
42 |     }
43 |   }
44 | }
45 | 
46 | if($seqID){
47 |   printf("#seq:%s\n", $seqID);
48 |   printf(" %s", $seq);
49 |   if($qual){
50 |     printf("#qual:%s\n", $seqID);
51 |     printf(" %s", $qual);
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/fastq2fasta.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | use Getopt::Long qw(:config auto_help pass_through);
 6 | 
 7 | my $minLength = 0;
 8 | my $singleLine = 0;
 9 | 
10 | GetOptions("minLength=i" => \$minLength, "singleLine!" => \$singleLine) or
11 |   die("Error in command line arguments");
12 | 
13 | my $inQual = 0; # false
14 | my $seqID = "";
15 | my $qualID = "";
16 | my $seq = "";
17 | my $qual = "";
18 | while(<>){
19 |   chomp;
20 |   chomp;
21 |   if(/^\s+$/){
22 |     next;
23 |   }
24 |   if(!$inQual){
25 |     if(/^@(.+)$/){
26 |       $seqID = $1;
27 |       $seq = "";
28 |     } elsif(/^\+(.*)$/) {
29 |       $inQual = 1; # true
30 |       $qualID = $1;
31 |       $qual = "";
32 |       if(length($seq) > $minLength){
33 |         my $printedSeq = $seq;
34 |         if(!$singleLine){
35 |           $printedSeq =~ s/(.{60})/$1\n/g;
36 |           $printedSeq =~ s/\n$//g;
37 |         }
38 |         printf(">%s\n%s\n", $seqID, $printedSeq);
39 |       }
40 |     } else {
41 |       $seq .= $_;
42 |     }
43 |   } else {
44 |     $qual .= $_;
45 |     if(length($qual) >= length($seq)){
46 |       $inQual = 0; # false
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/align2csv.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | print("SeqID,RefID,Pos,Ref,Alt\n");
 7 | 
 8 | while(<>){
 9 |   if(/^#/){
10 |     next;
11 |   }
12 |   chomp;
13 |   my ($nameR, $nameQ, $score, $seqRef, $seqDiff, $seqQry) = split(/\t/, $_);
14 |   my $refPos = 0;
15 |   my $ins = ""; # false
16 |   my $del = ""; # false
17 |   my $indelPos = -1;
18 |   for(my $i = 0; $i < length($seqRef); $i++){
19 |     my $r = substr($seqRef, $i, 1);
20 |     my $d = substr($seqDiff, $i, 1);
21 |     my $q = substr($seqQry, $i, 1);
22 |     if($r ne "-"){
23 |       $refPos++;
24 |       if(($q ne "-") && ($ins || $del)){
25 |         printf("%s,%s,%05d,%s,%s\n", $nameQ, $nameR, $indelPos, $del, $ins);
26 |         $ins = ""; # false
27 |         $del = ""; # false
28 |         $indelPos = -1;
29 |       }
30 |     }
31 |     if($r eq "-"){
32 |       $ins .= $q;
33 |       if($indelPos == -1){
34 |         $indelPos = $refPos+1;
35 |       }
36 |     }
37 |     if($q eq "-"){
38 |       $del .= $r;
39 |       if($indelPos == -1){
40 |         $indelPos = $refPos;
41 |       }
42 |     }
43 |     if(!($ins || $del) && ($r ne $q)){
44 |       printf("%s,%s,%05d,%s,%s\n", $nameQ, $nameR, $refPos, $r, $q);
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/chunkFilter.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $mw = 10; # match window
 7 | my $maxChunkSize = 5000; # limit chunk size
 8 | 
 9 | my $lastContig="";
10 | my $start=-($mw+1);
11 | my $length=-($mw+1);
12 | 
13 | my @lineBuffer = ();
14 | 
15 | while(<>){
16 |   chomp;
17 |   my @F = split(/ /);
18 |   my $newStart = $F[1];
19 |   my $newEnd = $F[1] + $F[2];
20 |   my $oldEnd = $start + $length;
21 |   if(($F[0] eq $lastContig) && ($length < ($maxChunkSize-$mw)) &&
22 |      (($start - $mw) < $newStart && ($oldEnd + $mw) > $newStart) ||
23 |      (($oldEnd + $mw) > $newEnd && ($start - $mw) < $newEnd)){
24 |     $start = $newStart if ($start > $newStart);
25 |     $newEnd = $oldEnd if ($oldEnd > $newEnd);
26 |     $F[1] = $start;
27 |     $F[2] = $newEnd - $start;
28 |   } else {
29 |     push(@lineBuffer,"");
30 |     my $sig = join(" ", ($lastContig, $start, $length));
31 |     print(join(" $sig\n", @lineBuffer));
32 |     @lineBuffer = ();
33 |     $lastContig = "";
34 |     $start = -($mw+1);
35 |     $length = -($mw+1);
36 |   }
37 |   push(@lineBuffer, $_);
38 |   ($lastContig, $start, $length) = @F;
39 | }
40 | 
41 | my $sig = join(" ", ($lastContig, $start, $length));
42 | push(@lineBuffer,"");
43 | print(join(" $sig\n", @lineBuffer));
44 | 


--------------------------------------------------------------------------------
/gtcounts2MAF.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | use List::Util;
 7 | 
 8 | sub processCounts{
 9 |   my ($marker, $countRef) = @_;
10 |   if($marker eq ""){
11 |     return;
12 |   }
13 |   #print(STDERR "Processing $marker\n");
14 |   foreach my $sex (keys(%{$countRef})){
15 |     my %tCounts = %{$countRef->{$sex}};
16 |     my $total = 0;
17 |     my $minA = "0";
18 |     my $minC = undef;
19 |     while(my ($a, $b) = each %tCounts) {
20 |       if(!defined($minC) || ($b < $minC)){
21 |         $minC = $b;
22 |         $minA = $a;
23 |       }
24 |       $total += $b;
25 |     }
26 |     my $minF = ($total > 0) ? $minC / $total : 0;
27 |     printf("%s,%s,%s,%0.4f\n", $marker, $sex, $minA, $minF);
28 |   }
29 | }
30 | 
31 | my $lastMarker = "";
32 | my $counts = {};
33 | 
34 | while(<>){
35 |   chomp;
36 |   #print $_."\n";
37 |   my ($marker, $sex, $type, $bases, $count) = split(/,/);
38 |   if($marker ne $lastMarker){
39 |     processCounts($lastMarker, $counts);
40 |     $counts = {"m" => {}, "f" => {}, "a" => {}};
41 |     $lastMarker = $marker;
42 |   }
43 |   #print(join(";",%{$counts})."\n");
44 |   if(($type ne "a") || ($bases eq "0")){
45 |     next;
46 |   }
47 |   $counts->{$sex}->{$bases} = $count;
48 | }
49 | processCounts($lastMarker, $counts);
50 | 


--------------------------------------------------------------------------------
/fourievar.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | if(length(commandArgs(TRUE)) < 1){
 4 |     cat("Error: no fasta file specified\n");
 5 |     cat("syntax: ./fouriervar.r <input.fa>\n");
 6 |     quit(save="no");
 7 | }
 8 | 
 9 | library(Biostrings, quietly=TRUE, warn.conflicts=FALSE, verbose=FALSE);
10 | 
11 | fuzz <- 2;
12 | doPlot <- FALSE;
13 | 
14 | #fileName <- "/bioinf/presentations/2017-Sep-03/tig00022132_rpt0.fa";
15 | fileName <- commandArgs(TRUE)[1];
16 | seqs <- readDNAStringSet(fileName);
17 | 
18 | print(seqs);
19 | 
20 | for(si in 1:length(seqs)){
21 |     mySeq <- as.vector(seqs[[si]]);
22 |     slen <- length(mySeq);
23 |     fLimit <- 1000;
24 |     fLimit <- min(slen-1, fLimit);
25 |     
26 |     baseCplx <- complex(real=c(A=1, C=0, G=-1, T=0)[mySeq],
27 |                         imaginary=c(A=0, C=1, G=0, T=-1)[mySeq]);
28 |     
29 |     spectrum <- sapply(2:fLimit, function(cycle){
30 |         sum(head(baseCplx, -cycle) == tail(baseCplx, -cycle)) / (slen-cycle);
31 |     });
32 | 
33 |     png("out.png");
34 |     plot(spectrum);
35 |     invisible(dev.off());
36 | 
37 |     names(spectrum) <- 2:fLimit;
38 |     spectrum <- (spectrum - mean(spectrum)) / sd(spectrum);
39 |     spectrum <- spectrum[order(-spectrum)];
40 | 
41 |     print(head(cbind(spectrum, qProb = -10 * log10(1-pnorm(spectrum))),20));
42 | }
43 | 


--------------------------------------------------------------------------------
/fastm2fastx.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $fastqMode = 0; # false
 7 | my $inQual = 0;
 8 | my $inSeq = 0;
 9 | 
10 | my $seqID = "";
11 | my $seq = "";
12 | my $qual = "";
13 | 
14 | while(<>){
15 |   if(/^#seq(:(.*$))?/){
16 |     my $newSeqID = $2;
17 |     $inQual = 0; # false
18 |     $inSeq = 1; # true
19 |     if($seqID){
20 |       if($fastqMode){
21 | 	printf("@%s\n%s+\n%s", $seqID, $seq, $qual);
22 |       } else {
23 | 	printf(">%s\n%s", $seqID, $seq);
24 |       }
25 |     }
26 |     $seqID = $newSeqID;
27 |     $seq = "";
28 |   } elsif(/^#qual(:(.*$))?/){
29 |     my $newSeqID = $2;
30 |     if($newSeqID && ($newSeqID ne $seqID)){
31 |       warn(sprintf("Quality ID [%s] and sequence ID [%s] do not match.".
32 | 		   " Setting ID to sequence ID [%s]", 
33 | 		   $newSeqID, $seqID, $seqID));
34 |     }
35 |     $fastqMode = 1;
36 |     $inSeq = 0; # false
37 |     $inQual = 1; # true
38 |   } elsif(/^#/){ # some other unknown tag
39 |     $inSeq = 0; # false
40 |     $inQual = 0; # false
41 |   }
42 |   if(/^ (.*$)/){
43 |     if($inSeq){
44 |       $seq .= $1 . "\n";
45 |     }
46 |     if($inQual){
47 |       $qual .= $1 . "\n";
48 |     }
49 |   }
50 | }
51 | 
52 | if($seqID){
53 |   if($fastqMode){
54 |     printf("@%s\n%s+\n%s", $seqID, $seq, $qual);
55 |   } else {
56 |     printf(">%s\n%s", $seqID, $seq);
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/fasta-nsplit.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | my $seq = "";
 6 | my $shortSeqID = "";
 7 | my $seqID = "";
 8 | my $keep = 0;
 9 | my $cumLength = 0;
10 | while(<>){
11 |   chomp;
12 |   if(/^>((.+?)( .*?\s*)?)$/){
13 |     my $newID = $1;
14 |     my $newShortID = $2;
15 |     if($seq){
16 |       my $inc = 0;
17 |       while($seq =~ s/(NNNN+)(.*)//){
18 | 	my $nStretch = $1;
19 |         my $newSeq = $2;
20 |         printf(">%s.%s\n%s\n", $seqID, $inc++, $seq) if ($seq);
21 | 	$cumLength += length($seq);
22 | 	printf(STDERR "%s\t%d\t%d\n", $shortSeqID, $cumLength, 
23 | 	       $cumLength + length($nStretch));
24 | 	$cumLength += length($nStretch);
25 |         $seq = $newSeq;
26 |       }
27 |       printf(">%s\n%s\n", $seqID, $seq) if ($seq);
28 |     }
29 |     $seq = "";
30 |     $shortSeqID = $newShortID;
31 |     $seqID = $newID;
32 |     $cumLength = 0;
33 |   } else {
34 |     $seq .= $_;
35 |   }
36 | }
37 | if($seq){
38 |   my $inc = 0;
39 |   while($seq =~ s/(NNNN+)(.*)//){
40 |     my $nStretch = $1;
41 |     my $newSeq = $2;
42 |     printf(">%s.%s\n%s\n", $seqID, $inc++, $seq) if ($seq);
43 |     $cumLength += length($seq);
44 |     printf(STDERR "%s\t%d\t%d\n", $shortSeqID, $cumLength, 
45 | 	   $cumLength + length($nStretch));
46 |     $cumLength += length($nStretch);
47 |     $seq = $newSeq;
48 |   }
49 |   printf(">%s\n%s\n", $seqID, $seq) if ($seq);
50 | }
51 | 


--------------------------------------------------------------------------------
/pileupPropView.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | pileupFileName <- commandArgs(TRUE)[1];
 4 | pos.start <- as.numeric(commandArgs(TRUE)[2]);
 5 | pos.end <- as.numeric(commandArgs(TRUE)[3]);
 6 |                         
 7 | data.prop.df <- subset(
 8 |     read.csv(pileupFileName),
 9 |     (Position >= pos.start) & (Position <= pos.end));
10 | 
11 | prop.plot <- function(data.df){
12 |     par(mar=c(5,5,0.5,1));
13 |     res <- barplot(t(as.matrix(data.df[,c("A","C","G","T","d","pR","i")]) *
14 |                      data.df$Coverage),
15 |                    ylim=c(0,max(data.df$Coverage*1.4)),
16 |                    xlim=c(0,nrow(data.df)*1.1),
17 |                    xaxt="n", xlab = "Mitochondrial Genome Location",
18 |                    ylab="Read Coverage", border=NA, space=0,
19 |                    col=c("darkgreen","blue","black","red",
20 |                          "steelblue","grey90","grey60"));
21 |     legend("right", horiz=FALSE, legend=c("A","C","G","T","Del","Ref","Ins"),
22 |            fill=c("darkgreen","blue","black","red",
23 |                   "steelblue","grey90","grey60"));
24 |     tckPoss <- pretty(data.df$Position);
25 |     axis(1, at=res[match(tckPoss, data.df$Position)],
26 |          labels=tckPoss);
27 | }
28 | 
29 | png(paste0(sub("\\..*$","",pileupFileName),"_",pos.start,"-",pos.end,".png"),
30 |     width=1366, height=718, pointsize=24);
31 | prop.plot(data.prop.df);
32 | dummy <- dev.off();
33 | 


--------------------------------------------------------------------------------
/mitochondrial_descendants.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my %foundingLine = ();
 7 | my %found = ();
 8 | my %maternalIDs = ();
 9 | my %sex = ();
10 | 
11 | while (<>) {
12 |     my @F = split(/\s+/);
13 |     $maternalIDs{$F[1]} = $F[3];
14 |     $sex{$F[1]} = $F[4];
15 |     if(($F[2] == 0) && ($F[3] == 0)){
16 | 	$foundingLine{$F[1]}{$F[1]} = 1;
17 |     }
18 | }
19 | 
20 | print(STDERR "Loaded all IDs\n");
21 | 
22 | my $changed = 1;
23 | 
24 | print(STDERR "Finding mitochondrial descendants...");
25 | while($changed){
26 |     $changed = 0;
27 |     foreach my $line (keys(%foundingLine)){
28 | 	my %lineInds = %{$foundingLine{$line}};
29 | 	foreach my $ind (keys(%lineInds)){
30 | 	    if(!$found{$ind}){
31 | 		#printf(STDERR "Finding mitochondrial descendants of <%d>\n", $ind);
32 | 		foreach my $ind2 (keys(%maternalIDs)){
33 | 		    if($maternalIDs{$ind2} == $ind){
34 | 			$foundingLine{$line}{$ind2} = 1;
35 | 		    }
36 | 		}
37 | 		$found{$ind} = 1;
38 | 		$changed = 1;
39 | 	    }
40 | 	}
41 |     }
42 | }
43 | 
44 | print(STDERR " stored all descendants\n");
45 | 
46 | foreach my $line (sort {$a <=> $b} (keys(%foundingLine))){
47 |     if(scalar(keys(%{$foundingLine{$line}})) > 1){
48 | 	printf("Mitochondrial descendants of <%s>:\n", $line);
49 | 	my %lineInds = %{$foundingLine{$line}};
50 | 	foreach my $ind (sort {$a <=> $b} (keys(%lineInds))){
51 | 	    printf(" $ind %d\n", $sex{$ind});
52 | 	}
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/minimus2_OCA.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ $# -lt 1 ]
 4 |     then echo "Error: no arguments have been specified"
 5 |     echo "usage <path1/sample1.fasta.gz> [path2/sample2.fasta] [-nomerge]"
 6 |     exit
 7 | fi
 8 | 
 9 | ffn1=$1
10 | d1=$(dirname $(readlink -e ${ffn1}))
11 | f1=$(basename ${ffn1})
12 | y1=$(basename ${f1} .gz)
13 | y1=$(basename ${y1} .gzip)
14 | y1=$(basename ${y1} .fasta)
15 | y1=$(basename ${y1} .fa)
16 | mergeName=${y1}_mm2
17 | refCount=$(zgrep -c '^>' ${d1}/${f1})
18 | d2=""
19 | f2=""
20 | if [ $# -gt 1 ]
21 |     then ffn2=$2
22 |     d2=$(dirname $(readlink -e ${ffn2}))
23 |     f2=$(basename ${ffn2})
24 |     y2=$(basename ${f2} .gz)
25 |     y2=$(basename ${y2} .gzip)
26 |     y2=$(basename ${y2} .fasta)
27 |     y2=$(basename ${y2} .fa)
28 |     mergeName=${y1}_${y2}_mm2
29 |     if [ $3 != "-nomerge" ]
30 |         then echo "-- merging ${y2} into ${y1} --"
31 |         else echo "-- carrying out all-vs-all overlap on ${y1} + ${y2} --"
32 |     fi
33 |     else echo "-- carrying out all-vs-all overlap on ${y1} --"
34 | fi
35 | mkdir -p ${d1}/${mergeName}
36 | cd ${d1}/${mergeName}
37 | zcat -f ${d1}/${f1} > ${mergeName}.seq
38 | if [ $# -gt 1 ]
39 |     then zcat -f ${d2}/${f2} >> ${mergeName}.seq
40 | fi
41 | echo "currently in $(pwd)"
42 | ~/bin/amos/toAmos -s ${mergeName}.seq -o ${mergeName}.afg
43 | if [ $# -gt 2 ]
44 | then if [ $3 = "-nomerge" ]
45 |     then ~/bin/amos/minimus2 ${mergeName}
46 |     else ~/bin/amos/minimus2 ${mergeName} -D REFCOUNT=${refCount}
47 |     fi
48 | else ~/bin/amos/minimus2 ${mergeName} -D REFCOUNT=${refCount}
49 | fi
50 | echo "-- DONE --"
51 | 


--------------------------------------------------------------------------------
/mergeFast5events.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my $model = "";
 7 | my $cumLength = 0;
 8 | my $cumTotalLength = 0;
 9 | my $meanSig = 0;
10 | my $bpPos = 0;
11 | my $lastMove = 0;
12 | my $lastStart = 0;
13 | 
14 | my %colnums = ();
15 | 
16 | while(<>){
17 |   chomp;
18 |   my @F = split(",", $_);
19 |   if($F[5] !~ /[0-9]/){
20 |     my $colNum = 0;
21 |     grep {$colnums{$_} = $colNum++} @F;
22 |     print("channel,mux,read,model_state,move,bppos,mean,start,startSecs,length\n");
23 |     next;
24 |   }
25 |   ## convert start/length to samples, assuming 4kHz sampling
26 |   @F[($colnums{start},$colnums{length})] =
27 |     map({$_ * $F[$colnums{sampleRate}]} @F[($colnums{start},$colnums{length})]);
28 |   if($F[$colnums{model_state}] ne $model){
29 |     if($model){
30 |       print(join(",",(@F[($colnums{runID},$colnums{channel},$colnums{mux})],
31 |                       $model,$lastMove,$bpPos,sprintf('%0.2f',$meanSig),
32 |                       sprintf('%.0f', $lastStart),
33 |                       sprintf('%0.4f', $lastStart/$F[$colnums{sampleRate}]),
34 |                       sprintf('%.0f',$cumLength)
35 |                      )),"\n");
36 |     }
37 |     $model = $F[$colnums{model_state}];
38 |     $lastMove = $F[$colnums{move}];
39 |     $lastStart = $F[$colnums{start}];
40 |     $bpPos += $F[$colnums{move}];
41 |     $cumLength = 0;
42 |     $meanSig = 0;
43 |   }
44 |   $meanSig = ($meanSig * $cumLength + $F[$colnums{mean}] * $F[$colnums{length}]) / ($cumLength + $F[$colnums{length}]);
45 |   $cumLength = $cumLength + $F[$colnums{length}];
46 |   $cumLength = int($cumLength);
47 |   $cumTotalLength += $cumLength;
48 | }
49 | 


--------------------------------------------------------------------------------
/sam2mapped.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## sam2mapped -- created mapped sequences (as fasta files) for aligned regions
 4 | 
 5 | my $pos = -1;
 6 | my $seqName = "";
 7 | my $bestFlags = "";
 8 | my $bestID = "";
 9 | my $bestSeq = "";
10 | my $bestQual = "";
11 | my $bestLine = "";
12 | my $seenCount = 0;
13 | 
14 | my $addSeq = 40; # amount of sequence to add to the start and end
15 | 
16 | my $output = "fa"; # can be "fa"
17 | 
18 | sub printFQ {
19 |   my ($id, $seq, $qual) = @_;
20 |   if($id){
21 |     printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual);
22 |   }
23 | }
24 | 
25 | sub printFA {
26 |   my ($id, $seq, $qual) = @_;
27 |   if($id){
28 |     printf(">%s\n%s\n", $id, $seq);
29 |   }
30 | }
31 | 
32 | while(<>){
33 |   if(/^@/){
34 |     next;
35 |   }
36 |   my $line = $_;
37 |   chomp;
38 |   my @F = split(/\t/);
39 |   my $refPos = $F[3];
40 |   my $cigar = $F[5];
41 |   $cigar =~ s/[0-9]S$//;
42 |   my $seq = $F[9];
43 |   my $qual = $F[10];
44 |   my $startTrim = 0;
45 |   my $matchLen = 0;
46 |   while($cigar =~ s/^([0-9]+)([MIDNSHP=X])//){
47 |     my $subLen = $1;
48 |     my $op = $2;
49 |     if($op eq "S"){
50 |       if($subLen > $addSeq){
51 |         $seq = substr($seq, $subLen - $addSeq);
52 |         $qual = substr($qual, $subLen - $addSeq);
53 |       }
54 |     }
55 |     if($op =~ /[M=XI]/){
56 |       $matchLen += $subLen;
57 |     }
58 |   }
59 |   $seq = substr($seq, 0, $matchLen + $addSeq);
60 |   $qual = substr($qual, 0, $matchLen + $addSeq);
61 |   if($seq){
62 |     printFA($F[0], $seq, $qual);
63 |   }
64 | }
65 | if($output eq "fastq"){
66 |   printSeq($bestID, $bestSeq, $bestQual);
67 | } elsif($output eq "sam"){
68 |   print($bestLine);
69 | }
70 | 


--------------------------------------------------------------------------------
/sam2LongestBase.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## sam2LongestBase -- extracts a random sequence from the longest sequences at each base
 4 | 
 5 | my $pos = -1;
 6 | my $seqName = "";
 7 | my $bestFlags = "";
 8 | my $bestID = "";
 9 | my $bestSeq = "";
10 | my $bestQual = "";
11 | my $bestLine = "";
12 | my $seenCount = 0;
13 | 
14 | my $output = "sam"; # can be "sam" or "fastq"
15 | 
16 | sub printSeq {
17 |   my ($id, $seq, $qual) = @_;
18 |   if($id){
19 |     printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual);
20 |   }
21 | }
22 | 
23 | while(<>){
24 |   if(/^@/){
25 |     print;
26 |     next;
27 |   }
28 |   my $line = $_;
29 |   chomp;
30 |   my @F = split(/\t/);
31 |   if(($F[2] ne $seqName) || ($F[3] != $pos) || (length($bestSeq) <= length($F[9]))){
32 |     if(length($bestSeq) == length($F[9])){
33 |       ## reservoir sampling with a reservoir size of 1
34 |       ## See https://en.wikipedia.org/wiki/Reservoir_sampling
35 |       ## * with probability 1/i, keep the new item instead of the current item
36 |       $seenCount++;
37 |       if(!rand($seenCount)){
38 |         ## i.e. if rand($seenCount) == 0, then continue with replacement
39 |         next;
40 |       }
41 |     } else {
42 |       $seenCount = 1;
43 |     }
44 |     if(($F[2] ne $seqName) || ($F[3] != $pos)){
45 |       if($output eq "fastq"){
46 |         printSeq($bestID, $bestSeq, $bestQual);
47 |       } elsif($output eq "sam"){
48 |         print($bestLine);
49 |       }
50 |     }
51 |     $seqName = $F[2];
52 |     $pos = $F[3];
53 |     $bestLine = $line;
54 |     $bestID = $F[0];
55 |     $bestFlags = $F[1];
56 |     $bestSeq = $F[9];
57 |     $bestQual = $F[10];
58 |   }
59 | }
60 | if($output eq "fastq"){
61 |   printSeq($bestID, $bestSeq, $bestQual);
62 | } elsif($output eq "sam"){
63 |   print($bestLine);
64 | }
65 | 


--------------------------------------------------------------------------------
/fastx-qualHist.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | use Getopt::Long qw(:config auto_help pass_through);
 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 7 | 
 8 | my $idFileName = "";
 9 | my $quiet = 0;
10 | my $base = 33;
11 | 
12 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet,
13 |            "base" => \$base ) or
14 |   die("Error in command line arguments");
15 | 
16 | # unknown commands are treated as identifiers
17 | my @files = ();
18 | while(@ARGV){
19 |   my $arg = shift(@ARGV);
20 |   if(-f $arg){
21 |     push(@files, $arg);
22 |   }
23 | }
24 | @ARGV = @files;
25 | 
26 | my %qualCounts = ();
27 | 
28 | my $baseCount = 0;
29 | 
30 | my $inQual = 0; # false
31 | my $seqID = "";
32 | my $qualID = "";
33 | my $seq = "";
34 | my $qual = "";
35 | while(<>){
36 |   chomp;
37 |   chomp;
38 |   if(!$inQual){
39 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
40 |       my $newSeqID = $2;
41 |       my $newShortID = $3;
42 |       $seq = "";
43 |       $qual = "";
44 |       $seqID = $newSeqID;
45 |     } elsif(/^\+(.*)$/) {
46 |       $inQual = 1; # true
47 |       $qualID = $1;
48 |     } else {
49 |       $seq .= $_;
50 |     }
51 |   } else {
52 |     grep {$qualCounts{$_}++} split(//,$_);
53 |     $baseCount += length($_);
54 |     $qual .= $_;
55 |     if(length($qual) >= length($seq)){
56 |       $inQual = 0; # false
57 |     }
58 |   }
59 | }
60 | 
61 | my $cumCount = 0;
62 | foreach my $qualChar (sort(keys(%qualCounts))){
63 |   $cumCount += $qualCounts{$qualChar};
64 |   printf("%s [%2d]: %6d (%6.2f%% / %6.2f%% )\n",
65 |          $qualChar, ord($qualChar) - $base,
66 |          $qualCounts{$qualChar}, $qualCounts{$qualChar} * 100 / $baseCount,
67 |         $cumCount * 100 / $baseCount);
68 | }
69 | 
70 | printf("Total sequence length: %d\n", $baseCount);
71 | 


--------------------------------------------------------------------------------
/labid2UUID.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | use Text::CSV;
 7 | 
 8 | open(my $idLookupFile, "<", "/home/gringer/bioinf/GU-2012-Apr-01-RLMB/Miles/individuals/NI_UUID_Ped_2012-Oct-23.csv") or die("Cannot open lookup file");
 9 | 
10 | my $csv = Text::CSV->new ({ binary => 1, eol => $/ });
11 | 
12 | my %col = ();
13 | my $headerRow = $csv->getline($idLookupFile);
14 | my $fieldCount = 0;
15 | foreach my $field (@$headerRow){
16 |   $col{$field} = $fieldCount++;
17 | }
18 | 
19 | my %dataReplacement = ();
20 | my $fileType = "tfam";
21 | 
22 | while (my $row = $csv->getline($idLookupFile)){
23 |   my @fields = @$row;
24 |   if($fields[$col{"LAB_ID"}] ne "NA"){
25 |     my $labid = $fields[$col{"LAB_ID"}];
26 |     my $uuid = $fields[$col{"UUID"}];
27 |     my $gender = $fields[$col{"Gender"}];
28 |     my $genderVal = ($gender eq "Male")?1:($gender eq "Female")?2:0;
29 |     if(($fileType eq "tfam") || ($fileType eq "ped")){
30 |       $dataReplacement{$labid} = 
31 |           join(" ",1,$uuid,
32 |                ($fields[$col{"patID"}] eq "NA")?0:$fields[$col{"patID"}],
33 |                ($fields[$col{"matID"}] eq "NA")?0:$fields[$col{"matID"}],
34 |                $genderVal);
35 |     } else {
36 |       $dataReplacement{$labid} = $uuid;
37 |     }
38 |   }
39 | }
40 | close($idLookupFile);
41 | 
42 | while(<>){
43 |   if(($fileType eq "tfam") || ($fileType eq "ped")){
44 |     my $output = $_;
45 |     my @fields = split(/\s+/,$output,6);
46 |     if(exists($dataReplacement{$fields[1]})){
47 |       # try the 'individual' column first
48 |       $output = $dataReplacement{$fields[1]}." ".$fields[5];
49 |     } elsif(exists($dataReplacement{$fields[0]})){
50 |       # try the 'family' column
51 |       $output = $dataReplacement{$fields[0]}." ".$fields[5];
52 |     }
53 |     print($output);
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/posAggregate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## Aggregates adjacent LAST matches into a single result/match line
 4 | 
 5 | use warnings;
 6 | use strict;
 7 | 
 8 | my ($oldquery, $oldtarget, $olddir, $oldqs, $oldqe, $oldqml,
 9 |     $oldql, $oldqpct, $oldts, $oldte, $oldtml, $oldtl, $oldtpct) =
10 |   ("") x 13;
11 | 
12 | while(<>){
13 |   chomp;
14 |   my ($query, $target, $dir, $qs, $qe, $qml,
15 |     $ql, $qpct, $ts, $te, $tml, $tl, $tpct) =
16 |       split(/,/);
17 |   if(/^query/){
18 |     print($_."\n");
19 |     next;
20 |   }
21 |   if(($query eq $oldquery) && ($target eq $oldtarget) &&
22 |      ($olddir eq $dir) &&
23 |      ((($dir eq "+") && ($oldts <= $ts)) ||
24 |       (($dir eq "-") && ($oldte >= $te)))){
25 |     $qs = $oldqs;
26 |     $qml += $oldqml;
27 |     $qpct = (($oldqpct * $oldqml) + ($qpct * $qml)) / ($oldqml + $qml);
28 |     $ts = $oldts if ($dir eq "+");
29 |     $te = $oldte if ($dir eq "-");
30 |     $tml += $oldtml;
31 |     $tpct = (($oldtpct * $oldtml) + ($tpct * $tml)) / ($oldtml + $tml);
32 |   } elsif($oldquery ne "") {
33 |     $qpct = sprintf("%0.2f", $qpct);
34 |     $tpct = sprintf("%0.2f", $tpct);
35 |     print(join(",",($oldquery, $oldtarget, $olddir,
36 |                     $oldqs, $oldqe, $oldqml, $oldql, $oldqpct,
37 |                     $oldts, $oldte, $oldtml, $oldtl, $oldtpct))."\n");
38 |   }
39 |   ($oldquery, $oldtarget, $olddir, $oldqs, $oldqe, $oldqml,
40 |    $oldql, $oldqpct, $oldts, $oldte, $oldtml, $oldtl, $oldtpct) =
41 |      ($query, $target, $dir, $qs, $qe, $qml,
42 |       $ql, $qpct, $ts, $te, $tml, $tl, $tpct);
43 | }
44 | 
45 | $oldqpct = sprintf("%0.2f", $oldqpct);
46 | $oldtpct = sprintf("%0.2f", $oldtpct);
47 | print(join(",",($oldquery, $oldtarget, $olddir,
48 |                 $oldqs, $oldqe, $oldqml, $oldql, $oldqpct,
49 |                 $oldts, $oldte, $oldtml, $oldtl, $oldtpct))."\n");
50 | 


--------------------------------------------------------------------------------
/vcf_normalise.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## vcf_normalise.pl -- removes VCF information from all but the
 4 | ## highest coverage position, considering SNPs/INDELs separately
 5 | 
 6 | ## RSRS    12      .       TATCA   TCATCA,TCAATCA  0       .       INDEL;IDV=13;IMF=0.0634146;DP=92629;...
 7 | ## RSRS    12      .       T       C,A,G   0       .       DP=92629;...
 8 | ## RSRS    12      .       T       C,G,A   0       .       DP=7490;...
 9 | ## RSRS    12      .       T       TT      0       .       INDEL;IDV=1;IMF=0.2;DP=7490;...
10 | ## =>
11 | ## RSRS    12      .       TATCA   TCATCA,TCAATCA  0       .       INDEL;IDV=13;IMF=0.0634146;DP=92629;...
12 | ## RSRS    12      .       T       C,A,G   0       .       DP=92629;...
13 | 
14 | use warnings;
15 | use strict;
16 | 
17 | my $indelCache = "";
18 | my $snpCache = "";
19 | my $bestSnpCov = 0;
20 | my $bestIndelCov = 0;
21 | my $oldChr = "";
22 | my $oldLoc = -1;
23 | my $chr = "";
24 | my $loc = 0;
25 | my $indel = "";
26 | my $cov = 0;
27 | 
28 | while(<>){
29 |     if(/^#/){
30 | 	print;
31 | 	next;
32 |     }
33 |     my $line = $_;
34 |     if($line =~ /^(.*?)\t([0-9]+)\t.*?(INDEL.*?;)?DP=([0-9]+)/){
35 | 	($chr, $loc, $indel, $cov) = ($1, $2, $3, $4);
36 |     }
37 |     if(($chr ne $oldChr) || ($loc != $oldLoc)){
38 | 	print($indelCache);
39 | 	print($snpCache);
40 | 	if($indel){
41 | 	    $indelCache = $line;
42 | 	    $bestIndelCov = $cov;
43 | 	    $snpCache = "";
44 | 	    $bestSnpCov = 0;
45 | 	} else {
46 | 	    $indelCache = "";
47 | 	    $bestIndelCov = 0;
48 | 	    $snpCache = $line;
49 | 	    $bestSnpCov = $cov;
50 | 	}
51 | 	$oldChr = $chr;
52 | 	$oldLoc = $loc;
53 |     } elsif($indel){
54 | 	if($cov > $bestIndelCov){
55 | 	    $indelCache = $line;
56 | 	    $bestIndelCov = $cov;
57 | 	}
58 |     } else{
59 | 	if($cov > $bestSnpCov){
60 | 	    $snpCache = $line;
61 | 	    $bestSnpCov = $cov;
62 | 	}
63 |     }
64 | }
65 | 
66 | print($indelCache);
67 | print($snpCache);
68 | 


--------------------------------------------------------------------------------
/gtsubset.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # gtsubset.pl -- subsets a simplegt file using an individual list
 4 | 
 5 | use warnings;
 6 | use strict;
 7 | use FileHandle;
 8 | use Getopt::Long qw(:config auto_version auto_help pass_through);
 9 | 
10 | my $idFileName = "";
11 | my %ids = ();
12 | 
13 | GetOptions(
14 |            'ids=s' => \$idFileName,
15 |           );
16 | 
17 | my @fileArgs = ();
18 | 
19 | foreach my $arg (@ARGV){
20 |     if(-f $arg){
21 | 	$idFileName = $arg;
22 |     } else {
23 | 	$ids{$arg} = 1;
24 |     }
25 | }
26 | 
27 | @ARGV = @fileArgs;
28 | 
29 | if($idFileName){
30 |   print(STDERR "Reading in id file... ");
31 |   open(my $idFile, "< $idFileName")
32 |   or die("cannot open $idFileName for reading");
33 |   while(<$idFile>){
34 |       chomp;
35 |       s/(\s|,).*$//;
36 |       $ids{$_} = 1;
37 |   }
38 |   close($idFile);
39 |   print(STDERR "done!\n");
40 | }
41 | 
42 | if(!%ids){
43 |     print(STDERR "Error: no IDs specified. Cannot continue.\n");
44 |     exit(1);
45 | }
46 | 
47 | my %filteredIDs = ();
48 | my @idOrder = ();
49 | my @colOrder = ();
50 | 
51 | while(<>){
52 |   chomp;
53 |   if(/^#/){
54 |     s/<Individual.*?://g;
55 |     s/##//g;
56 |     s/>//g;
57 |     s/^\s+//;
58 |     s/\s+$//;
59 |     my @origIDs = split(/\s+/);
60 |     for(my $i = 0; $i <= $#origIDs; $i++){
61 |       if($ids{$origIDs[$i]}){
62 |         $filteredIDs{$origIDs[$i]} = $i;
63 |         push(@idOrder, $origIDs[$i]);
64 |         push(@colOrder, $i);
65 |       }
66 |     }
67 |     if(!(@idOrder)){
68 | 	print(STDERR "Error: no specified IDs found. Cannot continue.\n");
69 | 	exit(1);
70 |     }
71 |     if(scalar(@idOrder) < scalar(keys(%ids))){
72 | 	printf(STDERR "Warning: some ID values were not found:\n");
73 | 	for my $id (keys(%ids)){
74 | 	    if(!$filteredIDs{$id}){
75 | 		print(STDERR " $id");
76 | 	    }
77 | 	}
78 | 	print(STDERR "\n");
79 |     }
80 |     printf("## <Individual/Column IDs: %s > ##\n", join(" ",@idOrder));
81 |     next;
82 |   }
83 |   my ($marker, @gts) = split(/\s+/);
84 |   printf("%-15s %s\n", $marker, join(" ", @gts[@colOrder]));
85 | }
86 | 
87 | 


--------------------------------------------------------------------------------
/tped2gtcounts.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | my ($tfamFileName) = shift(@ARGV);
 7 | 
 8 | my @maleGTs = ();
 9 | my @femaleGTs = ();
10 | 
11 | #print(STDERR "Processing TFAM file...");
12 | open(my $tfamFile, "<", $tfamFileName) or die("Cannot open TFAM file '$tfamFileName'");
13 | my $tfamPos = 0;
14 | my $lastGTPos = -1;
15 | while(<$tfamFile>){
16 |     chomp;
17 |     my ($iid, $fid, $mat, $pat, $sex, $pheno) = split(/\s+/);
18 |     if($sex == 1){
19 | 	push(@maleGTs, $tfamPos);
20 |     } elsif($sex == 2){
21 | 	push(@femaleGTs, $tfamPos);
22 |     }
23 |     $tfamPos++;
24 |     $lastGTPos++;
25 | }
26 | close($tfamFile);
27 | #print(STDERR " done\n");
28 | 
29 | my @maleAlleles = map {($_*2, $_*2+1)} @maleGTs;
30 | my @femaleAlleles = map {($_*2, $_*2+1)} @femaleGTs;
31 | 
32 | my $processed = 0;
33 | 
34 | print(STDERR ".");
35 | #printf("Marker,Sex,Type,Bases,Count\n");
36 | my %counts = ();
37 | while(<>){
38 |     chomp;
39 |     my $line = $_;
40 |     my ($chr, $marker, $mapPos, $genPos, @alleles) = split(/\s+/, $_);
41 |     my @genotypes = map {@alleles[$_*2].@alleles[$_*2+1]} (0..$lastGTPos);
42 |     %counts = ();
43 |     grep {
44 | 	my $gt = $_;
45 | 	my $a1 = substr($gt,0,1);
46 | 	my $a2 = substr($gt,1,1);
47 | 	$counts{"a"}{$gt}++;
48 | 	$counts{"a"}{$a1}++;
49 | 	$counts{"a"}{$a2}++;
50 | 	$counts{"m"}{$gt}++;
51 | 	$counts{"m"}{$a1}++;
52 | 	$counts{"m"}{$a2}++;
53 |     } @genotypes[@maleGTs];
54 |     grep {
55 | 	my $gt = $_;
56 | 	my $a1 = substr($gt,0,1);
57 | 	my $a2 = substr($gt,1,1);
58 | 	$counts{"a"}{$gt}++;
59 | 	$counts{"a"}{$a1}++;
60 | 	$counts{"a"}{$a2}++;
61 | 	$counts{"f"}{$gt}++;
62 | 	$counts{"f"}{$a1}++;
63 | 	$counts{"f"}{$a2}++;
64 |     } @genotypes[@femaleGTs];
65 |     foreach my $sex ("a","m","f"){
66 | 	foreach my $bases (keys(%{$counts{$sex}})){
67 | 	    printf("%s,%s,%s,%s,%d\n", $marker, $sex,
68 | 		   (length($bases) == 1) ? "a" : "g",
69 | 		   $bases,
70 | 		   $counts{$sex}{$bases});
71 | 	}
72 |     }
73 |     $processed++;
74 |     if($processed > 1000){
75 | 	print(STDERR ".");
76 | 	$processed = 0;
77 |     }
78 | }
79 | #print(STDERR " done\n");
80 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/header.html:
--------------------------------------------------------------------------------
 1 | <html><head>
 2 | <link rel="stylesheet" href="../style/css/tabs_style.css" />
 3 | <link rel="stylesheet" href="../style/css/side_menu.css" />
 4 | <link rel="stylesheet" href="../style/css/main_area.css" />
 5 | <title>%(site_name) BLAST Search (%(program))</title>
 6 | <script type="text/javascript" src="templates/sorttable.js"></script>
 7 | </head><body>
 8 | <form method="post" action="%(request_uri)" enctype="multipart/form-data">
 9 | <div id="menu_area">
10 | <div id="logo_area">
11 | <img alt="Project Logo" id="logo" src="../style/project_logo.png"
12 |      width="100" height="100" />
13 | </div>
14 | <div id="menu">
15 | <h1>%(site_name) %(program)</h1>
16 | <ul class="rootMenu">
17 | <li>Other Program
18 | <ul>
19 | <li>
20 |   <button type="submit" class="programButton" name="selectProgram" value="blastn">
21 |     <p>blastn</p>
22 |   </button>
23 | </li>
24 | <li>
25 |   <button type="submit" class="programButton" name="selectProgram" value="blastp">
26 |     <p>blastp</p>
27 |   </button>
28 | </li>
29 | <li>
30 |   <button type="submit" class="programButton" name="selectProgram" value="blastx">
31 |     <p>blastx</p>
32 |   </button>
33 | </li>
34 | <li>
35 |   <button type="submit" class="programButton" name="selectProgram" value="tblastn">
36 |     <p>tblastn</p>
37 |   </button>
38 | </li>
39 | <li>
40 |   <button type="submit" class="programButton" name="selectProgram" value="tblastx">
41 |     <p>tblastx</p>
42 |   </button>
43 | </li>
44 | </ul>
45 | </li>
46 | </ul>
47 | </div>
48 | </div>
49 | <div id="tabarea">
50 | <div id="tabbar">
51 | <ul>
52 | <li id="t_query" class="%(class_query)">
53 |   <button type="submit" name="selectTab" value="query">
54 |     <p>Query</p>
55 |   </button>
56 | </li>
57 | <li id="t_params" class="%(class_params)">
58 |   <button type="submit" name="selectTab" value="params">
59 |     <p>Parameters</p>
60 |   </button>
61 | </li>
62 | <li id="t_results" class="%(class_results)">
63 |   <button type="submit" name="selectTab" value="results">
64 |     <p>Results</p>
65 |   </button>
66 | </li>
67 | </ul>
68 | </div>
69 | </div>
70 | <div id="contentarea">
71 | <div id="tabcontent">
72 | 


--------------------------------------------------------------------------------
/vcf2simplegt.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | ## vcf2simplegt.pl -- convert from VCF file to simplegt format
 7 | 
 8 | use Getopt::Long qw(:config auto_help pass_through);
 9 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
10 | 
11 | my %colNums = ();
12 | 
13 | my $idFileName = "";
14 | my @idOrder = ();
15 | my $idsSpecified = 0; # false
16 | 
17 | my %excludeCols = (
18 |   "#CHROM" => 1, "ID" => 1, "POS" => 1,  "REF" => 1,
19 |   "ALT" => 1, "QUAL" => 1,  "FILTER" => 1,
20 |   "INFO" => 1, "FORMAT" => 1);
21 | 
22 | GetOptions("idFile=s" => \$idFileName) or
23 |     die("Error in command line arguments");
24 | 
25 | if($idFileName){
26 |   print(STDERR "Retrieving id names from $idFileName...");
27 |   $idsSpecified = 1; # true
28 |   my $idFile = 0;
29 |   $idFile = new IO::Uncompress::Gunzip "$idFileName" or
30 |       die "Unable to open $idFileName\n";
31 |   while(<$idFile>){
32 |     if(/^\"?(.*?)\"?[\s,]+/){
33 |       my $id = $1;
34 |       push(@idOrder, $id);
35 |     }
36 |   }
37 |   close($idFile);
38 |   print(STDERR scalar(@idOrder)." id names extracted\n");
39 | }
40 | 
41 | my $nextAlleleNum = 0;
42 | my %alleleNums = ();
43 | my @alleles = ();
44 | 
45 | while(<>){
46 |   if(/^##/){
47 |     next;
48 |   }
49 |   chomp;
50 |   my @F = split(/\t/, $_);
51 |   if(/(^|\s)ID(\s|$)/){
52 |     my $colNum = 0;
53 |     foreach my $colName (@F){
54 | 	$colNums{$colName} = $colNum;
55 | 	if(!$idsSpecified && (!$excludeCols{$colName})){
56 | 	    push(@idOrder, $colName);
57 | 	}
58 | 	$colNum++;
59 |     }
60 |     @idOrder = grep {defined($colNums{$_})} @idOrder;
61 |     printf("## <Individual/Column IDs: %s > ##\n",
62 | 	join(" ", @idOrder));
63 |     next;
64 |   } elsif(!defined($colNums{"ID"})){
65 |       die("IDs have not been defined / found");
66 |   }
67 |   ## by this time, @idOrder should be populated with column IDs
68 |   ## identify alleles
69 |   @alleles = ($F[$colNums{"REF"}], split(/,/, $F[$colNums{"ALT"}]), ".");
70 |   grep {$_ =~ s/\./$#alleles/ge} @alleles;
71 |   printf("%-15s %s\n", $F[$colNums{"ID"}],
72 | 	 join(" ",map {join("",@alleles[split(/\|/, $_)])} @F[@colNums{@idOrder}]));
73 | }
74 | 


--------------------------------------------------------------------------------
/vcf_windowSorter.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # vcf_windowSorter.pl -- sorts a VCF file by position within a
 4 | # specified window size. The effect of this is to convert a file
 5 | # containing overlapping gene regions (with a given window of overlap)
 6 | # into a sorted file that can be indexed.
 7 | 
 8 | use warnings;
 9 | use strict;
10 | 
11 | my $windowSize = 50000;
12 |  my $currentChr = "";
13 | my $minPos = -1;
14 | my $maxPos = -1;
15 | 
16 | my %posCache = ();
17 | my @storedPoss = ();
18 | 
19 | while(<>){
20 |     my $line = $_;
21 |     if(!/^#/){
22 | 	my ($chr, $pos) = (/^(.*?)\t([0-9]+)/);
23 | 	if($chr ne $currentChr){
24 | 	    # spit out cache
25 | 	    for my $oldPos (@storedPoss){
26 | 		print($posCache{$oldPos});
27 | 	    }
28 | 	    # reset min/max values and clear cache
29 | 	    $currentChr = $chr;
30 | 	    $minPos = -1;
31 | 	    $maxPos = -1;
32 | 	    %posCache = ();
33 | 	    @storedPoss = ();
34 | 	}
35 | 	#print(join(":",($chr, $pos))."\n");
36 | 	if($pos > ($minPos + $windowSize)){
37 | 	    #printf(STDERR "Pos: $pos, MinPos: $minPos, MaxPos: $maxPos, ArraySize: %d\n", scalar(@storedPoss));
38 | 	    while(@storedPoss && ($storedPoss[0] <= ($pos - $windowSize))){
39 | 		# spit out first bit of cache
40 | 		my $oldPos = shift(@storedPoss);
41 | 		# extract and remove position information from cache
42 | 		print(delete($posCache{$oldPos}));
43 | 	    }
44 | 	    if(!(@storedPoss)){ # no remaining values, so no maximum value
45 | 		$maxPos = -1;
46 | 	    }
47 | 	}
48 | 	if(!exists($posCache{$pos})){ # avoid inserting positions multiple times
49 | 	    # pos might need to slot into array, so check with max value
50 | 	    if($pos < $maxPos){
51 | 		# this is expensive, but hopefully doesn't happen too often
52 | 		@storedPoss = sort({$a <=> $b} ($pos, @storedPoss));
53 | 		# no change to maxPos
54 | 	    } else {
55 | 		push(@storedPoss, $pos);
56 | 		$maxPos = $pos;
57 | 	    }
58 | 	}
59 | 	# store line in cache
60 | 	$posCache{$pos} .= $line;
61 | 	# regenerate minimum value; storedPoss should have at least 1 element
62 | 	$minPos = $storedPoss[0];
63 |     } else {
64 | 	print($_);
65 |     }
66 | }
67 | 
68 | # print out remaining cached locations
69 | for my $oldPos (@storedPoss){
70 |     print($posCache{$oldPos});
71 | }
72 | 


--------------------------------------------------------------------------------
/bam2proportion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | readType=$1; # IonTorrent, Nanopore, Illumina
 4 | bamFile=$2;
 5 | regFile=$3; # /data/all/david/diagnostic/design/IAD31669_Submitted.bed
 6 | refFile=$(samtools view -H ${bamFile} | grep '\.fa' | perl -pe 's#^.*-f #/mnt/ihbi_ngs/iontorrent-kgq4#;s/(\.fa(sta)?).*$/$1/');
 7 | 
 8 | echo "BAM File: '${bamFile}'" > /dev/stderr;
 9 | echo "Reference File: '${refFile}'" > /dev/stderr;
10 | echo "Region File: '${regFile}'" > /dev/stderr;
11 | 
12 | pileupOpts="";
13 | 
14 | if [ ${readType} = "Nanopore" ] ; then
15 |     # initial assumption: similar to IonTorrent, with lower base quality and lower INDEL coefficient
16 |     pileupOpts="-d 10000 -L 10000 -Q 5 -h 20 -o 10 -e 17 -m 10 -f ${refFile} ${bamFile}";
17 | fi
18 | 
19 | if [ ${readType} = "Illumina" ] ; then
20 |     # Illumina should work with default options
21 |     pileupOpts="-f ${refFile} ${bamFile}";
22 | fi
23 | 
24 | # https://www.edgebio.com/variant-calling-ion-torrent-data
25 | 
26 | if [ ${readType} = "IonTorrent" ] ; then
27 |     pileupOpts="-d 10000 -L 10000 -Q 7 -h 50 -o 10 -e 17 -m 10 -f ${refFile} ${bamFile}";
28 | fi
29 | 
30 | # for Variant annotation with annovar
31 | # http://www.cureffi.org/2012/09/07/an-alternative-exome-sequencing-pipeline-using-bowtie2-and-samtools/
32 | 
33 | if [ -z ${regFile} ]; then
34 |     #echo "samtools mpileup ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g'" > /dev/stderr
35 |     samtools mpileup ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g'
36 | else
37 |     #echo "samtools mpileup -l ${regFile} ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g'" > /dev/stderr
38 |     samtools mpileup -l ${regFile} ${pileupOpts} | ~/scripts/mpileup2Proportion.pl -m 10 | perl -pe 's/ +/,/g'
39 | fi
40 | 
41 | # for x in IonXpress_0*.bam; do echo -n ${x} "..."; ~/scripts/bam2proportion.sh IonTorrent ${x} /data/all/david/diagnostic/design/IAD31669_Submitted.bed > proportion_Diagnostics_AmpliSeq_QUT_NGS11_98_099/$(basename ${x} .bam).pileupprop.csv; echo "done"; done
42 | # for x in IonXpress_0*.bam; do echo -n ${x} "..."; ~/scripts/bam2proportion.sh IonTorrent ${x} > proportion_Shani_Mito_Run18_71_082/$(basename ${x} .bam).pileupprop.csv; echo "done"; done
43 | 
44 | exit 0
45 | 


--------------------------------------------------------------------------------
/fastx-annotate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | use Getopt::Long qw(:config auto_help pass_through);
 6 | 
 7 | my $idFileName = "";
 8 | my $quiet = 0;
 9 | 
10 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet) or
11 |   die("Error in command line arguments");
12 | 
13 | my %idsToAnnotate = ();
14 | 
15 | # unknown commands are treated as identifiers
16 | my @files = ();
17 | while(@ARGV){
18 |   my $arg = shift(@ARGV);
19 |   if(-f $arg){
20 |     push(@files, $arg);
21 |   } else {
22 |     $idsToAnnotate{$arg} = shift(@ARGV);
23 |   }
24 | }
25 | @ARGV = @files;
26 | 
27 | if($idFileName){
28 |   # read sequence IDs from input file
29 |   printf(STDERR "Reading from file '$idFileName'\n");
30 |   open(my $idFile, "<", $idFileName);
31 |   while(<$idFile>){
32 |     chomp;
33 |     s/^[>@]//;
34 |     s/\"//g;
35 |     my @F = split(/,/,$_,2);
36 |     $idsToAnnotate{$F[0]} = $F[1];
37 |   }
38 |   close($idFile);
39 | }
40 | 
41 | if(!$quiet){
42 |   printf(STDERR "Read %d identifiers\n", scalar(keys(%idsToAnnotate)));
43 | }
44 | 
45 | my $inQual = 0; # false
46 | my $seqID = "";
47 | my $qualID = "";
48 | my $seq = "";
49 | my $qual = "";
50 | while(<>){
51 |   chomp;
52 |   chomp;
53 |   if(!$inQual){
54 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
55 |       my $newSeqID = $2;
56 |       my $newShortID = $3;
57 |       if($seqID){
58 |         if($qual){
59 |           printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
60 |         } else {
61 |           printf(">%s\n%s\n", $seqID, $seq);
62 |         }
63 |       }
64 |       $seq = "";
65 |       $qual = "";
66 |       if(exists($idsToAnnotate{$newSeqID})){
67 |         $seqID = $newSeqID . " " . $idsToAnnotate{$newSeqID};
68 |       } elsif(exists($idsToAnnotate{$newShortID})){
69 |         $seqID = $newShortID . " " . $idsToAnnotate{$newShortID};
70 |       } else {
71 |         $seqID = $newSeqID;
72 |       }
73 |     } elsif(/^\+(.*)$/) {
74 |       $inQual = 1; # true
75 |       $qualID = $1;
76 |     } else {
77 |       $seq .= $_;
78 |     }
79 |   } else {
80 |     $qual .= $_;
81 |     if(length($qual) >= length($seq)){
82 |       $inQual = 0; # false
83 |     }
84 |   }
85 | }
86 | 
87 | if($seqID){
88 |   if($qual){
89 |     printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
90 |   } else {
91 |     printf(">%s\n%s\n", $seqID, $seq);
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/webblast/style/css/side_menu.css:
--------------------------------------------------------------------------------
  1 | #menu_area {
  2 |    position: fixed;
  3 |    padding:0;
  4 |    margin:0;
  5 |    left:0;
  6 |    top:0;
  7 |    width: 170px;
  8 |    height: 100%;
  9 | }
 10 | 
 11 | #logo_area {
 12 |    position: absolute;
 13 |    padding:0;
 14 |    left:0;
 15 |    width: 130px;
 16 |    height: 130px;
 17 |    top: 0px;
 18 |    color: black;
 19 |    margin: 0px;
 20 |    text-decoration: none;
 21 | }
 22 | 
 23 | #logo {
 24 |    padding: 10px;
 25 | }
 26 | 
 27 | /* Menu Text */
 28 | #menu {
 29 |    color: black;
 30 |    margin-top: 130px;
 31 |    margin-left: 10px;
 32 |    font-family: "Bitstream Vera Sans", Verdana, Helvetica, Arial, sans-serif;
 33 | }
 34 | 
 35 | /* Gives padding to sub menu items (because they have links <a>) */
 36 | #menu a {background-color: transparent; padding: 7px; }
 37 | 
 38 | /* Aligns menu better */
 39 | #menu ul {top: 200px;  padding: 0; margin: 0; font-size: 1.05em; }
 40 | 
 41 | /* Sub-menu background area - making it hidden */
 42 | #menu > ul ul { display: none ; }
 43 | 
 44 | /* Sub-Menu Area */
 45 | #menu > ul li:hover > ul {
 46 |    width:200px;
 47 |    top: 1.25em;
 48 |    margin-left: -115px;
 49 |    margin-top: 12px;
 50 |    display: block;
 51 |    position: absolute;
 52 |    left: 100%;
 53 | }
 54 | 
 55 | /* Main Menu selector emphasis */
 56 | #menu > ul li:hover {
 57 |    width:150px;
 58 |    text-align: left;
 59 |    z-index: 10000;
 60 |    cursor: pointer;
 61 |    background: url("menu_active.png");
 62 | }
 63 | 
 64 | /* Main Menu selector emphasis */
 65 | #menu ul li ul li:hover {
 66 |    width:100px;
 67 |    text-align: left;
 68 |    z-index: 10000;
 69 |    cursor: pointer;
 70 | }
 71 | /* Pop-out menu colour */
 72 | #menu > ul ul li {
 73 |    width:200px;
 74 |    cursor: pointer;
 75 |    line-height: 1.05em;
 76 | }
 77 | 
 78 | /* Sub-menu selector area decleration */
 79 | #menu > ul ul li:hover {
 80 |    width:165px;
 81 | }
 82 | 
 83 | /* Sub-menu text */
 84 | #menu > ul ul li a {
 85 |    display: block;
 86 |    color: black;
 87 |    text-decoration: none;  }
 88 | 
 89 | /* Menu text spacing */
 90 | #menu ul li {
 91 |    width:100%;
 92 |    list-style-type: none;
 93 |    position: relative;
 94 |    line-height: 2.0em;
 95 | }
 96 | 
 97 | #menu { padding-left: 5px;}
 98 | 
 99 | #menu p {
100 |     margin: 2px;
101 | }
102 | 
103 | .programButton {
104 |     border-style: none;
105 |     background-color: transparent;
106 | }


--------------------------------------------------------------------------------
/dotplotAnnotator.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | library(png);
 4 | 
 5 | fname <- commandArgs(TRUE);
 6 | img <- readPNG(fname);
 7 | 
 8 | valToSci <- function(val, unit = ""){
 9 |     sci.prefixes <- c("", "k", "M", "G", "T", "P", "E", "Z", "Y");
10 |     units <- rep(paste(sci.prefixes,unit,sep=""), each=3);
11 |     logRegion <- floor(log10(val))+1;
12 |     conv.units <- units[logRegion];
13 |     conv.div <- 10^rep(0:(length(sci.prefixes)-1) * 3, each = 3)[logRegion];
14 |     conv.val <- val / conv.div;
15 |     conv.val[val == 0] <- 0;
16 |     conv.units[val == 0] <- unit;
17 |     return(sprintf("%s %s",conv.val,conv.units));
18 | }
19 | 
20 | fib.divs <- round(10^((0:4)/5) * 2) * 0.5; ## splits log decades into 5
21 | 
22 | seqName <- sub("^.*?_(.*).png","\\1",fname);
23 | seqLen <- sub("_.*$","",fname);
24 | seqStart <- 0;
25 | seqEnd <- 0;
26 | if(grepl("-", seqLen)){
27 |     seqStart <- as.numeric(sub("-.*$","",seqLen));
28 |     seqEnd <- as.numeric(sub("^.*?-","",seqLen));
29 |     seqLen <- seqEnd - seqStart + 1;
30 | } else {
31 |     seqLen <- as.numeric(seqLen);
32 |     seqStart <- 1;
33 |     seqEnd <- seqLen;
34 | }
35 | 
36 | print(c(seqName, seqLen, seqStart, seqEnd));
37 | 
38 | png(sprintf("featurePlot_%s.png", seqName),
39 |     width=1800, height=600, pointsize=12);
40 | #svg(sprintf("featurePlot_%s.svg", seqName),
41 | #    width=12.8, height=7.2, pointsize=12);
42 | par(mar=c(5,6,3,1.5), cex.axis=1.5, cex.lab=1.5, cex.main=2);
43 | plot(NA, main=sprintf("Feature profile (%s)", seqName),
44 |      xlab="Sequence Location (kbp)",
45 |      ylab="", log="y", xlim=c(seqStart,seqEnd)/1000,
46 |      ylim=c(1,seqLen), yaxt="n");
47 | rasterImage(img, xleft=seqStart/1000, xright=seqEnd/1000,
48 |             ybottom=1, ytop=seqLen);
49 | drMax <- ceiling(log10(seqLen));
50 | axis(2, at= 10^(0:drMax), las=2, lwd=3, cex.axis=1.5, labels=valToSci(10^(0:drMax)));
51 | axis(2, at= rep(1:9, each=drMax+1) * 10^(0:drMax), labels=FALSE);
52 | abline(h=10^(0:drMax), col="#80808050", lwd = 3);
53 | mtext("Feature distance (bp)", 2, line=4.5, cex=1.5);
54 | legend(x = "bottom",
55 |        fill=c("#9000a0","#8b0000",
56 |               "#fdc086","#ff7f00",
57 |               "#00a090","#0000ff",
58 |               "#a09000","#00a000"),
59 |     legend=c("Repeat (L)",  "Repeat (R)",
60 |              "Comp (L)",    "Comp (R)",
61 |              "RevComp (L)", "RevComp (R)",
62 |              "Reverse (L)", "Reverse (R)"),
63 |     bg="#FFFFFF", horiz=FALSE, inset=0.01, ncol=4);
64 | invisible(dev.off());
65 | 


--------------------------------------------------------------------------------
/markercsv2linkage.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # csv2merlin.pl -- Copy genotype data and pedigree information
 4 | # to create MERLIN-compatible QTDT files
 5 | 
 6 | # Author: David Eccles (gringer), 2013 <bioinformatics@gringer.org>
 7 | 
 8 | use warnings;
 9 | use strict;
10 | 
11 | use Text::CSV;
12 | 
13 | sub usage {
14 |   print("usage: ./csv2merlin.pl <PED file> <genotype file> [options]\n");
15 |   print("\nConvert CSV format to MERLIN-QTDT\n");
16 |   print("\nOther Options:\n");
17 |   print("-help        : Only display this help message\n");
18 |   print("\n");
19 | }
20 | 
21 | my @files = ();
22 | 
23 | my $pedFileName = "";
24 | my $gtFileName = "";
25 | 
26 | # extract command line arguments
27 | while(@ARGV){
28 |     my $argument = shift @ARGV;
29 |     if(-f $argument){ # file existence check
30 |         if(!$pedFileName){
31 |             $pedFileName = $argument;
32 |             #printf(STDERR "Setting PED file name to '%s'\n", $pedFileName);
33 |         } elsif (!$gtFileName) {
34 |             $gtFileName = $argument;
35 |             #printf(STDERR "Setting genotype file ".
36 |             #       "name to '%s'\n", $gtFileName);
37 |         } else {
38 |             print(STDERR "Error: only two files can be specified\n");
39 |             usage();
40 |             exit(1);
41 |         }
42 |     } else {
43 |         if($argument eq "-help"){
44 |             usage();
45 |             exit(0);
46 |         }
47 |     }
48 | }
49 | 
50 | if(!$pedFileName || !$gtFileName){
51 |     print(STDERR "Error: *two* files must be specified, cannot continue\n");
52 |     usage();
53 |     exit(1);
54 | }
55 | 
56 | my $csv = Text::CSV->new ( { binary => 1 } )  # should set binary attribute.
57 |    or die "Cannot use CSV: ".Text::CSV->error_diag ();
58 | 
59 | my $maxGTCount = 0;
60 | my %gtLines = ();
61 | open(my $gtFile, "<", $gtFileName) or die("Cannot open genotype file");
62 | while(my $row = $csv->getline($gtFile)){
63 |   my @F = @{$row};
64 |   my $id = shift(@F);
65 |   if(scalar(@F) > $maxGTCount){
66 |     $maxGTCount = scalar(@F);
67 |   }
68 |   map{$_ = ($_)?$_:"0/0"} @F;
69 |   $gtLines{$id} = join(" ",@F)."\n";
70 | }
71 | close($gtFile);
72 | 
73 | my $blankLine = " 0/0" x $maxGTCount . "\n";
74 | 
75 | open(my $pedFile, "<", $pedFileName) or die("Cannot open PED file");
76 | while(<$pedFile>){
77 |   chomp;
78 |   print $_." ";
79 |   my @F = split(/\s+/, $_, 3);
80 |   if($gtLines{$F[1]}){
81 |     print($gtLines{$F[1]});
82 |   } else {
83 |     print($blankLine);
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/fastx-grep.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | ## fastx-grep -- search for a pattern in the sequence or sequence name
 6 | 
 7 | use Getopt::Long qw(:config auto_help pass_through);
 8 | 
 9 | my $quiet = 0;
10 | my $filterSeqs = "";
11 | my $filterIDs = "";
12 | my $reverse = 0;
13 | 
14 | GetOptions("filter=s" => \$filterSeqs, "idfilter=s" => \$filterIDs,
15 |            "reverse|v!" => \$reverse, "quiet!" => \$quiet) or
16 |   die("Error in command line arguments");
17 | 
18 | # unknown commands are treated as identifiers
19 | my @files = ();
20 | while(@ARGV){
21 |   my $arg = shift(@ARGV);
22 |   if(-e $arg){
23 |     push(@files, $arg);
24 |   } else {
25 |     $filterSeqs .= "|$arg";
26 |   }
27 | }
28 | @ARGV = @files;
29 | 
30 | if($filterSeqs){
31 |   $filterSeqs =~ s/^\|//;
32 |   $filterSeqs = "($filterSeqs)";
33 |   if($reverse){
34 |     printf(STDERR "Filter sequence (excluded): $filterSeqs\n");
35 |   } else {
36 |     printf(STDERR "Filter sequence: $filterSeqs\n");
37 |   }
38 | }
39 | 
40 | if($filterIDs){
41 |   $filterIDs =~ s/^\|//;
42 |   $filterIDs = "($filterIDs)";
43 |   if($reverse){
44 |     printf(STDERR "Filter ID (excluded): $filterIDs\n");
45 |   } else {
46 |     printf(STDERR "Filter ID: $filterIDs\n");
47 |   }
48 | }
49 | 
50 | 
51 | my $inQual = 0; # false
52 | my $seqID = "";
53 | my $qualID = "";
54 | my $seq = "";
55 | my $qual = "";
56 | while(<>){
57 |   chomp;
58 |   chomp;
59 |   if(!$inQual){
60 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
61 |       my $newSeqID = $2;
62 |       my $newShortID = $3;
63 |       if($seqID && (!$filterSeqs || ($reverse xor ($seq =~ /$filterSeqs/)))
64 |         && (!$filterIDs || ($reverse xor ($seqID =~ /$filterIDs/)))){
65 |         if($qual){
66 |           printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
67 |         } else {
68 |           printf(">%s\n%s\n", $seqID, $seq);
69 |         }
70 |       }
71 |       $seq = "";
72 |       $qual = "";
73 |       $seqID = $newSeqID;
74 |     } elsif(/^\+(.*)$/) {
75 |       $inQual = 1; # true
76 |       $qualID = $1;
77 |     } else {
78 |       $seq .= $_;
79 |     }
80 |   } else {
81 |     $qual .= $_;
82 |     if(length($qual) >= length($seq)){
83 |       $inQual = 0; # false
84 |     }
85 |   }
86 | }
87 | 
88 | if($seqID && (!$filterSeqs || ($reverse xor ($seq =~ /$filterSeqs/)))
89 |    && (!$filterIDs || ($reverse xor ($seqID =~ /$filterIDs/)))){
90 |     if($qual){
91 | 	printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
92 |     } else {
93 | 	printf(">%s\n%s\n", $seqID, $seq);
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/quantile_subset.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## quantile_subset.pl -- filters out items from a sorted list (with
 4 | ##   repeated values in the first field) that are the closest to the
 5 | ##   desired quantiles
 6 | 
 7 | ## For example, if there are 100 values, then a request for the 5th
 8 | ## and 95th quantiles will produce the 5th and 95th value respectively.
 9 | 
10 | ## usage: cat <csv_file> | quantile_subset.pl -quantiles 0.05,0.95
11 | 
12 | use warnings;
13 | use strict;
14 | 
15 | use Getopt::Long qw(:config auto_help pass_through);
16 | 
17 | my @values = ();
18 | my $marker = "";
19 | 
20 | my @quantiles = (0.05,0.95);
21 | my $quantileStr = "";
22 | 
23 | GetOptions("quantiles=s" => \$quantileStr) or
24 |   die("Error in command line arguments");
25 | 
26 | if($quantileStr){
27 |   @quantiles = split(/,/, $quantileStr);
28 | }
29 | 
30 | while(<>){
31 |   chomp;
32 |   my @F = split(/,/, $_);
33 |   my $lastLine = $_;
34 |   if($marker ne $F[0]){
35 |     if(@values){
36 |       foreach my $quantile (@quantiles){
37 |         if($quantile == 1){
38 |           printf("%s\n", $values[$#values]);
39 |         } else {
40 |           my $rankPos = $quantile * $#values;
41 |           my $rankInt = int($rankPos);
42 |           my $rankFrac = $rankPos - $rankInt;
43 |           ## standard quantile calculation doesn't make sense for
44 |           ## fractional locations when actual position is in between
45 |           ## two arbitrary text fields, so choose 0.5 as a threshold
46 |           ## [e.g. SNP at chromosome 4, location 30 Mb vs
47 |           ##       SNP at chromosome 7, location 10 Mb]
48 |           if($rankFrac < 0.5){
49 |             printf("%s\n", $values[$rankInt]);
50 |           } else {
51 |             printf("%s\n", $values[$rankInt+1]);
52 |           }
53 |         }
54 |       }
55 |       @values = ();
56 |     }
57 |   }
58 |   $marker = $F[0];
59 |   if(!/[0-9]$/){
60 |     ## Write header line(s) [end with non-numeric character] out directly
61 |     printf("%s\n", $lastLine);
62 |   } else {
63 |     push(@values, $lastLine);
64 |   }
65 | }
66 | 
67 | if(@values){
68 |   foreach my $quantile (@quantiles){
69 |     if($quantile == 1){
70 |       printf("%s\n", $values[$#values]);
71 |     } else {
72 |       my $rankPos = $quantile * $#values;
73 |       my $rankInt = int($rankPos);
74 |       my $rankFrac = $rankPos - $rankInt;
75 |       if($rankFrac < 0.5){
76 |         printf("%s\n", $values[$rankInt]);
77 |       } else {
78 |         printf("%s\n", $values[$rankInt+1]);
79 |       }
80 |     }
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/power_analysis.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscripts
 2 | data.df <-
 3 |     read.csv("Power_Data_30_min_kWh.csv", stringsAsFactors=FALSE);
 4 | 
 5 | colnames(data.df) <- c("Date","Time","Usage");
 6 | 
 7 | data.df$Date <- as.POSIXct(data.df$Date, format="%d/%m/%Y");
 8 | data.df$Time <- as.POSIXct(data.df$Time, format="%I:%M:%S %p");
 9 | 
10 | ## Graph of power usage by half-hour (aggregated over all days), with median
11 | png("PowerUsageDayAggregate.png", width=1280, height=640, pointsize=24);
12 | par(xaxs="i", mar=c(4,4,0.5,0.5), cex.axis=0.8);
13 | smoothScatter(y=log10(data.df$Usage),
14 |               x= as.numeric(data.df$Time),
15 |      pch=21, bg="#00000010",
16 |      col="#00000010", xaxt="n", yaxt="n", ylab = "Usage (kWh)",
17 |      xlab = "Time", nbin=c(48*4,128), bandwidth=c(1200,.04), nrpoints=0);
18 | axis(1, at=sort(unique(data.df$Time)), cex.axis=0.71, las=2,
19 |      labels=strftime(sort(unique(data.df$Time)), format="%H:%M %p"));
20 | axis(2, at=log10(1:9 * rep(10^(-1:1),each=9)),
21 |      labels=(1:9 * rep(10^(-1:1),each=9)), las=1);
22 | day.aggregate <- tapply(data.df$Usage, data.df$Time, median);
23 | points(spline(x=as.POSIXct(names(day.aggregate)),y=log10(day.aggregate)),
24 |        type = "l");
25 | points(x=as.POSIXct(names(day.aggregate)),y=log10(day.aggregate), lwd=2);
26 | invisible(dev.off());
27 | 
28 | ## Graph of daily power usage, with median daily usage per month
29 | png("PowerUsageByMonth.png", width=1280, height=640, pointsize=24);
30 | par(mar=c(4,4,0.5,0.5), cex.axis=0.8);
31 | plot(x=as.POSIXct(names(day.sum)), y=day.sum, xaxt="n", xlab="",
32 |      ylab = "Daily Consumption (kWh)", col="#00000020", pch=21,
33 |      bg="#00000020", cex=0.5);
34 | axis(1, at=as.POSIXct(sprintf("201%d-%02d-01",rep(5:7,each=11), 2:12)),
35 |      label=strftime(sprintf("201%d-%02d-01",rep(5:7,each=11), 2:12),
36 |                     format="%b"), las=2);
37 | axis(1, at=as.POSIXct(sprintf("201%d-%02d-01",rep(5:7,each=1), 1)),
38 |      label=strftime(sprintf("201%d-%02d-01",rep(5:7,each=1), 1),
39 |                     format="%Y-%b"), las=2);
40 | day.sum <- tapply(data.df$Usage, data.df$Date, sum);
41 | month.med <- tapply(day.sum, strftime(names(day.sum), format="%Y-%b"), median);
42 | names(month.med) <- sub("$","-15",names(month.med));
43 | month.med <- month.med[order(as.POSIXct(names(month.med), format="%Y-%b-%d"))];
44 | points(x=as.POSIXct(names(month.med), format="%Y-%b-%d"), y=month.med,
45 |        pch=21, bg="black");
46 | points(spline(x=as.POSIXct(names(month.med), format="%Y-%b-%d"), y=month.med,
47 |               n=length(month.med) * 10),
48 |        type="l");
49 | invisible(dev.off());
50 | 


--------------------------------------------------------------------------------
/sam-senseflagger.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## sam-sense_flagger.pl -- re-flag a SAM file using sense and
 4 | ##   anti-sense mapping information to create a strand-specific
 5 | ##   output. Query-reverse sequences are set to read #1, while
 6 | ##   Query-forward are set to read #2
 7 | 
 8 | use Getopt::Long qw(:config auto_help pass_through);
 9 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
10 | 
11 | my $fwdFileName = "";
12 | my $revFileName = "";
13 | my $prefix = "split";
14 | my $filter = 0; ## false
15 | my $filterFwd = 0; ## false
16 | my $filterRev = 0; ## false
17 | 
18 | GetOptions("fwdids=s" => \$fwdFileName, "revids=s" => \$revFileName,
19 |            "filter!" => \$filter, "xfwd!" => \$filterFwd,
20 |            "xrev!" => \$filterRev, ) or
21 |   die("Error in command line arguments");
22 | 
23 | if(!$fwdFileName || !$revFileName){
24 |   die("Error: 'fwdFileName' and 'revFileName' have not been defined");
25 | }
26 | 
27 | my %fwdIDs;
28 | my %revIDs;
29 | 
30 | open(my $fwdFile, "<", $fwdFileName) or die("Error loading foward ID file");
31 | while(<$fwdFile>){
32 |   chomp;
33 |   $fwdIDs{$_} = $_;
34 | }
35 | close($fwdFile);
36 | 
37 | open(my $revFile, "<", $revFileName) or die("Error loading reverse ID file");
38 | while(<$revFile>){
39 |   chomp;
40 |   $revIDs{$_} = $_;
41 | }
42 | close($fwdFile);
43 | 
44 | my $pos = -1;
45 | my $seqName = "";
46 | my $bestFlags = "";
47 | my $bestID = "";
48 | my $bestSeq = "";
49 | my $bestQual = "";
50 | my $bestLine = "";
51 | my $seenCount = 0;
52 | 
53 | sub printSeq {
54 |   my ($id, $seq, $qual) = @_;
55 |   if($id){
56 |     printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual);
57 |   }
58 | }
59 | 
60 | while(<>){
61 |   if(/^@/){
62 |     print;
63 |     next;
64 |   }
65 |   chomp;
66 |   my @F = split(/\t/);
67 |   my $flag = $F[1];
68 |   my $revMap = $flag & 0x10; # mapping orientation to *reference*
69 |   my $sense = 0;
70 |   my $antisense = 0;
71 |   my ($fwdRead, $revRead) = (0, 0); # false
72 |   $flag = $flag &= ~0xC3; # clear 0x80, 0x40, 0x01, and 0x03 flags
73 |   if($revIDs{$F[0]}){ ## reverse-mapped to expected *query*
74 |     $flag |= 0x43;
75 |     $fwdRead = $revMap;
76 |     $revRead = !$revMap;
77 |   } elsif($fwdIDs{$F[0]}){ ## forward-mapped to expected *query*
78 |     $flag |= 0x83;
79 |     $fwdRead = !$revMap;
80 |     $revRead = $revMap;
81 |   } else{
82 |     $flag |= 0x200;
83 |   }
84 |   if(!$filter | !(($flag & 0x200) || ($flag & 0x100) || ($flag & 0x800))){
85 |     $F[1] = $flag;
86 |     if(($filterFwd && $fwdRead) || ($filterRev && $revRead) ||
87 |        !($filterFwd | $filterRev)){
88 |       print(join("\t",@F)."\n");
89 |     }
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/fastx-rc.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | ## fastx-rc -- reverse complement a sequence
  6 | 
  7 | use Getopt::Long qw(:config auto_help pass_through);
  8 | 
  9 | my $quiet = 0;
 10 | my $preserve = 0;
 11 | 
 12 | sub rc {
 13 |   my ($seq) = @_;
 14 |   $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/;
 15 |   # work on masked sequences as well
 16 |   $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/;
 17 |   return(scalar(reverse($seq)));
 18 | }
 19 | 
 20 | GetOptions("quiet!" => \$quiet, "preserve!" => \$preserve) or
 21 |   die("Error in command line arguments");
 22 | 
 23 | # unknown commands are treated as identifiers
 24 | my @files = ();
 25 | while(@ARGV){
 26 |   my $arg = shift(@ARGV);
 27 |   if(-f $arg){
 28 |     push(@files, $arg);
 29 |   }
 30 | }
 31 | @ARGV = @files;
 32 | 
 33 | my $inQual = 0; # false
 34 | my $seqID = "";
 35 | my $qualID = "";
 36 | my $seq = "";
 37 | my $qual = "";
 38 | while(<>){
 39 |   chomp;
 40 |   chomp;
 41 |   if(!$inQual){
 42 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
 43 |       my $newSeqID = $2;
 44 |       my $newShortID = $3;
 45 |       if($seqID){
 46 | 	if($preserve){
 47 | 	  if($qual){
 48 | 	    printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
 49 | 	  } else {
 50 | 	    $seq =~ s/\n//g;
 51 | 	    $seq =~ s/(.{70})/$1\n/g;
 52 | 	    $seq =~ s/\n$//;
 53 | 	    printf(">%s\n%s\n", $seqID, $seq);
 54 | 	  }
 55 | 	}
 56 |         if($qual){
 57 |           printf("@%s [RC]\n%s\n+\n%s\n", $seqID, rc($seq), scalar(reverse($qual)));
 58 |         } else {
 59 |           $seq =~ s/\n//g;
 60 |           $seq = rc($seq);
 61 |           $seq =~ s/(.{70})/$1\n/g;
 62 |           $seq =~ s/\n$//;
 63 |           printf(">%s [RC]\n%s\n", $seqID, $seq);
 64 |         }
 65 |       }
 66 |       $seq = "";
 67 |       $qual = "";
 68 |       $seqID = $newSeqID;
 69 |     } elsif(/^\+(.*)$/) {
 70 |       $inQual = 1; # true
 71 |       $qualID = $1;
 72 |     } else {
 73 |       $seq .= $_;
 74 |     }
 75 |   } else {
 76 |     $qual .= $_;
 77 |     if(length($qual) >= length($seq)){
 78 |       $inQual = 0; # false
 79 |     }
 80 |   }
 81 | }
 82 | 
 83 | if($seqID){
 84 |   if($preserve){
 85 |     if($qual){
 86 |       printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
 87 |     } else {
 88 |       $seq =~ s/\n//g;
 89 |       $seq =~ s/(.{70})/$1\n/g;
 90 |       $seq =~ s/\n$//;
 91 |       printf(">%s\n%s\n", $seqID, $seq);
 92 |     }
 93 |   }
 94 |   if($qual){
 95 |     printf("@%s [RC]\n%s\n+\n%s\n", $seqID, rc($seq), scalar(reverse($qual)));
 96 |   } else {
 97 |     $seq =~ s/\n//g;
 98 |     $seq = rc($seq);
 99 |     $seq =~ s/(.{70})/$1\n/g;
100 |     $seq =~ s/\n$//;
101 |     printf(">%s [RC]\n%s\n", $seqID, $seq);
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/fastx-isofilter.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | ## fastx-isofilter.pl -- Extract the longest isoform (or ORF) from transcripts
  6 | 
  7 | use Getopt::Long qw(:config auto_help pass_through);
  8 | 
  9 | my $quiet = 0;
 10 | my $trimString = "_i[0-9]+\$";
 11 | my $orfMode = 0;
 12 | 
 13 | GetOptions("trim=s" => \$trimString, "orf!" => \$orfMode,
 14 |            "quiet!" => \$quiet) or
 15 |   die("Error in command line arguments");
 16 | 
 17 | if($orfMode){
 18 |   $trimString = "_[0-9]+\$";
 19 | }
 20 | 
 21 | # unknown commands are treated as identifiers
 22 | my @files = ();
 23 | while(@ARGV){
 24 |   my $arg = shift(@ARGV);
 25 |   if(-f $arg){
 26 |     push(@files, $arg);
 27 |   } else {
 28 |     $trimString .= "|$arg";
 29 |   }
 30 | }
 31 | @ARGV = @files;
 32 | 
 33 | # use stdin if no files supplied
 34 | if(!@ARGV){
 35 |   @ARGV = '-' unless (-t STDIN);
 36 | }
 37 | 
 38 | if($trimString){
 39 |   $trimString =~ s/^\|//;
 40 |   $trimString = "($trimString)";
 41 | }
 42 | 
 43 | my $inQual = 0; # false
 44 | my $seqID = "";
 45 | my $fullID = "";
 46 | my $qualID = "";
 47 | my $seq = "";
 48 | my $qual = "";
 49 | my %fastXStrs = ();
 50 | my %fastXLengths = ();
 51 | while(<>){
 52 |   chomp;
 53 |   chomp;
 54 |   if(!$inQual){
 55 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
 56 |       my $newFullID = $2;
 57 |       my $newSeqID = $3;
 58 |       $newSeqID =~ s/$trimString//;
 59 |       if($seqID && (!$fastXLengths{$seqID} || ($fastXLengths{$seqID} < length($seq)))){
 60 |         $fastXLengths{$seqID} = length($seq);
 61 |         if(!$qual){
 62 |           $seq =~ s/(.{100})/$1\n/g;
 63 |           $seq =~ s/\n$//;
 64 |         }
 65 |         $fastXStrs{$seqID} = ($qual) ?
 66 |           sprintf("@%s\n%s\n+\n%s\n", $fullID, $seq, $qual) :
 67 |           sprintf(">%s\n%s\n", $fullID, $seq);
 68 |       }
 69 |       $seq = "";
 70 |       $qual = "";
 71 |       $seqID = $newSeqID;
 72 |       $fullID = $newFullID;
 73 |     } elsif(/^\+(.*)$/) {
 74 |       $inQual = 1; # true
 75 |       $qualID = $1;
 76 |     } else {
 77 |       $seq .= $_;
 78 |     }
 79 |   } else {
 80 |     $qual .= $_;
 81 |     if(length($qual) >= length($seq)){
 82 |       $inQual = 0; # false
 83 |     }
 84 |   }
 85 | }
 86 | 
 87 | if($seqID && (!$fastXLengths{$seqID} || ($fastXLengths{$seqID} < length($seq)))){
 88 |   $fastXLengths{$seqID} = length($seq);
 89 |   if(!$qual){
 90 |     $seq =~ s/(.{100})/$1\n/g;
 91 |     $seq =~ s/\n$//;
 92 |   }
 93 |   $fastXStrs{$seqID} = ($qual) ?
 94 |     sprintf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual) :
 95 |     sprintf(">%s\n%s\n", $seqID, $seq);
 96 | }
 97 | 
 98 | foreach my $pat (sort {$fastXLengths{$b} <=> $fastXLengths{$a}} (keys(%fastXStrs))){
 99 |   print($fastXStrs{$pat});
100 | }
101 | 


--------------------------------------------------------------------------------
/exchange_rate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | use Pod::Usage;
 7 | use Getopt::Long qw(:config auto_version auto_help pass_through);
 8 | use Time::localtime;
 9 | 
10 | use HTML::TreeBuilder::XPath;
11 | 
12 | =head1 DESCRIPTION
13 | 
14 | Retrieve exchange rates for foreign currencies, either a specified
15 | date in the past (from Oanda), or for today (from TSB Bank)
16 | 
17 | =head1 SYNOPSIS
18 | 
19 | exchange_rate.pl [<date>] <currency>
20 | 
21 | Today's date will be used if the date is not specified
22 | 
23 | =cut
24 | 
25 | my %monthNums = ( Jan=>1, Feb=>2, Mar=>3, Apr=>4,
26 |                   May=>5, Jun=>6, Jul=>7, Aug=>8,
27 |                   Sep=>9, Oct=>10, Nov=>11, Dec=>12);
28 | my @numMonths = qw(   Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );
29 | 
30 | my $todayStr = # today
31 |   sprintf("%04d-%03s-%02d",
32 |           localtime->year + 1900, $numMonths[localtime->mon], localtime->mday);
33 | 
34 | my %options = (date => $todayStr);
35 | 
36 | GetOptions(\%options, "date=s") or pod2usage();
37 | 
38 | if((@ARGV) && ($ARGV[0] =~ /20..-([A-Z][a-z]{2}|\d{2})-\d{2}/)){
39 |   $options{"date"} = shift(@ARGV);
40 | }
41 | if(!@ARGV){
42 |   pod2usage("Currency must be specified");
43 | }
44 | 
45 | my $currency = shift(@ARGV);
46 | 
47 | $options{"date"} =~ s/(20..-)(\d{2})(-\d{2})/$1.$numMonths[$2-1].$3/e;
48 | 
49 | my $todayDate = $options{"date"} eq $todayStr;
50 | 
51 | ##printf("TSB Bank NZ;%s\n","0.9629");
52 | ##exit(0);
53 | 
54 | printf(STDERR "Getting exchange rate information for %s%s\n", $options{"date"},
55 |        ( $todayDate ? " [today]" : ""));
56 | 
57 | if($todayDate){
58 |   printf(STDERR "[Fetching NZD/%s rate for %s from xe.com]\n",
59 |          $currency, $options{"date"});
60 |   my $tree = HTML::TreeBuilder->
61 |     new_from_url('http://www.xe.com/currencyconverter/convert/?Amount=1&From=NZD&To='.$currency)
62 |     or die("Cannot load URL");
63 |   for my $elt ($tree->findnodes('//span[@class="uccResultUnit"]')){
64 |     printf("xe.com;%0.4f\n", $elt->attr("data-amount"));
65 |   }
66 | } else {
67 |   my $date = $options{"date"};
68 |   $date =~ s/20(..)-(...)-(\d{2})/$1."\/".$monthNums{$2}."\/".$3/e;
69 |   my $url = sprintf("http://www.oanda.com/currency/".
70 |                     "historical-rates-classic?date_fmt=jp&".
71 |                     "date=%s&date1=%s&exch=NZD&expr=%s&".
72 |                     "margin_fixed=0&format=CSV&redirected=1",
73 |                    $date,$date,$currency);
74 |   printf(STDERR "[Fetching NZD/%s rate for %s from oanda.com]\n",
75 |          $currency, $options{"date"});
76 |   printf(STDERR "[$url]\n");
77 |   # print(STDERR "[$url]\n");
78 |   my $tree = HTML::TreeBuilder->new_from_url($url);
79 |   my $rate = $tree->findvalue('//pre');
80 |   $rate =~ s/^.*?,//;
81 |   print("oanda.com;$rate\n");
82 | }
83 | 


--------------------------------------------------------------------------------
/fastx-sample.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | use Getopt::Long qw(:config auto_help pass_through);
 6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 7 | use File::Temp qw(:seekable);
 8 | use Encode qw(encode_utf8);
 9 | 
10 | my $idFileName = "";
11 | my $maxCount = 0;
12 | my $trim = 0;
13 | 
14 | GetOptions("count=i" => \$maxCount, ) or
15 |   die("Error in command line arguments");
16 | 
17 | if(!$maxCount){
18 |   die("Error: Reservoir sampling needs a maximum number of reads (-c <count>)\n");
19 | } else {
20 |   printf(STDERR "Reservoir sampling reads to output at most %d reads:",
21 |         $maxCount);
22 | }
23 | 
24 | sub processReads{
25 |   my ($seq, $qual, $maxCount, $readsRead, $readsProcessed, $readStore) = @_;
26 |   $readsProcessed++;
27 |   my $swapPos = ($readsProcessed <= $maxCount) ? ($readsProcessed-1) : int(rand($readsProcessed));
28 |   if($swapPos < $maxCount){
29 |     my $outLines = "";
30 |     if($qual){
31 |       $outLines = sprintf("@%012d\n%s\n+\n%s\n", $readsRead, $seq, $qual);
32 |     } else {
33 |       $outLines = sprintf(">%012d\n%s\n", $readsRead, $seq);
34 |     }
35 |     ${$readStore}{$swapPos} = $outLines;
36 |   }
37 |   return $readsProcessed;
38 | }
39 | 
40 | my $inQual = 0; # false
41 | my $seqID = "";
42 | my $qualID = "";
43 | my $seq = "";
44 | my $qual = "";
45 | my $readsRead = 0;
46 | my $readsProcessed = 0;
47 | my $dotsPrinted = 0;
48 | my %readStore = ();
49 | 
50 | while(<>){
51 |   chomp;
52 |   chomp;
53 |   if(!$inQual){
54 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
55 |       my $newSeqID = $2;
56 |       my $newShortID = $3;
57 |       if($seqID){
58 |         if($maxCount && ($readsRead % 10000 == 0)){
59 |           if($dotsPrinted % 50 == 0){
60 |             if($readsRead > 1000){
61 |               printf(STDERR " (%d reads processed)", $readsRead);
62 |             }
63 |             printf(STDERR "\n  ");
64 |           }
65 |           print(STDERR ".");
66 |           $dotsPrinted++;
67 |         }
68 |         $readsProcessed =
69 |           processReads($seq, $qual, $maxCount, ++$readsRead, $readsProcessed, \%readStore);
70 |       }
71 |       $seq = "";
72 |       $qual = "";
73 |       $seqID = $newSeqID;
74 |     } elsif(/^\+(.*)$/) {
75 |       $inQual = 1; # true
76 |       $qualID = $1;
77 |     } else {
78 |       $seq .= $_;
79 |     }
80 |   } else {
81 |     $qual .= $_;
82 |     if(length($qual) >= length($seq)){
83 |       $inQual = 0; # false
84 |     }
85 |   }
86 | }
87 | 
88 | if($seqID){
89 |   $readsProcessed =
90 |           processReads($seq, $qual, $maxCount, ++$readsRead, $readsProcessed, \%readStore);
91 | }
92 | 
93 | printf(STDERR "\ndone (%d reads processed from %d total reads)\n",
94 |        $readsProcessed, $readsRead);
95 | 
96 | foreach my $id (keys(%readStore)){
97 |   print $readStore{$id};
98 | }
99 | 


--------------------------------------------------------------------------------
/pedtodot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Read a GAS or LINKAGE format pedigree, return a digraph in the dot language 
 4 | # call dot to make pedigree drawing
 5 | # 
 6 | AWK=$(which awk)
 7 | DOTEXE=$(which dot)
 8 | NEATOEXE=$(which neato)
 9 | 
10 | for fil in $*
11 | do
12 |   for ped in `$AWK '!/^[!#]/ {print $1}' $fil | sort -u`
13 |   do
14 |      echo "Pedigree $ped"
15 |      $AWK -v ped=$ped '
16 |      BEGIN { shape["f"]="box,regular=1"
17 |              shape["1"]="box,regular=1"
18 |              shape["m"]="circle"
19 |              shape["2"]="circle"
20 |              shape["u"]="diamond"
21 |              shape["0"]="diamond"
22 |              shade["y"]="grey"
23 |              shade["2"]="grey"
24 |              shade["n"]="white"
25 |              shade["1"]="white"
26 |              shade["x"]="white"
27 |              shade["0"]="white"
28 |      }
29 |      !/^[!#]/ && $1==ped {
30 |              sex[$2]=$5
31 |              aff[$2]="x" ; if ($6 ~ /[012nyx]/) aff[$2]=$6
32 |              if($3!="x" && $3!="0") {
33 |                marriage[$3,$4]++
34 |                child[$3,$4,marriage[$3,$4]]=$2
35 |              }
36 |      }
37 |      END   { print "digraph Ped_" ped " {"
38 | #             print "# page =\"11,8.5\" ;"
39 |              print "node [shape=diamond] ;"
40 |              print "ratio =\"auto\" ;"
41 |              print "mincross = 2.0 ;"
42 |              print "label=\"Pedigree " ped "\" ;"
43 |              print "rotate=0 ;"
44 |              for(s in sex) {
45 |                print "\"" s "\" [shape=" shape[sex[s]] ","  \
46 |                      " style=filled,fillcolor=" shade[aff[s]] "] ;"
47 |              }
48 |              for(m in marriage) {
49 |                n=split(m,par,"\034")
50 |                mating_t="\"t_" par[1] "x" par[2] "\""
51 |                mating_b="\"b_" par[1] "x" par[2] "\""
52 |                print mating_t "[shape=diamond,style=filled," \
53 |                      "label=\"\",height=.1,width=.1] ;"
54 |                print mating_b "[shape=diamond,style=filled," \
55 |                      "label=\"\",height=.1,width=.1] ;"
56 |                print "\"" par[1] "\" -> " mating_t " [dir=none, weight=1, penwidth=3.0] ;"
57 |                print "\"" par[2] "\" -> " mating_t " [dir=none, weight=1, penwidth=3.0] ;"
58 |                print mating_t " -> " mating_b " [dir=none, weight=1, penwidth=3.0] ;"
59 |                for(k=1;k<=marriage[par[1],par[2]];k++) {
60 |                  print  mating_b " -> \"" child[par[1],par[2],k] "\"" \
61 |                         " [dir=none, weight=2] ;"
62 |                }
63 |              }
64 |              print "}"
65 |      }' $fil > $ped.dot
66 | #     echo "running ${DOTEXE} -Tsvg ${ped}.dot -o ${ped}_dot.svg"
67 |      ${DOTEXE} -Tpdf ${ped}.dot -o ${ped}_dot.pdf
68 | #     ${NEATOEXE} -Tsvg ${ped}.dot -o ${ped}_neato.svg
69 |   done
70 | done
71 | 


--------------------------------------------------------------------------------
/mpileupDC.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use warnings;
 3 | use strict;
 4 | 
 5 | use Getopt::Long qw(:config auto_help pass_through);
 6 | 
 7 | my $startName = "";
 8 | my $startPos = -1;
 9 | my $lastPos = -1;
10 | my $lastDesc = "";
11 | 
12 | my $covAdjust = 5;
13 | my $printedHeader = 0; # false
14 | my $accuracyDP = 0;
15 | my $DCOnly = 0; # false
16 | my $threshold = 0; # Log2 threshold to report DC (otherwise DC is set to 0)
17 | my $header = 1; # true
18 | 
19 | my $compare = "";
20 | 
21 | GetOptions("adjustment=i" => $covAdjust, "header!" => $header,
22 | 	   "compare=s" => \$compare, "dpaccuracy=i" => \$accuracyDP,
23 | 	   "threshold=f" => \$threshold, "onlydc!" => \$DCOnly ) or
24 |     die("Error in command line arguments");
25 | 
26 | my @comp1 = ();
27 | my @comp2 = ();
28 | my @comps = ();
29 | 
30 | while(<>){
31 |   chomp;
32 |   if(!$_){
33 |     next;
34 |   }
35 |   my ($refName, $pos, $refAllele, $cov, $bases, $qual, $rest) =
36 |       split(/\t/, $_, 7);
37 |   my $skip = ($bases =~ tr/<>//);
38 |   my @adjCovs = ($cov - $skip);
39 |   while($rest){
40 |     ($cov, $bases, $qual, $rest) = split(/\t/, $rest, 4);
41 |     $skip = ($bases =~ tr/<>//);
42 |     push(@adjCovs, $cov - $skip);
43 |   }
44 |   if(!$printedHeader){
45 |     ## set up comparisons
46 |     if(!$compare){
47 |       for(my $x = 0; $x < $#adjCovs; $x++){
48 | 	for(my $y = $x+1; $y <= $#adjCovs; $y++){
49 | 	  push(@comp1, $x+1);
50 | 	  push(@comp2, $y+1);
51 | 	  push(@comps, sprintf("%d,%d", $x+1, $y+1));
52 | 	}
53 |       }
54 |     } else {
55 |       while($compare =~ s/([0-9]+?)[-,]([0-9]+?)[;,\s]?//){
56 | 	push(@comp1, $1);
57 | 	push(@comp2, $2);
58 | 	push(@comps, "${1},${2}");
59 |       }
60 |     }
61 |     if($header){
62 |       if($DCOnly){
63 |         print(join("\t", "##Ref", "Start", "End", @comps)."\n");
64 |       } else {
65 |         print(join("\t", "##Ref", "Start", "End",
66 |                    (1..($#adjCovs+1)), @comps)."\n");
67 |       }
68 |     }
69 |     $printedHeader = 1; # true
70 |   }
71 |   my @DCov = ();
72 |   for(my $i = 0; $i <= $#comp1; $i++){
73 |     my $x = $comp1[$i]-1;
74 |     my $y = $comp2[$i]-1;
75 |     my $val = (log($adjCovs[$x]+$covAdjust) -
76 | 	       log($adjCovs[$y]+$covAdjust)) / log(2);
77 |     push(@DCov, sprintf("%0.${accuracyDP}f",
78 | 			abs($val) < $threshold ? 0 : $val));
79 |   }
80 |   my $descLine = ($DCOnly) ?
81 |       join("\t", @DCov) :
82 |       join("\t", @adjCovs, @DCov);
83 |   $descLine =~ s/-0/0/g;
84 |   if(($refName ne $startName) || ($descLine ne $lastDesc)){
85 |     # print sequence (if any)
86 |     if($startName){
87 |       print(join("\t", $startName, $startPos-1, $lastPos, $lastDesc)."\n");
88 |     }
89 |     $startName = $refName;
90 |     $startPos = $pos;
91 |     $lastDesc = $descLine;
92 |   }
93 |   $lastPos = $pos;
94 | }
95 | 
96 | if($startName){
97 |   print(join("\t", $startName, $startPos-1, $lastPos, $lastDesc)."\n");
98 | }
99 | 


--------------------------------------------------------------------------------
/fastx-cgmasker.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | ## fastx-hplength.pl -- get statistics on homopolymers in a fastq/fasta file
  6 | 
  7 | use Getopt::Long qw(:config auto_help pass_through);
  8 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
  9 | 
 10 | my $quiet = 0;
 11 | my $mode = "CG";
 12 | 
 13 | GetOptions("quiet!" => \$quiet, "mode=s" => \$mode) or
 14 |   die("Error in command line arguments");
 15 | 
 16 | # unknown commands are treated as identifiers
 17 | my @files = ();
 18 | while(@ARGV){
 19 |   my $arg = shift(@ARGV);
 20 |   if(-e $arg){
 21 |     push(@files, $arg);
 22 |   }
 23 | }
 24 | @ARGV = @files;
 25 | 
 26 | # use stdin if no files supplied
 27 | if(!@ARGV){
 28 |   @ARGV = '-' unless (-t STDIN);
 29 | }
 30 | 
 31 | my %hpCounts = ();
 32 | 
 33 | my $baseCount = 0;
 34 | my $inQual = 0; # false
 35 | my $seqID = "";
 36 | my $qualID = "";
 37 | my $seq = "";
 38 | my $qual = "";
 39 | 
 40 | foreach my $file (@ARGV) {
 41 |   # This little gunzip dance makes sure the script can handle both
 42 |   # gzip-compressed and uncompressed input, regardless of whether
 43 |   # or not it is piped
 44 |   my $z = new IO::Uncompress::Gunzip($file, "transparent", 1)
 45 |     or die "gunzip failed: $GunzipError\n";
 46 |   while(<$z>){
 47 |     s/\s+$//; # remove ending whitespace
 48 |     if (!$inQual) {
 49 |       if (/^(>|@)((.+?)( .*?\s*)?)$/) {
 50 |         my $newSeqID = $2;
 51 |         my $newShortID = $3;
 52 |         $baseCount += length($seq);
 53 |         my $cur = "";
 54 |         my $cchr = "";
 55 |         if($mode eq "CG"){
 56 |           $seq =~ s/CG/XX/gi;
 57 |           $seq =~ tr/TtCcGg/AaAaAa/;
 58 |           $seq =~ s/XX/CG/gi;
 59 |         }
 60 |         if($seqID){
 61 |           if($qual){
 62 |             printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
 63 |           } else {
 64 |             $seq =~ s/(.{100})/$1\n/g;
 65 |             $seq =~ s/\n$//;
 66 |             printf(">%s\n%s\n", $seqID, $seq);
 67 |           }
 68 |         }
 69 |         $seq = "";
 70 |         $qual = "";
 71 |         $seqID = $newSeqID;
 72 |       } elsif (/^\+(.*)$/) {
 73 |         $inQual = 1;            # true
 74 |         $qualID = $1;
 75 |       } else {
 76 |         $seq .= uc($_);
 77 |       }
 78 |     } else {
 79 |       $qual .= $_;
 80 |       if (length($qual) >= length($seq)) {
 81 |         $inQual = 0;            # false
 82 |       }
 83 |     }
 84 |   }
 85 |   close($z);
 86 | }
 87 | 
 88 | $mode = uc($mode);
 89 | 
 90 | $baseCount += length($seq);
 91 | my $cur = "";
 92 | my $cchr = "";
 93 | if($mode eq "CG"){
 94 |   $seq =~ s/CG/XX/gi;
 95 |   $seq =~ tr/TtCcGg/AaAaAa/;
 96 |   $seq =~ s/XX/CG/gi;
 97 | }
 98 | if($seqID){
 99 |   if($qual){
100 |     printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
101 |   } else {
102 |     $seq =~ s/(.{100})/$1\n/g;
103 |     $seq =~ s/\n$//;
104 |     printf(">%s\n%s\n", $seqID, $seq);
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/addKraken2Silva.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # addKraken2Silva.pl -- modifies SILVA FASTA files to add in
  4 | # embl taxIDs for the Kraken database and translate U to T
  5 | 
  6 | # Author: David Eccles (gringer), 2015 <bioinformatics@gringene.org>
  7 | 
  8 | use strict;
  9 | use warnings;
 10 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 11 | 
 12 | sub usage {
 13 |   print(STDERR "usage: ./addKraken2Silva.pl <Map File> <input file>\n");
 14 |   print(STDERR "\nmodifies SILVA FASTA files to add in embl taxIDs for the Kraken database\n");
 15 |   print(STDERR "\nOther Options:\n");
 16 |   print(STDERR "-help        : Only display this help message\n");
 17 |   print(STDERR "\n");
 18 | }
 19 | 
 20 | my $mapFileName = 0; # false
 21 | 
 22 | my @files = ();
 23 | 
 24 | # extract command line arguments
 25 | while(@ARGV){
 26 |     my $argument = shift @ARGV;
 27 |     if(-f $argument){ # file existence check
 28 |         if(!$mapFileName){
 29 |             $mapFileName = $argument;
 30 |         } else {
 31 | 	    push(@files, $argument);
 32 |         }
 33 |     } else {
 34 |         if($argument eq "-help"){
 35 |             usage();
 36 |             exit(0);
 37 |         } else {
 38 | 	    print(STDERR "Error: command line parameter '$argument' not understood\n");
 39 | 	    usage();
 40 | 	    exit(1);
 41 | 	}
 42 |     }
 43 | }
 44 | 
 45 | @ARGV = @files;
 46 | 
 47 | if(!$mapFileName){
 48 |     print(STDERR "Error: No valid map file given\n");
 49 |     usage();
 50 |     exit(1);
 51 | }
 52 | 
 53 | my $mapFile = 0;
 54 | $mapFile = new IO::Uncompress::Gunzip "$mapFileName" or
 55 |     die "Unable to open $mapFileName\n";
 56 | 
 57 | my %seqMap = ();
 58 | 
 59 | print(STDERR "Reading from map file...");
 60 | 
 61 | my $mapLinesCounter = 0;
 62 | 
 63 | while(<$mapFile>){
 64 |     chomp;
 65 |     my $seqID = 0;
 66 |     my $taxID = 0;
 67 |     if($_ =~ /^(.*?)\s/){
 68 | 	$seqID = $1;
 69 |     }
 70 |     if($_ =~ /\s([0-9]+)$/){
 71 | 	$taxID = $1;
 72 |     }
 73 |     if($seqID && $taxID){
 74 | 	$seqMap{$seqID} = $taxID;
 75 |     }
 76 |     if($mapLinesCounter++ > 100000){
 77 | 	$mapLinesCounter = 0;
 78 | 	print(STDERR ".");
 79 |     }
 80 | }
 81 | 
 82 | printf(STDERR " found %d mappings from sequence IDs to taxa\n", scalar(keys(%seqMap)));
 83 | 
 84 | my $seqID = "";
 85 | my $seq = "";
 86 | while(<>){
 87 |   chomp;
 88 |   if(/^>(([^\.]+).*?)(( |$).*)$/){
 89 |       $seqID = $1;
 90 |       my $seqBase = $2;
 91 |       my $rest = $3;
 92 |       ##print(STDERR "base: $seqBase, id:$seqID, rest:$rest\n");
 93 |       my $taxID = $seqMap{$seqBase};
 94 |       if($taxID){
 95 | 	  $seqID.= "|kraken:taxid|".$taxID;
 96 | 	  printf(">%s%s\n", $seqID, $rest);
 97 |       } else {
 98 | 	  print(STDERR "Warning: Sequence '$seqID' skipped: no reference found in map file\n");
 99 | 	  $seqID = "";
100 |       }
101 |       $seq = "";
102 |   } elsif($seqID) {
103 |       $_ =~ tr/U/T/;
104 |       print($_."\n");
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/edgebeeguildfinder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Edgebee guild finder -- given a list of user names,
 4 | # work out what edgebee guild those users come from. Output is csv,
 5 | # containing User, Game, and Guild
 6 | 
 7 | # also found at http://pastebin.com/W6sYanuQ
 8 | 
 9 | # Author: David Eccles (gringer) <coding@gringer.org>
10 | 
11 | # usage: echo -e "user1\nuser2" | ./edgebeeguildfinder.py
12 | 
13 | import cookielib, urllib2
14 | import fileinput
15 | import re
16 | import sys
17 | import json
18 | import csv
19 | 
20 | userNames = []
21 | addedUsers = set()
22 | 
23 | for line in fileinput.input():
24 |     if(line.startswith("{")):
25 |         jsonObj = json.loads(line)
26 |         if(('result' in jsonObj) and ('players' in jsonObj['result'])):
27 |             for userName in map(lambda x: x['name'],jsonObj['result']['players']):
28 |                 if(not userName in addedUsers):
29 |                     userNames.append(userName)
30 |                     addedUsers.add(userName)
31 |     else:
32 |         userName = line.rstrip()
33 |         if(not userName in addedUsers):
34 |             userNames.append(userName)
35 |             addedUsers.add(userName)
36 | 
37 | cj = cookielib.CookieJar()
38 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
39 | f = opener.open('http://www.edgebee.com/signin?' +
40 |                 'username=gringerscripts&password=gringerscripts&remember=1')
41 | 
42 | cr = csv.writer(sys.stdout)
43 | cr.writerow(['User','Game','Guild','LastOnline','Registered'])
44 | for user in userNames:
45 |     foundSandP = False
46 |     try:
47 |        f = opener.open('http://www.edgebee.com/user?name=%s' % (user))
48 |     except:
49 |        continue
50 |     inItem = False
51 |     gameName = None
52 |     extraStat = None
53 |     extraStats = {}
54 |     extraStats['lastOn'] = None
55 |     extraStats['regOn'] = None
56 |     extraStats['guildName'] = None
57 |     for line in f.read().splitlines(True):
58 |         line = line.rstrip()
59 |         if('playerListItem' in line):
60 |             inItem = True
61 |             gameName = None
62 |             extraStats['guildName'] = None
63 |         if(inItem):
64 |             if('h2' in line):
65 |                 gameName = re.compile(r'<[^>]+>').sub('', line).lstrip()
66 |             if('h4' in line):
67 |                 inItem = False
68 |         elif('h4' in line):
69 |             if('Registered on' in line):
70 |                 extraStat = 'regOn'
71 |             elif('Last online' in line):
72 |                 extraStat = 'lastOn'
73 |             else:
74 |                 extraStat = None
75 |         elif((extraStat is not None) and ('<br>' in line)):
76 |             extraStats[extraStat] = re.compile(r'<[^>]+>').sub('', line).lstrip()
77 |         elif('Guild:' in line):
78 |             guildName = re.compile(r'<[^>]+>').sub('', line).lstrip()
79 |             guildName = guildName.replace('Guild: ','')
80 |             extraStats['guildName'] = guildName
81 |         if(('<br style="clear:both"/>' in line)  and (gameName is not None)):
82 |             cr.writerow([user, gameName, extraStats['guildName'],
83 |                                 extraStats['lastOn'],extraStats['regOn']])
84 | 


--------------------------------------------------------------------------------
/shufflefastx.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | use Getopt::Long qw(:config auto_help pass_through);
  6 | use List::Util qw(shuffle);
  7 | 
  8 | my $idFileName = "";
  9 | my $quiet = 0;
 10 | my $wrap = 1;
 11 | 
 12 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet,
 13 |           "wrap!" => \$wrap) or
 14 |   die("Error in command line arguments");
 15 | 
 16 | my %idsToGet = ();
 17 | 
 18 | # unknown commands are treated as identifiers
 19 | my @files = ();
 20 | while(@ARGV){
 21 |   my $arg = shift(@ARGV);
 22 |   if(-f $arg){
 23 |     push(@files, $arg);
 24 |   } else {
 25 |     $idsToGet{$arg} = 1;
 26 |   }
 27 | }
 28 | @ARGV = @files;
 29 | 
 30 | if($idFileName){
 31 |   # read sequence IDs from input file
 32 |   open(my $idFile, "<", $idFileName);
 33 |   while(<$idFile>){
 34 |     chomp;
 35 |     s/^[>@]//;
 36 |     $idsToGet{$_} = 1;
 37 |   }
 38 |   close($idFile);
 39 | }
 40 | 
 41 | my $numToFind = scalar(keys(%idsToGet));
 42 | 
 43 | if(!$quiet && $numToFind){
 44 |   printf(STDERR "Read %d identifiers\n", $numToFind);
 45 | }
 46 | 
 47 | 
 48 | my $inQual = 0; # false
 49 | my $seqID = "";
 50 | my $qualID = "";
 51 | my $seq = "";
 52 | my $qual = "";
 53 | while(<>){
 54 |   chomp;
 55 |   chomp;
 56 |   if(!$inQual){
 57 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
 58 |       my $newSeqID = $2;
 59 |       my $newShortID = $3;
 60 |       if($seqID){
 61 |         my $newSeq = "";
 62 |         my @shuffleLocs = (shuffle (0 .. length($seq)));
 63 |         foreach my $rp (@shuffleLocs){
 64 |           $newSeq .= substr($seq,$rp,1);
 65 |         }
 66 |         if($qual){
 67 |           my $newQual = "";
 68 |           foreach my $rp (@shuffleLocs){
 69 |             $newQual .= substr($qual,$rp,1);
 70 |           }
 71 |           printf("@"."shuffled_%s\n%s\n+\n%s\n", $seqID, $newSeq, $qual);
 72 |         } else {
 73 |           if($wrap){
 74 |             $newSeq =~ s/(.{70})/$1\n/g;
 75 |             $newSeq =~ s/\s+$//;
 76 |           }
 77 |           printf(">shuffled_%s\n%s\n", $seqID, $newSeq);
 78 |         }
 79 |       }
 80 |       $seq = "";
 81 |       $qual = "";
 82 |       if(!$numToFind ||
 83 |          exists($idsToGet{$newSeqID}) || exists($idsToGet{$newShortID})){
 84 |         $seqID = $newSeqID;
 85 |       } else {
 86 |         $seqID = "";
 87 |       }
 88 |     } elsif(/^\+(.*)$/) {
 89 |       $inQual = 1; # true
 90 |       $qualID = $1;
 91 |     } else {
 92 |       $seq .= $_;
 93 |     }
 94 |   } else {
 95 |     $qual .= $_;
 96 |     if(length($qual) >= length($seq)){
 97 |       $inQual = 0; # false
 98 |     }
 99 |   }
100 | }
101 | 
102 | if ($seqID) {
103 |   my $newSeq = "";
104 |   my @shuffleLocs = (shuffle (0 .. length($seq)));
105 |   foreach my $rp (@shuffleLocs) {
106 |     $newSeq .= substr($seq,$rp,1);
107 |   }
108 |   if ($qual) {
109 |     my $newQual = "";
110 |     foreach my $rp (@shuffleLocs) {
111 |       $newQual .= substr($qual,$rp,1);
112 |     }
113 |     printf("@"."shuffled_%s\n%s\n+\n%s\n", $seqID, $newSeq, $qual);
114 |   } else {
115 |     if ($wrap) {
116 |       $newSeq =~ s/(.{70})/$1\n/g;
117 |       $newSeq =~ s/\s+$//;
118 |     }
119 |     printf(">shuffled_%s\n%s\n", $seqID, $newSeq);
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/fastx-sort.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | use Getopt::Long qw(:config auto_help pass_through);
  6 | 
  7 | my $quiet = 0;
  8 | my $searchPattern = ""; ## "(^.*\$)";
  9 | my $numeric = 0;
 10 | my $length = 0;
 11 | my $r = 0;
 12 | 
 13 | GetOptions("reverse|r!" => \$r, "pattern=s" => \$searchPattern,
 14 |            "quiet!" => \$quiet,
 15 |            "numeric|n!" => \$numeric, "length!" => \$length) or
 16 |   die("Error in command line arguments");
 17 | 
 18 | if($length){
 19 |   $numeric = 1;
 20 |   print(STDERR "Numeric sort based on sequence length\n");
 21 | } elsif($numeric){
 22 |   $numeric = 1;
 23 |   print(STDERR "Numeric sort\n");
 24 | }
 25 | 
 26 | if($r){
 27 |   print(STDERR "Reversing sort direction\n");
 28 | }
 29 | 
 30 | # Complain about non-file command line argument
 31 | my @files = ();
 32 | while(@ARGV){
 33 |   my $arg = shift(@ARGV);
 34 |   if(-e $arg){
 35 |     push(@files, $arg);
 36 |   } else {
 37 |     die("Unknown argument: $arg");
 38 |   }
 39 | }
 40 | @ARGV = @files;
 41 | 
 42 | my %fastXStrs = ();
 43 | 
 44 | my $inQual = 0; # false
 45 | my $seqID = "";
 46 | my $qualID = "";
 47 | my $seq = "";
 48 | my $qual = "";
 49 | while(<>){
 50 |   chomp;
 51 |   chomp;
 52 |   if(!$inQual){
 53 |     if(/^(>|@)(.*)$/){
 54 |       my $newSeqID = $2;
 55 |       if($seqID){
 56 |         if(!$searchPattern || ($seqID =~ /($searchPattern)/)){
 57 |           my $matchPattern = ($2) ? $2 : $1;
 58 |           my $key = (!$searchPattern) ?
 59 |             ($length ? (-length($seq)) : $seqID) : $matchPattern;
 60 |           if(!$qual){
 61 |             $seq =~ s/(.{100})/$1\n/g;
 62 |             $seq =~ s/\n$//;
 63 |           }
 64 |           $fastXStrs{$key} .= ($qual) ?
 65 |             sprintf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual) :
 66 |             sprintf(">%s\n%s\n", $seqID, $seq);
 67 |         } else {
 68 |           printf(STDERR "Warning: No match for pattern '$searchPattern' for sequence '$seqID'\n");
 69 |         }
 70 |       }
 71 |       $seq = "";
 72 |       $qual = "";
 73 |       $seqID = $newSeqID;
 74 |     } elsif(/^\+(.*)$/) {
 75 |       $inQual = 1; # true
 76 |       $qualID = $1;
 77 |     } else {
 78 |       $seq .= $_;
 79 |     }
 80 |   } else {
 81 |     $qual .= $_;
 82 |     if(length($qual) >= length($seq)){
 83 |       $inQual = 0; # false
 84 |     }
 85 |   }
 86 | }
 87 | 
 88 | if ($seqID) {
 89 |   if (!$searchPattern || ($seqID =~ /($searchPattern)/)) {
 90 |     my $matchPattern = ($2) ? $2 : $1;
 91 |     my $key = (!$searchPattern) ?
 92 |       ($length ? (-length($seq)) : $seqID) : $matchPattern;
 93 |     if (!$qual) {
 94 |       $seq =~ s/(.{100})/$1\n/g;
 95 |       $seq =~ s/\n$//;
 96 |     }
 97 |     $fastXStrs{$key} .= ($qual) ?
 98 |       sprintf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual) :
 99 |       sprintf(">%s\n%s\n", $seqID, $seq);
100 |   } else {
101 |     printf(STDERR "Warning: No match for pattern '$searchPattern' for sequence '$seqID'\n");
102 |   }
103 | }
104 | 
105 | printf(STDERR "Seen %d keys\n", scalar(keys(%fastXStrs)));
106 | 
107 | if($numeric){
108 |   foreach my $pat (sort {($r?$b:$a) <=> ($r?$a:$b)} (keys(%fastXStrs))){
109 |     print($fastXStrs{$pat});
110 |   }
111 | } else {
112 |   foreach my $pat (sort {($r?$b:$a) cmp ($r?$a:$b)} (keys(%fastXStrs))){
113 |     print($fastXStrs{$pat});
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/GenABEL2GIANT.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use warnings;
  4 | use strict;
  5 | 
  6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
  7 | use IO::Compress::Gzip qw(gzip $GzipError);
  8 | 
  9 | my ($snpName, $mafName, @giantNames) = @ARGV;
 10 | 
 11 | my %chipSNPs = ();
 12 | my %MAFs = ();
 13 | 
 14 | print(STDERR "Reading SNP file...");
 15 | my $snpFile = new IO::Uncompress::Gunzip("$snpName") or
 16 |   die("Unable to open $snpName\n");
 17 | my $read = 0;
 18 | while(<$snpFile>){
 19 |   chomp;
 20 |   $chipSNPs{$_} = 1;
 21 |   if($read++ > 10000){
 22 |     print(STDERR ".");
 23 |     $read = 0;
 24 |   }
 25 | }
 26 | close($snpFile);
 27 | print(STDERR " done.\n");
 28 | 
 29 | print(STDERR "Reading MAF file...");
 30 | my $mafFile = new IO::Uncompress::Gunzip("$mafName") or
 31 |   die("Unable to open $mafName\n");
 32 | $read = 0;
 33 | while(<$mafFile>){
 34 |   chomp;
 35 |   if(/^Marker/){
 36 |     next;
 37 |   }
 38 |   my ($marker, $sex, $base, $MAF) = split(/,/);
 39 |   $MAFs{$sex}{$marker}="$base,$MAF";
 40 |   if($read++ > 300000){
 41 |     print(STDERR ".");
 42 |     $read = 0;
 43 |   }
 44 | }
 45 | close($mafFile);
 46 | print(STDERR " done.\n");
 47 | 
 48 | 
 49 | foreach my $giantName (@giantNames){
 50 |   my $sexCode = "";
 51 |   if ($giantName =~ /NI_([fmc]).*?_IMPUTE2/) {
 52 |     $sexCode = ($1 eq "c") ? "a" : $1;
 53 |   }
 54 |   my $giantFile = new IO::Uncompress::Gunzip("$giantName") or
 55 |     die("Unable to open $giantName\n");
 56 |   print(STDERR "Reading GIANT file [$giantName]...");
 57 |   $read = 0;
 58 |   my $of = new IO::Compress::Gzip "GIANT_${giantName}" or
 59 |     die "Unable to open GIANT_${giantName} for writing\n";
 60 |   print($of "chrom,markername,strand,n,effect_allele,".
 61 |          "other_allele,eaf,imputation_type,chi2.1df,beta,".
 62 |          "se,p\n");
 63 |   while (<$giantFile>) {
 64 |     chomp;
 65 |     tr/\"//d;
 66 |     if (substr($_,0,1) eq ",") {
 67 |       next;
 68 |     }
 69 |     my ($markername, $chrom, $pos, $strand, $A1, $A2, $n, $beta, $se,
 70 |         $chi2_1df, $P1df, $p, $effAB, $effBB, $chi2_2df, $P2df,
 71 |         $effect_allele) = split(/,/);
 72 |     my ($maa,$maf) = split(/,/, $MAFs{$sexCode}{$markername});
 73 |     if (($maf == 1) || ($maf == 0)) {
 74 |       next;                     # skip homozygous / missing SNPs
 75 |     }
 76 |     my $eaf = $maf;
 77 |     my $other_allele = $A1;
 78 |     if ($maa ne $effect_allele) {
 79 |       $eaf = 1 - $maf;
 80 |     }
 81 |     $eaf = sprintf("%0.4f", $eaf);
 82 |     if ($other_allele eq $effect_allele) {
 83 |       $other_allele = $A2;
 84 |     }
 85 |     my $imputation_type = 4;
 86 |     if ($chipSNPs{$markername}) {
 87 |       $imputation_type = 0;
 88 |     }
 89 |     if($chi2_1df ne "NA"){
 90 |       $chi2_1df = sprintf("%0.4f", $chi2_1df);
 91 |     }
 92 |     if($beta ne "NA"){
 93 |       $beta = sprintf("%0.4f", $beta);
 94 |     }
 95 |     if($se ne "NA"){
 96 |       $se = sprintf("%0.6f", $se);
 97 |     }
 98 |     if($p ne "NA"){
 99 |       $p = sprintf("%0.4f", $p);
100 |     }
101 |     print($of "$chrom,$markername,$strand,$n,$effect_allele,".
102 |            "$other_allele,$eaf,$imputation_type,$chi2_1df,$beta,".
103 |            "$se,$p\n");
104 |     if($read++ > 400000){
105 |       print(STDERR ".");
106 |       $read = 0;
107 |     }
108 |   }
109 |   close($of);
110 |   close($giantFile);
111 |   print(STDERR " done.\n");
112 | }
113 | 


--------------------------------------------------------------------------------
/maf2gfa.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | ## Filters out non-end and identical LAST results to simulate overlap alignments
  4 | 
  5 | use warnings;
  6 | use strict;
  7 | 
  8 | use Getopt::Long qw(:config auto_help pass_through);
  9 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 10 | 
 11 | my $seqFileName = "";
 12 | my $ignoreSelf = 0;
 13 | 
 14 | my $endLeniency = 100; ## base pairs from end to allow
 15 | my $containFrac = 0.9; ## minimum containment fraction
 16 | my $containIdent = 0.9; ## minimum containment identity
 17 | 
 18 | my %quals = ();
 19 | 
 20 | my $qSeq = "";
 21 | my $qStart = 0;
 22 | my $qEnd = 0;
 23 | my $qLen = 0;
 24 | my $qStrand = "";
 25 | my $qMatchLen = 0;
 26 | my $qName = "";
 27 | my $tSeq = "";
 28 | my $tStart = 0;
 29 | my $tEnd = 0;
 30 | my $tMatchLen = 0;
 31 | my $tLen = 0;
 32 | my $tName = "";
 33 | 
 34 | my $lineBuffer = "";
 35 | my $GFAspec = 1;
 36 | 
 37 | my %matches = ();
 38 | 
 39 | if($GFAspec == 2){
 40 |   print("H\tVN:Z:2.0\n");
 41 | } else {
 42 |   print("H\tVN:Z:bogart/edges\n");
 43 | }
 44 | 
 45 | my $fastaFileName = shift(@ARGV);
 46 | if(!(-e "${fastaFileName}.fai")){
 47 |   print(STDERR "Generating index file... ");
 48 |   system(("samtools", "faidx", $fastaFileName));
 49 |   print("done\n");
 50 | }
 51 | 
 52 | open(my $fastaFile, "<", "${fastaFileName}.fai");
 53 | while(<$fastaFile>){
 54 |   chomp;
 55 |   my @F = split(/\s+/);
 56 |   if($GFAspec == 2){
 57 |     printf("S\t%s\t%d\t*\n", $F[0], $F[1]);
 58 |   } else {
 59 |     printf("S\t%s\t*\tLN:i:%d\n", $F[0], $F[1]);
 60 |   }
 61 | }
 62 | close($fastaFile);
 63 | 
 64 | while(<>){
 65 |   if(/^$/){
 66 |     next;
 67 |   }
 68 |   if(!/^[as]/){
 69 |     next;
 70 |   }
 71 |   $lineBuffer .= $_;
 72 |   my @F = split(/\s+/);
 73 |   if($F[0] eq "a"){
 74 |     $qSeq = "";
 75 |     $tSeq = "";
 76 |   } elsif($F[0] eq "s"){
 77 |     if($tSeq){
 78 |       $qName = $F[1];
 79 |       $qStart = $F[2];
 80 |       $qMatchLen = $F[3];
 81 |       $qEnd = $qStart + $qMatchLen;
 82 |       $qStrand = $F[4];
 83 |       $qLen = $F[5];
 84 |       $qSeq = $F[6];
 85 |       if($GFAspec == 2){
 86 |         if(($qSeq ne $tSeq) &&
 87 |            (($qStart < $endLeniency) || ($tStart < $endLeniency) ||
 88 |             ($qEnd > ($qLen-$endLeniency)) || ($tEnd > ($tLen-$endLeniency)))){
 89 |           print(join("\t",("E","${tName}+","${qName}${qStrand}",$tStart,$tEnd,
 90 |                            $qStart,$qEnd))."\n");
 91 |         }
 92 |       } else {
 93 |         if($qSeq ne $tSeq){
 94 |           if(($tEnd > ($tLen-$endLeniency)) && ($qStart < $endLeniency)){
 95 |             print(join("\t",("L",$tName,"+",$qName,$qStrand,"*"))."\n");
 96 |           }
 97 |           if(($qEnd > ($qLen-$endLeniency)) && ($tStart < $endLeniency)){
 98 |             print(join("\t",("L",$qName,$qStrand,$tName,"+","*"))."\n");
 99 |           }
100 |           if(($qMatchLen / $qLen) >= $containFrac){
101 |             print(join("\t",("C",$tName,"+",$qName,$qStrand,"*"))."\n");
102 |           }
103 |           if(($tMatchLen / $tLen) >= $containFrac){
104 |             print(join("\t",("C",$qName,"+",$tName,$qStrand,"*"))."\n");
105 |           }
106 |         }
107 |       }
108 |       $lineBuffer = "";
109 |    } else {
110 |       $tName = $F[1];
111 |       $tStart = $F[2];
112 |       $tMatchLen = $F[3];
113 |       $tEnd = $tStart + $tMatchLen;
114 |       $tLen = $F[5];
115 |       $tSeq = $F[6];
116 |     }
117 |   }
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/nbmt-translate.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | ## Translate mitochondrial sequence of Nb mitochondria
  3 | ## (mostly because UAG is *not* a stop codon)
  4 | use warnings;
  5 | use strict;
  6 | 
  7 | ## codon usage table -- consider proportion of mitochondrial codons
  8 | 
  9 | ## for x in A C G T; do for y in A C G T; do for z in A C G T; do echo -n "${x}${y}${z} => \"" | perl -pe 'tr/T/U/'; transeq <(echo -e ">1\n${x}${y}${z}") /dev/stdout 2>/dev/null | perl -pe 's/$/  \",/;' | grep -v '^>'; done; done; done
 10 | 
 11 | my %transTable = ## 3-letter codes are for translations from mtDNA tRNAs
 12 | ( AAA => "(K)",
 13 |   AAC => "(N)",
 14 |   AAG => "(K)",
 15 |   AAU => "(N)",
 16 |   ACA => "(T)",
 17 |   ACC => "(T)",
 18 |   ACG => "Arg",
 19 |   ACU => "(T)",
 20 |   AGA => "(S)",
 21 |   AGC => "(S)",
 22 |   AGG => "(S)",
 23 |   AGU => "(S)",
 24 |   AUA => "(M)",
 25 |   AUC => "(I)",
 26 |   AUG => "(M)",
 27 |   AUU => "(I)",
 28 |   CAA => "(Q)",
 29 |   CAC => "(H)",
 30 |   CAG => "(Q)",
 31 |   CAU => "Met",
 32 |   CCA => "(P)",
 33 |   CCC => "(P)",
 34 |   CCG => "(P)",
 35 |   CCU => "(P)",
 36 |   CGA => "(R)",
 37 |   CGC => "(R)",
 38 |   CGG => "(R)",
 39 |   CGU => "(R)",
 40 |   CUA => "(L)",
 41 |   CUC => "(L)",
 42 |   CUG => "(L)",
 43 |   CUU => "(L)",
 44 |   GAA => "Phe",
 45 |   GAC => "(D)",
 46 |   GAG => "(E)",
 47 |   GAU => "Ile",
 48 |   GCA => "Cys",
 49 |   GCC => "(A)",
 50 |   GCG => "(A)",
 51 |   GCU => "(A)",
 52 |   GGA => "(G)",
 53 |   GGC => "(G)",
 54 |   GGG => "(G)",
 55 |   GGU => "(G)",
 56 |   GUA => "Tyr",
 57 |   GUC => "Asp",
 58 |   GUG => "His",
 59 |   GUU => "Asn",
 60 |   UAA => "L/*", ## was * (mtDNA/Leu)
 61 |   UAC => "Val",
 62 |   UAG => "Leu",
 63 |   UAU => "(Y)",
 64 |   UCA => "Trp",
 65 |   UCC => "Gly",
 66 |   UCG => "(S)",
 67 |   UCU => "Ser",
 68 |   UGA => "(W)",
 69 |   UGC => "Ala",
 70 |   UGG => "Pro",
 71 |   UGU => "Thr",
 72 |   UUA => "(L)",
 73 |   UUC => "Glu",
 74 |   UUG => "Gln",
 75 |   UUU => "Lys",
 76 | 
 77 | );
 78 | 
 79 | my $seq = "";
 80 | my $seqID = "";
 81 | while(<>){
 82 |   chomp;
 83 |   if(/^>(.*)$/){
 84 |     my $newID = $1;
 85 |     if($seq){
 86 |       printf(">%s\n", $seqID);
 87 |       while($seq =~ s/^(.{1,60})//){
 88 | 	  my $seqHead = $1;
 89 | 	  for(my $i=0; $i < length($seqHead); $i+=3){
 90 | 	      my $codon = substr($seqHead, $i, 3);
 91 | 	      $codon =~ tr/T/U/;
 92 | 	      if($transTable{$codon}){
 93 | 		  print($transTable{$codon});
 94 | 	      } else {
 95 | 		  print("???");
 96 | 	      }
 97 | 	  }
 98 | 	  print("\n".("|----:----" x 6)."\n");
 99 | 	  printf("%s\n", $seqHead);
100 |       }
101 |     }
102 |     $seq = "";
103 |     $seqID = $newID;
104 |   } else {
105 |     $seq .= $_;
106 |   }
107 | }
108 | if($seq){
109 |     printf(">%s\n", $seqID);
110 |     my $pos = 0;
111 |     while($seq =~ s/^(.{1,60})//){
112 | 	my $seqHead = $1;
113 | 	for(my $i=0; $i < length($seqHead); $i+=3){
114 | 	    my $codon = substr($seqHead, $i, 3);
115 | 	    $codon =~ tr/T/U/;
116 | 	    if($transTable{$codon}){
117 | 		print($transTable{$codon});
118 | 	    } else {
119 | 		print("???");
120 | 	    }
121 | 	}
122 | 	print("\n ");
123 | 	for(my $i=1; $i < (length($seqHead)-3); $i+=3){
124 | 	    my $codon = substr($seqHead, $i, 3);
125 | 	    $codon =~ tr/T/U/;
126 | 	    if($transTable{$codon}){
127 | 		print($transTable{$codon});
128 | 	    } else {
129 | 		print("???");
130 | 	    }
131 | 	}
132 | 	print("\n  ");
133 | 	for(my $i=2; $i < (length($seqHead)-3); $i+=3){
134 | 	    my $codon = substr($seqHead, $i, 3);
135 | 	    $codon =~ tr/T/U/;
136 | 	    if($transTable{$codon}){
137 | 		print($transTable{$codon});
138 | 	    } else {
139 | 		print("???");
140 | 	    }
141 | 	}
142 | 	print("\n".("|----:----" x 6)."\n");
143 | 	printf("%s\n", $seqHead);
144 | 	printf("%-10s",$pos);
145 | 	for(my $posInc = 10; $posInc < 60; $posInc += 10){
146 | 	    printf("%-10s",substr(sprintf("%10s",$pos+$posInc),8,2));
147 | 	}
148 | 	$pos += 60;
149 | 	print("\n\n");
150 |     }
151 | }
152 | 
153 | ## TAGTA-
154 | 


--------------------------------------------------------------------------------
/fastx-length.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | use Getopt::Long qw(:config auto_help pass_through);
  6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
  7 | 
  8 | sub SIConvert{
  9 |   my ($val) = @_;
 10 |   my @largePrefixes = ("k", "M", "G");
 11 |   my $pID = 0;
 12 |   my $prefix = "";
 13 |   my $changed=0;
 14 |   while(($val > 1000) && ($pID < $#largePrefixes)){
 15 |     $prefix = $largePrefixes[$pID];
 16 |     $val /= 1000;
 17 |     $pID++;
 18 |   }
 19 |   return(sprintf("%.12g %s", $val, $prefix));
 20 | }
 21 | 
 22 | my $showMD5 = 0; # false
 23 | my $showFName = 0; # false
 24 | 
 25 | GetOptions("md5!" => \$showMD5, "fname!" => \$showFName, ) or
 26 |     die("Error in command line arguments");
 27 | 
 28 | if($showMD5){
 29 |   use Digest::MD5 qw(md5_hex); ## only attempt to load if md5 is requested
 30 | }
 31 | 
 32 | # use stdin if no files supplied
 33 | if(!@ARGV){
 34 |   @ARGV = '-' unless (-t STDIN);
 35 | }
 36 | 
 37 | my @lengths = ();
 38 | my $inQual = 0; # false
 39 | my $seqID = "";
 40 | my $qualID = "";
 41 | my $seq = "";
 42 | my $qual = "";
 43 | my $seqFName = "";
 44 | foreach my $file (@ARGV) {
 45 |   # This little gunzip dance makes sure the script can handle both
 46 |   # gzip-compressed and uncompressed input, regardless of whether
 47 |   # or not it is piped
 48 |   my $z = new IO::Uncompress::Gunzip($file, "transparent", 1)
 49 |     or die "gunzip failed: $GunzipError\n";
 50 |   while(<$z>){
 51 |     chomp;chomp;chomp;
 52 |     if (!$inQual) {
 53 |       if (/^(>|@)((.+?)( .*?\s*)?)$/) {
 54 |         my $newSeqID = $2;
 55 |         my $newShortID = $3;
 56 |         if ($seqID) {
 57 |           print(length($seq));
 58 |           if($showMD5){
 59 |             print(" ".md5_hex($seq));
 60 |           }
 61 |           print(" ${seqID}");
 62 |           if($showFName){
 63 |             print(" ".$seqFName);
 64 |           }
 65 |           print("\n");
 66 |           push(@lengths, length($seq));
 67 |         }
 68 |         $seq = "";
 69 |         $qual = "";
 70 |         $seqID = $newSeqID;
 71 |         $seqFName = $file;
 72 |       } elsif (/^\+(.*)$/) {
 73 |         $inQual = 1;            # true
 74 |         $qualID = $1;
 75 |       } else {
 76 |         $seq .= $_;
 77 |       }
 78 |     } else {
 79 |       $qual .= $_;
 80 |       if (length($qual) >= length($seq)) {
 81 |         $inQual = 0;            # false
 82 |       }
 83 |     }
 84 |   }
 85 | }
 86 | 
 87 | if ($seqID) {
 88 |   print(length($seq));
 89 |   if($showMD5){
 90 |     print(" ".md5_hex($seq));
 91 |   }
 92 |   print(" ${seqID}");
 93 |   if($showFName){
 94 |     print(" ".$seqFName);
 95 |   }
 96 |   print("\n");
 97 |   push(@lengths, length($seq));
 98 | }
 99 | 
100 | ## calculate statistics
101 | @lengths = sort {$b <=> $a} (@lengths);
102 | my $sum = 0;
103 | my @cumLengths = map {$sum += $_} (@lengths);
104 | 
105 | my $L50LengthNum = 0;
106 | while($cumLengths[$L50LengthNum] < ($sum * 0.5)){
107 |   $L50LengthNum++;
108 | }
109 | 
110 | my $L90LengthNum = 0;
111 | while($cumLengths[$L90LengthNum] < ($sum * 0.9)){
112 |   $L90LengthNum++;
113 | }
114 | 
115 | my $L10LengthNum = 0;
116 | while($cumLengths[$L10LengthNum] < ($sum * 0.1)){
117 |   $L10LengthNum++;
118 | }
119 | 
120 | printf(STDERR "Total sequences: %d\n", scalar(@lengths));
121 | printf(STDERR "Total length: %sb\n", SIConvert($sum));
122 | printf(STDERR "Longest sequence: %sb\n", SIConvert($lengths[0]));
123 | printf(STDERR "Shortest sequence: %sb\n", SIConvert($lengths[$#lengths]));
124 | printf(STDERR "Mean Length: %sb\n", SIConvert(sprintf("%d", ($sum) / scalar(@lengths))));
125 | printf(STDERR "Median Length: %sb\n", SIConvert($lengths[$#lengths / 2]));
126 | printf(STDERR "N10: %d sequences; L10: %sb\n",
127 |      ($L10LengthNum+1), SIConvert($lengths[$L10LengthNum]));
128 | printf(STDERR "N50: %d sequences; L50: %sb\n",
129 |      ($L50LengthNum+1), SIConvert($lengths[$L50LengthNum]));
130 | printf(STDERR "N90: %d sequences; L90: %sb\n",
131 |      ($L90LengthNum+1), SIConvert($lengths[$L90LengthNum]));
132 | 


--------------------------------------------------------------------------------
/expenseTemplate.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[10pt,a4paper,twoside]{letter}
  2 | \usepackage{graphicx}
  3 | \usepackage{fp}
  4 | \usepackage{charter}
  5 | 
  6 | % length changes to get header in the right place
  7 | \setlength{\voffset}{1.5cm}
  8 | \setlength{\headheight}{2cm}
  9 | \setlength{\headsep}{0pt}
 10 | 
 11 | \usepackage[margin=1cm,bottom=3cm]{geometry}
 12 | \usepackage{multirow}
 13 | \usepackage{fancyhdr}
 14 | \usepackage{lastpage} % provides the number of the last page
 15 | \usepackage{framed}
 16 | \usepackage{color}
 17 | 
 18 | \pagestyle{fancy}
 19 | 
 20 | \newcommand{\myHeader}{
 21 | 
 22 | %% header section (including business logo, name, address)
 23 | \begin{tabular}{llr}
 24 |   \begin{minipage}[b]{0.4\textwidth}
 25 |     \includegraphics[height=2cm]{@@@logoLocation@@@}
 26 |   \end{minipage} &
 27 |   \begin{minipage}[b]{0.55\textwidth}
 28 |     \flushright%
 29 |     {\Huge \textsc{gringene} Bioinformatics}\\
 30 |     \vspace{6pt}%
 31 |     {\Large bioinformatics@gringene.org}\\
 32 |     \vspace{3pt}%
 33 |     5 Debra Way, Karori, Wellington 6012\\
 34 |     \vfill%
 35 |   \end{minipage}%
 36 | \end{tabular}
 37 | 
 38 | \rule{\linewidth}{0.5mm}
 39 | 
 40 | }
 41 | 
 42 | \fancyhead{}
 43 | \fancyhead[CO,CE]{\myHeader}
 44 | \fancyfoot{}
 45 | \fancyfoot[CO,CE]{Page \thepage\ of \pageref{LastPage}}
 46 | \renewcommand{\headrulewidth}{0pt}
 47 | \renewcommand{\footrulewidth}{0pt}
 48 | 
 49 | \renewcommand*{\thefootnote}{\alph{footnote}}
 50 | 
 51 | \definecolor{lightsalmon}{rgb}{1,0.63,0.48}
 52 | 
 53 | 
 54 | \begin{document}
 55 | 
 56 | %% Address section (client name / address, invoice type)
 57 | 
 58 | \hspace{\fill}
 59 | 
 60 | \begin{tabular}{lr}
 61 |   \begin{minipage}[c]{0.55\textwidth}
 62 |     @@@clientName@@@\\%%CLIENTNAME
 63 |     @@@clientAddress@@@
 64 |   \end{minipage} &
 65 |   \begin{minipage}[c]{0.4\textwidth}
 66 |     \begin{center}
 67 |       \begin{framed}
 68 |         \huge{\textbf{Expense Claim}}
 69 |       \end{framed}
 70 |     \end{center}
 71 |   \end{minipage}%
 72 | \end{tabular}
 73 | 
 74 | \rule{\linewidth}{1pt}
 75 | 
 76 | \leftskip=10pt
 77 | 
 78 | Summary of expenses claimed by David Eccles for \emph{@@@clientBusinessName@@@}
 79 | (see attached invoices for more details):
 80 | 
 81 | \begin{flushleft}
 82 | 
 83 |   \setlength{\tabcolsep}{1.5pt}
 84 |   \hspace{30pt}
 85 |   \begin{tabular}{lrl}
 86 |     \textbf{Claim Date} & \multicolumn{2}{l}{@@@invDate@@@}\\
 87 |     \textbf{Job ID} & \multicolumn{2}{l}{@@@jobID@@@}\\
 88 |     \textbf{Job Description}\hspace{6pt}\vspace{\fill} &
 89 |     \multicolumn{2}{p{10cm}}{\raggedright{@@@jobDesc@@@}}\\
 90 |     \\
 91 |     \begin{tabular}{llrl}
 92 |       \textbf{Description} & \textbf{Category} & %%
 93 |       \textbf{Date} & \textbf{Amount}\\
 94 |       \hline\\
 95 |       @@@expenseLines@@@\\
 96 |       \hline\\
 97 |       Total @@@payType@@@s & & @@@totalUnits@@@\\%%UNITLINE
 98 |     \end{tabular}
 99 | 
100 | 
101 |     \cline{1-2}\\
102 |     \textbf{Expense Total}  & \textbf{@@@tAmt@@@} & NZD\\
103 |     \cline{1-2}\\%%DUEAMOUNT
104 |     \textbf{Amount due} &  \textbf{@@@TAmt@@@} & NZD\\%%DUEAMOUNT
105 |     & & \begin{picture}(0,0)%%PAIDEXPENSES
106 |       \put(65,30){\rotatebox[origin=c]{45}{%%PAIDEXPENSES
107 |           \textcolor{lightsalmon}{\fbox{\Huge{PAID}}}}}%%PAIDEXPENSES
108 |     \end{picture}%%PAIDEXPENSES
109 |   \end{tabular}
110 | 
111 | \end{flushleft}
112 | \leftskip=10pt
113 | 
114 | \addvspace{\fill}
115 | \addvspace{6pt}
116 | 
117 | Thank you for your business.
118 | 
119 | \addvspace{12pt}
120 | 
121 | David Eccles
122 | 
123 | \addvspace{6pt}
124 | 
125 | \addvspace{\fill}%%PAIDEXPENSES
126 | 
127 | \filbreak%%DUEEXPENSES
128 | 
129 | \leftskip=0pt
130 | 
131 | \rule{\linewidth}{0.5mm}%%DUEEXPENSES
132 | 
133 | \begin{center}%%DUEEXPENSES
134 |   \Large{\textbf{Payment Details}}\\ %%DUEEXPENSES
135 | \end{center}%%DUEEXPENSES
136 | 
137 | \rule{\linewidth}{0.5mm}%%DUEEXPENSES
138 | 
139 | \leftskip=10pt
140 | 
141 | @@@bankMessage@@@%%DUEEXPENSES
142 | 
143 | \hspace{30pt}%%DUEEXPENSES
144 | \begin{tabular}{ll}%%DUEEXPENSES
145 |   @@@accountDetails@@@%%DUEEXPENSES
146 | \end{tabular}%%DUEEXPENSES
147 | 
148 | \end{document}
149 | 


--------------------------------------------------------------------------------
/maf_bcsplit.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | ## Splits up a FASTA/FASTQ file based on the location of identified sequences (e.g. adapter/barcode sequences)
  4 | ## [currently only identifies the location of the sequences]
  5 | 
  6 | use warnings;
  7 | use strict;
  8 | 
  9 | use Getopt::Long qw(:config auto_help pass_through);
 10 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 11 | 
 12 | my $seqFileName = "";
 13 | my $ignoreSelf = 0;
 14 | my $includeQuery = 0;
 15 | my $includeTarget = 0;
 16 | 
 17 | GetOptions("seqfile=s" => \$seqFileName, "overlap!" => \$ignoreSelf,
 18 |            "query!" => \$includeQuery, "target!" => \$includeTarget) or
 19 |   die("Error in command line arguments");
 20 | 
 21 | my %seqs = ();
 22 | my %quals = ();
 23 | 
 24 | # if(!$seqFileName){
 25 | #   die("Error: read sequence file must be specified, '-seqfile <file.fa>'");
 26 | # }
 27 | 
 28 | # ## read in sequences
 29 | # my $inQual = 0; # false
 30 | # my $seqID = "";
 31 | # my $qualID = "";
 32 | 
 33 | # my $seqFile = new IO::Uncompress::Gunzip "$seqFileName" or
 34 | #   die "Unable to open $seqFileName\n";
 35 | # while(<$seqFile>){
 36 | #   chomp;
 37 | #   chomp;
 38 | #   if(!$inQual){
 39 | #     if(/^(>|@)((.+?)( .*?\s*)?)$/){
 40 | #       my $newSeqID = $2;
 41 | #       my $newShortID = $3;
 42 | #       $seqID = $newShortID;
 43 | #       if($seqID){
 44 | #         $seqs{$seqID} = "";
 45 | #         $quals{$seqID} = "";
 46 | #       }
 47 | #     } elsif(/^\+(.*)$/) {
 48 | #       $inQual = 1; # true
 49 | #       $qualID = $1;
 50 | #       if(($qualID ne "") && ($qualID ne $seqID)){
 51 | #         warn("Sequence ID and Qual ID do not match: $seqID; $qualID");
 52 | #       }
 53 | #     } else {
 54 | #       $seqs{$seqID} .= $_;
 55 | #     }
 56 | #   } else {
 57 | #     $quals{$seqID} .= $_;
 58 | #     my $lq = length($quals{$seqID});
 59 | #     my $ls = length($seqs{$seqID});
 60 | #     if($lq >= $ls){
 61 | #       $inQual = 0; # false
 62 | #       if($lq != $ls){
 63 | #         warn(sprintf("Sequence and Qual length do not match: $seqID (%d; %d)",
 64 | #                     $ls, $lq));
 65 | #       }
 66 | #     }
 67 | #   }
 68 | # }
 69 | # close($seqFile);
 70 | 
 71 | if(keys(%seqs)){
 72 |   printf(STDERR "Read in %d sequences\n", scalar(keys(%seqs)));
 73 | }
 74 | 
 75 | my $qSeq = "";
 76 | my $qStart = 0;
 77 | my $qEnd = 0;
 78 | my $qLen = 0;
 79 | my $qStrand = "";
 80 | my $qMatchLen = 0;
 81 | my $qName = "";
 82 | my $tSeq = "";
 83 | my $tStart = 0;
 84 | my $tEnd = 0;
 85 | my $tMatchLen = 0;
 86 | my $tLen = 0;
 87 | my $tName = "";
 88 | 
 89 | my %matches = ();
 90 | 
 91 | print("query,target,dir,qS,qE,qML,qL,qPct,tS,tE,tML,tL,tPct");
 92 | print(",qStr") if($includeQuery);
 93 | print(",tStr") if($includeTarget);
 94 | print("\n");
 95 | 
 96 | while(<>){
 97 |   if(!/^[as]/){
 98 |     next;
 99 |   }
100 |   my @F = split(/\s+/);
101 |   if($F[0] eq "a"){
102 |     $qSeq = "";
103 |     $tSeq = "";
104 |   } elsif($F[0] eq "s"){
105 |     if($tSeq){
106 |       $qName = $F[1];
107 |       $qStart = $F[2];
108 |       $qMatchLen = $F[3];
109 |       $qEnd = $qStart + $qMatchLen;
110 |       $qStrand = $F[4];
111 |       $qLen = $F[5];
112 |       $qSeq = $F[6];
113 |       if($qStrand eq "-"){ ## correct for reverse complement
114 |         $qEnd = $qLen - $qStart;
115 |         $qStart = $qEnd - $qMatchLen;
116 |       }
117 |       my $matchLine =
118 |         sprintf("%s,%s,%s,%d,%d,%d,%d,%0.2f,%d,%d,%d,%d,%0.2f",
119 |                 $qName, $tName, $qStrand,
120 |                 $qStart, $qEnd, $qMatchLen, $qLen, ($qMatchLen / $qLen) * 100,
121 |                 $tStart, $tEnd, $tMatchLen, $tLen, ($tMatchLen / $tLen) * 100);
122 |       if(!$ignoreSelf || ($qName ne $tName)){
123 |         print("${matchLine}");
124 |         print(",${qSeq}") if($includeQuery);
125 |         print(",${tSeq}") if($includeTarget);
126 |         print("\n");
127 |       }
128 |       $matches{$qName}{$qStart} .= ":matchLine";
129 |    } else {
130 |       $tName = $F[1];
131 |       $tStart = $F[2];
132 |       $tMatchLen = $F[3];
133 |       $tEnd = $tStart + $tMatchLen;
134 |       $tLen = $F[5];
135 |       $tSeq = $F[6];
136 |     }
137 |   }
138 | }
139 | 
140 | 


--------------------------------------------------------------------------------
/rsfilter.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # rsfilter.pl -- hunts (quickly) for a set of markers in one pass of a
  4 | # file.
  5 | 
  6 | # Note: marker names *must not* be the same as an existing file
  7 | 
  8 | # Author: David Eccles (gringer), 2008 <programming@gringer.org>
  9 | 
 10 | # This program tries to be fast and memory-efficient, so decisions are
 11 | # pushed to outer loops. Using the ordered option, '-o', retains
 12 | # genotype lines in memory for the specified markers until all input
 13 | # has been read, which can consume large amounts of memory when the
 14 | # requested marker set is large.
 15 | 
 16 | use strict;
 17 | use warnings;
 18 | 
 19 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 20 | 
 21 | sub usage {
 22 |   print("usage: ./rsfilter.pl <marker1> <marker2> ... [options] < <file name>\n");
 23 |   print("\nOther Options:\n");
 24 |   print("-r : invert filter (i.e. select markers to exclude)\n");
 25 |   print("-o : order by selection\n");
 26 |   print("-- : no more files on command line (treat file names as markers)\n");
 27 |   print("\n");
 28 | }
 29 | 
 30 | my %markers = ();
 31 | my $complement = 0; # false # determines if complement, rather
 32 |                             # than intersect, should be chosen
 33 | my $order = 0; # false # order markers by given ordering
 34 | my @markerorder = ();
 35 | 
 36 | my @inFiles = ();
 37 | 
 38 | my $filesFinished = 0; # false
 39 | 
 40 | while(@ARGV){
 41 |     my $arg = shift(@ARGV);
 42 |     if(!$filesFinished && (-e $arg)){
 43 |         if(!keys(%markers)){
 44 |             print(STDERR "Retrieving marker names from $arg...");
 45 |             my $markFile = 0;
 46 |             my $fileName = $arg;
 47 |             $markFile = new IO::Uncompress::Gunzip "$fileName" or
 48 |                 die "Unable to open $fileName\n";
 49 |             while(<$markFile>){
 50 |                 if(/^\"?(.*?)\"?[\s,]+/){
 51 |                     my $marker = $1;
 52 |                     # print(STDERR "adding marker $marker\n");
 53 |                     $markers{$marker} = 1;
 54 |                     push(@markerorder, $marker);
 55 |                 }
 56 |             }
 57 |             print(STDERR keys(%markers)." marker names extracted\n");
 58 |         } else {
 59 |             push(@inFiles, $arg);
 60 |         }
 61 |     } elsif ($arg =~ /^-r/){
 62 |         $complement = 1; # true
 63 |         print(STDERR "Markers will be excluded, rather than filtered\n");
 64 |     } elsif ($arg =~ /^-o/){
 65 |         $complement = 0; # false
 66 |         $order = 1; # true
 67 |         print(STDERR "Markers will output in specified order\n");
 68 |     } elsif ($arg =~ /^--/){
 69 |         $filesFinished = 1; # true
 70 |         print(STDERR "No more files will be read\n");
 71 |     } elsif ($arg =~ /^-help/){
 72 |         usage();
 73 |         exit(0);
 74 |     } else {
 75 |         $markers{$arg} = 1;
 76 |         push(@markerorder, $arg);
 77 |     }
 78 | }
 79 | 
 80 | @ARGV = @inFiles;
 81 | 
 82 | my %markerlines = ();
 83 | 
 84 | #print keys(%markers)." marker names extracted\n";
 85 | 
 86 | if($complement){ # tests are in outer loop to slightly reduce processor effort
 87 |     while (<>){
 88 |         my $line = $_;
 89 |         if($line =~ /^(\"?.*?\"?)[\s,]+/){
 90 |             if (!$markers{$1}){
 91 |                 print($line);
 92 |             }
 93 |         }
 94 |     }
 95 | } else {
 96 |     if(!$order){
 97 |         while (<>){
 98 |             my $line = $_;
 99 | 	    if($line =~ /^(\"?.*?\"?)[\s,]+/){
100 |                 if ($markers{$1}){
101 |                     print($line);
102 |                 }
103 |             }
104 |         }
105 |     } else {
106 |         while (<>){
107 |             my $line = $_;
108 | 	    if($line =~ /^(\"?.*?\"?)[\s,]+/){
109 |                 if ($markers{$1}){
110 |                     my $marker = $1;
111 |                     $markerlines{$1} = $line;
112 |                 }
113 |             }
114 |         }
115 |         foreach(@markerorder){
116 |             my $marker = $_;
117 |             if($markerlines{$marker}){
118 |                 print($markerlines{$marker});
119 |             }
120 |         }
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/fastx-rlength.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | use Getopt::Long qw(:config auto_help pass_through);
  6 | #use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error);
  7 | #use IO::File;
  8 | 
  9 | sub rc {
 10 |   my ($seq) = @_;
 11 |   $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/;
 12 |   # work on masked sequences as well
 13 |   $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/;
 14 |   return(scalar(reverse($seq)));
 15 | }
 16 | 
 17 | sub rev {
 18 |   my ($seq) = @_;
 19 |   return(scalar(reverse($seq)));
 20 | }
 21 | 
 22 | sub printStats {
 23 |   my ($seq, $seqID, $trim, $kmerLength) = @_;
 24 |   my $len = length($seq);
 25 |   my $sseq = "";
 26 |   if($seqID && (length($seq) > $trim) && (length($seq) > $kmerLength)){
 27 |     my $countTotal = 0;
 28 |     my $countMax = 0;
 29 |     my $maxKmer = "";
 30 |     my %rptPos = ();
 31 |     my %allGapCounts = ();
 32 |     my %minGaps = ();
 33 |     my $revCount = 1;
 34 |     my $rcCount = 1;
 35 |     for(my $p = 0; ($p + $kmerLength) <= $len; $p++){
 36 |       $sseq = substr($seq, $p, $kmerLength);
 37 |       if(exists($rptPos{$sseq})){
 38 | 	my $gap = $p - $rptPos{$sseq};
 39 | 	$allGapCounts{$gap}++;
 40 | 	if(!exists($minGaps{$sseq}) || ($minGaps{$sseq} > $gap)){
 41 | 	  $minGaps{$sseq} = $gap;
 42 | 	}
 43 |       }
 44 |       if(exists($rptPos{rev($sseq)})){
 45 | 	$revCount++;
 46 |       }
 47 |       if(exists($rptPos{rc($sseq)})){
 48 | 	$rcCount++;
 49 |       }
 50 |       $rptPos{$sseq} = $p;
 51 |     }
 52 |     if($revCount == 1){
 53 |       $revCount = 0;
 54 |     }
 55 |     if($rcCount == 1){
 56 |       $rcCount = 0;
 57 |     }
 58 |     my $numKmers = scalar(keys(%rptPos));
 59 |     my $kmerRatio = $numKmers/($len - $kmerLength + 1);
 60 |     my $numRepeats = scalar(keys(%minGaps));
 61 |     my @gaps = sort {$a <=> $b} (values(%minGaps));
 62 |     my $medianGap = (@gaps) ? $gaps[$#gaps / 2] : 0;
 63 |     my $medianCount = 0;
 64 |     my $modalGap = 0;
 65 |     my $modalCount = 0;
 66 |     my $rangeCountMed = 0;
 67 |     my $rangeCountMod = 0;
 68 |     if($medianGap){
 69 |       my %gapCounts = ();
 70 |       foreach my $gap (@gaps){
 71 | 	$gapCounts{$gap}++;
 72 |       }
 73 |       $medianCount = ${allGapCounts{$medianGap}};
 74 |       my @modalSort = sort {$allGapCounts{$b} <=> $allGapCounts{$a}} (@gaps);
 75 |       $modalGap = $modalSort[0];
 76 |       $modalCount = $allGapCounts{$modalGap};
 77 |       for(my $gP = int($medianGap * 0.99); ($gP <= ($medianGap / 0.99));
 78 | 	  $gP++){
 79 | 	$rangeCountMed += $allGapCounts{$gP} if($allGapCounts{$gP});
 80 |       }
 81 |       for(my $gP = int($modalGap * 0.99); ($gP <= ($modalGap / 0.99));
 82 | 	  $gP++){
 83 | 	$rangeCountMod += $allGapCounts{$gP} if($allGapCounts{$gP});
 84 |       }
 85 |     }
 86 |     printf("%8d %0.3f %6d %6d %5d %5d %6d %5d %5d %6d %6d %6d %s\n",
 87 | 	   $len, $kmerRatio, 
 88 | 	   $numRepeats,
 89 | 	   $countTotal,
 90 | 	   $medianCount,
 91 | 	   $rangeCountMed,
 92 | 	   $medianGap,
 93 | 	   $modalCount,
 94 | 	   $rangeCountMod,
 95 | 	   $modalGap,
 96 | 	   $revCount,
 97 | 	   $rcCount,
 98 | 	   $seqID);
 99 |   }
100 | }
101 | 
102 | my $trim = 0;
103 | my $kmerLength = 17; ## number of bases in hash keys
104 | 
105 | GetOptions("trim=s" => \$trim) or
106 |     die("Error in command line arguments");
107 | 
108 | my $inQual = 0; # false
109 | my $seqID = "";
110 | my $qualID = "";
111 | my $seq = "";
112 | my $qual = "";
113 | my $buffer = "";
114 | printf("%8s %5s %6s %6s %5s %5s %6s %5s %5s %6s %6s %6s %s\n",
115 |        "length", "kRat", "cntRep", "cntTot",
116 |        "medCt", "RCMed", "medGap",  "modCt", "RCMod",
117 |        "modGap", "rvCnt", "rcCnt", "SeqID");
118 | while(<>){
119 |   chomp;
120 |   chomp;
121 |   if(!$inQual){
122 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
123 |       my $newSeqID = $2;
124 |       my $newShortID = $3;
125 |       printStats($seq, $seqID, $trim, $kmerLength);
126 |       $seq = "";
127 |       $qual = "";
128 |       $buffer = "";
129 |       $seqID = $newSeqID;
130 |     } elsif(/^\+(.*)$/) {
131 |       $inQual = 1; # true
132 |       $qualID = $1;
133 |     } else {
134 |       $seq .= $_;
135 |     }
136 |   } else {
137 |     $qual .= $_;
138 |     if(length($qual) >= length($seq)){
139 |       $inQual = 0; # false
140 |     }
141 |   }
142 | }
143 | 
144 | printStats($seq, $seqID, $trim, $kmerLength);
145 | 


--------------------------------------------------------------------------------
/fastx-hplength.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | ## fastx-hplength.pl -- get statistics on homopolymers in a fastq/fasta file
  6 | 
  7 | use Getopt::Long qw(:config auto_help pass_through);
  8 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
  9 | 
 10 | my $quiet = 0;
 11 | my $mode = "ACGT";
 12 | 
 13 | GetOptions("quiet!" => \$quiet, "mode=s" => \$mode) or
 14 |   die("Error in command line arguments");
 15 | 
 16 | # unknown commands are treated as identifiers
 17 | my @files = ();
 18 | while(@ARGV){
 19 |   my $arg = shift(@ARGV);
 20 |   if(-e $arg){
 21 |     push(@files, $arg);
 22 |   }
 23 | }
 24 | @ARGV = @files;
 25 | 
 26 | # use stdin if no files supplied
 27 | if(!@ARGV){
 28 |   @ARGV = '-' unless (-t STDIN);
 29 | }
 30 | 
 31 | my %hpCounts = ();
 32 | 
 33 | my $baseCount = 0;
 34 | my $inQual = 0; # false
 35 | my $seqID = "";
 36 | my $qualID = "";
 37 | my $seq = "";
 38 | my $qual = "";
 39 | 
 40 | foreach my $file (@ARGV) {
 41 |   # This little gunzip dance makes sure the script can handle both
 42 |   # gzip-compressed and uncompressed input, regardless of whether
 43 |   # or not it is piped
 44 |   my $z = new IO::Uncompress::Gunzip($file, "transparent", 1)
 45 |     or die "gunzip failed: $GunzipError\n";
 46 |   while(<$z>){
 47 |     s/\s+$//; # remove ending whitespace
 48 |     if (!$inQual) {
 49 |       if (/^(>|@)((.+?)( .*?\s*)?)$/) {
 50 |         my $newSeqID = $2;
 51 |         my $newShortID = $3;
 52 |         $baseCount += length($seq);
 53 |         my $cur = "";
 54 |         my $cchr = "";
 55 |         if($mode ne "ACGT"){
 56 |           $seq =~ tr/ACGT/ABBB/ if ($mode eq "B");
 57 |           $seq =~ tr/ACGT/DCDD/ if ($mode eq "D");
 58 |           $seq =~ tr/ACGT/HHGH/ if ($mode eq "H");
 59 |           $seq =~ tr/ACGT/VVVT/ if ($mode eq "V");
 60 |           $seq =~ tr/ACGT/RYRY/ if (($mode eq "RY") || ($mode eq "YR"));
 61 |           $seq =~ tr/ACGT/WSSW/ if (($mode eq "WS") || ($mode eq "SW"));
 62 |           $seq =~ tr/ACGT/MMKK/ if (($mode eq "MK" || $mode eq "KM"));
 63 |         }
 64 |         grep { # collect homopolymers
 65 |           if($_ ne $cchr){
 66 |             $hpCounts{$cur}++ if($cur);
 67 |             $cur = $cchr = $_;
 68 |           } else {
 69 |             $cur .= $cchr;
 70 |           }
 71 |         } split(//, $seq);
 72 |         $hpCounts{$cur}++ if($cur); # collect remaining homopolymer (if any)
 73 |         $seq = "";
 74 |         $qual = "";
 75 |         $seqID = $newSeqID;
 76 |       } elsif (/^\+(.*)$/) {
 77 |         $inQual = 1;            # true
 78 |         $qualID = $1;
 79 |       } else {
 80 |         $seq .= uc($_);
 81 |       }
 82 |     } else {
 83 |       $qual .= $_;
 84 |       if (length($qual) >= length($seq)) {
 85 |         $inQual = 0;            # false
 86 |       }
 87 |     }
 88 |   }
 89 |   close($z);
 90 | }
 91 | 
 92 | $mode = uc($mode);
 93 | 
 94 | $baseCount += length($seq);
 95 | my $cur = "";
 96 | my $cchr = "";
 97 | if($mode ne "ACGT"){
 98 |   $seq =~ tr/ACGT/ABBB/ if ($mode eq "B");
 99 |   $seq =~ tr/ACGT/DCDD/ if ($mode eq "D");
100 |   $seq =~ tr/ACGT/HHGH/ if ($mode eq "H");
101 |   $seq =~ tr/ACGT/VVVT/ if ($mode eq "V");
102 |   $seq =~ tr/ACGT/RYRY/ if (($mode eq "RY") || ($mode eq "YR"));
103 |   $seq =~ tr/ACGT/WSSW/ if (($mode eq "WS") || ($mode eq "SW"));
104 |   $seq =~ tr/ACGT/MMKK/ if (($mode eq "MK" || $mode eq "KM"));
105 | }
106 | grep { # collect homopolymers
107 |   if($_ ne $cchr){
108 |     $hpCounts{$cur}++ if($cur);
109 |     $cur = $cchr = $_;
110 |   } else {
111 |     $cur .= $cchr;
112 |   }
113 | } split(//, $seq);
114 | $hpCounts{$cur}++ if($cur); # collect remaining homopolymer (if any)
115 | 
116 | my $cumCount = 0;
117 | foreach my $hpChar (sort {length($a) <=> length($b) || $a cmp $b}
118 |                     (keys(%hpCounts))){
119 |   my $hpCount = $hpCounts{$hpChar};
120 |   my $hpBaseCount = $hpCount * length($hpChar);
121 |   $cumCount += $hpBaseCount;
122 |   printf("%10d %10d ( %6.2f%% / %6.2f%% ) %10s %s %d\n",
123 |          $hpCount,  $hpBaseCount,
124 |          $hpBaseCount * 100 / $baseCount,
125 |          $cumCount * 100 / $baseCount,
126 | 	 (length($hpChar) < 10 ? $hpChar : substr($hpChar,0,1)."........."),
127 | 	 (length($hpChar) < 10 ? ":" : "x" ), length($hpChar));
128 | }
129 | 
130 | printf(STDERR "Total sequence length: %d\n", $baseCount) unless $quiet;
131 | 


--------------------------------------------------------------------------------
/fmod.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import array
  5 | import struct
  6 | import wave
  7 | import csv
  8 | from math import sin, asin, pi, log, exp, sqrt
  9 | 
 10 | ## Carry out frequency modulation; vary the frequency of a wave based
 11 | ## on a signal
 12 | ##
 13 | ## Here is the rough process:
 14 | # 1) find out the phase of the current frequency at the previous position
 15 | # 2) calculate the new amplitude by adding one time slice to that phase
 16 | 
 17 | def fmod(outFile, signal, minFreq, maxFreq, oldRate, newRate,
 18 |          speed=1.0, volume=0.8, logScale=True):
 19 |     oldRate = oldRate * speed
 20 |     logRange = log(maxFreq) - log(minFreq)
 21 |     linRange = maxFreq - minFreq
 22 |     meanSig = sum(signal) / len(signal)
 23 |     madSig = sum(map(lambda x: abs(x - meanSig), signal)) / len(signal)
 24 |     minSig = meanSig - madSig * 4;
 25 |     maxSig = meanSig + madSig * 4;
 26 |     if(min(signal) > minSig):
 27 |         minSig = min(signal)
 28 |     if(max(signal) < maxSig):
 29 |         maxSig = max(signal)
 30 |     sys.stderr.write("Min: %f, Max: %f, Signal: %d\n" %
 31 |                      (minSig, maxSig, len(signal)))
 32 |     ## limit signal to within MAD range, scale to 0..100
 33 |     signal = map(lambda x: float(0) if (x < minSig) else
 34 |                  (float(99) if x > maxSig else
 35 |                   float(99) * (x - minSig) / (maxSig - minSig)), signal)
 36 |     newFreqs = (map(
 37 |         lambda x: exp((log(x + 1) / log(100)) * logRange + log(minFreq)),
 38 |         signal)) if logScale else (
 39 |             map(lambda x: (x/100) * linRange + minFreq, signal))
 40 |     # number of sound samples per signal sample
 41 |     newPerOld  = (float(newRate) / (float(oldRate)))
 42 |     amp = [0] * int(len(signal) * newPerOld)
 43 |     # start in first quadrant [0..pi/2)
 44 |     quadrant = 0
 45 |     oldPhase = 0
 46 |     oldAmplitude = 0
 47 |     for s in xrange(len(signal)-1):
 48 |         ## work out phase of previous signal
 49 |         if(quadrant == 0):
 50 |             oldPhase = asin(oldAmplitude)
 51 |         if(quadrant == 1):
 52 |             oldPhase = pi - asin(oldAmplitude)
 53 |         if(quadrant == 2):
 54 |             oldPhase = pi - asin(oldAmplitude)
 55 |         if(quadrant == 3):
 56 |             oldPhase = 2 * pi + asin(oldAmplitude)
 57 |         newFreq = newFreqs[s+1]
 58 |         ## determine phase step (for input)
 59 |         sigPhaseStep = (newFreq * 2 * pi / oldRate)
 60 |         ## determine phase step (for output)
 61 |         outPhaseStep = (newFreq * 2 * pi / newRate)
 62 |         ## determine new phase
 63 |         newPhase = (oldPhase + sigPhaseStep) % (2 * pi)
 64 |         ## calculate new quadrant
 65 |         quadrant = int(newPhase / (pi/2))
 66 |         ## determine new amplitude
 67 |         oldAmplitude = sin(newPhase)
 68 |         ## write signal to file
 69 |         sStart = int(s*newPerOld);
 70 |         sEnd = int((s+1)*newPerOld);
 71 |         packedSamples = map(
 72 |             lambda x: struct.pack('h',int(sin(oldPhase + x*outPhaseStep) *
 73 |                                           volume * 32767)),
 74 |             xrange(sEnd-sStart))
 75 |         fmodOut.writeframes(''.join(packedSamples))
 76 | 
 77 | rate=int(sys.argv[2])
 78 | 
 79 | if(".csv" in sys.argv[1]):
 80 |     with open(sys.argv[1]) as csvfile:
 81 |         myreader = csv.reader(csvfile, delimiter=",", quotechar='"')
 82 |         data = array.array('f')
 83 |         for row in myreader:
 84 |             data.append(float(row[1]))
 85 |         outRate = 44100
 86 |         fmodOut = wave.open('out.wav', 'w')
 87 |         fmodOut.setparams((1, 2, outRate, 0, 'NONE', 'not compressed'))
 88 |         fmod(outFile=fmodOut, signal=data, minFreq=50, maxFreq=1000,
 89 |              speed=1.0,
 90 |              oldRate=rate, newRate=outRate, volume=0.1)
 91 |         fmodOut.close()
 92 |     pass
 93 | else:
 94 |     with open(sys.argv[1], "rb") as f:
 95 |         outRate = 44100
 96 |         inData = f.read()
 97 |         if(len(inData) % 2 == 1):
 98 |             inData = inData[:-1]
 99 |         sys.stderr.write("Input length: %d\n" % len(inData))
100 |         data = array.array('H', inData)
101 |         #print(",".join(map(str,newData[:50])))
102 |         fmodOut = wave.open('out.wav', 'w')
103 |         fmodOut.setparams((1, 2, outRate, 0, 'NONE', 'not compressed'))
104 |         fmod(outFile=fmodOut, signal=data, minFreq=200, maxFreq=1000,
105 |              speed=1.0,
106 |              oldRate=rate, newRate=outRate, volume=0.1)
107 |         fmodOut.close()
108 | 
109 |     #with open ("out.raw", "wb") as soundFile:
110 |     #    soundFile.write(newData)
111 | 


--------------------------------------------------------------------------------
/svgImport.r:
--------------------------------------------------------------------------------
  1 | library(gridSVG);
  2 | library(grid);
  3 | library(XML);
  4 | 
  5 | x = c(0, 0.5, 1, 0.5)
  6 | y = c(0.5, 1, 0.5, 0)
  7 | grid.newpage()
  8 | grid.polygon(x,y, name="goodshape")
  9 | pat <- pattern(linesGrob(gp=gpar(col="black",lwd=3)),
 10 |   width = unit(5, "mm"), height = unit(5, "mm"),
 11 |   dev.width = 1, dev.height = 1)
 12 | # Registering pattern
 13 | registerPatternFill("pat", pat)
 14 | # Applying pattern fill
 15 | grid.patternFill("goodshape", label = "pat")
 16 | grid.export("test-pattern.svg")
 17 | 
 18 | svg.data <- xmlParse("dna_linear_DS_v4_backbone.svg");
 19 | 
 20 | xmlGetAttr(svg.data,"viewBox")
 21 | 
 22 | 
 23 | svginfo <- getNodeSet(svg.data, "//*[name()='svg']")[[1]];
 24 | svgVB <- xmlGetAttr(svginfo, "viewBox");
 25 | svgW <- xmlGetAttr(svginfo, "width");
 26 | svgH <- xmlGetAttr(svginfo, "height");
 27 | if(!is.null(svgVB)){
 28 |     svgVB <- as.numeric(strsplit(svgVB,"\\s+")[[1]]);
 29 | } else if(!is.null(svgW)){
 30 |     svgVB <- c(0,0,svgW,svgH);
 31 | }
 32 | 
 33 | svg.paths <- xpathApply(svg.data, '//svg:path', fun=function(x){
 34 |     list(d=xmlGetAttr(x, "d"), style=xmlGetAttr(x, "style"));
 35 | });
 36 | 
 37 | grid.newpage();
 38 | 
 39 | for(myPath in svg.paths){
 40 |     ## see https://stackoverflow.com/questions/4246077/matching-numbers-with-regular-expressions-only-digits-and-commas/4247184
 41 |     numRE <- "(([+-]?)(?=\\d|\\.\\d)\\d*(\\.\\d*)?([Ee]([+-]?\\d+))?)";
 42 |     pathChunks <-
 43 |         unlist(regmatches(myPath$d,
 44 |                           gregexpr(paste0("([MmLl]?\\s*",numRE,
 45 |                                           "[,\\s]",numRE,"|[zZ])"), perl=TRUE,
 46 |                                    myPath$d)));
 47 |     ## not yet implemented:
 48 |     ## # HhVv - Horiz / vert lines     # CcSs - Curve (cubic bezier)
 49 |     ## # QqTt - Curve (quadratic)      # Aa   - Elliptical arc 
 50 |     blankLineChunks <- grep("^[^MmLlHhVvCcSsQqTtAaZz]", pathChunks);
 51 |     pathChunks[blankLineChunks] <- paste0("#",pathChunks[blankLineChunks]);
 52 |      ## remove filler whitespace (if any)
 53 |     pathChunks <- sub(paste0("^(.)\\s+",numRE),"\\1\\2", pathChunks, perl=TRUE);
 54 |     path.df <- data.frame(chunk = pathChunks, stringsAsFactors=FALSE);
 55 |     path.df$command <- substr(path.df$chunk,1,1);
 56 |     path.df$remainder <- sub("^.","",path.df$chunk);
 57 |     path.df$posX <- as.numeric(sub(paste0("^",numRE,"(.*)$"),"\\1",
 58 |                                    path.df$remainder, perl=TRUE));
 59 |     path.df$posY <- as.numeric(sub(paste0("^",numRE,"(,|\\s+)"),"",
 60 |                                    path.df$remainder, perl=TRUE));
 61 |     #if(grepl("e",myPath$d)){
 62 |     #    break;
 63 |     #}
 64 |     command.rle <- rle(path.df$command);
 65 |     anonPoss <- which(command.rle$values == "#");
 66 |     ## [8.3.2] "If a moveto is followed by multiple pairs of coordinates, the
 67 |     ## subsequent pairs are treated as implicit lineto commands."
 68 |     command.rle$values[anonPoss] <-
 69 |         ifelse(command.rle$values[anonPoss-1] == "M", "L",
 70 |         ifelse(command.rle$values[anonPoss-1] == "m", "l", "#"));
 71 |     path.df$command <- inverse.rle(command.rle);
 72 |     #if(path.df$command[1] == "M"){
 73 |     #    next;
 74 |     #}
 75 |     penX <- 0;  penY <- 0;  startX <- 0;  startY <- 0;
 76 |     absPoss <- matrix(NA,nrow=nrow(path.df), ncol=2);
 77 |     for(li in seq_along(path.df$command)){
 78 |         cmd <- path.df$command[li];
 79 |         px <- path.df$posX[li]; py <- path.df$posY[li];
 80 |         if((cmd == "m") || (cmd == "M")){
 81 |             startX <- px; startY <- py;
 82 |         } else if(cmd == "l"){
 83 |             px <- px + penX; py <- py + penY;
 84 |         } else if((cmd == "z") || (cmd == "Z")){
 85 |             px <- startX; py <- startY;
 86 |         }
 87 |         absPoss[li,] <- c(px, py);
 88 |         penX <- px; penY <- py;
 89 |     }
 90 |     path.df[,c("absX","absY")] <- absPoss;
 91 |     path.df$absX <- (path.df$absX - svgVB[1]) / (svgVB[3] - svgVB[1]);
 92 |     path.df$absY <- 1 - (path.df$absY - svgVB[2]) / (svgVB[4] - svgVB[2]);
 93 |     pathStyle <- myPath$style;
 94 |     pathFill <- sub("^fill:(.*)[;$]","\\1",
 95 |                     regmatches(pathStyle,gregexpr("fill:.*?[;$]",pathStyle)));
 96 |     pathStroke <- sub("^stroke:(.*)[;$]","\\1",
 97 |                       regmatches(pathStyle,gregexpr("stroke:.*?[;$]",pathStyle)));
 98 |     if((tail(path.df$command,1) == "z") || (tail(path.df$command,1) == "Z")){
 99 |         grid.polygon(x=head(path.df$absX,-1), y=head(path.df$absY,-1),
100 |                      gp=gpar(col=pathStroke,fill=pathFill));
101 |     } else {
102 |         grid.lines(x=path.df$absX, y=path.df$absY,
103 |                      gp=gpar(col=pathStroke,fill=pathFill));
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/tped2trios.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # tped2trios.pl -- Convert plink tped/tfam format file to a BEAGLE
  4 | # unphased trio data file. If a second file is provided indicating the
  5 | # genotyped individuals, then only trios containing two (or three)
  6 | # genotyped individuals will be output.
  7 | 
  8 | # Author: David Eccles (gringer), 2012 <bioinformatics@gringer.org>
  9 | 
 10 | use strict;
 11 | use warnings;
 12 | 
 13 | sub usage {
 14 |   print("usage: ./tped2trios.pl <tped file> [id list file] [options]\n");
 15 |   print("\nConvert plink tped/tfam file to BEAGLE trio file\n");
 16 |   print("\nOther Options:\n");
 17 |   print("-3      : ensure all trio members are genotyped (instead of 2)\n");
 18 |   print("-help   : Only display this help message\n");
 19 |   print("\n");
 20 | }
 21 | 
 22 | my @files = ();
 23 | 
 24 | my $tpedFileName = "";
 25 | my $listFileName = "";
 26 | my $trioMin = 2;
 27 | 
 28 | # extract command line arguments
 29 | while(@ARGV){
 30 |     my $argument = shift @ARGV;
 31 |     if(-f $argument){ # file existence check
 32 |         if(!$tpedFileName){
 33 |             $tpedFileName = $argument;
 34 |             printf(STDERR "Setting TPED file name to '%s'\n", $tpedFileName);
 35 |         } elsif (!$listFileName) {
 36 |             $listFileName = $argument;
 37 |             printf(STDERR "Setting genotyped individual list file ".
 38 |                    "name to '%s'\n", $listFileName);
 39 |         } else {
 40 |             print(STDERR "Error: only two files can be specified\n");
 41 |             usage();
 42 |             exit(1);
 43 |         }
 44 |     } else {
 45 |         if($argument eq "-3"){
 46 |           $trioMin = 3;
 47 |             print(STDERR "All trio individuals output will be in list file\n");
 48 |         }
 49 |         if($argument eq "-help"){
 50 |             usage();
 51 |             exit(0);
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | if(!$tpedFileName){
 57 |     print(STDERR "Error: no files specified, cannot continue\n");
 58 |     usage();
 59 |     exit(1);
 60 | }
 61 | 
 62 | my $tfamFileName = $tpedFileName;
 63 | $tfamFileName =~ s/\.tped$/\.tfam/;
 64 | 
 65 | if(!(-f $tfamFileName)){
 66 |     print(STDERR "Error: TFAM file associated with TPED file does not exist\n");
 67 |     printf(STDERR " - expecting '%s'\n", $tfamFileName);
 68 |     usage();
 69 |     exit(1);
 70 | }
 71 | 
 72 | if(!$listFileName){
 73 |     $listFileName = $tfamFileName;
 74 | }
 75 | 
 76 | my %genotypedIndivs = ();
 77 | # get IDs for people who have been genotyped
 78 | my $lineNum = 0;
 79 | open(my $listFile, "<", $listFileName) or die("Cannot open $listFileName");
 80 | while(<$listFile>){
 81 |     chomp;
 82 |     my @F = split(/\s+/, $_);
 83 |     # assume field 0 is family, field 1 is individual
 84 |     $genotypedIndivs{$F[1]."@".$F[0]} = $lineNum++;
 85 | }
 86 | close($listFile);
 87 | printf(STDERR "Added %d genotyped IDs\n", scalar(keys(%genotypedIndivs)));
 88 | 
 89 | my %tfamIndivs = ();
 90 | my @trioIDs = ();
 91 | # get locations/IDs and parents for indviduals in TFAM file
 92 | $lineNum = 0;
 93 | open(my $tfamFile, "<", $tfamFileName) or die("Cannot open $tfamFileName");
 94 | while(<$tfamFile>){
 95 |     chomp;
 96 |     my @F = split(/\s+/, $_);
 97 |     # assume field 0 is family, field 1 is individual
 98 |     # beagle output should be parent, parent, offspring
 99 |     my @IDs = ($F[3]."@".$F[0], $F[2]."@".$F[0], $F[1]."@".$F[0]);
100 | #    print(join(";",@IDs)."\n");
101 |     $tfamIndivs{$IDs[2]} = $lineNum++;
102 |     if(scalar(grep {$genotypedIndivs{$_}} @IDs) >= $trioMin){
103 |         push(@trioIDs, @IDs);
104 |     }
105 | }
106 | close($tfamFile);
107 | 
108 | foreach my $id (@trioIDs){
109 |     if(!defined($tfamIndivs{$id})){
110 |         printf("column ID for $id is undefined, bailing out\n");
111 |         usage();
112 |         exit(1);
113 |         # $genotypedIndivs{$id} = -1;
114 |     }
115 | }
116 | 
117 | # convert trio IDs into column numbers
118 | my @trioColumns = 
119 |     map {($tfamIndivs{$_} * 2, $tfamIndivs{$_} * 2 + 1)} @trioIDs;
120 | 
121 | # print first line (list of IDs)
122 | print("I ind\@fam ".join(" ", map {($_,$_)} @trioIDs)."\n");
123 | 
124 | open(my $tpedFile, "<", $tpedFileName) or die("Cannot open $tpedFileName");
125 | print(STDERR "Writing output (one '.' per 1,000 lines)...");
126 | my $lineCount = 0;
127 | while(<$tpedFile>){
128 |     chomp;
129 |     my @F = split(/\s+/, $_);
130 |     push(@F, 0, 0);
131 |     shift(@F); # remove chromosome number
132 |     my $marker = shift(@F); # extract marker name
133 |     shift(@F);shift(@F); # remove map position / base location
134 |     # print out marker line with trio genotypes
135 |     printf("M $marker %s\n", join(" ", map {$F[$_]} @trioColumns));
136 |     if($lineCount++ % 1000 == 0){
137 |         print(STDERR ".");
138 |     }
139 | }
140 | print(STDERR " done!\n");
141 | 


--------------------------------------------------------------------------------
/vcf2fq.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Author: David Eccles (gringer) 2012 [based on vcf2fq by lh3]
  4 | 
  5 | use strict;
  6 | use warnings;
  7 | use Getopt::Std;
  8 | 
  9 | &vcf2fq();
 10 | exit;
 11 | 
 12 | sub vcf2fq {
 13 |   my %opts = (d=>3, D=>100000, Q=>10, L=>100, f=>"");
 14 |   getopts('d:D:Q:L:f:', \%opts);
 15 |   die(qq/
 16 | Usage:   vcfutils.pl vcf2fq [options] <all-site.vcf>
 17 | 
 18 | Options: -d INT    minimum depth                   [$opts{d}]
 19 |          -D INT    maximum depth                   [$opts{D}]
 20 |          -Q INT    min RMS mapQ                    [$opts{Q}]
 21 |          -L INT    min INDEL Qual                  [$opts{L}]
 22 |          -f FASTA  file with reference sequence(s) [$opts{f}]
 23 | \n/) if (@ARGV == 0 && -t STDIN);
 24 | 
 25 |   my ($last_chr, $seq, $qual, $last_pos, @gaps);
 26 |   my $_Q = $opts{Q};
 27 |   my $_L = $opts{L};
 28 |   my $_d = $opts{d};
 29 |   my $_D = $opts{D};
 30 | 
 31 |   my %baseSeqs = ();
 32 | 
 33 |   if($opts{f}){
 34 |     open(my $fastaFile, "<", $opts{f}) or die("unable to open reference sequence file: ".$opts{f});
 35 |     my $seqID = "";
 36 |     $seq = "";
 37 |     while(<$fastaFile>){
 38 |       chomp;
 39 |       if(/^>([^ ]+)/){
 40 |         $baseSeqs{$seqID} = $seq unless ($seqID eq "");
 41 |         $seqID = $1;
 42 |         $seq = "";
 43 |       } else {
 44 |         $seq .= $_;
 45 |       }
 46 |     }
 47 |     $baseSeqs{$seqID} = $seq unless ($seqID eq "");
 48 |     close($fastaFile);
 49 |   }
 50 | 
 51 |   my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
 52 | 			 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
 53 | 
 54 |   $last_chr = '';
 55 |   while (<>) {
 56 | 	next if (/^#/);
 57 | 	my @t = split;
 58 | 	if ($last_chr ne $t[0]) {
 59 | 	  &v2q_post_process($last_chr, \$seq, \$qual, \@gaps) if ($last_chr);
 60 | 	  ($last_chr, $last_pos) = ($t[0], 0);
 61 | 	  $seq = $qual = '';
 62 |           if(defined($baseSeqs{$t[0]})){
 63 |             $seq .= $baseSeqs{$t[0]};
 64 |             $qual .= '~' x length($seq);
 65 |           }
 66 | 	  @gaps = ();
 67 | 	}
 68 | 	die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
 69 | 	if ($t[1] > length($seq)) {
 70 |           $seq .=  'n' x ($t[1] - $last_pos);
 71 |           $qual .= '!' x ($t[1] - $last_pos);
 72 | 	}
 73 | 	if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
 74 | 	  my ($ref, $alt) = ($t[3], $1);
 75 | 	  my ($b, $q);
 76 | 	  $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
 77 | 	  if ($q < 0) {
 78 | 		$_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
 79 | 		$b = ($_ < .5 || $alt eq '.')? $ref : $alt;
 80 | 		$q = -$q;
 81 | 	  } else {
 82 | 		$b = $het{"$ref$alt"};
 83 | 		$b ||= 'N';
 84 | 	  }
 85 | 	  $b = lc($b);
 86 | 	  $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
 87 | 	  $q = int($q + 33 + .499);
 88 | 	  $q = chr($q <= 126? $q : 126);
 89 | 	  substr($seq,$t[1]-1,1) = $b;
 90 | 	  substr($qual,$t[1]-1,1) = $q;
 91 | 	} elsif (($t[4] ne '.') && ($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) &&
 92 |             ($t[5] > $_L)) {
 93 |           # an INDEL
 94 |           my $fq = 126;
 95 |           if($t[7] =~ /MQ=(\d+)/){
 96 |             $fq = $1;
 97 |           }
 98 | 	  push(@gaps, [$t[1], length($t[3]), (split(/,/,$t[4],2))[0], $fq]);
 99 | 	}
100 | 	$last_pos = $t[1];
101 |   }
102 |   &v2q_post_process($last_chr, \$seq, \$qual, \@gaps);
103 | }
104 | 
105 | sub v2q_post_process {
106 |   my ($chr, $seq, $qual, $gaps) = @_;
107 |   for my $g (@$gaps) {
108 |     # blank out replaced sequence (there was an off-by-1 error in the original code)
109 |     substr($$seq, ($g->[0]-1), $g->[1]) = '-' x ($g->[1]);
110 |     substr($$qual, ($g->[0]-1), $g->[1]) = ' ' x ($g->[1]);
111 |   }
112 |   my $newSeq = '';
113 |   my $newQual = '';
114 |   my $seqPos = 1;
115 |   my $indelOffset = 0;
116 |   for my $g (@$gaps) {
117 |     $newSeq .= substr($$seq,$seqPos-1,($g->[0])-$seqPos);
118 |     $newQual .= substr($$qual,$seqPos-1,($g->[0])-$seqPos);
119 |     $indelOffset = $indelOffset - (($g->[0])-$seqPos);
120 |     $indelOffset = 0 if ($indelOffset < 0);
121 |     $seqPos = $g->[0];
122 |     $newSeq .= lc(substr($g->[2],$indelOffset)) unless ($indelOffset > length($g->[2]));
123 |     my $q = int($g->[3] + 33 + .499);
124 |     $q = chr($q <= 126? $q : 126);
125 |     $newQual .= $q x (length($g->[2]) - $indelOffset) unless ($indelOffset > length($g->[2]));
126 |     $indelOffset = $g->[1] unless ($indelOffset > $g->[1]);
127 |   }
128 |   if($seqPos < length($$seq)){
129 |     $newSeq .= substr($$seq,$seqPos);
130 |     $newQual .= substr($$qual,$seqPos);
131 |   }
132 |   $newSeq =~ tr/\-//d;
133 |   $newQual =~ tr/ //d;
134 |   print "\@$chr\n"; &v2q_print_str(\$newSeq);
135 |   print "+\n"; &v2q_print_str(\$newQual);
136 | }
137 | 
138 | sub v2q_print_str {
139 |   my ($s) = @_;
140 |   my $l = length($$s);
141 |   for (my $i = 0; $i < $l; $i += 60) {
142 | 	print substr($$s, $i, 60), "\n";
143 |   }
144 | }
145 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastn_params.html:
--------------------------------------------------------------------------------
  1 | <h2>Program Parameters</h2>
  2 | <!-- structure taken from NCBI blast, with extra help / default candy removed -->
  3 | <fieldset>
  4 |   <legend>General Parameters</legend>
  5 | 
  6 |   <p>
  7 |     <label for="numseq">Max target sequences</label>
  8 |     <select name="MAX_NUM_SEQ" id="numseq"
  9 |             alt = "Select the maximum number of aligned sequences to display">
 10 |       <option value="%(MAX_NUM_SEQ)" selected="selected">%(MAX_NUM_SEQ)</option>
 11 |       <option disabled="disabled">-----</option>
 12 |       <option value="10">10</option>
 13 |       <option value="50">50</option>
 14 |       <option value="100">100</option>
 15 |       <option value="250">250</option>
 16 |       <option value="500">500</option>
 17 |       <option value="1000">1000</option>
 18 |       <option value="5000">5000</option>
 19 |       <option value="10000">10000</option>
 20 |       <option value="20000">20000</option>
 21 |     </select>
 22 |   </p>
 23 | 
 24 |   <p>
 25 |     <input type="checkbox" name="SHORT_QUERY_ADJUST" id="adjparam" checked="%(SHORT_QUERY_ADJUST)" />
 26 |     <label for="adjparam">Automatically adjust parameters for short input sequences</label>
 27 |   </p>
 28 | 
 29 |   <p>
 30 |     <label for="expect">Expect threshold</label>
 31 |     <input name="EXPECT" id="expect" size="10" type="text" value="%(EXPECT)" />
 32 |   </p>
 33 | 
 34 |   <p>
 35 |     <label for="wordsize">Word size</label>
 36 |     <select name="WORD_SIZE" id="wordsize" defVal="11" >
 37 |       <option value="%(WORD_SIZE)" selected="selected">%(WORD_SIZE)</option>
 38 |       <option disabled="disabled">-----</option>
 39 |       <option value="7">7</option>
 40 |       <option value="11">11</option>
 41 |       <option value="15">15</option>
 42 |     </select>
 43 |   </p>
 44 | 
 45 |   <!-- <p> -->
 46 |   <!--   <label for="hsp_range_max">Max matches in a query range</label> -->
 47 |   <!--   <input name="HSP_RANGE_MAX" id="hsp_range_max" size="10" type="text" value="0" defVal="0" /> -->
 48 |   <!-- </p> -->
 49 | 
 50 | </fieldset>
 51 | 
 52 | <fieldset>
 53 |   <legend>Scoring Parameters</legend>
 54 |   <p>
 55 |     <label for="matrixName">Matrix</label>
 56 |     <select name="MATRIX_NAME" id="matrixName" defVal="">
 57 |       <option value="%(MATRIX_NAME)" selected="selected">%(MATRIX_NAME)</option>
 58 |       <option disabled="disabled">-----</option>
 59 |       <option value="PAM30">PAM30</option>
 60 |       <option value="PAM70">PAM70</option>
 61 |       <option value="BLOSUM80">BLOSUM80</option>
 62 |       <option value="BLOSUM62">BLOSUM62</option>
 63 |       <option value="BLOSUM45">BLOSUM45</option>
 64 |     </select>
 65 |   </p>
 66 | 
 67 |   <p>
 68 |     <label for="matchscores">Match/Mismatch Scores</label>
 69 |     <select name="MATCH_SCORES" id="matchscores">
 70 |       <option value="%(MATCH_SCORES)" selected="selected">%(MATCH_SCORES)</option>
 71 |       <option disabled="disabled">-----</option>
 72 |       <option value="1,-2">1,-2</option>
 73 |       <option value="1,-3">1,-3</option>
 74 |       <option value="1,-4">1,-4</option>
 75 |       <option value="2,-3">2,-3</option>
 76 |       <option value="4,-5">4,-5</option>
 77 |       <option value="1,-1">1,-1</option>
 78 |     </select>
 79 |   </p>
 80 | 
 81 |   <p>
 82 |     <label for="gapcosts">Gap Costs</label>
 83 |     <select name="GAPCOSTS" id="gapcosts" defVal="5 2" ><!-- default depends on MatchScores selected - doen in javascript -->
 84 |       <option value="%(GAPCOSTS)" selected="selected">%(GAPCOSTS)</option>
 85 |       <option disabled="disabled">-----</option>
 86 |       <option value = "0 0">Linear</option>
 87 |       <option value = "4 4">Existence:  4 Extension: 4</option>
 88 |       <option value = "2 4">Existence:  2 Extension: 4</option>
 89 |       <option value = "0 4">Existence:  0 Extension: 4</option>
 90 |       <option value = "3 3">Existence:  3 Extension: 3</option>
 91 |       <option value = "6 2">Existence:  6 Extension: 2</option>
 92 |       <option value = "5 2">Existence:  5 Extension: 2</option>
 93 |       <option value = "4 2">Existence: 4 Extension: 2</option>
 94 |       <option value = "2 2">Existence:  2 Extension: 2</option>
 95 |     </select>
 96 |   </p>
 97 |   <p>
 98 |     <label for="compbasedstat">Compositional adjustments</label>
 99 |     <select name="COMPOSITION_BASED_STATISTICS" id="compbasedstat" defVal="">
100 |       <option value="%(COMPOSITION_BASED_STATISTICS)" selected="selected">%(COMPOSITION_BASED_STATISTICS)</option>
101 |       <option disabled="disabled">-----</option>
102 |       <option value="0">No adjustment</option>
103 |       <option value="1">Composition-based statistics</option>
104 |       <option value="2">Conditional compositional score matrix adjustment</option>
105 |       <option value="3">Universal compositional score matrix adjustment</option>
106 |     </select>
107 |   </p>
108 | 
109 | </fieldset>
110 | 
111 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
112 |   <p>BLAST</p>
113 | </button>
114 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastp_params.html:
--------------------------------------------------------------------------------
  1 | <h2>Program Parameters</h2>
  2 | <!-- structure taken from NCBI blast, with extra help / default candy removed -->
  3 | <fieldset>
  4 |   <legend>General Parameters</legend>
  5 | 
  6 |   <p>
  7 |     <label for="numseq">Max target sequences</label>
  8 |     <select name="MAX_NUM_SEQ" id="numseq"
  9 |             alt = "Select the maximum number of aligned sequences to display">
 10 |       <option value="%(MAX_NUM_SEQ)" selected="selected">%(MAX_NUM_SEQ)</option>
 11 |       <option disabled="disabled">-----</option>
 12 |       <option value="10">10</option>
 13 |       <option value="50">50</option>
 14 |       <option value="100">100</option>
 15 |       <option value="250">250</option>
 16 |       <option value="500">500</option>
 17 |       <option value="1000">1000</option>
 18 |       <option value="5000">5000</option>
 19 |       <option value="10000">10000</option>
 20 |       <option value="20000">20000</option>
 21 |     </select>
 22 |   </p>
 23 | 
 24 |   <p>
 25 |     <input type="checkbox" name="SHORT_QUERY_ADJUST" id="adjparam" checked="%(SHORT_QUERY_ADJUST)" />
 26 |     <label for="adjparam">Automatically adjust parameters for short input sequences</label>
 27 |   </p>
 28 | 
 29 |   <p>
 30 |     <label for="expect">Expect threshold</label>
 31 |     <input name="EXPECT" id="expect" size="10" type="text" value="%(EXPECT)" />
 32 |   </p>
 33 | 
 34 |   <p>
 35 |     <label for="wordsize">Word size</label>
 36 |     <select name="WORD_SIZE" id="wordsize" defVal="11" >
 37 |       <option value="%(WORD_SIZE)" selected="selected">%(WORD_SIZE)</option>
 38 |       <option disabled="disabled">-----</option>
 39 |       <option value="7">7</option>
 40 |       <option value="11">11</option>
 41 |       <option value="15">15</option>
 42 |     </select>
 43 |   </p>
 44 | 
 45 |   <!-- <p> -->
 46 |   <!--   <label for="hsp_range_max">Max matches in a query range</label> -->
 47 |   <!--   <input name="HSP_RANGE_MAX" id="hsp_range_max" size="10" type="text" value="0" defVal="0" /> -->
 48 |   <!-- </p> -->
 49 | 
 50 | </fieldset>
 51 | 
 52 | <fieldset>
 53 |   <legend>Scoring Parameters</legend>
 54 |   <p>
 55 |     <label for="matrixName">Matrix</label>
 56 |     <select name="MATRIX_NAME" id="matrixName" defVal="">
 57 |       <option value="%(MATRIX_NAME)" selected="selected">%(MATRIX_NAME)</option>
 58 |       <option disabled="disabled">-----</option>
 59 |       <option value="PAM30">PAM30</option>
 60 |       <option value="PAM70">PAM70</option>
 61 |       <option value="BLOSUM80">BLOSUM80</option>
 62 |       <option value="BLOSUM62">BLOSUM62</option>
 63 |       <option value="BLOSUM45">BLOSUM45</option>
 64 |     </select>
 65 |   </p>
 66 | 
 67 |   <p>
 68 |     <label for="matchscores">Match/Mismatch Scores</label>
 69 |     <select name="MATCH_SCORES" id="matchscores">
 70 |       <option value="%(MATCH_SCORES)" selected="selected">%(MATCH_SCORES)</option>
 71 |       <option disabled="disabled">-----</option>
 72 |       <option value="1,-2">1,-2</option>
 73 |       <option value="1,-3">1,-3</option>
 74 |       <option value="1,-4">1,-4</option>
 75 |       <option value="2,-3">2,-3</option>
 76 |       <option value="4,-5">4,-5</option>
 77 |       <option value="1,-1">1,-1</option>
 78 |     </select>
 79 |   </p>
 80 | 
 81 |   <p>
 82 |     <label for="gapcosts">Gap Costs</label>
 83 |     <select name="GAPCOSTS" id="gapcosts" defVal="5 2" ><!-- default depends on MatchScores selected - doen in javascript -->
 84 |       <option value="%(GAPCOSTS)" selected="selected">%(GAPCOSTS)</option>
 85 |       <option disabled="disabled">-----</option>
 86 |       <option value = "0 0">Linear</option>
 87 |       <option value = "4 4">Existence:  4 Extension: 4</option>
 88 |       <option value = "2 4">Existence:  2 Extension: 4</option>
 89 |       <option value = "0 4">Existence:  0 Extension: 4</option>
 90 |       <option value = "3 3">Existence:  3 Extension: 3</option>
 91 |       <option value = "6 2">Existence:  6 Extension: 2</option>
 92 |       <option value = "5 2">Existence:  5 Extension: 2</option>
 93 |       <option value = "4 2">Existence: 4 Extension: 2</option>
 94 |       <option value = "2 2">Existence:  2 Extension: 2</option>
 95 |     </select>
 96 |   </p>
 97 |   <p>
 98 |     <label for="compbasedstat">Compositional adjustments</label>
 99 |     <select name="COMPOSITION_BASED_STATISTICS" id="compbasedstat" defVal="">
100 |       <option value="%(COMPOSITION_BASED_STATISTICS)" selected="selected">%(COMPOSITION_BASED_STATISTICS)</option>
101 |       <option disabled="disabled">-----</option>
102 |       <option value="0">No adjustment</option>
103 |       <option value="1">Composition-based statistics</option>
104 |       <option value="2">Conditional compositional score matrix adjustment</option>
105 |       <option value="3">Universal compositional score matrix adjustment</option>
106 |     </select>
107 |   </p>
108 | 
109 | </fieldset>
110 | 
111 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
112 |   <p>BLAST</p>
113 | </button>
114 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/blastx_params.html:
--------------------------------------------------------------------------------
  1 | <h2>Program Parameters</h2>
  2 | <!-- structure taken from NCBI blast, with extra help / default candy removed -->
  3 | <fieldset>
  4 |   <legend>General Parameters</legend>
  5 | 
  6 |   <p>
  7 |     <label for="numseq">Max target sequences</label>
  8 |     <select name="MAX_NUM_SEQ" id="numseq"
  9 |             alt = "Select the maximum number of aligned sequences to display">
 10 |       <option value="%(MAX_NUM_SEQ)" selected="selected">%(MAX_NUM_SEQ)</option>
 11 |       <option disabled="disabled">-----</option>
 12 |       <option value="10">10</option>
 13 |       <option value="50">50</option>
 14 |       <option value="100">100</option>
 15 |       <option value="250">250</option>
 16 |       <option value="500">500</option>
 17 |       <option value="1000">1000</option>
 18 |       <option value="5000">5000</option>
 19 |       <option value="10000">10000</option>
 20 |       <option value="20000">20000</option>
 21 |     </select>
 22 |   </p>
 23 | 
 24 |   <p>
 25 |     <input type="checkbox" name="SHORT_QUERY_ADJUST" id="adjparam" checked="%(SHORT_QUERY_ADJUST)" />
 26 |     <label for="adjparam">Automatically adjust parameters for short input sequences</label>
 27 |   </p>
 28 | 
 29 |   <p>
 30 |     <label for="expect">Expect threshold</label>
 31 |     <input name="EXPECT" id="expect" size="10" type="text" value="%(EXPECT)" />
 32 |   </p>
 33 | 
 34 |   <p>
 35 |     <label for="wordsize">Word size</label>
 36 |     <select name="WORD_SIZE" id="wordsize" defVal="11" >
 37 |       <option value="%(WORD_SIZE)" selected="selected">%(WORD_SIZE)</option>
 38 |       <option disabled="disabled">-----</option>
 39 |       <option value="7">7</option>
 40 |       <option value="11">11</option>
 41 |       <option value="15">15</option>
 42 |     </select>
 43 |   </p>
 44 | 
 45 |   <!-- <p> -->
 46 |   <!--   <label for="hsp_range_max">Max matches in a query range</label> -->
 47 |   <!--   <input name="HSP_RANGE_MAX" id="hsp_range_max" size="10" type="text" value="0" defVal="0" /> -->
 48 |   <!-- </p> -->
 49 | 
 50 | </fieldset>
 51 | 
 52 | <fieldset>
 53 |   <legend>Scoring Parameters</legend>
 54 |   <p>
 55 |     <label for="matrixName">Matrix</label>
 56 |     <select name="MATRIX_NAME" id="matrixName" defVal="">
 57 |       <option value="%(MATRIX_NAME)" selected="selected">%(MATRIX_NAME)</option>
 58 |       <option disabled="disabled">-----</option>
 59 |       <option value="PAM30">PAM30</option>
 60 |       <option value="PAM70">PAM70</option>
 61 |       <option value="BLOSUM80">BLOSUM80</option>
 62 |       <option value="BLOSUM62">BLOSUM62</option>
 63 |       <option value="BLOSUM45">BLOSUM45</option>
 64 |     </select>
 65 |   </p>
 66 | 
 67 |   <p>
 68 |     <label for="matchscores">Match/Mismatch Scores</label>
 69 |     <select name="MATCH_SCORES" id="matchscores">
 70 |       <option value="%(MATCH_SCORES)" selected="selected">%(MATCH_SCORES)</option>
 71 |       <option disabled="disabled">-----</option>
 72 |       <option value="1,-2">1,-2</option>
 73 |       <option value="1,-3">1,-3</option>
 74 |       <option value="1,-4">1,-4</option>
 75 |       <option value="2,-3">2,-3</option>
 76 |       <option value="4,-5">4,-5</option>
 77 |       <option value="1,-1">1,-1</option>
 78 |     </select>
 79 |   </p>
 80 | 
 81 |   <p>
 82 |     <label for="gapcosts">Gap Costs</label>
 83 |     <select name="GAPCOSTS" id="gapcosts" defVal="5 2" ><!-- default depends on MatchScores selected - doen in javascript -->
 84 |       <option value="%(GAPCOSTS)" selected="selected">%(GAPCOSTS)</option>
 85 |       <option disabled="disabled">-----</option>
 86 |       <option value = "0 0">Linear</option>
 87 |       <option value = "4 4">Existence:  4 Extension: 4</option>
 88 |       <option value = "2 4">Existence:  2 Extension: 4</option>
 89 |       <option value = "0 4">Existence:  0 Extension: 4</option>
 90 |       <option value = "3 3">Existence:  3 Extension: 3</option>
 91 |       <option value = "6 2">Existence:  6 Extension: 2</option>
 92 |       <option value = "5 2">Existence:  5 Extension: 2</option>
 93 |       <option value = "4 2">Existence: 4 Extension: 2</option>
 94 |       <option value = "2 2">Existence:  2 Extension: 2</option>
 95 |     </select>
 96 |   </p>
 97 |   <p>
 98 |     <label for="compbasedstat">Compositional adjustments</label>
 99 |     <select name="COMPOSITION_BASED_STATISTICS" id="compbasedstat" defVal="">
100 |       <option value="%(COMPOSITION_BASED_STATISTICS)" selected="selected">%(COMPOSITION_BASED_STATISTICS)</option>
101 |       <option disabled="disabled">-----</option>
102 |       <option value="0">No adjustment</option>
103 |       <option value="1">Composition-based statistics</option>
104 |       <option value="2">Conditional compositional score matrix adjustment</option>
105 |       <option value="3">Universal compositional score matrix adjustment</option>
106 |     </select>
107 |   </p>
108 | 
109 | </fieldset>
110 | 
111 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
112 |   <p>BLAST</p>
113 | </button>
114 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastn_params.html:
--------------------------------------------------------------------------------
  1 | <h2>Program Parameters</h2>
  2 | <!-- structure taken from NCBI blast, with extra help / default candy removed -->
  3 | <fieldset>
  4 |   <legend>General Parameters</legend>
  5 | 
  6 |   <p>
  7 |     <label for="numseq">Max target sequences</label>
  8 |     <select name="MAX_NUM_SEQ" id="numseq"
  9 |             alt = "Select the maximum number of aligned sequences to display">
 10 |       <option value="%(MAX_NUM_SEQ)" selected="selected">%(MAX_NUM_SEQ)</option>
 11 |       <option disabled="disabled">-----</option>
 12 |       <option value="10">10</option>
 13 |       <option value="50">50</option>
 14 |       <option value="100">100</option>
 15 |       <option value="250">250</option>
 16 |       <option value="500">500</option>
 17 |       <option value="1000">1000</option>
 18 |       <option value="5000">5000</option>
 19 |       <option value="10000">10000</option>
 20 |       <option value="20000">20000</option>
 21 |     </select>
 22 |   </p>
 23 | 
 24 |   <p>
 25 |     <input type="checkbox" name="SHORT_QUERY_ADJUST" id="adjparam" checked="%(SHORT_QUERY_ADJUST)" />
 26 |     <label for="adjparam">Automatically adjust parameters for short input sequences</label>
 27 |   </p>
 28 | 
 29 |   <p>
 30 |     <label for="expect">Expect threshold</label>
 31 |     <input name="EXPECT" id="expect" size="10" type="text" value="%(EXPECT)" />
 32 |   </p>
 33 | 
 34 |   <p>
 35 |     <label for="wordsize">Word size</label>
 36 |     <select name="WORD_SIZE" id="wordsize" defVal="11" >
 37 |       <option value="%(WORD_SIZE)" selected="selected">%(WORD_SIZE)</option>
 38 |       <option disabled="disabled">-----</option>
 39 |       <option value="7">7</option>
 40 |       <option value="11">11</option>
 41 |       <option value="15">15</option>
 42 |     </select>
 43 |   </p>
 44 | 
 45 |   <!-- <p> -->
 46 |   <!--   <label for="hsp_range_max">Max matches in a query range</label> -->
 47 |   <!--   <input name="HSP_RANGE_MAX" id="hsp_range_max" size="10" type="text" value="0" defVal="0" /> -->
 48 |   <!-- </p> -->
 49 | 
 50 | </fieldset>
 51 | 
 52 | <fieldset>
 53 |   <legend>Scoring Parameters</legend>
 54 |   <p>
 55 |     <label for="matrixName">Matrix</label>
 56 |     <select name="MATRIX_NAME" id="matrixName" defVal="">
 57 |       <option value="%(MATRIX_NAME)" selected="selected">%(MATRIX_NAME)</option>
 58 |       <option disabled="disabled">-----</option>
 59 |       <option value="PAM30">PAM30</option>
 60 |       <option value="PAM70">PAM70</option>
 61 |       <option value="BLOSUM80">BLOSUM80</option>
 62 |       <option value="BLOSUM62">BLOSUM62</option>
 63 |       <option value="BLOSUM45">BLOSUM45</option>
 64 |     </select>
 65 |   </p>
 66 | 
 67 |   <p>
 68 |     <label for="matchscores">Match/Mismatch Scores</label>
 69 |     <select name="MATCH_SCORES" id="matchscores">
 70 |       <option value="%(MATCH_SCORES)" selected="selected">%(MATCH_SCORES)</option>
 71 |       <option disabled="disabled">-----</option>
 72 |       <option value="1,-2">1,-2</option>
 73 |       <option value="1,-3">1,-3</option>
 74 |       <option value="1,-4">1,-4</option>
 75 |       <option value="2,-3">2,-3</option>
 76 |       <option value="4,-5">4,-5</option>
 77 |       <option value="1,-1">1,-1</option>
 78 |     </select>
 79 |   </p>
 80 | 
 81 |   <p>
 82 |     <label for="gapcosts">Gap Costs</label>
 83 |     <select name="GAPCOSTS" id="gapcosts" defVal="5 2" ><!-- default depends on MatchScores selected - doen in javascript -->
 84 |       <option value="%(GAPCOSTS)" selected="selected">%(GAPCOSTS)</option>
 85 |       <option disabled="disabled">-----</option>
 86 |       <option value = "0 0">Linear</option>
 87 |       <option value = "4 4">Existence:  4 Extension: 4</option>
 88 |       <option value = "2 4">Existence:  2 Extension: 4</option>
 89 |       <option value = "0 4">Existence:  0 Extension: 4</option>
 90 |       <option value = "3 3">Existence:  3 Extension: 3</option>
 91 |       <option value = "6 2">Existence:  6 Extension: 2</option>
 92 |       <option value = "5 2">Existence:  5 Extension: 2</option>
 93 |       <option value = "4 2">Existence: 4 Extension: 2</option>
 94 |       <option value = "2 2">Existence:  2 Extension: 2</option>
 95 |     </select>
 96 |   </p>
 97 |   <p>
 98 |     <label for="compbasedstat">Compositional adjustments</label>
 99 |     <select name="COMPOSITION_BASED_STATISTICS" id="compbasedstat" defVal="">
100 |       <option value="%(COMPOSITION_BASED_STATISTICS)" selected="selected">%(COMPOSITION_BASED_STATISTICS)</option>
101 |       <option disabled="disabled">-----</option>
102 |       <option value="0">No adjustment</option>
103 |       <option value="1">Composition-based statistics</option>
104 |       <option value="2">Conditional compositional score matrix adjustment</option>
105 |       <option value="3">Universal compositional score matrix adjustment</option>
106 |     </select>
107 |   </p>
108 | 
109 | </fieldset>
110 | 
111 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
112 |   <p>BLAST</p>
113 | </button>
114 | 


--------------------------------------------------------------------------------
/webblast/cgi-bin/templates/tblastx_params.html:
--------------------------------------------------------------------------------
  1 | <h2>Program Parameters</h2>
  2 | <!-- structure taken from NCBI blast, with extra help / default candy removed -->
  3 | <fieldset>
  4 |   <legend>General Parameters</legend>
  5 | 
  6 |   <p>
  7 |     <label for="numseq">Max target sequences</label>
  8 |     <select name="MAX_NUM_SEQ" id="numseq"
  9 |             alt = "Select the maximum number of aligned sequences to display">
 10 |       <option value="%(MAX_NUM_SEQ)" selected="selected">%(MAX_NUM_SEQ)</option>
 11 |       <option disabled="disabled">-----</option>
 12 |       <option value="10">10</option>
 13 |       <option value="50">50</option>
 14 |       <option value="100">100</option>
 15 |       <option value="250">250</option>
 16 |       <option value="500">500</option>
 17 |       <option value="1000">1000</option>
 18 |       <option value="5000">5000</option>
 19 |       <option value="10000">10000</option>
 20 |       <option value="20000">20000</option>
 21 |     </select>
 22 |   </p>
 23 | 
 24 |   <p>
 25 |     <input type="checkbox" name="SHORT_QUERY_ADJUST" id="adjparam" checked="%(SHORT_QUERY_ADJUST)" />
 26 |     <label for="adjparam">Automatically adjust parameters for short input sequences</label>
 27 |   </p>
 28 | 
 29 |   <p>
 30 |     <label for="expect">Expect threshold</label>
 31 |     <input name="EXPECT" id="expect" size="10" type="text" value="%(EXPECT)" />
 32 |   </p>
 33 | 
 34 |   <p>
 35 |     <label for="wordsize">Word size</label>
 36 |     <select name="WORD_SIZE" id="wordsize" defVal="11" >
 37 |       <option value="%(WORD_SIZE)" selected="selected">%(WORD_SIZE)</option>
 38 |       <option disabled="disabled">-----</option>
 39 |       <option value="7">7</option>
 40 |       <option value="11">11</option>
 41 |       <option value="15">15</option>
 42 |     </select>
 43 |   </p>
 44 | 
 45 |   <!-- <p> -->
 46 |   <!--   <label for="hsp_range_max">Max matches in a query range</label> -->
 47 |   <!--   <input name="HSP_RANGE_MAX" id="hsp_range_max" size="10" type="text" value="0" defVal="0" /> -->
 48 |   <!-- </p> -->
 49 | 
 50 | </fieldset>
 51 | 
 52 | <fieldset>
 53 |   <legend>Scoring Parameters</legend>
 54 |   <p>
 55 |     <label for="matrixName">Matrix</label>
 56 |     <select name="MATRIX_NAME" id="matrixName" defVal="">
 57 |       <option value="%(MATRIX_NAME)" selected="selected">%(MATRIX_NAME)</option>
 58 |       <option disabled="disabled">-----</option>
 59 |       <option value="PAM30">PAM30</option>
 60 |       <option value="PAM70">PAM70</option>
 61 |       <option value="BLOSUM80">BLOSUM80</option>
 62 |       <option value="BLOSUM62">BLOSUM62</option>
 63 |       <option value="BLOSUM45">BLOSUM45</option>
 64 |     </select>
 65 |   </p>
 66 | 
 67 |   <p>
 68 |     <label for="matchscores">Match/Mismatch Scores</label>
 69 |     <select name="MATCH_SCORES" id="matchscores">
 70 |       <option value="%(MATCH_SCORES)" selected="selected">%(MATCH_SCORES)</option>
 71 |       <option disabled="disabled">-----</option>
 72 |       <option value="1,-2">1,-2</option>
 73 |       <option value="1,-3">1,-3</option>
 74 |       <option value="1,-4">1,-4</option>
 75 |       <option value="2,-3">2,-3</option>
 76 |       <option value="4,-5">4,-5</option>
 77 |       <option value="1,-1">1,-1</option>
 78 |     </select>
 79 |   </p>
 80 | 
 81 |   <p>
 82 |     <label for="gapcosts">Gap Costs</label>
 83 |     <select name="GAPCOSTS" id="gapcosts" defVal="5 2" ><!-- default depends on MatchScores selected - doen in javascript -->
 84 |       <option value="%(GAPCOSTS)" selected="selected">%(GAPCOSTS)</option>
 85 |       <option disabled="disabled">-----</option>
 86 |       <option value = "0 0">Linear</option>
 87 |       <option value = "4 4">Existence:  4 Extension: 4</option>
 88 |       <option value = "2 4">Existence:  2 Extension: 4</option>
 89 |       <option value = "0 4">Existence:  0 Extension: 4</option>
 90 |       <option value = "3 3">Existence:  3 Extension: 3</option>
 91 |       <option value = "6 2">Existence:  6 Extension: 2</option>
 92 |       <option value = "5 2">Existence:  5 Extension: 2</option>
 93 |       <option value = "4 2">Existence: 4 Extension: 2</option>
 94 |       <option value = "2 2">Existence:  2 Extension: 2</option>
 95 |     </select>
 96 |   </p>
 97 |   <p>
 98 |     <label for="compbasedstat">Compositional adjustments</label>
 99 |     <select name="COMPOSITION_BASED_STATISTICS" id="compbasedstat" defVal="">
100 |       <option value="%(COMPOSITION_BASED_STATISTICS)" selected="selected">%(COMPOSITION_BASED_STATISTICS)</option>
101 |       <option disabled="disabled">-----</option>
102 |       <option value="0">No adjustment</option>
103 |       <option value="1">Composition-based statistics</option>
104 |       <option value="2">Conditional compositional score matrix adjustment</option>
105 |       <option value="3">Universal compositional score matrix adjustment</option>
106 |     </select>
107 |   </p>
108 | 
109 | </fieldset>
110 | 
111 | <button type="submit" class="blastbutton" name="runBlast" value="BLAST">
112 |   <p>BLAST</p>
113 | </button>
114 | 


--------------------------------------------------------------------------------
/fastx-fetch.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | use Getopt::Long qw(:config auto_help pass_through);
  6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
  7 | 
  8 | my $idFileName = "";
  9 | my $quiet = 0;
 10 | my $minLength = 1;
 11 | my $maxLength = 10 ** 12; # 1 Tbp
 12 | my $count = -1;
 13 | my $invert = 0; # invert logic
 14 | my $trim = 0;
 15 | my $trimString = "";
 16 | 
 17 | GetOptions("idfile=s" => \$idFileName, "quiet!" => \$quiet,
 18 |            "reverse|v!" => \$invert, "trim=i" => \$trim,
 19 |            "minLength=i" => \$minLength, "maxLength=i" => \$maxLength,
 20 |            "count=i" => \$count, "nametrim=s" => \$trimString, ) or
 21 |   die("Error in command line arguments");
 22 | 
 23 | my %idsToGet = ();
 24 | if($trim){
 25 |   $minLength = $minLength + $trim * 2;
 26 |   $maxLength = $maxLength + $trim * 2;
 27 | }
 28 | 
 29 | if($trimString){
 30 |   printf(STDERR "Will remove text matching '(%s)' from the sequence IDs",
 31 |          $trimString);
 32 |   $trimString =~ s/^\|//;
 33 |   $trimString = "($trimString)";
 34 | }
 35 | 
 36 | # unknown commands are treated as identifiers
 37 | my @files = ();
 38 | while(@ARGV){
 39 |   my $arg = shift(@ARGV);
 40 |   if(-e $arg){
 41 |     push(@files, $arg);
 42 |   } else {
 43 |     $idsToGet{$arg} = 1;
 44 |   }
 45 | }
 46 | @ARGV = @files;
 47 | 
 48 | # use stdin if no files supplied
 49 | if(!@ARGV){
 50 |   @ARGV = '-' unless (-t STDIN);
 51 | }
 52 | 
 53 | if($idFileName){
 54 |   # read sequence IDs from input file
 55 |   if(!$quiet){
 56 |     printf(STDERR "Attempting to read from input file ($idFileName)\n");
 57 |   }
 58 |   my $idFile = new IO::Uncompress::Gunzip "$idFileName" or
 59 |     die "Unable to open $idFileName\n";
 60 |   while(<$idFile>){
 61 |     chomp;
 62 |     s/^[>@]//;
 63 |     s/\s.*$//;
 64 |     $idsToGet{$_} = 1;
 65 |   }
 66 |   close($idFile);
 67 | }
 68 | 
 69 | if(!$quiet){
 70 |   printf(STDERR "Read %d identifiers\n", scalar(keys(%idsToGet)));
 71 | }
 72 | 
 73 | if(!$quiet && $invert){
 74 |   printf(STDERR "Excluding IDs, rather than selecting\n");
 75 | } else {
 76 |   ## Stop when all IDs have been seen
 77 |   if(($count == -1) && (keys(%idsToGet))){
 78 |     $count = scalar(keys(%idsToGet));
 79 |   }
 80 | }
 81 | 
 82 | my $inQual = 0; # false
 83 | my $seqID = "";
 84 | my $qualID = "";
 85 | my $seq = "";
 86 | my $qual = "";
 87 | foreach my $file (@ARGV) {
 88 |   # This little gunzip dance makes sure the script can handle both
 89 |   # gzip-compressed and uncompressed input, regardless of whether
 90 |   # or not it is piped
 91 |   my $z = new IO::Uncompress::Gunzip($file, "transparent", 1)
 92 |     or die "gunzip failed: $GunzipError\n";
 93 |   while(<$z>){
 94 |     chomp;
 95 |     chomp;
 96 |     if (!$inQual) {
 97 |       if (/^(>|@)((.+?)( .*?\s*)?)$/) {
 98 |         my $newSeqID = $2;
 99 |         my $newShortID = $3;
100 |         my $testSeqID = $newSeqID;
101 |         my $testShortID = $newShortID;
102 |         if($trimString){
103 |           $testShortID =~ s/$trimString//;
104 |           $testSeqID =~ s/$trimString//;
105 |         }
106 |         if ($seqID && (length($seq) >= $minLength) && (length($seq) <= $maxLength)) {
107 |           if ($trim > 0) {
108 |             $seq = substr($seq, $trim, length($seq)-($trim * 2));
109 |             if ($qual) {
110 |               $qual = substr($qual, $trim, length($qual)-($trim * 2));
111 |             }
112 |           }
113 |           if ($qual) {
114 |             printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
115 |           } else {
116 |             $seq =~ s/(.{100})/$1\n/g;
117 |             $seq =~ s/\n$//;
118 |             printf(">%s\n%s\n", $seqID, $seq);
119 |           }
120 |           if (--$count == 0) {
121 |             $seqID = "";
122 |             last;
123 |           }
124 |         }
125 |         $seq = "";
126 |         $qual = "";
127 |         if ((!(keys(%idsToGet)) || exists($idsToGet{$testSeqID}) || exists($idsToGet{$testShortID})) xor $invert) {
128 |           delete $idsToGet{$testSeqID};
129 |           delete $idsToGet{$testShortID};
130 |           $seqID = $newSeqID;
131 |         } else {
132 |           $seqID = "";
133 |         }
134 |       } elsif (/^\+(.*)$/) {
135 |         $inQual = 1;            # true
136 |         $qualID = $1;
137 |       } else {
138 |         $seq .= $_;
139 |       }
140 |     } else {
141 |       $qual .= $_;
142 |       if (length($qual) >= length($seq)) {
143 |         $inQual = 0;            # false
144 |       }
145 |     }
146 |   }
147 | }
148 | 
149 | if($seqID && (length($seq) >= $minLength) && (length($seq) <= $maxLength)){
150 |   if($trim > 0){
151 |     $seq = substr($seq, $trim, length($seq)-($trim * 2));
152 |     if($qual){
153 |       $qual = substr($qual, $trim, length($qual)-($trim * 2));
154 |     }
155 |   }
156 |   if($qual){
157 |     printf("@%s\n%s\n+\n%s\n", $seqID, $seq, $qual);
158 |   } else {
159 |     $seq =~ s/(.{100})/$1\n/g;
160 |     $seq =~ s/\n$//;
161 |     printf(">%s\n%s\n", $seqID, $seq);
162 |   }
163 | }
164 | 


--------------------------------------------------------------------------------
/readthrough.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import HTSeq
  3 | import string
  4 | from Bio.Seq import Seq
  5 | from Bio.Alphabet import generic_dna
  6 | import re
  7 | import sys
  8 | 
  9 | def writeSeqTrans(name, exons, seq, interval):
 10 |     if(interval.strand == "-"):
 11 |         #print(">%s (rc)\n%s" % (name, seq))
 12 |         seq = string.replace(seq,"|","")
 13 |         translation = str(Seq(seq, generic_dna).reverse_complement().translate())
 14 |         endText = " from RC strand"
 15 |     else:
 16 |         #print(">%s\n%s" % (name, seq))
 17 |         seq = string.replace(seq,"|","")
 18 |         translation = str(Seq(seq, generic_dna).translate())
 19 |         endText = ""
 20 |     translation = string.replace(translation, "*", "X", 1)
 21 |     translation = re.sub("\\*.*$","*",translation)
 22 |     if(translation.endswith("X")):
 23 |         endText += ", no sequence beyond stop codon"
 24 |         translation = string.replace(translation, "X", "*", 1)
 25 |         throughDist = 0
 26 |     elif(not translation.endswith("*")):
 27 |         endText += ", no additional stop codons found"
 28 |         throughDist = 0
 29 |     else:
 30 |         throughDist = translation.find("*") - translation.find("X")
 31 |         endText += ", stop codon distance: %d" % throughDist
 32 |     name = "%05d_%s" % (throughDist, name)
 33 |     if(numExons == 1):
 34 |         print(">%s [translation of %d exon%s]\n%s" % (name, exons, endText, translation))
 35 |     else:
 36 |         print(">%s [translation of %d exons%s]\n%s" % (name, exons, endText, translation))
 37 | 
 38 | def writeSeqMrna(name, exons, seq, interval):
 39 |     if(interval.strand == "-"):
 40 |         #print(">%s (rc)\n%s" % (name, seq))
 41 |         seq = string.replace(seq,"|","")
 42 |         translation = str(Seq(seq, generic_dna).reverse_complement().translate())
 43 |         endText = " from RC strand"
 44 |     else:
 45 |         #print(">%s\n%s" % (name, seq))
 46 |         seq = string.replace(seq,"|","")
 47 |         translation = str(Seq(seq, generic_dna).translate())
 48 |         endText = ""
 49 |     translation = string.replace(translation, "*", "X", 1)
 50 |     translation = re.sub("\\*.*$","*",translation)
 51 |     if(translation.endswith("X")):
 52 |         endText += ", no sequence beyond stop codon"
 53 |         translation = string.replace(translation, "X", "*", 1)
 54 |         throughDist = 0
 55 |     elif(not translation.endswith("*")):
 56 |         endText += ", no additional stop codons found"
 57 |         throughDist = 0
 58 |     else:
 59 |         throughDist = translation.find("*") - translation.find("X")
 60 |         endText += ", stop codon distance: %d" % throughDist
 61 |     name = "%05d_%s" % (throughDist, name)
 62 |     if(numExons == 1):
 63 |         print(">%s [translation of %d exon%s]\n%s" % (name, exons, endText, translation))
 64 |     else:
 65 |         print(">%s [translation of %d exons%s]\n%s" % (name, exons, endText, translation))
 66 | 
 67 | 
 68 | # load Scer reference genome
 69 | scerFile = open('saccharomyces_cerevisiae.gff', 'r')
 70 | gffLines = ()
 71 | fastaLines = ()
 72 | hitFasta = False
 73 | for(line in scerFile):
 74 |     if(line.startswith('##FASTA')):
 75 |         hitFasta = True
 76 |     if(not hitFasta):
 77 |         gfflines.append(line)
 78 |     else if(not line.startswith('#')):
 79 |         fastaLines.append(line)
 80 | 
 81 | 
 82 | gtfFile = HTSeq.GFF_Reader(gffLines)
 83 | # load all sequences into memory
 84 | sequences = dict()
 85 | for s in HTSeq.FastaReader(fastaLines):
 86 |     s.seq = string.replace(s.seq,"\r","")
 87 |     sequences[s.name] = s
 88 | 
 89 | lastSequence = ""
 90 | lastName = ""
 91 | lastInterval = None
 92 | extend = 50 # number of base pairs to extend
 93 | readDistance = 5000
 94 | numExons = 0
 95 | numFeatures = 0
 96 | sys.stderr.write("\n")
 97 | for feature in gtfFile:
 98 |     sys.stderr.write("\rFeatures read: %d" % numFeatures)
 99 |     numFeatures += 1
100 |     if(feature.type != "CDS"):
101 |         continue
102 |     if(feature.name != lastName):
103 |         if((lastInterval != None) and (lastInterval.strand != "-")):
104 |             # add readDistance bases to end of sequence
105 |             lastSequence += sequences[lastInterval.chrom].seq[lastInterval.end:lastInterval.end+readDistance]
106 |         # write out sequence / translation
107 |         if(lastName != ""):
108 |             writeSeqTrans(lastName, numExons, lastSequence, lastInterval)
109 |         lastSequence = ""
110 |         numExons = 0
111 |         if(feature.iv.strand == "-"):
112 |             # add readDistance bases to start of sequence
113 |             startPos = lastInterval.start - readDistance
114 |             if(startPos < 0):
115 |                 startPos = 0
116 |             lastSequence += sequences[feature.iv.chrom].seq[startPos:lastInterval.end]
117 |         lastName = feature.name
118 |         lastInterval = feature.iv
119 |     # add current interval to sequence
120 |     lastSequence += sequences[feature.iv.chrom].seq[feature.iv.start:feature.iv.end]
121 |     lastSequence += "|"
122 |     numExons += 1
123 | 
124 | # write out sequence / translation
125 | writeSeqTrans(lastName, numExons, lastSequence, lastInterval)
126 | 
127 | sys.stderr.write("\n")
128 | 


--------------------------------------------------------------------------------
/spiralign.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | setwd("~/bioinf/presentations/2017-Sep-03");
  4 | 
  5 | type <- "aa";
  6 | 
  7 | if(type == "nucl"){
  8 |     ## Nucleotides
  9 |     library(msa);
 10 |     input.seqs <- readDNAStringSet("notUSCO_EOG091H04CB.tran.fa");
 11 |     names(input.seqs) <- sub(" .*$","",names(input.seqs));
 12 |     msa.df <- data.frame(t(as.matrix(msa(input.seqs))), stringsAsFactors=FALSE);
 13 |     colnames(msa.df) <- 1:ncol(msa.df);
 14 | 
 15 |     efg.cols <- c("G" = "gold", "C" = "blue", "A" = "darkgreen",
 16 |                   "T" = "red", "-" = "grey20");
 17 | 
 18 |     png("msa.png", width=1024, height=1024, pointsize=24);
 19 |     par(mar=c(0.5,0.5,0.5,0.5), bg="black");
 20 |     loops <- 5;
 21 |     lstt <- 3;
 22 |     lend <- loops+lstt;
 23 |     ## integrate(2*pi*r,r=lstt..x)
 24 |     ## => pi(x²-(lstt)²)
 25 |     dTot <- pi*((lstt + loops)^2 - (lstt)^2); ## total "distance" travelled
 26 |     ## s = pi(x²-(lstt)²)
 27 |     ## => s/pi = x² - (lstt)²
 28 |     ## => x = sqrt((lstt)² + s/pi)
 29 |     msa.df$s <- seq(0,dTot, length.out=nrow(msa.df)); ## distance at each pos
 30 |     msa.df$r <- sqrt(lstt^2 + msa.df$s/pi); ## path radius at each pos
 31 |     msa.df$theta <- msa.df$r * 2*pi; ## traversed angle at each pos
 32 |     msa.df$deg <- (msa.df$theta / (2*pi)) * 360;
 33 |     msa.df$x <- msa.df$r * cos(msa.df$theta);
 34 |     msa.df$y <- msa.df$r * sin(msa.df$theta);
 35 |     plot(NA,xlim=c(-lend,lend), ylim=c(-lend,lend), ann=FALSE, axes=FALSE);
 36 |     pcex <- 0.45;
 37 |     for(p in seq(1,nrow(msa.df))){
 38 |         pr <- msa.df$r[p];
 39 |         pt <- msa.df$theta[p];
 40 |         text(x=-(pr-0.25)*cos(pt), y=(pr-0.25)*sin(pt), labels="▅",
 41 |              srt=-msa.df$deg[p],
 42 |              cex=pcex*0.9, col=efg.cols[msa.df[p,1]]);
 43 |         text(x=-pr*cos(pt), y=pr*sin(pt), labels="▅",
 44 |              srt=-msa.df$deg[p],
 45 |              cex=pcex, col=efg.cols[msa.df[p,2]]);
 46 |         text(x=-(pr+0.25)*cos(pt), y=(pr+0.25)*sin(pt), labels="▅",
 47 |              srt=-msa.df$deg[p],
 48 |              cex=pcex*1.05, col=efg.cols[msa.df[p,3]]);
 49 |     }
 50 |     text(0,0.5, expression(italic(Nippostrongylus)), col="white");
 51 |     text(0,0, expression(italic(brasiliensis)), col="white");
 52 |     text(0,-0.5, "Fructose-1,6-bisphosphatase", col="white", cex=0.75);
 53 |     invisible(dev.off());
 54 | }
 55 | 
 56 | ## Amino Acids
 57 | library(msa);
 58 | input.seqs <- readAAStringSet("notUSCO_EOG091H04CB.prot.fa");
 59 | names(input.seqs) <- sub(" .*$","",names(input.seqs));
 60 | msa.df <- data.frame(t(as.matrix(msa(input.seqs, order="input"))),
 61 |                      stringsAsFactors=FALSE);
 62 | msa.df <- msa.df[nrow(msa.df):1,]
 63 | 
 64 | rasmol.cols <- c("D" = "#E60A0A", "E" = "#E60A0A",
 65 |                  "C" = "#E6E600", "M" = "#E6E600",
 66 |                  "K" = "#145AFF", "R" = "#145AFF",
 67 |                  "S" = "#FA9600", "T" = "#FA9600",
 68 |                  "F" = "#3232AA", "Y" = "#3232AA",
 69 |                  "N" = "#00DCDC", "Q" = "#00DCDC",
 70 |                  "G" = "#EBEBEB",
 71 |                  "L" = "#0F820F", "V" = "#0F820F", "I" = "#0F820F",
 72 |                  "A" = "#C8C8C8",
 73 |                  "W" = "#B45AB4",
 74 |                  "H" = "#8282D2",
 75 |                  "P" = "#DC9682",
 76 |                  "-" = "grey20", "X" = "grey20");
 77 | rasmol.cats <- tapply(names(rasmol.cols),rasmol.cols,paste,collapse=",");
 78 | 
 79 | png("msa_aa.png", width=1024, height=1024, pointsize=24);
 80 | par(mar=c(0.5,0.5,0.5,0.5), bg="black");
 81 | loops <- 2.75;
 82 | lstt <- 3;
 83 | lend <- loops+lstt;
 84 | ## integrate(2*pi*r,r=lstt..x)
 85 | ## => pi(x²-(lstt)²)
 86 | dTot <- pi*((lstt + loops)^2 - (lstt)^2); ## total "distance" travelled
 87 | ## s = pi(x²-(lstt)²)
 88 | ## => s/pi = x² - (lstt)²
 89 | ## => x = sqrt((lstt)² + s/pi)
 90 | msa.df$s <- seq(0,dTot, length.out=nrow(msa.df)); ## distance at each pos
 91 | msa.df$r <- sqrt(lstt^2 + msa.df$s/pi); ## path radius at each pos
 92 | msa.df$theta <- msa.df$r * 2*pi; ## traversed angle at each pos
 93 | msa.df$deg <- (msa.df$theta / (2*pi)) * 360;
 94 | msa.df$x <- msa.df$r * cos(msa.df$theta);
 95 | msa.df$y <- msa.df$r * sin(msa.df$theta);
 96 | plot(NA,xlim=c(-lend,lend), ylim=c(-lend,lend), ann=FALSE, axes=FALSE);
 97 | pcex <- 0.8;
 98 | for(p in seq(1,nrow(msa.df))){
 99 |     pr <- msa.df$r[p];
100 |     pt <- msa.df$theta[p];
101 |     pym <- length(input.seqs);
102 |     pyr <- seq(-(pym-2)/(pym-1),(pym-2)/(pym-1), length.out=pym)/2;
103 |     for(py in 1:length(input.seqs)){
104 |         text(x=-(pr+pyr[py])*cos(pt), y=(pr+pyr[py])*sin(pt), labels="▅",
105 |              srt=-msa.df$deg[p],
106 |              cex=pcex, col=rasmol.cols[msa.df[p,py]]);
107 |     }
108 | }
109 | for(py in 1:length(input.seqs)){
110 |     text(x=-(max(msa.df$r)+pyr[py])*cos(max(msa.df$theta)),
111 |          y=(max(msa.df$r)+pyr[py])*sin(max(msa.df$theta))-0.02,
112 |          labels=names(input.seqs)[py],
113 |          srt=0, pos=2,
114 |          cex=0.6, col="white");
115 | }
116 | text(0,0.5, expression(italic(Nippostrongylus)), col="white");
117 | text(0,0, expression(italic(brasiliensis)), col="white");
118 | text(0,-0.5, "Fructose-1,6-bisphosphatase", col="white", cex=0.75);
119 | legend("topleft", fill=names(rasmol.cats), text.col="white",
120 |     legend=rasmol.cats, ncol=2, cex=0.8);
121 | invisible(dev.off());
122 | 


--------------------------------------------------------------------------------
/fastx-repeatFilter.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use warnings;
  4 | use strict;
  5 | 
  6 | use Pod::Usage; ## uses pod documentation in usage code
  7 | use Getopt::Long qw(:config auto_version auto_help);
  8 | use POSIX; ## for ceil
  9 | 
 10 | our $VERSION = "0.1";
 11 | our $DEBUG = 0;
 12 | 
 13 | =head1 NAME
 14 | 
 15 | fastx-repeatFilter.pl -- Repeat Match Algorithm for Long-read Evaluation
 16 | 
 17 | =head1 SYNOPSIS
 18 | 
 19 | ./fastx-repeatFilter.pl <reads.fq> [options]
 20 | 
 21 | =head2 Options
 22 | 
 23 | =over 2
 24 | 
 25 | =item B<-help>
 26 | 
 27 | Only display this help message
 28 | 
 29 | =item B<-lmin>
 30 | 
 31 | Minimum repeat length
 32 | 
 33 | =item B<-lmax>
 34 | 
 35 | Maximum repeat length
 36 | 
 37 | =item B<-length>
 38 | 
 39 | Repeat length; sets I<lmin> and I<lmax> to the same value
 40 | 
 41 | =item B<-skew>
 42 | 
 43 | Maximum skew between repeats for matching lines
 44 | 
 45 | =item B<-threshold>
 46 | 
 47 | Threshold score for identifying repeat reads
 48 | 
 49 | =item B<-fraction>
 50 | 
 51 | Fraction of read that must pass threshold score
 52 | 
 53 | =item B<-reverse>
 54 | 
 55 | Remove repeat-containing reads instead of extracting them
 56 | 
 57 | =back
 58 | 
 59 | =head1 DESCRIPTION
 60 | 
 61 | Identifies (filters) repeat-containing reads in a long-read dataset.
 62 | 
 63 | =head1 METHODS
 64 | 
 65 | something
 66 | 
 67 | =cut
 68 | 
 69 | =head2 processSeq(id, seq, qual, lmin, lmax, skew, threshold, fraction, reverse)
 70 | 
 71 | Analyses the sequence I<seq> to work out if it is likely to contain a
 72 | substantial proportion of repeats of length in the range I<lmin> to
 73 | I<lmax>. Depending on the value of I<reverse> and whether a repeat
 74 | read was detected, either return an empty string or the sequence.
 75 | 
 76 | =cut
 77 | 
 78 | sub processSeq{
 79 |   my ($id, $seq, $qual, $lmin, $lmax, $skew,
 80 |       $threshold, $fraction, $reverse) = @_;
 81 |   if(!$id || (length($seq) < $lmin)){
 82 |     if($reverse){
 83 |       if($qual){
 84 |         print("@${id}\n${seq}\n+\n${qual}\n");
 85 |       } else {
 86 |         print(">${id}\n${seq}\n");
 87 |       }
 88 |       return;
 89 |     }
 90 |   }
 91 |   foreach my $len ($lmin..$lmax){
 92 |     my @scores = ();
 93 |     for(my $spos = $skew; ($spos+2*$len+$skew+1) < length($seq); $spos += $len){
 94 |       my $maxScore = 0;
 95 |       foreach my $ofs (-$skew..$skew){
 96 |         my $score=0;
 97 |         foreach my $c (0..($len-1)){
 98 |           if(substr($seq,$c+$spos,1) eq substr($seq,$spos+$c+$len+$ofs,1)){
 99 |             $score++;
100 |           }
101 |         }
102 |         if($score > $maxScore){
103 |           $maxScore = $score;
104 |         }
105 |         # printf("---\n%s\n%s\n--- [%d+%d, %0.2f]\n",
106 |         #        substr($seq,$spos,$len),
107 |         #        substr($seq,$spos+$len+$ofs,$len),
108 |         #        $spos, $ofs, $score);
109 |       }
110 |       push(@scores, $maxScore);
111 |       #    printf("%3d %0.2f\n", $spos, $maxScore / $len);
112 |     }
113 |     @scores = (sort {$b <=> $a} (@scores))[0..($#scores * $fraction)];
114 |     print("${len},$scores[0],$scores[$#scores]\n");
115 |   }
116 |   #printf("Length %d, min score with fraction %0.2f: %0.2f\n",
117 |   #      $len, $fraction, $scores[$#scores]/$len);
118 | }
119 | 
120 | 
121 | ####################################################
122 | # Command line parsing and verification starts here
123 | ####################################################
124 | 
125 | my $argLine = join(" ",@ARGV);
126 | 
127 | my $options =
128 |   {
129 |    "length" => 100,
130 |    "lmin" => -1,
131 |    "lmax" => -1,
132 |    "skew" => 3,
133 |    "threshold" => 0.8,
134 |    "fraction" => 0.5,
135 |    "reverse" => 0
136 | };
137 | 
138 | GetOptions($options,
139 |            'length|l=i',
140 |            'lmin=i',
141 |            'lmax=i',
142 |            'skew=i',
143 |            'threshold=f',
144 |            'fraction=i',
145 |            'reverse|v!',
146 |            'debug!' => \$DEBUG,
147 | ) or pod2usage(1);
148 | 
149 | if($options->{"skew"} >= $options->{"length"}){
150 |   die("Skew must be less than repeat length");
151 | }
152 | 
153 | if(($options->{lmin} == -1) || ($options->{lmax} == -1)){
154 |   $options->{lmin} = $options->{length};
155 |   $options->{lmax} = $options->{length};
156 | }
157 | 
158 | my $inQual = 0; # false
159 | my $seqID = "";
160 | my $qualID = "";
161 | my $seq = "";
162 | my $qual = "";
163 | while(<>){
164 |   chomp;
165 |   chomp;
166 |   if(!$inQual){
167 |     if(/^(>|@)((.+?)( .*?\s*)?)$/){
168 |       my $newSeqID = $2;
169 |       my $newShortID = $3;
170 |       if($seqID){
171 |         processSeq($seqID, $seq, $qual,
172 |                    $options->{"lmin"}, $options->{"lmax"},
173 |                    $options->{"skew"},
174 |                    $options->{"threshold"}, $options->{"fraction"},
175 |                    $options->{"reverse"});
176 |       }
177 |       $seq = "";
178 |       $qual = "";
179 |       $seqID = $newSeqID;
180 |     } elsif(/^\+(.*)$/) {
181 |       $inQual = 1; # true
182 |       $qualID = $1;
183 |     } else {
184 |       $seq .= $_;
185 |     }
186 |   } else {
187 |     $qual .= $_;
188 |     if(length($qual) >= length($seq)){
189 |       $inQual = 0; # false
190 |     }
191 |   }
192 | }
193 | 
194 | if($seqID){
195 |   processSeq($seqID, $seq, $qual,
196 |              $options->{"lmin"}, $options->{"lmax"}, $options->{"skew"},
197 |              $options->{"threshold"}, $options->{"fraction"},
198 |              $options->{"reverse"});
199 | }
200 | 


--------------------------------------------------------------------------------
/fastx-interleave.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use warnings;
  3 | use strict;
  4 | 
  5 | use Getopt::Long qw(:config auto_help pass_through);
  6 | use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
  7 | 
  8 | my $singleLine = 0;
  9 | my $minLength = 0;
 10 | my $maxCount = 0; # use reservoir sampling to randomise reads
 11 | 
 12 | my %seqFiles = ();
 13 | my @seqFileOrder = ();
 14 | 
 15 | GetOptions("singleLine!" => \$singleLine, "minLength=i" => \$minLength,
 16 |            "count=i" => \$maxCount) or
 17 |   die("Error in command line arguments");
 18 | 
 19 | while(@ARGV){
 20 |   my $argument = shift(@ARGV);
 21 |   if(-e $argument){ # file existence check
 22 |     $seqFiles{$argument} = 1;
 23 |     push(@seqFileOrder, $argument);
 24 |   } else {
 25 |     die("Error in command line arguments: $argument");
 26 |   }
 27 | }
 28 | 
 29 | foreach my $seqFileName (keys(%seqFiles)){
 30 |   my $tmpFile = new IO::Uncompress::Gunzip "$seqFileName" or
 31 |     die "Unable to open $seqFileName\n";
 32 |   $seqFiles{$seqFileName} = $tmpFile;
 33 | }
 34 | 
 35 | my $numFiles = scalar(@seqFileOrder);
 36 | 
 37 | if($numFiles < 2){
 38 |   die("Too few input files");
 39 | } else {
 40 |   printf(STDERR "Interleaving %d input files\n", $numFiles);
 41 | }
 42 | 
 43 | if($minLength){
 44 |   printf(STDERR "Only outputting read groups for reads of at least %d bases\n", $minLength);
 45 | }
 46 | 
 47 | if($maxCount){
 48 |   printf(STDERR "Reservoir sampling to output at most %d read groups:",
 49 |         $maxCount);
 50 | }
 51 | 
 52 | my @inQual = (0) x $numFiles; # false
 53 | my @seqID = ("") x $numFiles;
 54 | my @qualID = ("") x $numFiles;
 55 | my @seq = ("") x $numFiles;
 56 | my @qual = ("") x $numFiles;
 57 | my @printable = ("") x $numFiles;
 58 | my $lineCount = 0;
 59 | my $inFile = 0;
 60 | my $recordsRead = 0;
 61 | my $dotsPrinted = 0;
 62 | 
 63 | my @printReservoir = ();
 64 | 
 65 | for(my @lines = map {
 66 |   $inFile = $seqFiles{$_}; my $res = <$inFile>; $res} @seqFileOrder;
 67 |     grep {$_} @lines; # stop if all input is invalid
 68 |     @lines = map {
 69 |       $inFile = $seqFiles{$_}; my $res = <$inFile>; $res} @seqFileOrder){
 70 |   $lineCount++;
 71 |   for(my $i = 0; $i < $numFiles; $i++){
 72 |     if(!$lines[$i]){
 73 |       next;
 74 |     }
 75 |     my $line = $lines[$i];
 76 |     chomp $line; chomp $line;
 77 |     #printf(STDERR "Line $lineCount,$i: $line\n");
 78 |     if ($line =~ /^\s+$/) {
 79 |       next;
 80 |     }
 81 |     if (!$inQual[$i]) {
 82 |       if($line =~ /^(>|@)(.*)$/){
 83 |         my $newSeqID = $2;
 84 |         if($seqID[$i]){
 85 |           if($printable[$i]){
 86 |             print(STDERR "Warning: double print for file $i\n");
 87 |           }
 88 |           if($qual[$i]){
 89 |             $printable[$i] .=
 90 |               sprintf("@%s\n%s\n+\n%s\n", $seqID[$i], $seq[$i], $qual[$i]);
 91 |           } else {
 92 |             $printable[$i] .=
 93 |               sprintf(">%s\n%s\n", $seqID[$i], $seq[$i]);
 94 |           }
 95 |           if(length($seq[$i]) < $minLength){
 96 |             $printable[$i] = "#";
 97 |           }
 98 |         }
 99 |         $qual[$i] = "";
100 |         $seq[$i] = "";
101 | #        printf(STDERR "Line $lineCount,$i: ".
102 | #               "setting SeqID for $i to $newSeqID\n");
103 |         $seqID[$i] = $newSeqID;
104 |       } elsif ($line =~ /^\+(.*)$/) {
105 |         if(!$seqID[$i]){
106 |           die("[QID ] no sequence ID for $i on line $lineCount\nline: ".$line);
107 |         }
108 |         $inQual[$i] = 1;            # true
109 |         $qualID[$i] = $1;
110 |         $qual[$i] = "";
111 |       } else {
112 |         if(!$seqID[$i]){
113 |           die("[SEQ ] no sequence ID for $i on line $lineCount\nline: ".$line);
114 |         }
115 |         $seq[$i] .= $line;
116 |       }
117 |     } else {
118 |       if(!$seqID[$i]){
119 |         die("[QUAL] no sequence ID for $i on line $lineCount\nline: ".$line);
120 |       }
121 |       $qual[$i] .= $line;
122 |       if (length($qual[$i]) >= length($seq[$i])) {
123 |         $inQual[$i] = 0;            # false
124 |       }
125 |     }
126 |   } # end loop over lines from files
127 |   if(scalar(grep {$_} @printable) == $numFiles){ # print if all can be printed
128 |     if(grep {/^#/} @printable){
129 |       ## reset / clear all if any are non-printable
130 |       for(my $i = 0; $i < $numFiles; $i++){
131 |         $printable[$i] = "";
132 |       }
133 |     } else {
134 |       if($maxCount && ($recordsRead % 10000 == 0)){
135 |         if($dotsPrinted % 50 == 0){
136 |           if($recordsRead > 1000){
137 |             printf(STDERR " (%d read groups processed)", $recordsRead);
138 |           }
139 |           printf(STDERR "\n  ");
140 |         }
141 |         print(STDERR ".");
142 |         $dotsPrinted++;
143 |       }
144 |       $recordsRead++;
145 |       my $linesToAdd = "";
146 |       for(my $i = 0; $i < $numFiles; $i++){
147 |         $linesToAdd .= $printable[$i];
148 |         $printable[$i] = "";
149 |       }
150 |       if(!$maxCount){
151 |         print($linesToAdd);
152 |       } elsif($maxCount >= $recordsRead){
153 |         push(@printReservoir, $linesToAdd);
154 |       } else {
155 |         my $swapPos = rand($recordsRead);
156 |         if($swapPos < $maxCount){
157 |           $printReservoir[$swapPos] = $linesToAdd;
158 |         }
159 |       }
160 |     }
161 |   }
162 | } # end loop over files
163 | 
164 | foreach my $seqFileName (keys(%seqFiles)){
165 |   close($seqFiles{$seqFileName});
166 | }
167 | 
168 | if($maxCount){
169 |   printf(STDERR "\ndone (%d read groups processed)\n", $recordsRead);
170 |   print(join("",@printReservoir));
171 | }
172 | 


--------------------------------------------------------------------------------