├── .github
    └── workflows
    │   └── testing.yml
├── .gitignore
├── LICENSE
├── README.md
├── lib
    ├── Statistics
    │   ├── Descriptive.pm
    │   └── Descriptive
    │   │   ├── Smoother.pm
    │   │   ├── Smoother
    │   │       ├── Exponential.pm
    │   │       └── Weightedexponential.pm
    │   │   └── Weighted.pm
    └── Vcf.pm
├── misc
    ├── EmpMiSeq250R1.txt
    └── EmpMiSeq250R2.txt
├── qsub
    ├── array
    │   ├── launch_FastAniAllVsAll.sh
    │   ├── launch_TTR.sh
    │   ├── launch_art.sh
    │   ├── launch_downsampling.sh
    │   ├── launch_etoki_mlst_array.sh
    │   ├── launch_fastq-dump_split.sh
    │   ├── launch_genotyphi.sh
    │   ├── launch_gzip.sh
    │   ├── launch_ksnp.sh
    │   ├── launch_mash_fasta.sh
    │   ├── launch_pymlst.sh
    │   ├── launch_realphy.sh
    │   ├── launch_set_alreadyShuffled.sh
    │   ├── launch_set_qsubarray.sh
    │   ├── launch_shovill_array.sh
    │   ├── launch_shuffleReads.pl
    │   ├── launch_skesa.sh
    │   ├── launch_skesa_sra.sh
    │   ├── launch_snp-pipeline.sh
    │   ├── launch_spades_split.sh
    │   ├── launch_wgmlst.sh
    │   ├── lyvesetVsLyveset.pl
    │   ├── lyvesetVsSimulations.pl
    │   ├── makeConfigs.pl
    │   └── snppipelineVsSimulations.pl
    ├── launch_SRST2.sh
    ├── launch_annotation.sh
    ├── launch_baym.sh
    ├── launch_chewbbaca.simple.sh
    ├── launch_circlator.sh
    ├── launch_colorid_mlst.sh
    ├── launch_downloadSrr.sh
    ├── launch_etoki_mlst.sh
    ├── launch_fastqToFasta.sh
    ├── launch_freyja.sh
    ├── launch_kraken.sh
    ├── launch_kraken2.sh
    ├── launch_kraken_contigs.sh
    ├── launch_mergeFastaReads.sh
    ├── launch_minion_guppy_wtdbg2_nanopolish.sh
    ├── launch_minion_wtdbg2.sh
    ├── launch_parsnp.sh
    ├── launch_polish.pl
    ├── launch_predict.sh
    ├── launch_prokka.sh
    ├── launch_shovill.sh
    ├── launch_skesa.sh
    ├── launch_spades.sh
    ├── launch_spades_SE.sh
    ├── launch_spades_iontorrent.sh
    ├── launch_spades_split.sh
    ├── launch_trimClean.sh
    ├── launch_velvet.sh
    ├── modules.csh
    ├── modules.sh
    └── sub_unicycler.sh
├── scripts
    ├── Kendall.R
    ├── Kendall.pl
    ├── Kuhner-Felsenstein.sh
    ├── MCM.sh
    ├── addCutSites.pl
    ├── alignmentToPhyloviz.pl
    ├── alignment_stats.pl
    ├── allelesDifference.pl
    ├── anagramChecker.pl
    ├── art_profile.pl
    ├── avgstdev.pl
    ├── bamStats.pl
    ├── blastAndExtract.pl
    ├── bp_jackknifeTrees.pl
    ├── clusterDensityFromFastq.pl
    ├── colorid.mlst.pl
    ├── comparePredictions.pl
    ├── constraintTree.pl
    ├── convertAlignment.pl
    ├── countATCG.pl
    ├── detectAdapters.pl
    ├── directoryDuration.pl
    ├── distance.bn.pl
    ├── distance.chewbbaca.pl
    ├── distance.coloridmlst.pl
    ├── distance.etoki.pl
    ├── distance.wgmlst.pl
    ├── downloadReadsFromBioproject.pl
    ├── downloadSra.pl
    ├── downloadSrrRemotely.sh
    ├── exportBioNumericsFastaWithCoverage.pl
    ├── extractSequence.pl
    ├── fastacmd.pl
    ├── fastqDump-SE.sh
    ├── fastqDump.sh
    ├── fastqMaxCompression.sh
    ├── fastqToFastaQual.pl
    ├── filterContigs.pl
    ├── filterKrakenOutput.pl
    ├── findpids.sh
    ├── fixKsnpVcf.pl
    ├── fixProkkaHeader.pl
    ├── flattenTree.pl
    ├── formatFastaForKraken.pl
    ├── genomeDist.pl
    ├── getGenesFromGb.pl
    ├── kaptivate_wrapper.pl
    ├── kraken-report-contamination.pl
    ├── kraken2-translate.pl
    ├── ksnpsToVcf.pl
    ├── lasergeneToFna.pl
    ├── lyve_splitgbk.pl
    ├── mashesToAlignment.pl
    ├── matrix.etoki.pl
    ├── md5sumDir.pl
    ├── md5sumDir.sh
    ├── mlstToTree.pl
    ├── mummerToVcf.pl
    ├── mvSymlink.pl
    ├── normalizeDepth.pl
    ├── pairwiseDistances.mlst.pl
    ├── parseMultiblast.partialAnswer.pl
    ├── pfgeOnGenome.pl
    ├── phylipDistToTallSkinny.pl
    ├── phylogeneticOrder.pl
    ├── pruneSafely.pl
    ├── pwdLinux.sh
    ├── pwdWindows.sh
    ├── qsubStats.sh
    ├── randFastq.pl
    ├── randTrees.pl
    ├── readLingual.pl
    ├── remoteMlst.pl
    ├── renameTreeNodes.pl
    ├── replaceReadsWithReference.pl
    ├── representativeTaxa.pl
    ├── rerootTree.pl
    ├── rootTheSameWay.pl
    ├── rowMath.pl
    ├── sangerPrimers.pl
    ├── sortFastq.pl
    ├── sortFastqByCommonKmers.pl
    ├── sortFastq_lowDisk.pl
    ├── splitBionumericsFasta.pl
    ├── splitPolytomies.pl
    ├── srr_to_tsv.pl
    ├── subtractContigs.pl
    ├── tanglegram.pl
    ├── tanglegram_ape.R
    ├── tanglegram_code.R
    ├── tbl2gff3.pl
    ├── translate-kraken-contigs.pl
    ├── treeDistance.pl
    ├── treeDistanceMatrix.pl
    ├── treeInfo.pl
    ├── treedist_wrapper.pl
    ├── ttrToMiSeq.sh
    ├── validateFastq.pl
    └── validateTaxonomy.pl
├── tests
    ├── all.sh
    └── unittests
    │   ├── Kendall.pl.bats
    │   ├── avgstdev.pl.bats
    │   ├── input
    │       ├── 2011C-3609.fasta
    │       ├── CFSAN023463.fasta
    │       ├── NC001416.fasta
    │       ├── NC_045512.fasta
    │       ├── SRR27366697.10x.fastq.gz
    │       ├── kendall-colijn1.dnd
    │       ├── kendall-colijn2.dnd
    │       └── kraken
    │       │   ├── FA1090
    │       │       ├── kraken.report
    │       │       └── kraken.taxonomy
    │       │   └── contaminated
    │       │       ├── kraken.filtered.report
    │       │       ├── kraken.report
    │       │       └── kraken.taxonomy
    │   └── randTrees.pl.bats
└── unfinishedScripts
    ├── gibbs.bak.pl
    └── gibbs.pl


/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches-ignore:
 6 |       dev
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ${{ matrix.os }}
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         os: ['ubuntu-20.04' ]
15 |         perl: [ '5.36.0' ]
16 |     defaults:
17 |       run:
18 |         shell: bash -el {0}
19 |     name: ${{ matrix.os }} perl ${{ matrix.perl }}
20 |     steps:
21 |       - name: Get Date
22 |         id: get-date
23 |         run: |
24 |           today=$(/bin/date -u '+%Y%m%d')
25 |           echo $today
26 |           echo "today=$today" >> $GITHUB_OUTPUT
27 |       - name: set up conda
28 |         uses: conda-incubator/setup-miniconda@v2
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 |           use-mamba: true
32 |           miniforge-variant: Mambaforge
33 |           miniforge-version: latest
34 |           channel-priority: strict
35 |           channels: conda-forge,bioconda,defaults
36 |           mamba-version: "*"
37 |           auto-activate-base: true
38 |           activate-environment: "~/conda_pkgs_dir/my-env"
39 |           use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
40 |       - name: conda info
41 |         run: |
42 |           conda info
43 |           echo
44 |           conda list
45 |           echo
46 |           conda config --show
47 |       - name: Cache Conda env
48 |         id: cache-conda
49 |         uses: actions/cache@v3
50 |         with:
51 |           path: |
52 |             ~/conda_pkgs_dir
53 |             ~/.conda
54 |             ~/.condarc
55 |             #/usr/share/miniconda
56 |           key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-perl_v${{ matrix.perl }}--${{env.CACHE_NUMBER}}
57 |         env:
58 |           CACHE_NUMBER: 2
59 |       - name: conda installations
60 |         shell: bash -el {0}
61 |         if: steps.cache-conda.outputs.cache-hit != 'true'
62 |         run: | 
63 |           mamba install -y perl-app-cpanminus perl-bioperl perl-statistics-descriptive bats-core
64 |       - name: check installation
65 |         shell: bash -el {0}
66 |         run: | 
67 |           which perl
68 |           perl -v
69 |           which cpanm
70 |           echo 
71 |           which python
72 |           python -V
73 |           echo 
74 |       - name: checkout my repo
75 |         uses: actions/checkout@v3
76 |       - name: apt-get install
77 |         run:  |
78 |           sudo apt-get update
79 |           sudo apt-get -y install ca-certificates
80 |           cpanm --verbose 'Math::Gauss' 
81 |       - name: Run tests
82 |         run: |
83 |           bats tests/unittests/*.bats
84 | 
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | blib/
 2 | .build/
 3 | _build/
 4 | cover_db/
 5 | inc/
 6 | Build
 7 | !Build/
 8 | Build.bat
 9 | .last_cover_stats
10 | Makefile
11 | Makefile.old
12 | MANIFEST.bak
13 | META.yml
14 | MYMETA.yml
15 | nytprof.out
16 | pm_to_blib
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Lee Katz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/qsub/array/launch_FastAniAllVsAll.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /etc/profile.d/modules.sh
 4 | if [ $? -gt 0 ]; then 
 5 |   echo "ERROR: cannot load the modules system";
 6 |   exit 1;
 7 | fi
 8 | 
 9 | module purge
10 | 
11 | if [ "$3" == "" ]; then
12 |   echo "Usage: $0 out.tsv in1.fasta in2.fasta [in3.fasta...]"
13 |   echo "  Runs ANI on all vs all and places that output in out.tsv"
14 |   exit
15 | fi
16 | 
17 | OUT=$1
18 | shift
19 | 
20 | if [ -e "$OUT" ]; then
21 |   echo "ERROR: $OUT already exists"
22 |   exit 1
23 | fi
24 | 
25 | LOGDIR=$(mktemp --directory $(basename $0 .sh).XXXXXX)
26 | CTRL_FILE="$LOGDIR/array.txt"
27 | mkdir $LOGDIR/log
28 | mkdir $LOGDIR/out
29 | echo "log directory is $LOGDIR/log"
30 | 
31 | echo "$@" | tr ' ' '\n' > $CTRL_FILE
32 | 
33 | qsub -q edlb.q -q all.q -N FastANIarray -o $LOGDIR/out -e $LOGDIR/log -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
34 |   -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
35 |   #!/bin/bash
36 |   set -e
37 | 
38 |   QUERY=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
39 | 
40 |   echo "ANI for query $QUERY" >&2
41 |   hostname >&2
42 | 
43 |   fastANI -q $QUERY --rl $CTRL_FILE -o /dev/stdout
44 | 
45 | END_OF_SCRIPT
46 | 
47 | qsub -q all.q -N combine_FastANI -o $LOGDIR -j y -pe smp 1 -V -cwd -hold_jid FastANIarray \
48 |   -v "LOGDIR=$LOGDIR" -v "OUT=$OUT" <<- "END_OF_SCRIPT"
49 |   #!/bin/bash
50 |   set -e
51 | 
52 |   sort -k1,2r $LOGDIR/out/*.o* | uniq > $OUT
53 | END_OF_SCRIPT
54 | 


--------------------------------------------------------------------------------
/qsub/array/launch_TTR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs any TreeToReads projects in a cluster-friendly method
 4 | # Author: Lee Katz
 5 | # Usage: bash launch_TTR.sh project1 project2 [... projectX]
 6 | #   where each project has its own TTR.cfg file and associated TTR files.
 7 | 
 8 | TMP=$(mktemp --tmpdir='.' --directory qsubTTR.XXXXXXXX)
 9 | echo "tmp dir is $TMP "
10 | 
11 | CTRL_FILE="$TMP/array.txt"
12 | echo "$@" | tr ' ' '\n' > $CTRL_FILE
13 | 
14 | mkdir -p $TMP/log
15 | qsub -q all.q -N TTR -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
16 |   -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
17 |   export PATH=$PATH:~/bin/TreeToReads:~/bin/ART
18 |   module unload perl/5.16.1-MT
19 |   export PERL5LIB=""
20 | 
21 |   base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
22 |   echo "Working on $base_dir"
23 |   scratch_out="/scratch/gzu2/TTR/$base_dir"
24 |   rm -rfv $scratch_out $base_dir/out
25 |   mkdir -p $(dirname $scratch_out)
26 |   cd $base_dir
27 |   sed -i.bak "s|output_dir.*|output_dir = $scratch_out|" TTR.cfg
28 |   treetoreads.py TTR.cfg
29 |   cd -
30 |   mv -v $scratch_out $base_dir/out
31 | END_OF_SCRIPT
32 | 
33 | 


--------------------------------------------------------------------------------
/qsub/array/launch_art.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Launches ART whole genome sequence simulator on a set
 4 | # of fasta files
 5 | set -e
 6 | 
 7 | if [ "$4" == "" ]; then
 8 |   echo "Simulates reads from a set of fasta files."
 9 |   echo "Usage: $(basename $0) profileR1.txt profileR2.txt outdir/ file1.fasta file2.fasta..."
10 |   exit 1
11 | fi
12 | 
13 | profile1=$1;
14 | profile2=$2
15 | outdir=$3
16 | shift;shift;shift;
17 | 
18 | # Check params
19 | if [ ! -e $profile1 ]; then
20 |   echo "ERROR: could not find $profile1"
21 |   exit 1;
22 | fi
23 | if [ ! -e $profile2 ]; then
24 |   echo "ERROR: could not find $profile2"
25 |   exit 1;
26 | fi
27 | if [ -e $outdir ]; then
28 |   echo "ERROR: outdir already exists: $outdir"
29 |   exit 1;
30 | fi
31 | mkdir $outdir
32 | 
33 | export PATH=$PATH:~/bin/ART-MountRainier
34 | 
35 | TMP=$(mktemp --tmpdir='.' --directory ART.XXXXXX)
36 | CTRL_FILE="$TMP/fasta.txt"
37 | echo "$@" | tr ' ' '\n' > $CTRL_FILE
38 | 
39 | mkdir -p $TMP/log
40 | qsub -q all.q -N ART -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
41 |   -v "CTRL_FILE=$CTRL_FILE" -v "outdir=$outdir" -v "profile1=$profile1" -v "profile2=$profile2" <<- "END_OF_SCRIPT"
42 |   #!/bin/bash
43 | 
44 |   set -e
45 |   
46 |   fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
47 |   b=$(basename $fasta | sed 's/\.[^.]*$//')
48 | 
49 |   if [ -e "$outdir/$b.1.fq.gz" ]; then
50 |     echo "Found $outdir/$b.1.fq.gz. Exiting.";
51 |     exit 0;
52 |   fi
53 | 
54 |   mkdir -p /dev/shm/$USER
55 |   tmpdir=$(mktemp --tmpdir=/dev/shm/$USER --directory ART.XXXXXX);
56 |   trap "{ rm -rf $tmpdir; }" EXIT
57 | 
58 |   prefix="$tmpdir/$b." # will generate fastq with correct basename followed by '.1.fq.gz' or '.2.fq.gz'
59 | 
60 |   art_illumina -1 $profile1 -2 $profile2 -na -p -i $fasta -l 250 -f 40 -m 480 -s 120 -o $prefix
61 |   gzip -v9 $prefix*
62 | 
63 |   mv -v $prefix*.fq.gz $outdir
64 | 
65 | END_OF_SCRIPT
66 | 
67 | 


--------------------------------------------------------------------------------
/qsub/array/launch_downsampling.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # downsamples a set of reads
 4 | 
 5 | if [ "$4" == "" ]; then
 6 |   echo "Usage: $0 oneX minCov maxCov 1.fastq.gz [2.fastq.gz ...]"
 7 |   echo "Reads will be deposited in cov\$i to cov\$j directories"
 8 |   echo "oneX must equal to the size of the reference assembly";
 9 |   echo "Can be fastq.gz or fastq files"
10 |   exit 1;
11 | fi
12 | 
13 | TMP=$(mktemp --tmpdir='.' --directory qsubDownsample.XXXXXXXX)
14 | mkdir -p $TMP/log
15 | echo "tmp dir is $TMP "
16 | 
17 | # Read ARGV
18 | oneX=$1;
19 | shift;
20 | MIN=$1
21 | MAX=$2
22 | shift; shift;
23 | 
24 | # CTRL file will have per line:
25 | #   filename  coverageLevel
26 | CTRL_FILE="$TMP/array.txt"
27 | (for cov in `seq $MIN $MAX`; do
28 |   for j in "$@"; do
29 |     echo "$j $cov";
30 |   done;
31 | done;) | grep . > $CTRL_FILE
32 | 
33 | qsub -q all.q -N downsample -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
34 |   -v oneX=$oneX -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
35 |   #!/bin/bash
36 | 
37 |   set -e
38 | 
39 |   # The global temporary directory
40 |   tmpdir=/scratch
41 | 
42 |   echo "Downsampling script will be $(which run_assembly_removeDuplicateReads.pl)"
43 | 
44 |   # What coverage?  Directories?
45 |   fastq=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $1}')
46 |   b=$(basename $fastq);
47 |   cov=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $2}')
48 |   localDir=cov$cov/reads
49 |   scratchDir=$tmpdir/$USER/cov$cov
50 |   mkdir -p $scratchDir $localDir
51 | 
52 |   if [ ! -d $scratchDir ]; then
53 |     echo "ERROR: could not make $scratchDir";
54 |     exit 1;
55 |   fi;
56 | 
57 |   # Final number of bp in the fastq file
58 |   bp=$(($oneX * $cov));
59 | 
60 |   echo "Reading $fastq to a coverage of $cov ($bp bp)";
61 | 
62 |   tmpFastq=$scratchDir/$b;
63 |   outFastq=$localDir/$b;
64 |   run_assembly_removeDuplicateReads.pl $fastq --sizeto $bp --nobin | gzip -c > $tmpFastq 
65 |   if [ $? -gt 0 ]; then
66 |     echo "ERROR: $b" >> $localDir/ERROR
67 |   fi
68 |   mv -v $tmpFastq $outFastq
69 | 
70 |   # Clean up temporary directory if it's empty
71 |   rmdir $scratchDir
72 | 
73 | END_OF_SCRIPT
74 | 
75 | 


--------------------------------------------------------------------------------
/qsub/array/launch_etoki_mlst_array.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | # Runs EToKi MLST
 3 | # Author: Lee Katz <lkatz@cdc.gov>
 4 | 
 5 | #$ -S /bin/bash
 6 | #$ -pe smp 1
 7 | #$ -cwd -V
 8 | #$ -o EToKi.log
 9 | #$ -j y
10 | #$ -N EToKi
11 | 
12 | outdir=$1
13 | refs=$2
14 | db=$3
15 | shift;shift;shift
16 | asm=$@
17 | 
18 | #NSLOTS=${NSLOTS:=1}
19 | 
20 | source /etc/profile.d/modules.sh
21 | scriptname=$(basename $0);
22 | 
23 | 
24 | if [ "$asm" == "" ]; then
25 |   echo "Usage: $scriptname outdir refs.fasta etoki.csv *.fasta"
26 |   exit 0;
27 | fi;
28 | 
29 | set -e
30 | set -u
31 | 
32 | which EToKi.py
33 | #EToKi.py configure
34 | 
35 | tmpdir=$(mktemp --tmpdir=. --directory --suffix=.$(basename $0));
36 | #trap ' { rm -rf $tmpdir; } ' EXIT
37 | mkdir -p $tmpdir/log
38 | mkdir -p $tmpdir/scratch
39 | echo "tmp dir is $tmpdir"
40 | 
41 | CTRL_FILE="$tmpdir/array.txt"
42 | for i in $asm; do
43 |   if [ -e "$outdir/$(basename $i)" ]; then
44 |     continue;
45 |   fi
46 |   echo $i
47 | done > $CTRL_FILE
48 | 
49 | echo "CTRL_FILE is $CTRL_FILE"
50 | 
51 | if [ -d "$outdir" ]; then
52 |   echo "WARNING: outdir already exists: $outdir"
53 |   echo "  pausing 2 seconds in case you want to cancel.";
54 |   sleep 2;
55 | fi
56 | mkdir -pv $outdir
57 | 
58 | qsub -N EToKi -o $tmpdir/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
59 |   -v "outdir=$outdir" -v "refs=$refs" -v "db=$db" -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
60 |   #!/bin/bash -l
61 | 
62 |   set -eu
63 |   hostname
64 | 
65 |   which EToKi.py
66 |   EToKi.py configure || true
67 |   echo
68 | 
69 |   tmpdir=$(mktemp --directory $(basename $0).TASK_ID$SGE_TASK_ID.XXXXXX --tmpdir=$TMPDIR)
70 |   trap ' { rm -rf $tmpdir; } ' EXIT
71 | 
72 |   asm=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
73 |   samplename=$(basename $asm .fasta)
74 |   b=$(basename $asm)
75 |   echo "Sample name will be $samplename"
76 | 
77 |   # Speed this up by working on scratch
78 |   cp -v $asm  $tmpdir/asm.fasta
79 |   cp -v $db   $tmpdir/db.csv
80 |   cp -v $refs $tmpdir/refs.fasta
81 | 
82 |   set -x
83 |   EToKi.py MLSType -i $tmpdir/asm.fasta -r $tmpdir/refs.fasta -k $samplename -d $tmpdir/db.csv -o $tmpdir/out.fasta
84 |   mv -v $tmpdir/out.fasta $outdir/$b
85 | 
86 | END_OF_SCRIPT
87 | 
88 | 


--------------------------------------------------------------------------------
/qsub/array/launch_fastq-dump_split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Modified from a script by Taylor Griswold <ycj5@cdc.gov>
 4 | 
 5 | # Read ARGV
 6 | OUTDIR=$1
 7 | DOWNLOAD_LIST=$2
 8 | SLOTS_PER_JOB=1 # manually change this as needed
 9 | 
10 | if [ "$DOWNLOAD_LIST" == "" ]; then
11 |   scriptName=$(basename $0)
12 |   echo "Executes the sratoolkit module into an array batch job."
13 |   echo "Text file has white-space delimited SRA run IDs"
14 |   echo "Usage: $scriptName outdir run_ids.txt"
15 |   exit 1;
16 | fi
17 | 
18 | if [ -f $OUTDIR ]; then
19 |   echo "ERROR: $OUTDIR is not a directory";
20 |   exit 1;
21 | fi
22 | mkdir -pv $OUTDIR
23 | 
24 | if [ ! -e "$DOWNLOAD_LIST" ]; then
25 |   echo "ERROR: $DOWNLOAD_LIST could not be found";
26 |   exit 1;
27 | fi
28 | 
29 | TMP=$(mktemp --tmpdir='.' --directory qsubFastqDump.XXXXXXXX)
30 | mkdir -p $TMP/log
31 | echo "tmp dir is $TMP "
32 | 
33 | # CTRL file will have one SRA run ID per line
34 | CTRL_FILE="$TMP/array.txt"
35 | cat $DOWNLOAD_LIST | perl -lane '
36 |   for my $sra(@F){
37 |     print $sra;
38 |   }
39 | ' > $CTRL_FILE
40 | echo "CTRL_FILE is $CTRL_FILE"
41 | 
42 | 
43 | qsub -N FastqDump -q edlb.q -o $TMP/log -j y -pe smp $SLOTS_PER_JOB -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
44 |   -v "OUTDIR=$OUTDIR" -v "CTRL_FILE=$CTRL_FILE","DOWNLOAD_LIST=$DOWNLOAD_LIST" <<- "END_OF_SCRIPT"
45 |   #!/bin/bash -l
46 |   set -e
47 |   source /etc/profile.d/modules.sh
48 |   module purge
49 |   module load sratoolkit/2.9.1
50 |   
51 |   which fastq-dump
52 | 
53 |   # Set up filenames
54 |   SRRID=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $1}')
55 |   READ1=$(echo "$SRRID"_1.fastq.gz)
56 |   READ2=$(echo "$SRRID"_2.fastq.gz)
57 |   tmpdir=/scratch/$USER
58 |   mkdir -p $tmpdir
59 |   outdir=$(mktemp --tmpdir=$tmpdir --directory FastqDump.XXXXXX);
60 |   trap "rm -rf $outdir" EXIT
61 |   newdir="$OUTDIR/$SRRID.fastq-dump"
62 | 
63 |   if [ -e "$newdir" ]; then
64 |     echo "ERROR: found pre-existing dir $newdir. Will not continue.";
65 |     exit 1
66 |   fi
67 | 
68 |   echo "fastq-dump will be run on $SRRID under $(hostname)"
69 |   echo "Working directory is $outdir"
70 |   echo -e "Final directory will be $newdir.\n";
71 | 
72 |   fastq-dump --accession $SRRID --outdir $outdir --defline-seq '@$ac.$si/$ri' --defline-qual '+' --split-files --skip-technical --dumpbase --clip --gzip
73 |   
74 |   mv -v $outdir $newdir
75 |   rm /scicomp/home/ycj5/ncbi/public/sra/"$SRRID".sra
76 |   
77 | END_OF_SCRIPT
78 | 
79 | 


--------------------------------------------------------------------------------
/qsub/array/launch_genotyphi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e;
 4 | 
 5 | FASTQLIST="$1"
 6 | ref="$2"
 7 | 
 8 | ref=$(realpath $ref)
 9 | echo "Testing the path to the reference genome"
10 | ls $ref
11 | 
12 | TMP=$(mktemp --tmpdir='.' --directory $(basename $0 .sh).tmp.XXXXXX)
13 | echo "tmp dir is $TMP"
14 | mkdir $TMP/samples
15 | mkdir $TMP/log
16 | 
17 | CTRL_FILE="$TMP/array.txt"
18 | grep '_1.*fastq.gz' $FASTQLIST > $CTRL_FILE
19 | 
20 | qsub -N genotypi -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
21 |   -v "CTRL_FILE=$CTRL_FILE" -v "TMP=$TMP" -v "ref=$ref" <<- "END_OF_SCRIPT"
22 | #/bin/bash -l
23 | set -e
24 | set -x
25 | 
26 |   R1=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
27 |   R2=${R1/_1/_2};
28 |   out="out/$(basename $R1 .fastq.gz).tsv";
29 |   out=$(realpath $out)
30 |   if [ -e "$out" ]; then
31 |     #echo "FOUND $out";
32 |     exit 0
33 |   fi
34 | 
35 | 
36 |   set +x
37 |   module load samtools/1.10 bcftools/1.10.2 bowtie2/2.3.5.1
38 |   set -x
39 | 
40 |   export PATH=$PATH:$HOME/bin/genotyphi
41 | 
42 | 
43 |   # I think that this script pollutes the working directory
44 |   # but I might be wrong.  Still though, just change
45 |   # the working directory to something in TMP to avoid problems.
46 |   workingDir="$TMP/samples/$(basename $R1)"
47 |   mkdir -pv $workingDir
48 |   cd $workingDir
49 |   bowtie2 -p $NSLOTS -x $ref -1 $R1 $R2 | samtools view -bh - > unsorted.bam
50 |   samtools sort -o sorted.bam unsorted.bam
51 |   samtools index sorted.bam
52 |   genotyphi.py --mode bam --bam sorted.bam --ref $ref --ref_id AL513382.1 --output $(basename $out)
53 | 
54 |   # just in case I needed to see files before something broke
55 |   ls -lht
56 |   
57 |   mv *.tsv $out
58 | 
59 | END_OF_SCRIPT
60 | 
61 | exit
62 | 
63 | #### original script
64 | 
65 | grep '_1.*fastq.gz' ../fofn.txt |\
66 |   while read R1; do
67 |     R2=${R1/_1/_2};
68 |     bam="bam/$(basename $R1 .fastq.gz).bam";
69 |     out="out/$(basename $R1 .fastq.gz).tsv";
70 |     if [ -e "$out" ]; then
71 |       echo "FOUND $out";
72 |       continue;
73 |     fi;
74 | 
75 |     bowtie2 -p 24 -x CT18.fasta -1 $R1 $R2 2> "$out.log" |\
76 |       samtools view -bh - > unsorted.bam;
77 |     samtools sort -o $bam unsorted.bam;
78 |     python genotyphi/genotyphi.py --mode bam --bam $bam --ref CT18.fasta --ref_id AL513382.1 --output $(basename $out) >> "$out.log" 2>&1;
79 |     localout=$(\ls *$(basename "$out"));
80 |     mv -v "$localout" "$out";
81 |   done
82 | 
83 | 


--------------------------------------------------------------------------------
/qsub/array/launch_gzip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | # taken from Seth's example: http://git.biotech.cdc.gov/snippets/3
 3 | # This is meant as an example to help me learn job arrays
 4 | 
 5 | set -e
 6 | 
 7 | if [[ -z $1 ]]; then
 8 |   echo "Gzips fastq files with -9"
 9 |   echo "Usage: $0 *.fastq *.fastq.gz"
10 |   exit 0
11 | fi
12 | 
13 | set -u
14 | 
15 | # Make temp folder that can hold log files and temporary files
16 | TMP=$(mktemp --tmpdir='.' --directory qsubtmp.XXXXXXXX)
17 | mkdir $TMP/log
18 | echo "tmp dir is $TMP "
19 | # Make a temp file that can hold an array of input files
20 | CTRL_FILE="$TMP/array.txt"
21 | echo $@ | tr ' ' '\n' | grep . > $CTRL_FILE
22 | 
23 | # Start off the job array
24 | # -N is the job name
25 | # -o $TMP -j y puts all log files into the temporary directory
26 | # -V -cwd is to use the current environment in the current working directory
27 | # -t indicates an array, but it needs a min to max in the next parameter
28 | #     1-$(cat $CTRL_FILE | wc -l) translates to "1 to the number of gzip files"
29 | # -v "CTRL_FILE=$CTRL_FILE" creates a variable to use inside of the here document.
30 | qsub -N gzip-job -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
31 |   -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
32 |     # This is a "here document."  It gets submitted as though it were a 
33 |     # separate file. The here document ends right before END_OF_SCRIPT
34 |     file=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
35 | 
36 |     TMP=$(mktemp --directory --suffix _qsub)
37 |     echo "TMP DIR is $HOSTNAME:$TMP"
38 |     trap " { set -x; rm -rf $TMP; } " EXIT
39 |     tmpIn=$TMP/$(basename $file)
40 |     uncompressed=$TMP/$(basename $file .gz)
41 |     cp -v $file $tmpIn
42 | 
43 |     if [[ "$tmpIn" =~ gz$ ]]; then
44 |       gunzip -v $tmpIn && \
45 |         gzip -9v $uncompressed && \
46 |         mv -v $tmpIn $file
47 |     else
48 |       gzip -v9 $tmpIn && \
49 |         mv -v $tmpIn.gz $file.gz && \
50 |         rm -v $file
51 |     fi
52 | END_OF_SCRIPT
53 | 
54 | 


--------------------------------------------------------------------------------
/qsub/array/launch_ksnp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs any set of reads through KSNP in a cluster-friendly way.
 4 | # Each reads directory will be a distinct project.
 5 | # Author: Lee Katz
 6 | # Usage: bash $0 readsdir1 [readsdir2 ... readsdirX]
 7 | 
 8 | if [ "$2" == "" ]; then
 9 |   echo "Usage: $0 ref.fasta cov1 [cov2...]"
10 |   exit 1;
11 | fi
12 | 
13 | TMP=$(mktemp --tmpdir='.' --directory qsubKsnp3.XXXXXXXX)
14 | echo "tmp dir is $TMP "
15 | 
16 | REF=$1; shift;
17 | 
18 | CTRL_FILE="$TMP/array.txt"
19 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE
20 | 
21 | mkdir -p $TMP/log
22 | qsub -q all.q -N KSNP3 -o $TMP/log -j y -pe smp 1-2 -V -S /bin/bash -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
23 |   -v "REF=$REF" -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
24 |   #!/bin/bash
25 | 
26 |   module load kSNP/3.0.0
27 |   module load fastx-toolkit/0.0.13
28 | 
29 |   base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
30 |   echo "Working on $base_dir, host " $(hostname)
31 |   mkdir -p /scratch/$USER
32 |   scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory ksnp3.XXXXXX)
33 |   export scratch_out
34 |   echo "Temporary directory is $scratch_out"
35 |   if [ ! -e "$scratch_out" ]; then echo "ERROR: could make temporary directory $scratch_out"; exit 1; fi;
36 | 
37 |   # Find what reads we're using
38 |   READS=$(find $base_dir $base_dir/reads -maxdepth 1 -name '*.fastq.gz');
39 |   echo -e "Found reads\n" $READS
40 | 
41 |   # Convert to fasta
42 |   echo $READS | xargs -P $NSLOTS -n 1 sh -c '
43 |     sample=$(basename $0 .fastq.gz);
44 |     sampleDir=$scratch_out/samples/$sample;
45 |     mkdir -pv $sampleDir;
46 | 
47 |     run_assembly_trimClean.pl -i $0 -o $sampleDir/cleaned.fastq --bases_to_trim 50 --auto --nosingletons
48 |     if [ $? -gt 0 ]; then echo "ERROR on trimClean on $0"; exit 1; fi;
49 | 
50 |     fastq_to_fasta -Q33 < $sampleDir/cleaned.fastq > $sampleDir/$sample.fasta
51 |     if [ $? -gt 0 ]; then echo "ERROR converting $0 to fasta"; exit 1; fi;
52 |     merge_fasta_reads3 $sampleDir/$sample.fasta > $sampleDir/merged.fasta
53 |     if [ $? -gt 0 ]; then echo "ERROR merging $0"; exit 1; fi;
54 | 
55 |     # cleanup
56 |     mv $sampleDir/merged.fasta $scratch_out/samples/$sample.fasta
57 |     rm -rvf $sampleDir
58 |   ';
59 |   cp -v $REF $scratch_out/samples/reference.fasta
60 | 
61 |   # Switch to the tmp dir
62 |   pushd $scratch_out
63 | 
64 |     # Find optimal kmer value
65 |     MakeKSNP3infile samples in.txt A
66 |     cat samples/*.fasta > kchooser.fasta
67 |     KMERLENGTH=$(Kchooser kchooser.fasta | grep "The optimum value of K is" | grep -o "[0-9]\+")
68 |     if [ $? -gt 0 ]; then echo "ERROR with Kchooser"; exit 1; fi;
69 |     echo "The optimal kmer length is $KMERLENGTH";
70 |     rm -f kchooser.fasta
71 | 
72 |     grep reference in.txt > reference_in.txt
73 | 
74 |     kSNP3 -k $KMERLENGTH -annotate reference_in.txt -in in.txt -c 4 -core -ML -min_frac 0.75 -CPU $NSLOTS -NJ -vcf -outdir out
75 |     if [ $? -gt 0 ]; then echo "ERROR with kSNP3"; exit 1; fi;
76 | 
77 |   # pop out of the tmp dir
78 |   popd
79 | 
80 |   mv -v $scratch_out/out $base_dir/kSNP3
81 |   rm -rvf $scratch_out
82 | END_OF_SCRIPT
83 | 
84 | 


--------------------------------------------------------------------------------
/qsub/array/launch_mash_fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Runs mash sketch on a set of fasta files
 4 | 
 5 | 
 6 | # Read ARGV
 7 | FASTA=$@
 8 | 
 9 | set -e
10 | set -u
11 | 
12 | if [ "$FASTA" == "" ]; then
13 |   echo "Mash sketch files in place"
14 |   echo "Usage: $0 *.fasta"
15 |   exit 1;
16 | fi
17 | 
18 | TMP=$(mktemp --tmpdir='.' --directory qsubMash.XXXXXXXX)
19 | mkdir -p $TMP/log
20 | echo "tmp dir is $TMP "
21 | 
22 | CTRL_FILE="$TMP/array.txt"
23 | echo "$FASTA" | tr ' ' '\n'  > $CTRL_FILE
24 | echo "CTRL_FILE is $CTRL_FILE"
25 | 
26 | source /etc/profile.d/modules.sh
27 | module purge
28 | module load Mash
29 | module load gcc/5.5
30 | mash --version # ensure it loaded
31 | 
32 | qsub -q edlb.q -N mashSketch -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
33 |   -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
34 |   #!/bin/bash -l
35 | 
36 |   set -e
37 |   set -u
38 |   set -x
39 | 
40 |   hostname
41 |   # Reload modules to ensure things like LD_LIBRARY_PATH are re-added
42 |   source /etc/profile.d/modules.sh || true
43 |   module purge
44 |   module load Mash
45 |   module load gcc/5.5
46 | 
47 |   mash --version # ensure it loaded
48 | 
49 |   # Set up filenames
50 |   fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
51 |   mkdir /scratch/$USER || true
52 |   tmpdir=$(mktemp --tmpdir=/scratch/$USER --directory qsubMash.XXXXXXXX)
53 |   trap ' { rm -rf $tmpdir; } ' EXIT
54 | 
55 |   tmpsketch="$tmpdir/out.msh"
56 |   outsketch="$(dirname $fasta)/$(basename $fasta).msh"
57 | 
58 |   echo "mash will be run on $fasta => $tmpsketch => $outsketch"
59 | 
60 |   if [ -e "$outsketch" ]; then
61 |     echo "$outsketch already exists. Exiting.";
62 |     exit 1;
63 |   fi
64 | 
65 |   mash sketch -o $tmpsketch $fasta
66 | 
67 |   mv -v $tmpsketch $outsketch
68 | END_OF_SCRIPT
69 | 
70 | 


--------------------------------------------------------------------------------
/qsub/array/launch_pymlst.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs NCBI's wgmlst in an array for MLST calling
 4 | 
 5 | OUTDIR=$1; shift;
 6 | DB=$1; shift
 7 | ASM=$@
 8 | 
 9 | set -e
10 | set -u
11 | 
12 | if [ "$ASM" == "" ]; then
13 |   echo "Run NCBI's wgmlst for wgMLST allele calling"
14 |   echo "Usage: $0 outdir something.scheme *.fasta"
15 |   exit 1;
16 | fi
17 | 
18 | if [ ! -d "$OUTDIR" ]; then
19 |   mkdir "$OUTDIR"
20 | fi;
21 | if [ ! -e "$DB" ]; then
22 |   echo "ERROR: not found: $DB"
23 |   exit 2;
24 | fi
25 | 
26 | tmpdir=$(mktemp --tmpdir='.' --directory wgmlst.XXXXXXXX)
27 | mkdir -pv $tmpdir/log
28 | #trap " rm -rf $tmpdir " EXIT
29 | echo "tmp dir is $tmpdir "
30 | 
31 | # CTRL file will have per line:
32 | #   filename  coverageLevel
33 | CTRL_FILE="$tmpdir/array.txt"
34 | # Put the reads one at a time into the CTRL_FILE but use paste to keep paired ends together
35 | echo "$ASM" | tr ' ' '\n' > $CTRL_FILE
36 | 
37 | head $CTRL_FILE
38 | 
39 | module purge
40 | 
41 | # Check executables
42 | which wgmlst
43 | 
44 | qsub -N ncbi_wgmlst -o $tmpdir/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
45 |   -v "tmpdir=$tmpdir" -v "DB=$DB" -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT"
46 |   #!/bin/bash -l
47 |   set -e
48 |   set -u
49 | 
50 |   # Set up filenames
51 |   fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
52 |   name=$(basename $fasta .fasta)
53 | 
54 |   sampledir=$(mktemp --tmpdir=$tmpdir --directory $name.wgmlst.XXXXXX);
55 |   trap "rm -rf $sampledir" EXIT
56 |   finalout="$OUTDIR/$name"
57 | 
58 |   if [ -e "$finalout/.done" ]; then
59 |     echo "Found $finalout/.done; not repeating."
60 |     exit 0;
61 |   fi
62 | 
63 |   mappings=$sampledir/mappings
64 |   alleles=$sampledir/alleles
65 |   stdout=$sampledir/wgmlst.out
66 |   log=$sampledir/wgmlst.log
67 | 
68 |   (
69 |     date # mark how long this takes
70 |     set -x
71 |     wgmlst --genome $fasta --alleles $DB --cores $NSLOTS --kmer 15 --output_mappings $mappings --output_loci $alleles 
72 |     set +x
73 |     date
74 |   ) 1>$stdout 2>$log
75 | 
76 |   ls -lh $sampledir
77 |   mv -v $sampledir $finalout
78 |   touch $finalout/.done
79 | 
80 | END_OF_SCRIPT
81 | 
82 | 


--------------------------------------------------------------------------------
/qsub/array/launch_realphy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs any set of reads through KSNP in a cluster-friendly way.
 4 | # Each reads directory will be a distinct project.
 5 | # Author: Lee Katz
 6 | # Usage: bash $0 readsdir1 [readsdir2 ... readsdirX]
 7 | 
 8 | if [ "$2" == "" ]; then
 9 |   echo "Usage: $0 ref.fasta cov1 [cov2...]"
10 |   exit 1;
11 | fi
12 | 
13 | TMP=$(mktemp --tmpdir='.' --directory qsubRealPhy.XXXXXXXX)
14 | echo "tmp dir is $TMP "
15 | 
16 | export REF=$1; shift;
17 | CTRL_FILE="$TMP/array.txt"
18 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE
19 | 
20 | mkdir -p $TMP/log
21 | #qsub -q all.q -N RealPhy -o $TMP/log -j y -pe smp 12 -hard -l exclusive=true -V -S /bin/bash -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
22 | qsub -q all.q -N RealPhy -o $TMP/log -j y -pe smp 2 -V -S /bin/bash -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
23 |   -v "REF=$REF" -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
24 |   #!/bin/bash
25 | 
26 |   module load RealPhy/v112
27 |   module load bowtie2/2.2.4
28 |   module load phylip/3.69
29 |   module load phyml/3.0
30 |   module load tree-puzzle/5.3.rc16
31 | 
32 |   base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
33 |   echo "Working on $base_dir, host " $(hostname)
34 |   mkdir -p /scratch/$USER
35 |   scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory realphy.XXXXXX)
36 |   export scratch_out
37 |   echo "Temporary directory is $scratch_out"
38 |   if [ ! -e "$scratch_out" ]; then echo "ERROR: could make temporary directory $scratch_out"; exit 1; fi;
39 |   if [ -e $base_dir/RealPhy ]; then echo "Found $base_dir/RealPhy. Skipping."; exit 0; fi
40 | 
41 |   # Find what reads we're using. Do not symlink
42 |   # because realphy has a stupid problem with symlinks.
43 |   # Instead, copy.
44 |   READS=$(ls $base_dir/reads/*.fastq.gz);
45 |   mkdir -p $scratch_out/samples
46 |   cp -v $READS $scratch_out/samples/
47 |   cp -v $REF $scratch_out/samples/reference.fasta
48 | 
49 |   mkdir -p $scratch_out/out
50 | 
51 |   # Make a config file.  RealPhy is intolerant of
52 |   # beginning indentation for each line and of a
53 |   # beginning empty line.
54 |   echo -e "BOWTIE2\t/apps/x86_64/bowtie2/bowtie2-2.2.4/bowtie2
55 | BOWTIE2BUILDER\t/apps/x86_64/bowtie2/bowtie2-2.2.4/bowtie2-build-l
56 | TREEPUZZLE\t/apps/x86_64/tree-puzzle/5.3.rc16/bin/puzzle
57 | RAXML\t/scicomp/groups/OID/NCEZID/DFWED/EDLB/share/bin/lyve-SET-v1.1.4/lib/standard-RAxML-8.1.16/raxmlHPC-PTHREADS
58 | Rscript\t/apps/x86_64/R/3.2.3/bin/Rscript
59 | MaxPars\t/apps/x86_64/phylip/phylip-3.69/exe/dnapars
60 | PhyML\t/apps/x86_64/phyml/PhyML_3.0/phyml
61 | " > $scratch_out/out/config.txt
62 | 
63 |   # Sanity check: look at the files and their permissions in
64 |   # the config file.
65 |   cut -f 2 $scratch_out/out/config.txt | xargs -I {} ls -lh {}
66 | 
67 |   echo "Config file contents:"
68 |   cat $scratch_out/out/config.txt
69 | 
70 |   REALPHY_v112 $scratch_out/samples $scratch_out/out -readLength 250 -ref reference
71 |   if [ $? -gt 0 ]; then exit 1; fi;
72 | 
73 |   ls -lh $scratch_out/reference/alignOut_NoGenes > $scratch_out/reference/alignOut_NoGenes.txt
74 |   rm -rvf $scratch_out/reference/alignOut_NoGenes
75 |   mv -v $scratch_out/out $base_dir/RealPhy
76 |   rm -rvf $scratch_out
77 | END_OF_SCRIPT
78 | 
79 | 


--------------------------------------------------------------------------------
/qsub/array/launch_set_alreadyShuffled.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs any set of reads through Lyve-SET in a cluster-friendly way.
 4 | # Each reads directory will be a distinct project.
 5 | # Author: Lee Katz
 6 | # Usage: bash $0 reference.fasta readsdir1 readsdir2 [... readsdirX]
 7 | 
 8 | if [ "$2" == "" ]; then
 9 |   echo "Usage: $0 ref/ dir [dir2 ... ]"
10 |   echo "  The reference directory can just contain reference.fasta or can have other Lyve-SET reference directory files."
11 |   echo "  Each directory will be searched for shuffled reads files matching *.f*q.gz"
12 |   exit 1;
13 | fi
14 | 
15 | TMP=$(mktemp --tmpdir='.' --directory qsubLyveSET.XXXXXXXX)
16 | echo "tmp dir is $TMP "
17 | 
18 | export PATH=/scicomp/groups/OID/NCEZID/DFWED/EDLB/share/bin/lyve-SET-v1.1.4e/scripts:$PATH
19 | echo -n "Lyve-SET is being launched from ";
20 | \which launch_set.pl
21 | 
22 | REF=$1; shift; # get the reference genome and remove it from ARGV
23 | CTRL_FILE="$TMP/array.txt"
24 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE
25 | 
26 | mkdir -p $TMP/log
27 | qsub -q all.q -N LyveSetShuffled -o $TMP/log -j y -pe smp 3-4 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
28 |   -v "CTRL_FILE=$CTRL_FILE" -v "REF=$REF" <<- "END_OF_SCRIPT"
29 |   #!/bin/bash
30 | 
31 |   base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
32 |   echo "Working on $base_dir with reference file $REF"
33 |   mkdir -p /scratch/$USER
34 |   if [ -e "$base_dir/Lyve-SET" ]; then
35 |     echo "Found $base_dir/Lyve-SET! Will not continue.";
36 |     exit 0;
37 |   fi;
38 |   scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory Lyve-SET.XXXXXX)
39 |   rm -rfv $scratch_out
40 | 
41 |   # Run Lyve-SET on the scratch drive without qsub inception.
42 |   set_manage.pl --create $scratch_out
43 |   rmdir $scratch_out/reference;
44 |   cp -r $REF $scratch_out/reference
45 |   ln -sv $(find $(realpath $base_dir)/reads -name '*.f*q.gz') $scratch_out/reads/
46 |   if [ $? -gt 0 ]; then exit 1; fi;
47 |   launch_set.pl --noqsub --numcpus $NSLOTS --read_cleaner CGP --mask-phages --mask-cliffs $scratch_out
48 |   if [ $? -gt 0 ]; then exit 1; fi;
49 | 
50 |   rm -rvf $scratch_out/{reads,bam,tmp}/* # no need to take up all this space
51 |   mv -v $scratch_out $base_dir/Lyve-SET
52 | END_OF_SCRIPT
53 | 
54 | 


--------------------------------------------------------------------------------
/qsub/array/launch_set_qsubarray.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs any set of reads through Lyve-SET in a cluster-friendly way.
 4 | # Each reads directory will be a distinct project.
 5 | # Author: Lee Katz
 6 | # Usage: bash $0 reference.fasta readsdir1 readsdir2 [... readsdirX]
 7 | 
 8 | if [ "$2" == "" ]; then
 9 |   echo "Usage: $0 ref.fasta dir [dir2 ... ]"
10 |   echo "  Each directory will be searched for split reads files matching *.f*q.gz"
11 |   exit 1;
12 | fi
13 | 
14 | TMP=$(mktemp --tmpdir='.' --directory qsubLyveSET.XXXXXXXX)
15 | echo "tmp dir is $TMP "
16 | 
17 | REF=$1; shift; # get the reference genome and remove it from ARGV
18 | CTRL_FILE="$TMP/array.txt"
19 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE
20 | 
21 | mkdir -p $TMP/log
22 | qsub -q all.q -N LyveSET -o $TMP/log -j y -pe smp 1-2 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
23 |   -v "CTRL_FILE=$CTRL_FILE" -v "REF=$REF" <<- "END_OF_SCRIPT"
24 |   #!/bin/bash
25 | 
26 |   set -e
27 | 
28 |   base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
29 |   echo "Working on $base_dir"
30 |   mkdir -p /scratch/$USER
31 |   if [ -e "$base_dir/Lyve-SET" ]; then
32 |     echo "Found $base_dir/Lyve-SET! Will not continue.";
33 |     exit 0;
34 |   fi;
35 |   scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory Lyve-SET.XXXXXX)
36 |   rm -rfv $scratch_out
37 | 
38 |   # Run Lyve-SET on the scratch drive without qsub inception.
39 |   set_manage.pl --create $scratch_out
40 |   cp $REF $scratch_out/reference/reference.fasta
41 |   shuffleSplitReads.pl --numcpus $NSLOTS --outdir $scratch_out/reads $(find $base_dir -name '*.f*q.gz') --regex '(.+)(_[12]\.f.*)'
42 |   if [ $? -gt 0 ]; then exit 1; fi;
43 |   launch_set.pl --noqsub --numcpus $NSLOTS --read_cleaner CGP --mask-phages --mask-cliffs $scratch_out
44 | 
45 |   rm -rvf $scratch_out/{bam,reads,vcf,tmp}/* # no need to take up all this space
46 |   mv -v $scratch_out $base_dir/Lyve-SET
47 | END_OF_SCRIPT
48 | 
49 | 


--------------------------------------------------------------------------------
/qsub/array/launch_shovill_array.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs shovill on a set of Illumina reads
 4 | 
 5 | 
 6 | # Read ARGV
 7 | OUTDIR=$1; shift;
 8 | READS=$@
 9 | 
10 | set -e
11 | 
12 | if [ "$READS" == "" ]; then
13 |   echo "Assemble all reads in a directory from Illumina runs"
14 |   echo "All files must be in order for paired-end to work"
15 |   echo "Usage: $0 outdir *.fastq.gz"
16 |   exit 1;
17 | fi
18 | 
19 | if [ ! -d "$OUTDIR" ]; then
20 |   mkdir "$OUTDIR"
21 | fi;
22 | 
23 | tmpdir=$(mktemp --tmpdir='.' --directory qsubShovillSpades.XXXXXXXX)
24 | mkdir -pv $tmpdir/log
25 | #trap " rm -rf $tmpdir " EXIT
26 | echo "tmp dir is $tmpdir "
27 | 
28 | # CTRL file will have per line:
29 | #   filename  coverageLevel
30 | CTRL_FILE="$tmpdir/array.txt"
31 | # Put the reads one at a time into the CTRL_FILE but use paste to keep paired ends together
32 | echo "$READS" | tr ' ' '\n' | paste - -  > $CTRL_FILE
33 | #echo "DEBUG"; head $CTRL_FILE > $CTRL_FILE.tmp && mv $CTRL_FILE.tmp $CTRL_FILE
34 | #echo "CTRL_FILE is $CTRL_FILE"
35 | 
36 | head -n 5 $CTRL_FILE
37 | echo "This is what the top of the CTRL file looks like. "
38 | echo "  Waiting 1 second in case you want to ctrl-c..."
39 | 
40 | # Might as well start loading the environment before sleeping
41 | module purge
42 | export PATH=$HOME/bin/shovill-v1.1.0/bin:$PATH
43 | module load SPAdes/3.13.0 Skesa/2.3.0 megahit/1.1.2 velvet/1.2.10 lighter/1.1.1 flash/1.2.11 samtools/1.9 bwa/0.7.17 seqtk/1.3 pilon/1.22 trimmomatic/0.35 perl/5.16.1-MT kmc/3.0 java/jdk1.8.0_301
44 | sleep 1
45 | 
46 | # See if we have all the right components
47 | shovill --check
48 | 
49 | qsub -q all.q -q edlb.q -N ShovillSpades -o $tmpdir/log -j y -pe smp 4-6 -l h_vmem=72G -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
50 |   -v "tmpdir=$tmpdir" -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT"
51 |   #!/bin/bash -l
52 |   set -e
53 |   set -u
54 | 
55 |   # bring back in modules that load LD_LIBRARY_PATH
56 |   # because that variable is stripped away for security purposes
57 |   module load gcc/4.9.3 pilon/1.22
58 | 
59 |   # Set up filenames
60 |   fastq=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
61 |   R1=$(echo "$fastq" | cut -f 1);
62 |   R2=$(echo "$fastq" | cut -f 2);
63 |   name=$(basename $R1 .fastq.gz)
64 | 
65 |   sampledir=$(mktemp --tmpdir=$tmpdir --directory $name.shovillSpades.XXXXXX);
66 |   trap "rm -rf $sampledir" EXIT
67 |   fastaOut="$OUTDIR/$name.shovillSpades.fasta"
68 | 
69 |   ls -lh $R1 $R2
70 |   echo "Shovill with SPAdes will be run on $R1 $R2, on $(hostname)"
71 |   echo "Fasta will be written to $fastaOut";
72 | 
73 |   if [ -e "$fastaOut" ]; then
74 |     echo "$fastaOut already exists. Exiting.";
75 |     exit 1;
76 |   fi
77 | 
78 |   # Make sure the skesa lib is there before the full shovill check
79 |   skesa --version 2>&1
80 | 
81 |   shovill --check
82 | 
83 |   shovill --R1 $R1 --R2 $R2 --outdir $sampledir --assembler spades --cpus $NSLOTS --force --tmpdir /scratch --ram 64
84 |   cp -v $sampledir/contigs.fa $fastaOut
85 |   # trap command will remove $sampledir
86 | 
87 | END_OF_SCRIPT
88 | 
89 | 


--------------------------------------------------------------------------------
/qsub/array/launch_shuffleReads.pl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ "$2" == "" ]; then
 6 |   echo "Only give _R1_ files to this script"
 7 |   echo "Usage:";
 8 |   exit 1
 9 | fi
10 | 
11 | outdir=$1
12 | shift
13 | 
14 | TMP=$(mktemp --tmpdir='.' --directory SHUFFLE.XXXXXX)
15 | CTRL_FILE="$TMP/fasta.txt"
16 | echo "$@" | tr ' ' '\n' > $CTRL_FILE
17 | mkdir -p $TMP/log
18 | 
19 | export PATH=$PATH:$HOME/bin/cg_pipeline/scripts
20 | 
21 | which run_assembly_shuffleReads.pl
22 | 
23 | qsub -q all.q -N shuffleReads -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
24 |   -v "CTRL_FILE=$CTRL_FILE" -v "outdir=$outdir" <<- "END_OF_SCRIPT"
25 |   #!/bin/bash
26 | 
27 |   set -e
28 |   
29 |   R1=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
30 |   b=$(basename $R1);
31 |   d=$(dirname  $R1);
32 | 
33 |   R2=${R1/_R1_/_R2_}
34 | 
35 |   echo "Shuffling on $(hostname) $R1 and $R2"
36 | 
37 |   ls $R1 $R2
38 | 
39 |   if [ -e "$outdir/$b.fastq.gz" ]; then
40 |     echo "Found $outdir/$b.fastq.gz. Exiting.";
41 |     exit 0;
42 |   fi
43 | 
44 |   mkdir -p /dev/shm/$USER
45 |   tmpdir=$(mktemp --tmpdir=/dev/shm/$USER --directory shuffleReads.XXXXXX);
46 |   trap "{ rm -rf $tmpdir; }" EXIT
47 | 
48 |   shuffled="$tmpdir/$b.fastq.gz";
49 |   
50 |   run_assembly_shuffleReads.pl $R1 $R2 | gzip -9c > $shuffled
51 | 
52 |   mv -v $shuffled $outdir/
53 | 
54 | END_OF_SCRIPT
55 | 
56 | 


--------------------------------------------------------------------------------
/qsub/array/launch_skesa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs spades on a set of Illumina reads
 4 | 
 5 | 
 6 | # Read ARGV
 7 | OUTDIR=$1; shift;
 8 | READS=$@
 9 | 
10 | set -e
11 | 
12 | if [ "$READS" == "" ]; then
13 |   echo "Assemble all reads in a directory from Illumina runs"
14 |   echo "All files must be in interleaved format";
15 |   echo "Usage: $0 outdir *.fastq.gz"
16 |   exit 1;
17 | fi
18 | 
19 | if [ ! -d "$OUTDIR" ]; then
20 |   mkdir "$OUTDIR"
21 | fi;
22 | 
23 | TMP=$(mktemp --tmpdir='.' --directory qsubSkesa.XXXXXXXX)
24 | mkdir -p $TMP/log
25 | echo "tmp dir is $TMP "
26 | 
27 | # CTRL file will have per line:
28 | #   filename  coverageLevel
29 | CTRL_FILE="$TMP/array.txt"
30 | echo "$READS" | tr ' ' '\n'  > $CTRL_FILE
31 | echo "CTRL_FILE is $CTRL_FILE"
32 | 
33 | module purge
34 | module load Skesa/2.0_2
35 | #module unload gcc
36 | #module load gcc/4.9.3
37 | skesa --version
38 | 
39 | qsub -q all.q -N skesa -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
40 |   -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT"
41 |   #!/bin/bash -l
42 |   set -e
43 | 
44 |   # LD_LIBRARY_PATH is stripped by qsub and needs to be readded
45 |   # The following is how it appeared in my own environment.
46 |   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/apps/x86_64/gcc/5.4/lib64:/apps/x86_64/gmp/6.1.0/lib:/apps/x86_64/mpfr/3.1.3/lib:/apps/x86_64/mpc/1.0.3/lib:/apps/x86_64/isl/0.18/lib
47 |   echo "WARNING: setting LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
48 | 
49 |   module list
50 |   skesa --version
51 | 
52 |   # Set up filenames
53 |   fastq=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
54 |   tmpdir=/scratch/$USER/skesa
55 |   mkdir -p $tmpdir
56 |   sampledir=$(mktemp --tmpdir=$tmpdir --directory skesa.XXXXXX);
57 |   trap "rm -rf $sampledir" EXIT
58 |   fastaOut="$sampledir/$(basename $fastq .fastq.gz).fasta"
59 | 
60 |   echo "skesa will be run on $fastq, on $(hostname)"
61 |   echo "Temporary fasta will be written to $fastaOut";
62 | 
63 |   if [ -e "$fastaOut" ]; then
64 |     echo "$fastaOut already exists. Exiting.";
65 |     exit 1;
66 |   fi
67 | 
68 |   /usr/bin/time -o $OUTDIR/time.$SGE_TASK_ID.tsv -f "$fastq\t%e" \
69 |     skesa --cores $NSLOTS --fastq $fastq --gz --use_paired_ends > $fastaOut
70 | 
71 |   mv -v $fastaOut $OUTDIR
72 | END_OF_SCRIPT
73 | 
74 | 


--------------------------------------------------------------------------------
/qsub/array/launch_skesa_sra.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs spades on a set of Illumina reads
 4 | 
 5 | 
 6 | # Read ARGV
 7 | OUTDIR=$1; shift;
 8 | READS=$@
 9 | 
10 | set -e
11 | 
12 | if [ "$READS" == "" ]; then
13 |   echo "Assemble all sra runs in a file"
14 |   echo "Usage: $0 outdir allsra_acc.txt"
15 |   echo "  where the sra accessions are separated by any whitespace"
16 |   exit 1;
17 | fi
18 | 
19 | if [ ! -d "$OUTDIR" ]; then
20 |   mkdir "$OUTDIR"
21 | fi;
22 | 
23 | TMP=$(mktemp --tmpdir='.' --directory qsubSkesa.XXXXXXXX)
24 | mkdir -p $TMP/log
25 | echo "tmp dir is $TMP "
26 | 
27 | # CTRL file will have per line:
28 | #   filename  coverageLevel
29 | CTRL_FILE="$TMP/array.txt"
30 | cat $READS | perl -lane '
31 |   chomp;
32 |   for my $f (split(/\s+/, $_)) {
33 |     print "$f";
34 |   }
35 | ' > $CTRL_FILE
36 | echo "CTRL_FILE is $CTRL_FILE"
37 | 
38 | # Check to make sure skesa will work in the heredoc before getting there
39 | module purge
40 | module load Skesa
41 | skesa --version
42 | module purge
43 | 
44 | qsub -q all.q -N skesa -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
45 |   -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT"
46 |   #!/bin/bash -l
47 |   set -e
48 | 
49 |   module purge
50 |   module load Skesa
51 |   module list
52 |   skesa --version
53 | 
54 |   # Set up filenames
55 |   run_acc=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
56 |   tmpdir=/scratch/$USER/skesa
57 |   mkdir -pv $tmpdir
58 |   sampledir=$(mktemp --tmpdir=$tmpdir --directory skesa.XXXXXX);
59 |   trap "rm -rf $sampledir" EXIT
60 |   fastaOut="$sampledir/$run_acc.skesa.fasta"
61 |   finalOut="$OUTDIR/$run_acc.skesa.fasta"
62 | 
63 |   echo "skesa will be run on $run_acc, on $(hostname)"
64 |   echo "Temporary fasta will be written to $fastaOut";
65 | 
66 |   if [ -e "$fastaOut" ]; then
67 |     echo "$fastaOut already exists. Exiting.";
68 |     exit 2;
69 |   fi
70 | 
71 |   if [ -e "$finalOut" ]; then
72 |     echo "$finalOut already exists. Exiting.";
73 |     exit 3;
74 |   fi
75 | 
76 |   skesa --cores $NSLOTS --sra_run $run_acc > $fastaOut
77 | 
78 |   mv -v $fastaOut $finalOut
79 | END_OF_SCRIPT
80 | 
81 | 


--------------------------------------------------------------------------------
/qsub/array/launch_snp-pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs any set of reads through Snp-Pipeline in a cluster-friendly way.
 4 | # Each reads directory will be a distinct project.
 5 | # Author: Lee Katz
 6 | # Usage: bash $0 reference.fasta readsdir1 readsdir2 [... readsdirX]
 7 | 
 8 | TMP=$(mktemp --tmpdir='.' --directory qsubSnp-Pipeline.XXXXXXXX)
 9 | echo "tmp dir is $TMP "
10 | 
11 | REF=$1; shift; # get the reference genome and remove it from ARGV
12 | CTRL_FILE="$TMP/array.txt"
13 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE
14 | 
15 | mkdir -p $TMP/log
16 | # Has to have an exclusive node because it is a greedy script
17 | qsub -q all.q -N snp-pipeline -o $TMP/log -j y -pe smp 12,16 -hard -l exclusive=true -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
18 |   -v "CTRL_FILE=$CTRL_FILE" -v "REF=$REF" <<- "END_OF_SCRIPT"
19 |   #!/bin/bash
20 | 
21 |   module load bowtie2/2.1.0
22 |   module load varscan/2.3.7
23 |   export PATH=~/.local/bin:$PATH # make sure snp-pipeline is prioritized
24 |   export CLASSPATH=/apps/x86_64/varscan/bin/VarScan.v2.3.7.jar # varscan
25 | 
26 |   base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE)
27 |   echo "Working on $base_dir"
28 |   mkdir -p /scratch/$USER
29 |   scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory snp-pipeline.XXXXXX)
30 |   export scratch_out
31 |   mkdir -p $scratch_out/samples
32 | 
33 |   # Make the config file and put it into scratch_out (snppipeline.conf)
34 |   copy_snppipeline_data.py configurationFile $scratch_out
35 | 
36 |   # Find what reads we're using. Assume all reads have been shuffled.
37 |   READS=$(find $base_dir $base_dir/reads -maxdepth 1 -name '*.fastq.gz' 2>/dev/null);
38 | 
39 |   # Deshuffle reads into the correct directories
40 |   echo $READS | xargs -P $NSLOTS -n 1 sh -c '
41 |     sample=$(basename $0 .fastq.gz);
42 |     sampleDir=$scratch_out/samples/$sample;
43 |     mkdir -p $sampleDir;
44 |     echo "Deshuffling into $sampleDir";
45 |     run_assembly_shuffleReads.pl $0 -d -gz 1>$sampleDir/1.fastq.gz 2>$sampleDir/2.fastq.gz
46 |     if [ $? -gt 0 ]; then echo "ERROR deshuffling $i"; exit 1; fi;
47 |   ';
48 | 
49 |   # Run snp-pipeline on the scratch drive without qsub inception.
50 |   run_snp_pipeline.sh -c $scratch_out/snppipeline.conf -s $scratch_out/samples -m copy -o $scratch_out $REF
51 |   if [ $? -gt 0 ]; then echo "ERROR with run_snp_pipeline.sh"; exit 1; fi;
52 |   rm -rvf $scratch_out/samples/*
53 | 
54 |   mv -v $scratch_out $base_dir/snp-pipeline
55 | END_OF_SCRIPT
56 | 
57 | 


--------------------------------------------------------------------------------
/qsub/array/launch_spades_split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs spades on a set of Illumina reads
 4 | 
 5 | # Read ARGV
 6 | READS=$@
 7 | SLOTS_PER_JOB=1 # manually change this as needed
 8 | 
 9 | if [ "$READS" == "" ]; then
10 |   echo "Assemble all reads in a directory from Illumina runs"
11 |   echo "All files must be in the format *_R1_*.fastq.gz to ensure pairs stay together"
12 |   echo "Usage: $0 *.fastq.gz"
13 |   exit 1;
14 | fi
15 | 
16 | TMP=$(mktemp --tmpdir='.' --directory qsubSPAdes.XXXXXXXX)
17 | mkdir -p $TMP/log
18 | echo "tmp dir is $TMP "
19 | 
20 | # CTRL file will have per line:
21 | #   filename  coverageLevel
22 | CTRL_FILE="$TMP/array.txt"
23 | echo "$READS" | tr ' ' '\n' | grep "_R1_" > $CTRL_FILE
24 | echo "CTRL_FILE is $CTRL_FILE"
25 | 
26 | qsub -q all.q -N spades -o $TMP/log -j y -pe smp $SLOTS_PER_JOB -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
27 |   -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT"
28 |   #!/bin/bash -l
29 |   set -e
30 |   module load SPAdes
31 | 
32 |   which spades.py
33 | 
34 |   # Set up filenames
35 |   R1=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $1}')
36 |   R2=${R1/_R1_/_R2_}
37 |   tmpdir=/scratch/$USER
38 |   mkdir -p $tmpdir
39 |   outdir=$(mktemp --tmpdir=$tmpdir --directory SPAdes.XXXXXX);
40 |   trap "rm -rf $outdir" EXIT
41 |   newdir=$(dirname $R1)/$(basename $R1 .fastq.gz).spades
42 | 
43 |   if [ -e "$newdir" ]; then
44 |     echo "ERROR: found pre-existing dir $newdir. Will not continue.";
45 |     exit 1
46 |   fi
47 | 
48 |   echo "spades will be run on $(hostname)"
49 |   echo "R1/R2:"
50 |   echo "  $R1"
51 |   echo "  $R2"
52 |   echo "Working directory is $outdir"
53 |   echo "Final directory will be $newdir";
54 | 
55 |   spades.py -t $NSLOTS -1 $R1 -2 $R2 -o $outdir --careful
56 | 
57 |   mv -v $outdir $newdir
58 | END_OF_SCRIPT
59 | 
60 | 


--------------------------------------------------------------------------------
/qsub/array/launch_wgmlst.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | # Runs NCBI's wgmlst in an array for MLST calling
 4 | 
 5 | OUTDIR=$1; shift;
 6 | DB=$1; shift
 7 | ASM=$@
 8 | 
 9 | set -e
10 | set -u
11 | 
12 | if [ "$ASM" == "" ]; then
13 |   echo "Run NCBI's wgmlst for wgMLST allele calling"
14 |   echo "Usage: $0 outdir something.scheme *.fasta"
15 |   exit 1;
16 | fi
17 | 
18 | if [ ! -d "$OUTDIR" ]; then
19 |   mkdir "$OUTDIR"
20 | fi;
21 | if [ ! -e "$DB" ]; then
22 |   echo "ERROR: not found: $DB"
23 |   exit 2;
24 | fi
25 | 
26 | tmpdir=$(mktemp --tmpdir='.' --directory wgmlst.XXXXXXXX)
27 | mkdir -pv $tmpdir/log
28 | #trap " rm -rf $tmpdir " EXIT
29 | echo "tmp dir is $tmpdir "
30 | 
31 | # CTRL file will have per line:
32 | #   filename  coverageLevel
33 | CTRL_FILE="$tmpdir/array.txt"
34 | # Put the reads one at a time into the CTRL_FILE but use paste to keep paired ends together
35 | echo "$ASM" | tr ' ' '\n' > $CTRL_FILE
36 | 
37 | head $CTRL_FILE
38 | 
39 | module purge
40 | 
41 | # Check executables
42 | which wgmlst
43 | 
44 | qsub -N ncbi_wgmlst -o $tmpdir/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \
45 |   -v "tmpdir=$tmpdir" -v "DB=$DB" -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT"
46 |   #!/bin/bash -l
47 |   set -e
48 |   set -u
49 | 
50 |   # Set up filenames
51 |   fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE);
52 |   name=$(basename $fasta .fasta)
53 | 
54 |   sampledir=$(mktemp --tmpdir=$tmpdir --directory $name.wgmlst.XXXXXX);
55 |   trap "rm -rf $sampledir" EXIT
56 |   finalout="$OUTDIR/$name"
57 | 
58 |   if [ -e "$finalout/.done" ]; then
59 |     echo "Found $finalout/.done; not repeating."
60 |     exit 0;
61 |   fi
62 | 
63 |   mappings=$sampledir/mappings
64 |   alleles=$sampledir/alleles
65 |   stdout=$sampledir/wgmlst.out
66 |   log=$sampledir/wgmlst.log
67 | 
68 |   (
69 |     date # mark how long this takes
70 |     set -x
71 |     wgmlst --genome $fasta --alleles $DB --cores $NSLOTS --kmer 15 --output_mappings $mappings --output_loci $alleles 
72 |     set +x
73 |     date
74 |   ) 1>$stdout 2>$log
75 | 
76 |   ls -lh $sampledir
77 |   mv -v $sampledir $finalout
78 |   touch $finalout/.done
79 | 
80 | END_OF_SCRIPT
81 | 
82 | 


--------------------------------------------------------------------------------
/qsub/array/lyvesetVsLyveset.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Uses a tree to reads directory and a Lyve-SET run to determine
 3 | # true positives, false negatives, false positives
 4 | #
 5 | 
 6 | use strict;
 7 | use warnings;
 8 | use Data::Dumper;
 9 | use Getopt::Long;
10 | 
11 | exit main();
12 | 
13 | sub main{
14 |   my $settings={};
15 |   GetOptions($settings,qw(help lyveset=s ref|reference=s)) or die $!;
16 |   die "ERROR: need Lyve-SET project name\n".usage() if(!$$settings{lyveset});
17 |   die "ERROR: need reference project name\n".usage() if(!$$settings{ref});
18 | 
19 |   # where are the Lyve-SET SNPs?
20 |   my $lyveSetSnps=lyveSetSnps($$settings{lyveset},$settings);
21 | 
22 |   # where are the real SNPs?
23 |   my $realSnps=lyveSetSnps($$settings{ref},$settings);
24 |   
25 |   # compare
26 |   my($TP,$TN,$FP,$FN)=compareSnps($lyveSetSnps,$realSnps,$settings);
27 | 
28 |   # Report
29 |   print join("\t",qw(L-S TTR TP TN FP FN))."\n";
30 |   print join("\t",$$settings{lyveset},$$settings{ref},$TP, $TN, $FP, $FN)."\n";
31 | }
32 | 
33 | sub lyveSetSnps{
34 |   my($proj,$settings)=@_;
35 |   my %pos;
36 | 
37 |   my $matrix="$proj/msa/out.filteredMatrix.tsv";
38 |   #my $matrix="$proj/msa/out.snpmatrix.tsv";
39 |   open(SNPMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!";
40 |   while(<SNPMATRIX>){
41 |     next if(/^#/);
42 |     chomp;
43 |     my($chr,$pos,$ref,@alt)=split /\t/;
44 |     $pos{$pos}=$ref;
45 |   }
46 |   close SNPMATRIX;
47 | 
48 |   return \%pos;
49 | }
50 | 
51 | sub compareSnps{
52 |   my ($lyveSetSnps,$realSnps,$settings)=@_;
53 | 
54 |   # Initialize counts to zero
55 |   my($TP,$TN,$FP,$FN)=split(//,"0" x 4);
56 |   
57 |   # How many of the real SNPs were found?
58 |   while(my($truePos,$trueRef)=each(%$realSnps)){
59 |     if($$lyveSetSnps{$truePos}){
60 |       $TP++;  # True positive: correct SNP was found
61 |     } else {
62 |       $FN++;  # False negative: a real SNP was not found
63 |     }
64 |   }
65 | 
66 |   # How many SNPs were found that were not real?
67 |   while(my($pos,$ref)=each(%$lyveSetSnps)){
68 |     if($$realSnps{$pos}){
69 |       # This is a true positive and was already counted in the previous loop.
70 |     } else {
71 |       $FP++;  # A SNP was found but is not in the real set of SNPs.
72 |     }
73 |   }
74 | 
75 |   return ($TP,$TN,$FP,$FN);
76 | }
77 | 
78 | sub usage{
79 |   "Compares a Lyve-SET run to a specific Lyve-SET run
80 |   Usage: $0 --lyveset projdirectory --ref projdirectory
81 |   "
82 | }
83 | 


--------------------------------------------------------------------------------
/qsub/array/lyvesetVsSimulations.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Uses a tree to reads directory and a Lyve-SET run to determine
  3 | # true positives, false negatives, false positives
  4 | #
  5 | 
  6 | use strict;
  7 | use warnings;
  8 | use Data::Dumper;
  9 | use Getopt::Long;
 10 | 
 11 | exit main();
 12 | 
 13 | sub main{
 14 |   my $settings={};
 15 |   GetOptions($settings,qw(help lyveset=s ttr=s));
 16 |   die usage() if($$settings{help});
 17 |   die "ERROR: need Lyve-SET project name\n".usage() if(!$$settings{lyveset});
 18 |   die "ERROR: need TreeToReads project name\n".usage() if(!$$settings{ttr});
 19 | 
 20 |   # where are the Lyve-SET SNPs?
 21 |   my $lyveSetSnps=lyveSetSnps($$settings{lyveset},$settings);
 22 | 
 23 |   # where are the real SNPs?
 24 |   my $realSnps=ttrSites($$settings{ttr},$settings);
 25 |   
 26 |   # compare
 27 |   my($TP,$TN,$FP,$FN)=compareSnps($lyveSetSnps,$realSnps,$settings);
 28 | 
 29 |   # Report
 30 |   print join("\t",qw(L-S TTR TP TN FP FN))."\n";
 31 |   print join("\t",$$settings{lyveset},$$settings{ttr},$TP, $TN, $FP, $FN)."\n";
 32 | }
 33 | 
 34 | sub lyveSetSnps{
 35 |   my($proj,$settings)=@_;
 36 |   my %pos;
 37 | 
 38 |   my $matrix="$proj/msa/out.snpmatrix.tsv";
 39 |   open(SNPMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!";
 40 |   while(<SNPMATRIX>){
 41 |     next if(/^#/);
 42 |     chomp;
 43 |     my($chr,$pos,$ref,@alt)=split /\t/;
 44 |     $pos{$pos}=$ref;
 45 |   }
 46 |   close SNPMATRIX;
 47 | 
 48 |   return \%pos;
 49 | }
 50 | 
 51 | sub ttrSites{
 52 |   my($proj,$settings)=@_;
 53 |   my %pos;
 54 | 
 55 |   my $matrix="$proj/mutsites.txt";
 56 |   open(TTRMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!";
 57 |   while(<TTRMATRIX>){
 58 |     chomp;
 59 |     my($pos)=split /\s+/;
 60 |     $pos{$pos}=1
 61 |   }
 62 |   close TTRMATRIX;
 63 |   return \%pos;
 64 | }
 65 | 
 66 | sub ttrSnps{
 67 |   my($proj,$settings)=@_;
 68 |   my %pos;
 69 | 
 70 |   my $matrix="$proj/var_site_matrix";
 71 |   open(TTRMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!";
 72 |   while(<TTRMATRIX>){
 73 |     chomp;
 74 |     my($chr,$ref,$pos)=split /\s+/;
 75 |     $pos{$pos}=$ref;
 76 |   }
 77 |   close TTRMATRIX;
 78 | 
 79 |   return \%pos;
 80 | }
 81 | 
 82 | sub compareSnps{
 83 |   my ($lyveSetSnps,$realSnps,$settings)=@_;
 84 | 
 85 |   # Initialize counts to zero
 86 |   my($TP,$TN,$FP,$FN)=split(//,"0" x 4);
 87 |   
 88 |   # How many of the real SNPs were found?
 89 |   while(my($truePos,$trueRef)=each(%$realSnps)){
 90 |     if($$lyveSetSnps{$truePos}){
 91 |       $TP++;  # True positive: correct SNP was found
 92 |     } else {
 93 |       $FN++;  # False negative: a real SNP was not found
 94 |     }
 95 |   }
 96 | 
 97 |   # How many SNPs were found that were not real?
 98 |   while(my($pos,$ref)=each(%$lyveSetSnps)){
 99 |     if($$realSnps{$pos}){
100 |       # This is a true positive and was already counted in the previous loop.
101 |     } else {
102 |       $FP++;  # A SNP was found but is not in the real set of SNPs.
103 |     }
104 |   }
105 | 
106 |   return ($TP,$TN,$FP,$FN);
107 | }
108 | 
109 | sub usage{
110 |   "Compares a Lyve-SET run to a simulated dataset
111 |   Usage: $0 --lyveset projdirectory --ttr treetoreadsdirectory
112 |   "
113 | }
114 | 


--------------------------------------------------------------------------------
/qsub/array/makeConfigs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Data::Dumper;
 6 | use Config::Simple;
 7 | use Getopt::Long;
 8 | use File::Spec::Functions qw/rel2abs/;
 9 | 
10 | #my $baseDir="/scicomp/home/gzu2/projects/wgsStandards/accuracyVsCoverage/manyCoverages";
11 | my $baseDir=".";
12 | 
13 | exit(main());
14 | 
15 | sub main{
16 |   my $settings={};
17 |   GetOptions($settings,qw(help reps=i min_coverage=i max_coverage=i));
18 |   $$settings{reps}||=1;
19 |   $$settings{min_coverage}||=1;
20 |   $$settings{max_coverage}||=$$settings{min_coverage};
21 |   die usage() if($$settings{help});
22 |   $$settings{config}=shift(@ARGV);
23 |   die "ERROR: need config file" if(!$$settings{config});
24 | 
25 |   # Get configuration from cfg file
26 |   my %config;
27 |   Config::Simple->import_from($$settings{config},\%config);
28 |   my $fixed=fixConfigValue(\%config);
29 |   %config=%$fixed;
30 | 
31 | 
32 |   for(my $i=$$settings{min_coverage};$i<=$$settings{max_coverage};$i++){
33 |     for(my $rep=0; $rep<$$settings{reps}; $rep++){
34 |       my $coverage=($i*1);
35 | 
36 |       # make a simulation directory
37 |       my $simDir="cov$i/rep$rep";
38 |       $simDir=rel2abs($simDir);
39 |       system("mkdir -pv $simDir");
40 |       # copy all necessary files except config
41 |       system("cp -rv $baseDir/IlluminaErrorProfiles $baseDir/reference $baseDir/lyve-set.dnd $simDir/");
42 |       
43 |       # generate a custom config file
44 |       my %newConfig=%config;
45 |       # update some paths
46 |       $newConfig{'default.output_dir'}="$simDir/out";
47 |       $newConfig{'default.coverage'}=$coverage;
48 |       $newConfig{'default.treefile_path'}="$simDir/$newConfig{'default.treefile_path'}";
49 |       $newConfig{'default.base_genome_path'}="$simDir/".$newConfig{'default.base_genome_path'};
50 |       for(qw(error_model1 error_model2)){
51 |         $newConfig{"default.$_"}="$simDir/".$newConfig{"default.$_"};
52 |       }
53 |       
54 |       open(CFG,">","$simDir/TTR.cfg") or die "ERROR: could not open $simDir/TTR.cfg for writing: $!";
55 |       while(my($key,$value)=each(%newConfig)){
56 |         $key=~s/^default\.//;
57 |         if(ref($value) eq 'ARRAY'){
58 |           $value=join(",",@$value);
59 |         }
60 |         print CFG "$key = $value\n";
61 |       }
62 |       close CFG;
63 |     }
64 |   }
65 | }
66 | 
67 | sub fixConfigValue{
68 |   my($value)=@_;
69 |   if(ref($value) eq 'HASH'){
70 |     while(my($k,$v)=each(%$value)){
71 |       $$value{$k}=fixConfigValue($v);
72 |     }
73 |   } elsif(ref($value) eq 'ARRAY'){
74 |     for(my $i=0;$i<@$value;$i++){
75 |       $$value[$i]=fixConfigValue($$value[$i]);
76 |     }
77 |   } else {
78 |     $value=~s/#.*$//;
79 |     $value=~s/^\s+|\s+$//g;
80 |   }
81 |   return $value;
82 | }
83 | 
84 | sub usage{
85 |   "Create a bunch of different treetoreads projects
86 |   Usage: $0 treetoreads.cfg
87 |   --reps         1  Number of repetitions
88 |   --min_coverage 1
89 |   --max_coverage 1
90 |   "
91 | }
92 | 


--------------------------------------------------------------------------------
/qsub/array/snppipelineVsSimulations.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Uses a tree to reads directory and a Snp-Pipeline run to determine
 3 | # true positives, false negatives, false positives
 4 | #
 5 | 
 6 | use strict;
 7 | use warnings;
 8 | use Data::Dumper;
 9 | use Getopt::Long;
10 | 
11 | exit main();
12 | 
13 | sub main{
14 |   my $settings={};
15 |   GetOptions($settings,qw(help snppipeline=s ttr=s));
16 |   die usage() if($$settings{help});
17 |   die "ERROR: need Snp-Pipeline project name\n".usage() if(!$$settings{snppipeline});
18 |   die "ERROR: need TreeToReads project name\n".usage() if(!$$settings{ttr});
19 | 
20 |   # where are the Snp-Pipeline SNPs?
21 |   my $snppipelineSnps=snppipelineSnps($$settings{snppipeline},$settings);
22 | 
23 |   # where are the real SNPs?
24 |   my $realSnps=ttrSnps($$settings{ttr},$settings);
25 |   
26 |   # compare
27 |   my($TP,$TN,$FP,$FN)=compareSnps($snppipelineSnps,$realSnps,$settings);
28 | 
29 |   # Report
30 |   print join("\t",qw(S-P TTR TP TN FP FN))."\n";
31 |   print join("\t",$$settings{snppipeline},$$settings{ttr},$TP, $TN, $FP, $FN)."\n";
32 | }
33 | 
34 | sub snppipelineSnps{
35 |   my($proj,$settings)=@_;
36 |   my %pos;
37 | 
38 |   my $list="$proj/snplist.txt";
39 |   open(SNPMATRIX,"<",$list) or die "ERROR: could not open $list for reading: $!";
40 |   while(<SNPMATRIX>){
41 |     next if(/^#/);
42 |     chomp;
43 |     my($chr,$pos,$count,@genomes)=split /\t/;
44 |     $pos{$pos}=1;
45 |   }
46 |   close SNPMATRIX;
47 | 
48 |   return \%pos;
49 | }
50 | 
51 | sub ttrSnps{
52 |   my($proj,$settings)=@_;
53 |   my %pos;
54 | 
55 |   my $matrix="$proj/var_site_matrix";
56 |   open(TTRMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!";
57 |   while(<TTRMATRIX>){
58 |     chomp;
59 |     my($chr,$ref,$pos)=split /\s+/;
60 |     $pos{$pos}=$ref;
61 |   }
62 |   close TTRMATRIX;
63 | 
64 |   return \%pos;
65 | }
66 | 
67 | sub compareSnps{
68 |   my ($snppipelineSnps,$realSnps,$settings)=@_;
69 | 
70 |   # Initialize counts to zero
71 |   my($TP,$TN,$FP,$FN)=split(//,"0" x 4);
72 |   
73 |   # How many of the real SNPs were found?
74 |   while(my($truePos,$trueRef)=each(%$realSnps)){
75 |     if($$snppipelineSnps{$truePos}){
76 |       $TP++;  # True positive: correct SNP was found
77 |     } else {
78 |       $FN++;  # False negative: a real SNP was not found
79 |     }
80 |   }
81 | 
82 |   # How many SNPs were found that were not real?
83 |   while(my($pos,$ref)=each(%$snppipelineSnps)){
84 |     if($$realSnps{$pos}){
85 |       # This is a true positive and was already counted in the previous loop.
86 |     } else {
87 |       $FP++;  # A SNP was found but is not in the real set of SNPs.
88 |     }
89 |   }
90 | 
91 |   return ($TP,$TN,$FP,$FN);
92 | }
93 | 
94 | sub usage{
95 |   "Compares a Snp-Pipeline run to a simulated dataset
96 |   Usage: $0 --snpPipeline projdirectory --ttr treetoreadsdirectory
97 |   "
98 | }
99 | 


--------------------------------------------------------------------------------
/qsub/launch_SRST2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | #$ -S /bin/bash
 3 | #$ -pe smp 1
 4 | #$ -cwd -V
 5 | #$ -o SRST.log -j y
 6 | #$ -N SRST2
 7 | 
 8 | source /etc/profile.d/modules.sh 
 9 | module load samtools/0.1.18;
10 | module load bowtie2/2.1.0;
11 | 
12 | DB=$1
13 | MLST_DEFS=$2
14 | PREFIX=$3
15 | OUTDIR=$4
16 | INTERLEVED=$5
17 | 
18 | thisScript=`basename $0`;
19 | if [ "$INTERLEVED" == "" ]; then
20 |   echo "Usage: $thisScript MLSTDB.fasta MLST_DEFS.txt OUTPREFIX OUTDIR/ INTERLEVED.fastq[.gz]"
21 |   echo "OUTPREFIX cannot include a prefix directory which is why you should specify OUTDIR"
22 |   exit 1;
23 | fi
24 | 
25 | NSLOTS=${NSLOTS-1}
26 | SCRIPT=/scicomp/home/gzu2/bin/srst2/scripts/srst2.py
27 | 
28 | b=$(basename $INTERLEVED);
29 | READ1="TMP/$b.read_1.fastq"
30 | READ2="TMP/$b.read_2.fastq"
31 | run_assembly_shuffleReads.pl -d $INTERLEVED > $READ1 2> $READ2
32 | if [ $? -gt 0 ]; then exit 1; fi;
33 | 
34 | # Low compression to make it compatible with srst2.
35 | # These files will be deleted later anyway
36 | gzip -f -v -1 $READ1 $READ2
37 | if [ $? -gt 0 ]; then exit 1; fi;
38 | 
39 | READ1="$READ1.gz"
40 | READ2="$READ2.gz"
41 | 
42 | 
43 | $SCRIPT --input_pe $READ1 $READ2 --mlst_delimiter "_" --output $PREFIX --log --mlst_db $DB --mlst_definitions $MLST_DEFS
44 | if [ $? -gt 0 ]; then exit 1; fi;
45 | mv -v ${PREFIX}*__results.txt "$OUTDIR/"
46 | if [ $? -gt 0 ]; then exit 1; fi;
47 | 
48 | # remove temp files
49 | rm -v $READ1 $READ2
50 | rm -vf ${PREFIX}__*.bam* ${PREFIX}__*.sam* ${PREFIX}__*results.txt ${PREFIX}__*.pileup ${PREFIX}*.log
51 | 


--------------------------------------------------------------------------------
/qsub/launch_annotation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #$ -S /bin/bash
 4 | #$ -pe smp 10-16
 5 | #$ -cwd -V
 6 | #$ -o annotation.log
 7 | #$ -j y
 8 | #$ -N cgpAnnotate
 9 | #$ -q all.q
10 | 
11 | project=$1
12 | 
13 | if [ "$project" == "" ]; then
14 |   echo "Usage: " $(basename $0) " CGP-project/"
15 |   exit 1;
16 | fi;
17 | 
18 | NSLOTS=${NSLOTS:=1}
19 | 
20 | run_pipeline annotate -p "$project" --numcpus $NSLOTS --skip INTERPRO
21 | if [ $? -gt 0 ]; then echo "ERROR with run_pipeline annotatE"; exit 1; fi;
22 | 


--------------------------------------------------------------------------------
/qsub/launch_baym.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Runs a shovill assembly.  Run with no options for usage.
  3 | # Author: Lee Katz <lkatz@cdc.gov>
  4 | # Workflow taken from Dorian Feistel
  5 | 
  6 | #$ -S /bin/bash
  7 | #$ -pe smp 1
  8 | #$ -cwd -V
  9 | #$ -o baym.log
 10 | #$ -j y
 11 | #$ -N baym
 12 | #$ -q all.q
 13 | 
 14 | outdir=$1; shift
 15 | reads=$@
 16 | NSLOTS=${NSLOTS:=1}
 17 | 
 18 | source /etc/profile.d/modules.sh
 19 | scriptname=$(basename $0);
 20 | 
 21 | if [ "$reads" == "" ]; then
 22 |   echo "Usage: $scriptname outdir *_1.fastq[.gz]"
 23 |   echo "  R2 reads will be detected automatically based on matchiing filenames"
 24 |   exit 0;
 25 | fi;
 26 | 
 27 | set -e
 28 | set -u
 29 | 
 30 | module purge
 31 | module load seqtk/1.3 trimmomatic/0.39 bwa/0.7.17 java/latest
 32 | 
 33 | # die if some crucial program is not present
 34 | # And side bonus of knowing where each executable is loading from
 35 | which seqtk
 36 | which bwa
 37 | which java
 38 | which kallisto
 39 | which 1output_abundances.py
 40 | which trim_reads_nwss.sh
 41 | 
 42 | reference_db=/scicomp/groups-pure/Projects/NWSS_SequencingData/apps/wastewater_analysis/NWSS_PIPELINE/03.reference_set_29SEP2021-2/sequences.kallisto_idx
 43 | scripts=/scicomp/groups-pure/Projects/NWSS_SequencingData/apps/wastewater_analysis/NWSS_PIPELINE/scripts
 44 | 
 45 | tmpdir=$(mktemp --directory --suffix=$(basename $0));
 46 | trap ' { rm -rf $tmpdir; } ' EXIT
 47 | 
 48 | mkdir -v $outdir || echo "WARNING: outdir already exists: $outdir"
 49 | 
 50 | for R1 in $reads; do
 51 |   name=$(basename $R1 .fastq.gz | perl -plane 's/_\d|\.fastq|\.gz//g');
 52 |   sampledir="$tmpdir/$name"
 53 |   mkdir -v $sampledir
 54 | 
 55 |   # Get the file extension, if it's .gz
 56 |   ext=${R1##*.}
 57 | 
 58 |   # This file has to be local
 59 |   ln -sv $(which 1output_abundances.py) $sampledir/
 60 | 
 61 |   # Get fastq files into our local tmp folder
 62 |   R2=${R1/_1.f/_2.f}
 63 |   # decompress or simply cat R1/R2 with zcat
 64 |   local_R1="$sampledir/$(basename $R1 .gz)"
 65 |   local_R2="$sampledir/$(basename $R2 .gz)"
 66 | 
 67 |   if [[ "$ext" == "gz" ]]; then
 68 |     zcat $R1 > $local_R1
 69 |     zcat $R2 > $local_R2
 70 |   else
 71 |     cp -vL $R1 $R2 $sampledir/
 72 |   fi
 73 | 
 74 |   # WARNING
 75 |   # Some commands require that you are in the sample directory
 76 |   # And so I will indent to indicate CWD
 77 |   cd $sampledir
 78 | 
 79 |     # Should just yield one sample name
 80 |     # into sample_ID
 81 |     local_sample_ID="sample_IDs.txt"
 82 |     \ls -f1 *.fastq | sed 's/_[12].fastq.*//' | sort | uniq > $local_sample_ID
 83 | 
 84 |     trim_dir="trimmed.out"
 85 |     kallisto="kallisto.out"
 86 | 
 87 |     echo "SAMPLE(S): $(cat $local_sample_ID | tr '\n' ',')"
 88 | 
 89 |     bash trim_reads_nwss.sh . $trim_dir $local_sample_ID
 90 | 
 91 |     echo
 92 | 
 93 |     bash run_kallisto_WG.sh $local_sample_ID $trim_dir/ivar $kallisto
 94 | 
 95 |   # Moving back out of the sample directory
 96 |   cd -
 97 | 
 98 |   cp -rv $sampledir/*kallisto.out $outdir/
 99 | 
100 |   #run_kallisto_SPIKE.sh
101 | 
102 |   # Directly remove the directory since we're done with it now
103 |   rm -rvf $sampledir
104 | done
105 | 
106 | 


--------------------------------------------------------------------------------
/qsub/launch_chewbbaca.simple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: Lee Katz <lkatz@cdc.gov>
 3 | 
 4 | #$ -S /bin/bash
 5 | #$ -pe smp 4-24
 6 | #$ -cwd -V
 7 | #$ -o chewbbaca.log
 8 | #$ -j y
 9 | #$ -N Chewbbaca
10 | 
11 | set -eu
12 | 
13 | scriptname=$(basename $0);
14 | dirname=$(dirname $0);
15 | if [ "$#" -lt 3 ]; then
16 |   echo "Runs cgMLST on a genome assembly"
17 |   echo "Usage: $scriptname outdir/ cgMLST-database/ indir"
18 |   echo "  where indir is a directory of assembly fasta files"
19 |   exit 0;
20 | fi;
21 | 
22 | outdir=$1
23 | DB=$2
24 | indir=$3
25 | 
26 | # Some defaults
27 | NSLOTS=${NSLOTS:=24}
28 | TMPDIR=${TMPDIR:=/scratch}
29 | 
30 | function logmsg() {
31 |   script=$(basename $0)
32 |   HR="-------------"
33 |   echo $HR >&2
34 |   echo "$script: $@" >&2
35 |   echo $HR >&2
36 | }
37 | 
38 | if [ ! -d "$DB" ]; then
39 |   logmsg "ERROR: could not find a cgMLST database folder at $DB"
40 |   exit 1
41 | fi
42 | 
43 | # Count how many fasta files are in the directory and if
44 | # they aren't there then make an error
45 | num_schema_loci=$(\ls -f1 "$DB" | grep -c 'fasta$' || true)
46 | if [ "$num_schema_loci" -lt 1 ]; then
47 |   logmsg "ERROR: no fasta files found in $DB"
48 |   exit 1
49 | fi
50 | 
51 | logmsg "Found $num_schema_loci loci in the schema at $DB"
52 | 
53 | tempdir=$(mktemp --tmpdir=$TMPDIR --directory chewbbaca.XXXXXX)
54 | trap ' { cd; rm -rf $tempdir; } ' EXIT
55 | 
56 | # Runs chewbbaca in a container
57 | # Arguments:
58 | #   database path
59 | #   file of fasta filenames or fasta file
60 | function chewbbaca() {
61 |   DB=$1
62 |   indir=$2
63 |   # TMPDIR is a global for basically /scratch
64 |   # $tempdir is a global for a directory under /scratch
65 |   # NSLOTS
66 | 
67 |   ls -dl $DB
68 |   ls -dl $indir
69 |   ls -dl $TMPDIR
70 |   ls -dl $tempdir
71 |   echo "NSLOTS: $NSLOTS"
72 | 
73 |   rm -rfv $DB/temp
74 |   set -x
75 |   singularity exec -B $TMPDIR:$TMPDIR -B $PWD:$PWD -B $indir:/input -B $DB:/schema -B $tempdir:/out bin/chewbbaca.cif chewBBACA.py AlleleCall -i /input --schema-directory /schema -o /out --cpu $NSLOTS 
76 |   set +x
77 |   mv -v $tempdir/results* $outdir/ || true
78 | 
79 |   logmsg $tempdir
80 |   ls -lhA $tempdir
81 | }
82 | 
83 | mkdir -pv "$tempdir/input"
84 | cp -vL $indir/*.f*a $tempdir/input/ || true
85 | #cp -vrL $indir "$tempdir/input" || true
86 | 
87 | mkdir -p $outdir
88 | chewbbaca $DB "$tempdir/input"
89 | 
90 | 


--------------------------------------------------------------------------------
/qsub/launch_circlator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -e assembly.circlator.err
 3 | #$ -o assembly.circlator.out
 4 | #$ -N circ.assembly
 5 | #$ -pe smp 4-16
 6 | source /etc/profile.d/modules.sh
 7 | module load Python/3.4
 8 | module load circlator/1.2.1
 9 | if [ -d circ_dir ]; then rm -r circ_dir; fi 
10 | circlator all --threads $NSLOTS assembly.fasta reads.fasta circ_dir 
11 | 


--------------------------------------------------------------------------------
/qsub/launch_colorid_mlst.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Runs ColorID MLST
 3 | # Author: Lee Katz <lkatz@cdc.gov>
 4 | 
 5 | #$ -S /bin/bash
 6 | #$ -pe smp 1
 7 | #$ -cwd -V
 8 | #$ -o colorid
 9 | #$ -j y
10 | #$ -N colorid
11 | 
12 | outdir=$1
13 | dbdir=$2
14 | asm=$3
15 | 
16 | NSLOTS=${NSLOTS:=1}
17 | 
18 | source /etc/profile.d/modules.sh
19 | scriptname=$(basename $0);
20 | 
21 | 
22 | if [ "$asm" == "" ]; then
23 |   echo "Usage: $scriptname outdir dbdir asm.fasta"
24 |   echo "  where dbdir is a directory of locus fasta files like chewbbaca"
25 |   exit 0;
26 | fi;
27 | 
28 | set -e
29 | set -u
30 | 
31 | module purge
32 | 
33 | colorid=$(which colorid_bv || which colorid)
34 | process_MLST=$(which process_MLST.py)
35 | 
36 | tmpdir=$(mktemp --directory --suffix=$(basename $0) --tmpdir=./);
37 | trap ' { rm -rf $tmpdir; } ' EXIT
38 | 
39 | if [ -e $outdir ]; then
40 |   echo "WARNING: outdir already exists: $outdir"
41 |   exit 1
42 | fi
43 | 
44 | bxi="$tmpdir/asm.bxi"
45 | 
46 | # Estimate assembly size by file size and then multiply by 10x
47 | asm_size=$(cat $asm | wc -c)
48 | bxi_size=$(( $asm_size * 10 ))
49 | 
50 | # Some colorid vars
51 | fofn="$tmpdir/asm.fofn"
52 | alleles="$tmpdir/alleles.tsv"
53 | 
54 | # Get a file of filenames
55 | echo -e "$(basename $asm .fasta)\t$asm" > $fofn
56 | 
57 | $colorid build -b $bxi -s $bxi_size -n 2 -k 39 -t $NSLOTS -r $fofn
58 | 
59 | $colorid search -ms -b $bxi -q $dbdir/*.fasta > $alleles
60 | sed -i.bak '/\*/d' $alleles
61 | $process_MLST $alleles $tmpdir/mlst
62 | 
63 | cp -v $alleles $tmpdir/mlst* $outdir/
64 | 
65 | 


--------------------------------------------------------------------------------
/qsub/launch_downloadSrr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #$ -S /bin/bash
 3 | #$ -pe smp 1
 4 | #$ -cwd -V
 5 | #$ -o fastq-dump.log 
 6 | #$ -j y
 7 | #$ -N fastq-dump
 8 | #$ -q all.q
 9 | 
10 | name=$1
11 | OUT=$2
12 | 
13 | if [ "$OUT" == "" ]; then
14 |   echo "Usage: $0 NAME"
15 |   echo "  Downloads a set of fastq files using fastq-dump";
16 |   exit 1;
17 | fi;
18 | 
19 | module load sratoolkit/2.4.5-2
20 | if [ $? -gt 0 ]; then echo "unable to load sratoolkit/2.4.5-2"; exit 1; fi;
21 | 
22 | mkdir -p /scratch/$USER/fastq-dump/$name
23 | 
24 | downloadSra.pl -t /scratch/$USER/fastq-dump/$name $name | gzip -c > $OUT
25 | if [ $? -gt 0 ]; then echo "ERROR with fastq-dump $name"; exit 1; fi;
26 | 
27 | rm -rf /scratch/$USER/fastq-dump/$name
28 | 
29 | 


--------------------------------------------------------------------------------
/qsub/launch_etoki_mlst.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Runs EToKi MLST
 3 | # Author: Lee Katz <lkatz@cdc.gov>
 4 | 
 5 | #$ -S /bin/bash
 6 | #$ -pe smp 1
 7 | #$ -cwd -V
 8 | #$ -o EToKi.log
 9 | #$ -j y
10 | #$ -N EToKi
11 | 
12 | out=$1
13 | refs=$2
14 | db=$3
15 | asm=$4
16 | 
17 | #NSLOTS=${NSLOTS:=1}
18 | 
19 | source /etc/profile.d/modules.sh
20 | scriptname=$(basename $0);
21 | 
22 | 
23 | if [ "$asm" == "" ]; then
24 |   echo "Usage: $scriptname out.etoki.fasta refs.fasta etoki.csv assembly.fasta"
25 |   exit 0;
26 | fi;
27 | 
28 | set -e
29 | set -u
30 | 
31 | module purge
32 | 
33 | source ~/.bash_conda || echo "Could not find bash file for loading conda"
34 | conda activate etoki || echo "could not activate etoki env"
35 | 
36 | EToKi.py configure
37 | 
38 | tmpdir=$(mktemp --directory --suffix=$(basename $0));
39 | trap ' { rm -rf $tmpdir; } ' EXIT
40 | 
41 | #mkdir -v $outdir || echo "WARNING: outdir already exists: $outdir"
42 | 
43 | if [ -e $out ]; then
44 |   echo "ERROR: output file already exists: $out"
45 |   exit 1
46 | fi
47 | 
48 | samplename=$(basename $asm .fasta)
49 | echo "Sample name will be $samplename"
50 | 
51 | EToKi.py MLSType -i $asm -r $refs -k $samplename -d $db -o $tmpdir/$(basename $out)
52 | mv -v $tmpdir/$(basename $out) $out
53 | 


--------------------------------------------------------------------------------
/qsub/launch_fastqToFasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #$ -S /bin/bash
 4 | #$ -pe smp 1
 5 | #$ -cwd -V
 6 | #$ -o fastq_to_fasta.log -j y
 7 | #$ -N fastq_to_fasta
 8 | #$ -q all.q
 9 | 
10 | in=$1
11 | out=$2
12 | 
13 | if [ "$out" == "" ]; then
14 |   echo "Usage: $0 in.fastq[.gz] out.fasta"
15 |   exit 1;
16 | fi;
17 | 
18 | extension="${in##*.}"
19 | 
20 | if [ "$extension" = "gz" ]; then
21 |   gunzip -c "$in" | fastq_to_fasta -Q33 > "$out"
22 | else
23 |   fastq_to_fasta -Q33 < "$in" > "$out"
24 | fi
25 | if [ $? -gt 0 ]; then echo "ERROR with fastq_to_fasta"; exit 1; fi;
26 | 


--------------------------------------------------------------------------------
/qsub/launch_freyja.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Runs Frejya wastewater pipeline for SARS-CoV-2
 3 | # Author: Lee Katz <lkatz@cdc.gov>
 4 | 
 5 | #$ -S /bin/bash
 6 | #$ -pe smp 1
 7 | #$ -cwd -V
 8 | #$ -o freyja.log
 9 | #$ -j y
10 | #$ -N freyja
11 | 
12 | ref=$1; shift
13 | outdir=$1; shift
14 | reads=$@
15 | NSLOTS=${NSLOTS:=1}
16 | 
17 | source /etc/profile.d/modules.sh
18 | scriptname=$(basename $0);
19 | 
20 | if [ "$reads" == "" ]; then
21 |   echo "Usage: $scriptname ref.fasta outdir *_1.fastq[.gz]"
22 |   echo "  R2 reads will be detected automatically based on matchiing filenames"
23 |   exit 0;
24 | fi;
25 | 
26 | set -e
27 | set -u
28 | 
29 | module purge
30 | 
31 | source ~/.bash_conda > /dev/null 2>&1 || echo "Could not find bash file for loading conda"
32 | conda activate freyja  || echo "could not activate freyja env"
33 | 
34 | which ivar
35 | which freyja
36 | which samtools
37 | which bowtie2
38 | 
39 | tmpdir=$(mktemp --directory --suffix=$(basename $0));
40 | trap ' { rm -rf $tmpdir; } ' EXIT
41 | 
42 | mkdir -v $outdir || echo "WARNING: outdir already exists: $outdir"
43 | 
44 | for R1 in $reads; do
45 |   name=$(basename $R1 .fastq.gz | perl -plane 's/_\d|\.fastq|\.gz//g');
46 |   sampledir="$tmpdir/$name.freyja"
47 |   echo "START $name in $sampledir"
48 |   mkdir -v $sampledir
49 | 
50 |   # Get the file extension, if it's .gz
51 |   ext=${R1##*.}
52 | 
53 |   # Get fastq files into our local tmp folder
54 |   R2=${R1/_1.f/_2.f}
55 | 
56 |   echo "$R1 $R2";
57 | 
58 |   cp -v $R1 $R2 $sampledir/
59 |   R1=$sampledir/$(basename $R1)
60 |   R2=$sampledir/$(basename $R2)
61 | 
62 |   bowtie2 -x $ref -1 $R1 -2 $R2 | samtools view -bhS | samtools sort > $sampledir/sorted.bam
63 | 
64 |   # Trim primers
65 |   ivar trim -i $sampledir/sorted.bam -p $sampledir/ivar.unsorted
66 |   samtools sort $sampledir/ivar.unsorted.bam > $sampledir/ivar.bam
67 | 
68 |   # call variants
69 |   samtools mpileup -aa -A -d 600000 -B -Q 0 $sampledir/ivar.bam | ivar variants -p $sampledir/ivar -q 20 -t 0.03 -r $ref
70 | 
71 |   # abundances
72 |   freyja variants $sampledir/ivar.bam --variants $sampledir/freyja.variants --depths $sampledir/freyja.depths --ref $ref
73 |   freyja demix $sampledir/freyja.variants.tsv $sampledir/freyja.depths --output $sampledir/freyja.demix
74 | 
75 |   # Clear results we don't need
76 |   (cd $sampledir && rm -v *.bam* *.fastq.gz)
77 |   # Save results
78 |   rsync -av $sampledir $outdir/
79 | done
80 | 
81 | 


--------------------------------------------------------------------------------
/qsub/launch_kraken.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -pe smp 1-16
 3 | #$ -S /bin/bash
 4 | #$ -cwd -V
 5 | #$ -o kraken.log -j y
 6 | #$ -N kraken
 7 | 
 8 | function logmsg () {
 9 |   script=$(basename $0);
10 |   echo -e "$script: $@" >&2;
11 | }
12 | 
13 | function run () {
14 |   script=$(basename $0);
15 |   logmsg "Running $@"
16 |   eval $@
17 |   if [ $? -gt 0 ]; then logmsg "ERROR with previous command"; exit 1; fi;
18 | }
19 | 
20 | NSLOTS=${NSLOTS-8}
21 | KRAKEN_DEFAULT_DB=${KRAKEN_DEFAULT_DB-/scicomp/reference/kraken/0.10.4/mini-20140330}
22 | 
23 | if [ $# -eq 0 ]; then
24 |   logmsg "Usage: $0 out.kraken/ reads_1.fastq.gz reads_2.fastq.gz [more_1.fastq.gz more_2.fastq.gz ...]";
25 |   logmsg "NOTE: KRAKEN_DEFAULT_DB is currently set to $KRAKEN_DEFAULT_DB"
26 |   exit 1;
27 | fi;
28 | 
29 | source /etc/profile.d/modules.sh
30 | module load kraken/1.0.0
31 | module load krona/2.5
32 | 
33 | # Output is the first arg.
34 | # The rest of the args are reads.
35 | OUTDIR=$1; shift;
36 | READS=$@;
37 | 
38 | if [ -e $OUTDIR ]; then
39 |   echo "ERROR: dir $OUTDIR already exists! I will not overwrite it."
40 |   #echo "DEBUG"; rm -rvf $OUTDIR; 
41 |   exit 1;
42 | fi
43 | 
44 | # Where are my executables?
45 | KRAKENDIR=$(dirname $(which kraken));
46 | KRONADIR=$(dirname $(which ktImportText));
47 | 
48 | # Set up the work space
49 | TEMPDIR=$(mktemp --directory --suffix=$(basename $0));
50 | KRAKENOUT="$TEMPDIR/kraken.out"
51 | KRAKENTAXONOMY="$TEMPDIR/kraken.taxonomy";
52 | KRAKENREPORT="$TEMPDIR/kraken.report"
53 | HTML="$TEMPDIR/out.html"
54 | 
55 | function cleanup () {
56 |   rm -rvf $TEMPDIR
57 | }
58 | trap cleanup EXIT
59 | 
60 | hostname
61 | logmsg "tempdir is $TEMPDIR\n  kraken dir is $KRAKENDIR\n  krona dir is $KRONADIR";
62 | 
63 | run $KRAKENDIR/kraken --fastq-input --paired --db=$KRAKEN_DEFAULT_DB --preload --gzip-compressed --quick --threads $NSLOTS --output $KRAKENOUT $READS
64 | 
65 | run kraken-translate --db $KRAKEN_DEFAULT_DB $KRAKENOUT | cut -f 2- | sort | uniq -c |\
66 |   perl -lane '
67 |               s/^ +//;   # remove leading spaces
68 |               s/ +/\t/;  # change first set of spaces from uniq -c to a tab
69 |               s/;/\t/g;  # change the semicolon-delimited taxonomy to tab-delimited
70 |               print;
71 |              ' |\
72 |   sort -k1,1nr > $KRAKENTAXONOMY
73 | 
74 | # Grab the unclassified reads
75 | head -n 1 $KRAKENOUT | cut -f 3 >> $KRAKENTAXONOMY
76 | cat $KRAKENTAXONOMY
77 | 
78 | run $KRONADIR/ktImportText -o $HTML $KRAKENTAXONOMY
79 | run kraken-report $KRAKENOUT > $KRAKENREPORT
80 | perl -lane ' print if($F[0] > 0.00); ' < $KRAKENREPORT > $KRAKENREPORT.filtered
81 | 
82 | rm -v $KRAKENOUT
83 | cp -rv $TEMPDIR $OUTDIR
84 | 
85 | logmsg "DONE!"
86 | 
87 | 


--------------------------------------------------------------------------------
/qsub/launch_kraken2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -pe smp 1-16
 3 | #$ -S /bin/bash
 4 | #$ -cwd -V
 5 | #$ -o kraken2.log -j y
 6 | #$ -N kraken2
 7 | 
 8 | function logmsg () {
 9 |   script=$(basename $0);
10 |   echo -e "$script: $@" >&2;
11 | }
12 | 
13 | function run () {
14 |   script=$(basename $0);
15 |   logmsg "Running $@"
16 |   eval $@
17 |   if [ $? -gt 0 ]; then logmsg "ERROR with previous command"; exit 1; fi;
18 | }
19 | 
20 | NSLOTS=${NSLOTS-16}
21 | KRAKEN2_DEFAULT_DB=${KRAKEN2_DEFAULT_DB-/scicomp/reference/kraken/2.0.0/}
22 | 
23 | if [ $# -eq 0 ]; then
24 |   logmsg "Usage: $0 out.kraken/ reads_1.fastq.gz reads_2.fastq.gz [more_1.fastq.gz more_2.fastq.gz ...]";
25 |   logmsg "NOTE: KRAKEN2_DEFAULT_DB is currently set to $KRAKEN2_DEFAULT_DB"
26 |   exit 1;
27 | fi;
28 | 
29 | source /etc/profile.d/modules.sh
30 | module load kraken/2.0.8
31 | module load krona/2.5
32 | 
33 | # Output is the first arg.
34 | # The rest of the args are reads.
35 | OUTDIR=$1; shift;
36 | READS=$@;
37 | 
38 | if [ -e $OUTDIR ]; then
39 |   echo "ERROR: dir $OUTDIR already exists! I will not overwrite it."
40 |   #echo "DEBUG"; rm -rvf $OUTDIR; 
41 |   exit 1;
42 | fi
43 | 
44 | # Where are my executables?
45 | KRAKENDIR=$(dirname $(which kraken2));
46 | KRONADIR=$(dirname $(which ktImportText));
47 | 
48 | # Set up the work space
49 | TEMPDIR=$(mktemp --directory --suffix=$(basename $0));
50 | KRAKENOUT="$TEMPDIR/kraken.out"
51 | KRAKENTAXONOMY="$TEMPDIR/kraken.taxonomy";
52 | KRAKENREPORT="$TEMPDIR/kraken.report"
53 | HTML="$TEMPDIR/out.html"
54 | 
55 | function cleanup () {
56 |   rm -rvf $TEMPDIR
57 | }
58 | trap cleanup EXIT
59 | 
60 | hostname
61 | logmsg "tempdir is $TEMPDIR\n  kraken dir is $KRAKENDIR\n  krona dir is $KRONADIR";
62 | 
63 | run $KRAKENDIR/kraken2 --paired --db=$KRAKEN2_DEFAULT_DB --gzip-compressed --quick --threads $NSLOTS --report $KRAKENREPORT --output $KRAKENOUT $READS
64 | 
65 | run kraken2-translate --db $KRAKEN2_DEFAULT_DB $KRAKENOUT | cut -f 2- | sort | uniq -c |\
66 |   perl -lane '
67 |               s/^ +//;   # remove leading spaces
68 |               s/ +/\t/;  # change first set of spaces from uniq -c to a tab
69 |               s/;/\t/g;  # change the semicolon-delimited taxonomy to tab-delimited
70 |               print;
71 |              ' |\
72 |   sort -k1,1nr > $KRAKENTAXONOMY
73 | 
74 | # Grab the unclassified reads
75 | head -n 1 $KRAKENOUT | cut -f 3 >> $KRAKENTAXONOMY
76 | cat $KRAKENTAXONOMY
77 | 
78 | run $KRONADIR/ktImportText -o $HTML $KRAKENTAXONOMY
79 | perl -lane ' print if($F[0] > 0.00); ' < $KRAKENREPORT > $KRAKENREPORT.filtered
80 | 
81 | rm -v $KRAKENOUT
82 | cp -rv $TEMPDIR $OUTDIR
83 | 
84 | logmsg "DONE!"
85 | 
86 | 


--------------------------------------------------------------------------------
/qsub/launch_kraken_contigs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -pe smp 1-16
 3 | #$ -S /bin/bash
 4 | #$ -cwd -V
 5 | #$ -o kraken.log -j y
 6 | #$ -N kraken
 7 | 
 8 | # The aspen module for kraken is broken, and so I just have
 9 | # to assume Kraken is in the path.
10 | 
11 | source /etc/profile.d/modules.sh
12 | #module load kraken/0.10.4
13 | export PATH=$PATH:~/src/lskScripts/scripts
14 | 
15 | function logmsg () {
16 |   script=$(basename $0);
17 |   echo -e "$script: $@" >&2;
18 | }
19 | 
20 | function run () {
21 |   script=$(basename $0);
22 |   logmsg "Running $@"
23 |   eval $@
24 |   if [ $? -gt 0 ]; then logmsg "ERROR with previous command"; exit 1; fi;
25 | }
26 | 
27 | NSLOTS=${NSLOTS-8}
28 | KRAKEN_DEFAULT_DB=${KRAKEN_DEFAULT_DB-/scicomp/reference/kraken/0.10.4/mini-20140330}
29 | 
30 | if [ $# -eq 0 ]; then
31 |   logmsg "Usage: $0 outdir in.fasta";
32 |   logmsg "NOTE: KRAKEN_DEFAULT_DB is currently set to $KRAKEN_DEFAULT_DB"
33 |   exit 1;
34 | fi;
35 | 
36 | # load modules after help menu
37 | module load kraken/1.0.0
38 | module load krona/2.5
39 | 
40 | OUTDIR=$1
41 | ASM=$2
42 | 
43 | # Where are my executables?
44 | KRAKENDIR=$(dirname $(which kraken));
45 | KRONADIR=$(dirname $(which ktImportText));
46 | 
47 | # Set up the work space
48 | if [ -d "$OUTDIR" ]; then 
49 |   echo "$OUTDIR already exists"; 
50 |   exit 1; 
51 | fi;
52 | mkdir $OUTDIR
53 | if [ $? -gt 0 ]; then
54 |   echo "ERROR: I could not create $OUTDIR";
55 |   exit 1
56 | fi
57 | KRAKENOUT="$OUTDIR/kraken.out"
58 | KRAKENTAXONOMY="$OUTDIR/kraken.taxonomy";
59 | HTML="$OUTDIR/kraken.html"
60 | 
61 | logmsg "Outdir is $OUTDIR\n  kraken dir is $KRAKENDIR\n  krona dir is $KRONADIR";
62 | 
63 | run $KRAKENDIR/kraken --fasta-input --db=$KRAKEN_DEFAULT_DB --preload --threads $NSLOTS --output $KRAKENOUT $ASM
64 | 
65 | translate-kraken-contigs.pl $KRAKENOUT | sort -k1,1nr > $KRAKENTAXONOMY
66 | 
67 | run $KRONADIR/ktImportText -o $HTML $KRAKENTAXONOMY
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/qsub/launch_mergeFastaReads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #$ -S /bin/bash
 4 | #$ -pe smp 1
 5 | #$ -cwd -V
 6 | #$ -o mergeFastaReads.log
 7 | #$ -j y
 8 | #$ -N mergeFastaReads
 9 | #$ -q all.q
10 | 
11 | reads=$1
12 | out=$2
13 | 
14 | if [ "$out" == "" ]; then
15 |   echo "Usage: $0 reads.fasta merged.fasta"
16 |   exit 1;
17 | fi;
18 | 
19 | merge_fasta_reads "$reads" > "$out.tmp" && mv "$out.tmp" "$out"
20 | if [ $? -gt 0 ]; then echo "ERROR with merge_fasta_reads"; exit 1; fi;
21 | 


--------------------------------------------------------------------------------
/qsub/launch_parsnp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #$ -S /bin/bash
 4 | #$ -pe smp 4-16
 5 | #$ -cwd -V
 6 | #$ -o parsnp.log
 7 | #$ -j y
 8 | #$ -N parsnp
 9 | #$ -q all.q
10 | 
11 | module load harvest/1.0.1
12 | if [ $? -gt 0 ]; then echo "WARNING: couldn't load harvest module"; fi;
13 | 
14 | refGbk=$1
15 | asmDir=$2
16 | out=$3
17 | script=$(basename $0)
18 | 
19 | if [ "$out" == "" ]; then
20 |   echo "Usage: $script reference.gbk asmDir outDir"
21 |   exit 1;
22 | fi;
23 | 
24 | NSLOTS=${NSLOTS:=12}
25 | 
26 | c="parsnp -a 13 -c -R 1 -g $refGbk -d $asmDir -p $NSLOTS -o $out"
27 | $c # run the command
28 | if [ $? -gt 0 ]; then echo -e "ERROR with parsnp\n  $c"; exit 1; fi;
29 | 


--------------------------------------------------------------------------------
/qsub/launch_polish.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #$ -pe smp 1-16
 3 | #$ -S /bin/bash
 4 | #$ -cwd -V
 5 | #$ -o polish.log -j y
 6 | #$ -N polishAssembly
 7 | 
 8 | use 5.12.0;
 9 | use strict;
10 | use warnings;
11 | use Data::Dumper;
12 | use Getopt::Long qw/GetOptions/;
13 | use File::Basename qw/basename/;
14 | use Env::Modulecmd;
15 | use File::Basename qw/basename/;
16 | use File::Temp qw/tempdir/;
17 | use File::Copy qw/cp/;
18 | 
19 | Env::Modulecmd::purge();
20 | Env::Modulecmd::load(qw(nanopolish/0.8.3 minimap2/2.7 Python/2.7.13 
21 |   samtools/1.4.1 minimap2/2.7 bowtie2/2.3.3.1 pilon/1.22));
22 | 
23 | my $numcpus=$ENV{NSLOTS} || 24;
24 | 
25 | exit(main());
26 | 
27 | sub main{
28 | 
29 |   my $settings={};
30 |   GetOptions($settings,qw(help nanopore=s illumina1=s illumina2=s outfile|output|outfasta=s)) or die $!;
31 |   $$settings{nanopore}//="";
32 |   $$settings{illumina1}//="";
33 |   $$settings{illumina2}//="";
34 |   $$settings{outfile}//="polished.fasta";
35 |   $$settings{tempdir}//=tempdir("asm_polish_XXXXXX", TMPDIR=>1, CLEANUP=>1);
36 | 
37 |   my $ref=shift @ARGV;
38 |   die usage() if(!$ref || $$settings{help});
39 | 
40 |   if($$settings{nanopore}){
41 |     $ref=nanopolish($ref,$$settings{nanopore},$settings);
42 |   }
43 |   if($$settings{illumina1}){
44 |     $ref=pilon     ($ref,$$settings{illumina1},$$settings{illumina2},$settings);
45 |   }
46 | 
47 |   # Easy to print results with cat!
48 |   #system("cat $ref");
49 |   cp($ref, $$settings{outfile});
50 |   die if $?;
51 |   
52 |   return 0;
53 | }
54 | 
55 | sub nanopolish{
56 |   ...;
57 | }
58 | 
59 | sub pilon{
60 |   my($ref, $R1, $R2, $settings)=@_;
61 | 
62 |   my $symlinkRef="$$settings{tempdir}/unpolished.fasta";
63 |   cp($ref, $symlinkRef);
64 |   $ref=$symlinkRef;
65 | 
66 |   # People say to run Pilon four times in a row, single threaded
67 |   my $maxRuns=4;
68 |   for(my $i=1;$i<=4;$i++){
69 |     my $bam="$$settings{tempdir}/pilon$i.sorted.bam";
70 | 
71 |     system("bowtie2-build $ref $ref"); die if $?;
72 |     system("bowtie2 -x $ref -1 $R1 -2 $R2 -p $numcpus | samtools sort -T $$settings{tempdir}/samtoolssort -o $bam");
73 |     die if $?;
74 |     system("samtools index $bam"); die if $?;
75 | 
76 |     system("pilon --genome '$ref' --frags $bam --output $$settings{tempdir}/pilon$i --changes --threads $numcpus --fix snps,indels");
77 |     die if $?;
78 | 
79 |     # update for the next iteration
80 |     $ref="$$settings{tempdir}/pilon$i.fasta";
81 |   }
82 | 
83 |   return $ref;
84 | }
85 | 
86 | sub usage{
87 |   "Polish an assembly with nanopore or illumina reads.\n".
88 |   basename($0)." [--nanopore=nanopore.fastq] --illumina1=R1.fastq --illumina2=R2.fastq --outfile=out.fasta contigs.fasta
89 |   "
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------
/qsub/launch_predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #$ -S /bin/bash
 4 | #$ -pe smp 10-16
 5 | #$ -cwd -V
 6 | #$ -o prediction.log
 7 | #$ -j y
 8 | #$ -N cgpPredict
 9 | #$ -q all.q
10 | 
11 | project=$1
12 | 
13 | if [ "$project" == "" ]; then
14 |   echo "Usage: " $(basename $0) " CGP-project/"
15 |   exit 1;
16 | fi;
17 | 
18 | NSLOTS=${NSLOTS:=1}
19 | 
20 | run_pipeline predict -p "$project" --numcpus $NSLOTS
21 | if [ $? -gt 0 ]; then echo "ERROR with run_pipeline predict"; exit 1; fi;
22 | 


--------------------------------------------------------------------------------
/qsub/launch_prokka.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -pe smp 8-16
 3 | #$ -S /bin/bash
 4 | #$ -cwd -V
 5 | #$ -o prokka.log -j y
 6 | #$ -N prokka
 7 | 
 8 | contigs=$1
 9 | genome=$2
10 | 
11 | source /etc/profile.d/modules.sh
12 | 
13 | genus=${3-genus}
14 | species=${4-species}
15 | NSLOTS=${NSLOTS-1}
16 | 
17 | if [ "$genome" == "" ]; then
18 |   script=$(basename $0);
19 |   echo "Usage: $script contigs.fasta genomename [genus species]"
20 |   exit 1;
21 | fi
22 | module load prokka/1.13.3
23 | module load rnammer/1.2
24 | 
25 | command="prokka --prefix $genome --compliant --locustag $genome --genus $genus --species $species --strain $genome --force --cpus $NSLOTS $contigs"
26 | eval $command
27 | 


--------------------------------------------------------------------------------
/qsub/launch_shovill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Runs a shovill assembly.  Run with no options for usage.
 3 | # Author: Lee Katz <lkatz@cdc.gov>
 4 | 
 5 | #$ -S /bin/bash
 6 | #$ -pe smp 4-16
 7 | #$ -cwd -V
 8 | #$ -o shovill.log
 9 | #$ -j y
10 | #$ -N Shovill
11 | #$ -q all.q
12 | 
13 | R1=$1
14 | R2=$2
15 | outdir=$3
16 | NSLOTS=${NSLOTS:=12}
17 | 
18 | source /etc/profile.d/modules.sh
19 | scriptname=$(basename $0);
20 | 
21 | if [ "$outdir" == "" ]; then
22 |   echo "Usage: $scriptname reads.1.fastq.gz reads.2.fastq.gz outdir"
23 |   exit 0;
24 | fi;
25 | 
26 | set -e
27 | set -u
28 | 
29 | module load shovill
30 | 
31 | shovill --check
32 | 
33 | shovill --outdir $outdir --R1 $R1 --R2 $R2 --assembler skesa
34 | 
35 | 


--------------------------------------------------------------------------------
/qsub/launch_skesa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #$ -S /bin/bash
 3 | #$ -pe smp 1-16
 4 | #$ -cwd -V
 5 | #$ -o skesa.log -j y
 6 | #$ -N Skesa_2.0_2
 7 | #$ -q all.q
 8 | 
 9 | reads=$1
10 | fastaOut=$2
11 | 
12 | if [ "$fastaOut" == "" ]; then
13 |   echo "Usage: $0 shuffled.fastq.gz out.fasta"
14 |   exit 1;
15 | fi;
16 | 
17 | module load Skesa/2.0_2
18 | if [ $? -gt 0 ]; then echo "unable to load Skesa v2"; exit 1; fi;
19 | 
20 | if [ -e "$fastaOut" ]; then
21 |   echo "$fastaOut already exists. Exiting.";
22 |   exit 1;
23 | fi
24 | 
25 | NSLOTS=${NSLOTS:=1}
26 | skesa --cores $NSLOTS --fastq $reads --gz --use_paired_ends > $fastaOut
27 | 
28 | 


--------------------------------------------------------------------------------
/qsub/launch_spades.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #$ -S /bin/bash
 3 | #$ -pe smp 4-16
 4 | #$ -cwd -V
 5 | #$ -o spades.log
 6 | #$ -j y
 7 | #$ -N SPAdes_3.15.3
 8 | #$ -q all.q
 9 | 
10 | reads=$1
11 | out=$2
12 | fastaOut=$3
13 | 
14 | if [ "$out" == "" ]; then
15 |   echo "Usage: $0 reads.fastq.gz output/ [out.fasta]"
16 |   echo "  if out.fasta is given, then the output directory will be removed and the scaffolds.fasta file will be saved"
17 |   exit 1;
18 | fi;
19 | 
20 | module load SPAdes/3.15.3
21 | if [ $? -gt 0 ]; then echo "unable to load SPAdes/3.15.3"; exit 1; fi;
22 | 
23 | NSLOTS=${NSLOTS:=1}
24 | spades.py --12 $reads --careful -o $out -t $NSLOTS 
25 | if [ $? -gt 0 ]; then echo "problem with SPAdes/3.15.3"; exit 1; fi;
26 | 
27 | # Assembly metrics. Don't die if this script dies.  It's not worth it.
28 | echo "# CG-Pipeline metrics" > $out/run_assembly_metrics.txt
29 | run_assembly_metrics.pl $out/scaffolds.fasta >> $out/run_assembly_metrics.txt
30 | 
31 | if [ "$fastaOut" != "" ]; then
32 |   cp -v "$out/scaffolds.fasta" $fastaOut
33 |   if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi;
34 |   rm -rf "$out";
35 |   if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi;
36 | fi
37 | 
38 | 


--------------------------------------------------------------------------------
/qsub/launch_spades_SE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #$ -S /bin/bash
 3 | #$ -pe smp 4-16
 4 | #$ -cwd -V
 5 | #$ -o spades.log -j y
 6 | #$ -N SPAdes3.1.0
 7 | 
 8 | reads=$1
 9 | out=$2
10 | fasta=$3
11 | 
12 | if [ "$out" == "" ]; then
13 |   echo "Usage: $0 reads.fastq.gz output/ out.fasta"
14 |   exit 1;
15 | fi;
16 | 
17 | module load SPAdes/3.1.0
18 | if [ $? -gt 0 ]; then echo "unable to load spades 3.1.0"; exit 1; fi;
19 | 
20 | NSLOTS=${NSLOTS:=1}
21 | 
22 | spades.py -s $reads -o $out -t $NSLOTS
23 | if [ $? -gt 0 ]; then echo "problem with spades 3.1.0"; exit 1; fi;
24 | 
25 | if [ "$fastaOut" != "" ]; then
26 |   cp -v "$out/scaffolds.fasta" $fastaOut
27 |   if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi;
28 |   rm -rf "$out";
29 |   if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi;
30 | fi
31 | 
32 | 


--------------------------------------------------------------------------------
/qsub/launch_spades_iontorrent.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #$ -S /bin/bash
 3 | #$ -pe smp 4-16
 4 | #$ -cwd -V
 5 | #$ -o spades.log
 6 | #$ -j y
 7 | #$ -N SPAdesIonTorrent3.6.2
 8 | #$ -q all.q
 9 | 
10 | reads=$1
11 | out=$2
12 | fastaOut=$3
13 | 
14 | if [ "$out" == "" ]; then
15 |   echo "Usage: $0 reads.sff output/ [out.fasta]"
16 |   echo "  if out.fasta is given, then the output directory will be removed and the scaffolds.fasta file will be saved"
17 |   exit 1;
18 | fi;
19 | 
20 | module load SPAdes/3.6.2
21 | if [ $? -gt 0 ]; then echo "unable to load spades 3.6.2"; exit 1; fi;
22 | 
23 | NSLOTS=${NSLOTS:=1}
24 | spades.py -s $reads --iontorrent -o $out -t $NSLOTS 
25 | if [ $? -gt 0 ]; then echo "problem with spades 3.6.2"; exit 1; fi;
26 | 
27 | if [ "$fastaOut" != "" ]; then
28 |   cp -v "$out/scaffolds.fasta" $fastaOut
29 |   if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi;
30 |   rm -rf "$out";
31 |   if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi;
32 | fi
33 | 


--------------------------------------------------------------------------------
/qsub/launch_spades_split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | # Runs a spades assembly.  Run with no options for usage.
 3 | # Author: Lee Katz <lkatz@cdc.gov>
 4 | #Example:  (for f in *R1_001.fastq.gz; do b=`basename $f _R1.fastq.gz`; r=`sed 's/R1/R2/' <<< $f`; qsub -N spades$b -o ./assemblies/log/b.spades.log ~/bin/launch_SPAdes_v3.11.0.sh $f $r ./assemblies/$b.spades3.11 ./assemblies/$b; done;)
 5 | 
 6 | #$ -S /bin/bash
 7 | #$ -pe smp 4-16
 8 | #$ -cwd -V
 9 | #$ -o spades.log
10 | #$ -j y
11 | #$ -N SPAdes3.15.3
12 | 
13 | forward=$1
14 | reverse=$2
15 | out=$3
16 | fastaOut=$4
17 | 
18 | source /etc/profile.d/modules.sh
19 | scriptname=$(basename $0);
20 | 
21 | if [ "$out" == "" ]; then
22 |   echo "Usage: $scriptname reads.1.fastq.gz reads.2.fastq.gz output/ [out.fasta]"
23 |   echo "  if out.fasta is given, then the output directory will be removed and the scaffolds.fasta file will be saved"
24 |   exit 1;
25 | fi;
26 | 
27 | module load SPAdes/3.15.3
28 | if [ $? -gt 0 ]; then echo "unable to load spades 3.15.3"; exit 1; fi;
29 | 
30 | NSLOTS=${NSLOTS:=1}
31 | 
32 | COMMAND="spades.py -1 $forward -2 $reverse --careful -o $out -t $NSLOTS"
33 | echo "$scriptname: $COMMAND"
34 | $COMMAND
35 | if [ $? -gt 0 ]; then echo "problem with spades 3.11.0"; exit 1; fi;
36 | 
37 | if [ "$fastaOut" != "" ]; then
38 |   cp -v "$out/scaffolds.fasta" $fastaOut
39 |   if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi;
40 |   rm -rf "$out";
41 |   if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi;
42 | fi
43 | 


--------------------------------------------------------------------------------
/qsub/launch_trimClean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #$ -S /bin/bash
 4 | #$ -pe smp 10-16
 5 | #$ -cwd -V
 6 | #$ -o trimClean.log -j y
 7 | #$ -N cgpTrimClean
 8 | #$ -q all.q
 9 | 
10 | reads=$1
11 | out=$2
12 | 
13 | if [ "$out" == "" ]; then
14 |   echo "Usage: $0 reads.fastq.gz cleaned.fastq.gz"
15 |   exit 1;
16 | fi;
17 | 
18 | NSLOTS=${NSLOTS:=1}
19 | 
20 | run_assembly_trimClean.pl -i $reads -o $out --auto --nosingletons --numcpus $NSLOTS
21 | if [ $? -gt 0 ]; then echo "ERROR with run_assembly_trimClean.pl"; exit 1; fi;
22 | 


--------------------------------------------------------------------------------
/qsub/launch_velvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -pe smp 16
 3 | #$ -cwd -V
 4 | #$ -o velvet.log -j y
 5 | #$ -N velvet
 6 | #$ -q all.q
 7 | 
 8 | module ()
 9 | {
10 |   eval `/usr/bin/modulecmd bash $*`
11 | }
12 | 
13 | module load velvet/1.2.10;
14 | if [ $? -gt 0 ]; then echo "unable to load velvet/1.2.10"; exit 1; fi;
15 | 
16 | reads=$1
17 | out=$2
18 | 
19 | # number of cpus is either set by SGE, or is just 1
20 | #NSLOTS=${NSLOTS:=1}
21 | NSLOTS=${NSLOTS:=1}
22 | echo $NSLOTS
23 | 
24 | if [ "$out" == "" ]; then
25 |   echo "Usage: $0 reads.fastq.gz output/"
26 |   exit 1;
27 | fi;
28 | 
29 | command="$(which perl) $(which VelvetOptimiser.pl) -s 55 -e 99 -d $out -p $out -t $NSLOTS -f '-fastq.gz -shortPaired $reads'"
30 | eval $command
31 | if [ $? -gt 0 ]; then echo "problem with VelvetOptimiser"; echo $command; exit 1; fi;
32 | 


--------------------------------------------------------------------------------
/qsub/modules.csh:
--------------------------------------------------------------------------------
 1 | if ($?tcsh) then
 2 | 	set modules_shell="tcsh"
 3 | else
 4 | 	set modules_shell="csh"
 5 | endif
 6 | set exec_prefix='/usr/bin'
 7 | 
 8 | set prefix=""
 9 | set postfix=""
10 | 
11 | if ( $?histchars ) then
12 |   set histchar = `echo $histchars | cut -c1`
13 |   set _histchars = $histchars
14 | 
15 |   set prefix  = 'unset histchars;'
16 |   set postfix = 'set histchars = $_histchars;'
17 | else
18 |   set histchar = \!
19 | endif
20 | 
21 | if ($?prompt) then
22 |   set prefix  = "$prefix"'set _prompt="$prompt";set prompt="";'
23 |   set postfix = "$postfix"'set prompt="$_prompt";unset _prompt;'
24 | endif
25 | 
26 | if ($?noglob) then
27 |   set prefix  = "$prefix""set noglob;"
28 |   set postfix = "$postfix""unset noglob;"
29 | endif
30 | set postfix = "set _exit="'$status'"; $postfix; /usr/bin/test 0 = "'$_exit;'
31 | 
32 | alias module $prefix'eval `'$exec_prefix'/modulecmd '$modules_shell' '$histchar'*`; '$postfix
33 | unset exec_prefix
34 | unset prefix
35 | unset postfix
36 | 
37 | setenv MODULESHOME /usr/share/Modules
38 | 
39 | if (! $?MODULEPATH ) then
40 |   setenv MODULEPATH `sed -n 's/[ 	#].*$//; /./H; $ { x; s/^\n//; s/\n/:/g; p; }' ${MODULESHOME}/init/.modulespath`
41 | endif
42 | 
43 | if (! $?LOADEDMODULES ) then
44 |   setenv LOADEDMODULES ""
45 | endif
46 | 


--------------------------------------------------------------------------------
/qsub/modules.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | module() { eval `/usr/bin/modulecmd bash $*`; }
 3 | #export -f module
 4 | 
 5 | MODULESHOME=/usr/share/Modules
 6 | export MODULESHOME
 7 | 
 8 | if [ "${LOADEDMODULES:-}" = "" ]; then
 9 |   LOADEDMODULES=
10 |   export LOADEDMODULES
11 | fi
12 | 
13 | ## july 2013 wgx9 chris dagdigian
14 | ## master node had incorrect modulepath for centos nodes and the correct
15 | ## path was not being set because the env was already present and defined
16 | ## commenting out the if-then test so we force the recreation of the
17 | ## proper centos-specific module file paths ...
18 | 
19 | #if [ "${MODULEPATH:-}" = "" ]; then
20 | #  MODULEPATH=`sed -n 's/[ 	#].*$//; /./H; $ { x; s/^\n//; s/\n/:/g; p; }' ${MODULESHOME}/init/.modulespath`
21 | #  export MODULEPATH
22 | #fi
23 | 
24 | MODULEPATH=`sed -n 's/[ 	#].*$//; /./H; $ { x; s/^\n//; s/\n/:/g; p; }' ${MODULESHOME}/init/.modulespath`
25 | export MODULEPATH
26 | 
27 | 
28 | if [ ${BASH_VERSINFO:-0} -ge 3 ] && [ -r ${MODULESHOME}/init/bash_completion ]; then
29 |  . ${MODULESHOME}/init/bash_completion
30 | fi
31 | 


--------------------------------------------------------------------------------
/qsub/sub_unicycler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -e err.err
 3 | #$ -o out.out
 4 | #$ -N racon
 5 | #$ -pe smp 8,16
 6 | #$ -q short.q,all.q
 7 | #$ -cwd
 8 | 
 9 | # Author: Dhwani Batra
10 | 
11 | source /etc/profile.d/modules.sh
12 | module purge
13 | unset PYTHONPATH
14 | module load Unicycler/0.4.4
15 | 
16 | 
17 | 
18 | usage(){
19 | 
20 |         echo -e "\nUSAGE: $(basename $0)\n"\
21 |                 "   -1  Full path to Illumina Read 1 \n"\
22 |                 "   -2  Full path to Illumina Read 2 \n"\
23 | 		"   -r  Full path to Pacbio Reference \n"\
24 |         exit 1
25 | 
26 | }
27 | 
28 | SHORT1=""
29 | SHORT2=""
30 | REFERENCE=""
31 | 
32 | while getopts 1:2:r: opt; do
33 |         case $opt in
34 |         1)
35 |                 SHORT1=$OPTARG
36 |                 ;;
37 |         2)
38 |                 SHORT2=$OPTARG
39 |                 ;;
40 | 	r)		
41 |                 REFERENCE=$OPTARG
42 | 		;;
43 | 
44 |         esac
45 | done
46 | 
47 | if [ -z "$SHORT1" ]; then
48 |         echo -e "\nERROR: -1 is a required parameter."
49 |         usage
50 |         exit 1
51 | fi
52 | 
53 | if [ -z "$SHORT2" ]; then
54 |         echo -e "\nERROR: -2 is a required parameter."
55 |         usage
56 |         exit 1
57 | fi
58 | 
59 | if [ -z "$REFERENCE" ]; then
60 | 		echo $REFERENCE
61 |         echo -e "\nERROR: -r is a required parameter."
62 |         usage
63 |         exit 1
64 | fi
65 | 
66 | if [ -z "$NSLOTS" ]; then
67 |         NSLOTS=8
68 | fi
69 | 
70 | LNAME=$(basename $REFERENCE)
71 | LABEL=${LNAME%%.fsa}
72 | mkdir $LABEL
73 | cd $LABEL
74 | 
75 | 
76 | unicycler_polish -1 ${SHORT1} -2 ${SHORT2} -a ${REFERENCE} --pilon /apps/x86_64/pilon/1.22/lib/pilon/pilon-1.22.jar --ale /apps/x86_64/ALE/ALE-20130717/src/ALE  --threads $NSLOTS
77 | cd ..
78 | 


--------------------------------------------------------------------------------
/scripts/Kuhner-Felsenstein.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ref_tree=$1
 4 | export query_tree=$2
 5 | 
 6 | script=$(basename $0);
 7 | if [ "$ref_tree" == "" ]; then
 8 |   echo "Usage: $script ref.dnd query.dnd"
 9 |   exit 1;
10 | fi;
11 | 
12 | logmsg () {
13 |   echo "$script: $@" >&2
14 | }
15 | 
16 | # Check executables
17 | for exe in treedist randTrees.pl; do
18 |   which $exe >/dev/null 2>&1;
19 |   if [ $? -gt 0 ]; then
20 |     logmsg "ERROR: I could not find $exe in your path!";
21 |     exit 1;
22 |   fi;
23 | done;
24 | 
25 | mkdir -p /tmp/$USER
26 | tmpdir=$(mktemp --directory --tmpdir=/tmp/$USER Kuhner-Felsenstein.XXXXXX)
27 | if [ $? -gt 0 ]; then logmsg "ERROR making temporary directory under /tmp/$USER"; exit 1; fi;
28 | logmsg "Temporary dir is $tmpdir";
29 | 
30 | cp $ref_tree $tmpdir/ &&\
31 | cp $query_tree $tmpdir/
32 | 
33 | tmpRefTree="$tmpdir/$(basename $ref_tree)"
34 | tmpQueryTree="$tmpdir/$(basename $query_tree)"
35 | 
36 | # Create a list of trees to compare against in $comparisonTrees:
37 | #   The first tree is the reference tree
38 | #   The next trees are randomly made trees from 
39 | #   the query tree.
40 | #   Therefore, the question being answered is, is the query
41 | #   closer to the ref than random trees?
42 | comparisonTrees="$tmpdir/compareAgainst.dnd";
43 | cp $tmpRefTree $comparisonTrees
44 | randTrees.pl $tmpQueryTree --numTrees 1000 >> $comparisonTrees
45 | # We need the files to be named intree and intree2 because of inflexible treedist
46 | ln -s $tmpQueryTree "$tmpdir/intree"
47 | ln -s $comparisonTrees "$tmpdir/intree2"
48 | 
49 | # Must do treedist in the temp directory because it pollutes the
50 | # local directory
51 | pushd $tmpdir > /dev/null 2>&1
52 | 
53 | # Run treedist with the Kuhner and Felsenstein distance metric.
54 | # If the option "D" were given, then it would be run with Robinson-Foulds
55 | #echo -ne 'D\n2\nL\nS\nY\n' | treedist > "$tmpdir/treedist.log" 2> "$tmpdir/treedist.out"
56 | echo -ne '2\nL\nS\nY\n' | treedist > "$tmpdir/treedist.log" 2> "$tmpdir/treedist.out"
57 | if [ $? -gt 0 ]; then logmsg "ERROR with treedist program: $(cat $tmpdir/treedist.log)"; exit 1; fi;
58 | 
59 | # Find average and stdev
60 | cat outfile | perl -MStatistics::Descriptive -MMath::Gauss=cdf,pdf -MList::Util=sum -lane '
61 |   BEGIN{
62 |     my @F=split(/\s+/,<>);
63 |     $obs=$F[2];
64 |   }
65 |   next if($F[0] != 1);
66 |   push(@difference,$F[2]);
67 |   END{
68 |     $stat=Statistics::Descriptive::Full->new();
69 |     $stat->add_data(@difference);
70 |     $num=@difference;
71 |     $avg=$stat->mean;
72 |     $stdev=$stat->standard_deviation;
73 |     $var=$stdev**2;
74 | 
75 |     $Z=($obs - $avg)/$stdev;
76 |     $p=cdf($obs,$avg,$stdev);
77 | 
78 |     # scientific and floating point formatting
79 |     $_=sprintf("%0.2e",$_) for($p);
80 |     $_=sprintf("%0.2f",$_) for($obs,$avg,$stdev,$Z);
81 | 
82 |     # Print results
83 |     print join("\t",qw(ref_tree query_tree obs num avg stdev Z p));
84 |     print join("\t",$ENV{ref_tree},$ENV{query_tree},$obs, $num, $avg,$stdev,$Z,$p);
85 |   }
86 | '
87 | 
88 | rm -rf $tmpdir
89 | 
90 | 


--------------------------------------------------------------------------------
/scripts/MCM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # runs mauve contig mover
 4 | # Author: Lee Katz
 5 | 
 6 | set -e
 7 | 
 8 | ref=$1
 9 | draft=$2
10 | output=$3
11 | MAUVE=$(which Mauve)
12 | jar=`dirname $MAUVE`/Mauve.jar
13 | echo "JAR: $jar"
14 | 
15 | if [ "$output" = "" ]; then
16 |   echo "Usage: $0 ref.fasta draft.fasta outputDir";
17 |   exit 1;
18 | fi
19 | 
20 | java -Xmx500m -cp $jar org.gel.mauve.contigs.ContigOrderer -output $output -ref $ref -draft $draft
21 | 
22 | cd $output
23 | lastAlignment=$(\ls -t alignment*/alignment* | grep -P '\d$' | head -n 1)
24 | ln -s $lastAlignment ./alignment.xmfa
25 | cd ..
26 | echo "Best alignment can be found at $output/alignment.xmfa"
27 | 


--------------------------------------------------------------------------------
/scripts/alignmentToPhyloviz.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Data::Dumper;
 6 | use Bio::Perl;
 7 | use Bio::AlignIO;
 8 | use Getopt::Long;
 9 | use File::Basename;
10 | use File::Slurp qw/read_file/;
11 | 
12 | exit(main());
13 | 
14 | sub main{
15 |   my $settings={};
16 |   GetOptions($settings,qw(prefix=s defline-format=s help));
17 |   die usage() if($$settings{help});
18 |   
19 |   my $prefix=$$settings{prefix}||"$0.out";
20 |   $$settings{'defline-format'}||="incremental";
21 |   my $alnFile=shift(@ARGV) || die "ERROR: need an alignment file\n".usage();
22 |   die "ERROR: you gave more than one alignment file: ".join(", ",@ARGV)."\n".usage() if(@ARGV>1);
23 | 
24 |   my($strainSeq)=seqToIdHash($alnFile,$settings);
25 |   printResults($strainSeq,$prefix,$settings);
26 | 
27 |   return 0;
28 | }
29 | 
30 | # Make a hash of sequence => [id1,id2,...]
31 | sub seqToIdHash{
32 |   my($alnFile,$settings)=@_;
33 |   my $aln=Bio::AlignIO->new(-file=>$alnFile)->next_aln;
34 |   
35 |   my %strainSeq;
36 |   for my $seq($aln->each_seq){
37 |     push(@{$strainSeq{$seq->seq}},$seq->id);
38 |   }
39 |   return \%strainSeq;
40 | }
41 | 
42 | sub printResults{
43 |   my($strainSeq,$prefix,$settings)=@_;
44 |   open(ALN,">","$prefix.aln.fas") or die "ERROR: Could not write to alignment file $prefix.aln.fas:$!";
45 |   open(STS,">","$prefix.STs.txt") or die "ERROR: Could not write to alignment file $prefix.STs.txt:$!";
46 |   print STS join("\t",qw(ST IDs))."\n";
47 |   my $STcounter=0;
48 |   while(my($sequence,$idArr)=each(%$strainSeq)){
49 |     $STcounter++;
50 | 
51 |     my $idStr=join("__",@$idArr);
52 |     
53 |     my $defline;
54 |     if($$settings{'defline-format'} eq 'incremental'){
55 |       $defline=$STcounter;
56 |     }elsif($$settings{'defline-format'} eq 'join'){
57 |       $defline=$idStr." ST:$STcounter";
58 |     } else{
59 |       die "ERROR: Could not understand defline-format parameter\n".usage();
60 |     }
61 | 
62 |     print ALN ">$defline\n$sequence\n";
63 |     print STS join("\t",$defline,$idStr)."\n";
64 |   }
65 |   close ALN; close STS;
66 |   print "Wrote $prefix.aln.fas and $prefix.STs.txt\n";
67 | }
68 | sub usage{
69 |   local $0=fileparse($0);
70 |   "Make an alignment suitable for PhyloViz.
71 |   Converts an alignment into sequence types and the boiled down alignment, removing redundant ST entries.
72 |   Usage: $0 file.aln -p prefix
73 |     -p prefix for output files: \$p.STs.aln and \$p.STs
74 |     --defline-format incremental  Options: incremental (default), or join
75 |   "
76 | }
77 | 


--------------------------------------------------------------------------------
/scripts/anagramChecker.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Long qw/GetOptions/;
 6 | use File::Basename qw/basename/;
 7 | use Data::Dumper;
 8 | 
 9 | local $0 = basename $0;
10 | 
11 | exit(main());
12 | 
13 | sub main{
14 |   my $settings={};
15 |   GetOptions($settings,qw(help)) or die $!;
16 |   if($$settings{help} || @ARGV < 2){
17 |     die "Usage: $0 referenceWord queryWord [queryWord2...]
18 |     
19 |     This script checks for anagrams as compared to the reference word.
20 |     However, it will report false positives if there differences in
21 |     letter counts.";
22 |   }
23 | 
24 |   my $bitwiseLetters = bitwiseLetters($settings);
25 | 
26 |   # Simplify to uppercase for all
27 |   @ARGV = map{uc($_)} @ARGV;
28 | 
29 |   my $refWord=shift(@ARGV);
30 |   my $refBitwise = wordToBitwise($refWord,$bitwiseLetters,$settings);
31 |   for my $queryWord(@ARGV){
32 |     my $queryBitwise = wordToBitwise($queryWord,$bitwiseLetters,$settings);
33 |     if($refBitwise == $queryBitwise){
34 |       print "$queryWord is an anagram of $refWord\n";
35 |     } else {
36 |       print "$queryWord is not an anagram of $refWord\n";
37 |     }
38 |   }
39 | 
40 |   return 0;
41 | }
42 | 
43 | sub bitwiseLetters{
44 |   my($settings)=@_;
45 | 
46 |   # Make the resolution as high as 10 letters per word.
47 |   my $numLetters=26 * 10;
48 | 
49 |   my %bitwiseLetter;
50 |   my $ordOffset=ord("A");
51 |   for(my $i=0;$i<$numLetters;$i++){
52 |     # Mod to find the letter of the alphabet
53 |     my $mod = $i % 26;
54 |     # String multiplier, e.g., A x 3 = AAA
55 |     my $multiplier = int($i / 26)+1;
56 |     # The chr that corresponds to the letter extended by $multiplier.
57 |     my $key=chr($mod + $ordOffset) x $multiplier;
58 | 
59 |     # Power of 2 to take advantage of binary
60 |     $bitwiseLetter{$key} = 2 ** $i;
61 |   }
62 | 
63 |   return \%bitwiseLetter;
64 | }
65 | 
66 | sub wordToBitwise{
67 |   my($word,$bitwiseLetters,$settings)=@_;
68 |   
69 |   my $bitwise=0;
70 |   my $sortedLetters = join("",sort{$a cmp $b} split(//,$word));
71 | 
72 |   while($sortedLetters=~/((.)\2*)/g){
73 |     $bitwise = $bitwise | $$bitwiseLetters{$1};
74 |   }
75 |     
76 |   return $bitwise;
77 | }
78 | 


--------------------------------------------------------------------------------
/scripts/art_profile.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use strict;
  3 | use warnings;
  4 | use Data::Dumper;
  5 | use File::Basename qw/basename/;
  6 | use Getopt::Long;
  7 | use threads;
  8 | use Thread::Queue;
  9 | 
 10 | local $0=basename $0;
 11 | sub logmsg {print STDERR "$0: @_\n";}
 12 | exit(main());
 13 | 
 14 | sub main{
 15 |   my $settings={};
 16 |   GetOptions($settings,qw(help numcpus=i)) or die $!;
 17 |   $$settings{numcpus}||=1;
 18 | 
 19 |   die usage() if(!@ARGV || $$settings{help});
 20 |   
 21 |   # Find the counts for each fastq file
 22 |   my %counts;
 23 | 
 24 |   my $Q=Thread::Queue->new(@ARGV);
 25 |   my @thr;
 26 |   for(0..$$settings{numcpus}-1){
 27 |     $Q->enqueue(undef);
 28 |     $thr[$_]=threads->new(\&getQualityCounts,$Q,$settings);
 29 |   }
 30 | 
 31 |   logmsg "All fastq files enqueued!";
 32 |   for my $t(@thr){
 33 |     logmsg "Waiting to join thread ".$t->tid;
 34 |     my $c = $t->join();
 35 |     while(my($nt, $positionalQualityCounts)=each(%$c)){
 36 |       while(my($pos, $qualityCounts)=each(%$positionalQualityCounts)){
 37 |         while(my($quality, $count)=each(%$qualityCounts)){
 38 |           $counts{$nt}{$pos}{$quality}+=$count;
 39 |         }
 40 |       }
 41 |     }
 42 |   }
 43 |   #print Dumper \%counts;die;
 44 | 
 45 |   # Print an ART format
 46 |   my @nt=sort(keys(%counts));
 47 |   for my $nt (@nt){
 48 |     my $positionalQualityCounts=$counts{$nt};
 49 |     my @posArr=sort {$a<=>$b} keys(%$positionalQualityCounts);
 50 | 
 51 |     for my $pos (@posArr){
 52 |       my @qualArr=sort {$a cmp $b} keys(%{$$positionalQualityCounts{$pos}});
 53 | 
 54 |       # Print the first row for this position, the quality value
 55 |       print "$nt\t$pos";
 56 |       for my $quality (@qualArr){
 57 |         $quality //= chr(33);
 58 |         print "\t";
 59 |         print ord($quality)-33;
 60 |       }
 61 |       print "\n";
 62 | 
 63 |       # Print the second row for this position, the counts at the quality value
 64 |       print "$nt\t$pos";
 65 |       for my $quality (@qualArr){
 66 |         print "\t".$$positionalQualityCounts{$pos}{$quality};
 67 |       }
 68 |       print "\n";
 69 |     }
 70 | 
 71 |   }
 72 |   
 73 |   return 0;
 74 | }
 75 | 
 76 | sub getQualityCounts{
 77 |   my($Q,$settings)=@_;
 78 | 
 79 |   my %counts;
 80 |   while(defined(my $fastq=$Q->dequeue)){
 81 | 
 82 |     logmsg "Processing $fastq";
 83 | 
 84 |     my $lineCounter=0;
 85 |     open(my $fh, "zcat $fastq | ") or die "ERROR: could not read $fastq: $!";
 86 |     while(<$fh>){
 87 |       my $seq=<$fh>;
 88 |       chomp($seq);
 89 |       my @seq  = split(//, $seq);
 90 |       <$fh>; # burn the plus line
 91 |       my @qual = split(//, scalar(<$fh>));
 92 |       # I _could_ chomp @qual but the last item won't be
 93 |       # reached if I do the for loop right.
 94 | 
 95 |       my $readLength=@seq;
 96 |       for(my $i=0;$i<$readLength;$i++){
 97 |         $counts{$seq[$i]}{$i}{$qual[$i]}++;
 98 |       }
 99 |     }
100 |     close $fh;
101 |   }
102 | 
103 |   return \%counts;
104 | }
105 | 
106 | sub usage{
107 |   "Usage: $0 *_R1.fastq.gz > profile_R1.txt
108 |           $0 *_R2.fastq.gz > profile_R2.txt
109 |   --numcpus   1   Number of cpus
110 |   "
111 | }
112 | 


--------------------------------------------------------------------------------
/scripts/bamStats.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Long;
 6 | use Data::Dumper;
 7 | use File::Basename qw/basename/;
 8 | 
 9 | local $0=basename $0;
10 | sub logmsg{print STDERR "$0: @_\n";}
11 | 
12 | exit main();
13 | 
14 | sub main{
15 |   my $settings={};
16 |   GetOptions($settings,qw(help percent|percentage label=s)) or die $!;
17 |   die usage() if($$settings{help});
18 |   die usage() if(-t STDIN);
19 | 
20 |   my $QC=bamMetrics($settings);
21 | 
22 |   my @header=sort {$a cmp $b} keys(%$QC);
23 | 
24 |   if($$settings{label}){
25 |     unshift(@header, 'label');
26 |     $$QC{label}=$$settings{label};
27 |   }
28 | 
29 |   print join("\t",@header)."\n";
30 |   for my $header(@header){
31 |     print $$QC{$header}."\t";
32 |   }
33 |   print "\n";
34 | 
35 |   return 0;
36 | }
37 | 
38 | sub bamMetrics{
39 |   my($settings)=@_;
40 | 
41 |   my $numReads=0;
42 |   my %QC;
43 |   for(qw(simple-unmapped simple-improperPair combination-singletonMap combination-bothUnmapped combination-bothProperPair combination-wrongOrientationGoodInsertSize combination-mappedButWrongInsertSize)){
44 |     $QC{$_}=0;
45 |   }
46 | 
47 |   while(<>){
48 |     $numReads++;
49 |     chomp;
50 |     my($seqid, $flag, $rname, $pos, $mapQ, $cigar, $rnext, $pnext, $tlen, $seq, $qual) = split(/\t/, $_);
51 | 
52 |     # Individual tags
53 |     if($flag & 4){
54 |       $QC{'simple-unmapped'}++;
55 |     }
56 |     if(! $flag & 2){
57 |       $QC{'simple-improperPair'}++;
58 |     }
59 |     
60 |     # Combination tags
61 |     if($flag =~ /^(?:73|133|89|121|165|181|101|117|153|185|69|137)$/){
62 |       $QC{'combination-singletonMap'}++;
63 |     } elsif ($flag =~ /^(?:77|141)$/){
64 |       $QC{'combination-bothUnmapped'}++;
65 |     } elsif($flag =~ /^(?:99|147|83|163)$/){
66 |       $QC{'combination-bothProperPair'}++;
67 |     } elsif($flag =~ /^(?:67|131|115|179)$/){
68 |       $QC{'combination-wrongOrientationGoodInsertSize'}++;
69 |     } elsif($flag =~ /^(?:81|161|97|145|65|129|113|177)$/){
70 |       $QC{'combination-mappedButWrongInsertSize'}++;
71 |     }
72 |   }
73 | 
74 |   if($$settings{percent}){
75 |     for my $metric(keys(%QC)){
76 |       $QC{$metric} = sprintf("%0.2f", $QC{$metric}/$numReads*100);
77 |     }
78 |   }
79 | 
80 |   return \%QC;
81 | }
82 | 
83 | sub usage{
84 |   "$0: get QC information on a sam file
85 |   Usage: samtools view file.bam | $0
86 |   --percent       View results in percentages
87 |   "
88 | }
89 | 


--------------------------------------------------------------------------------
/scripts/blastAndExtract.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Author: Lee Katz <lkatz@cdc.gov>
 3 | 
 4 | use strict;
 5 | use warnings;
 6 | use Data::Dumper;
 7 | use Bio::Perl;
 8 | use Getopt::Long;
 9 | 
10 | exit(main());
11 | 
12 | sub main{
13 |   my $settings={};
14 |   die usage() if(!@ARGV);
15 |   GetOptions($settings,qw(flanking=i name=s));
16 |   $$settings{flanking}||=0;
17 |   $$settings{revcom}  ||=0;
18 |   my ($db,$query)=@ARGV[0,1];
19 |   die "ERROR: need db and query\n".usage() if(!$query || !$db);
20 |   
21 |   my($contig,$start,$stop)=blastAgainstDb($db,$query,$settings);
22 |   my $seq=extractSeq($contig,$start,$stop,$db,$settings);
23 |   print "$seq\n";
24 |   return 0;
25 | }
26 | 
27 | sub extractSeq{
28 |   my($contig,$start,$stop,$db,$settings)=@_;
29 |   my $fastaStr;
30 |   if($start>$stop){
31 |     my $tmp=$start;
32 |     $start=$stop;
33 |     $stop=$tmp;
34 |     $$settings{revcom}=1;
35 |   }
36 |   my $in=Bio::SeqIO->new(-file=>$db);
37 |   while(my $seq=$in->next_seq){
38 |     next if($seq->id ne $contig);
39 |     $start=$start-$$settings{flanking};
40 |     $stop=$stop+$$settings{flanking};
41 |     $start=1 if($start<1);
42 |     $stop=$seq->length if($stop>$seq->length);
43 |     my $sequence=$seq->subseq($start,$stop);
44 |     $sequence=~tr/ATCGatcg/TAGCtagc/ if($$settings{revcom});
45 |     $sequence=reverse($sequence) if($$settings{revcom});
46 |     $sequence=~s/(.{60})/$1\n/g;
47 |     my $id=join("_",">$$settings{name}".$seq->id,$start,$stop);
48 |     $fastaStr="$id\n$sequence";
49 |   }
50 |   die "Could not create a fasta in contig $contig with start/stop=$start/$stop\n" if(!$fastaStr);
51 |   return $fastaStr;
52 | }
53 | 
54 | sub blastAgainstDb{
55 |   my($db,$query,$settings)=@_;
56 |   my $command="legacy_blast.pl blastall -i '$query' -d '$db' -a 12 -e 0.05 -m 8 -p blastn -v 5 -b 5";
57 |   my @result=split(/\n/,`$command`);
58 |   die if(!@result);
59 |   @result=sort{
60 |     (split(/\t/,$b))[2]<=>(split(/\t/,$a))[2]
61 |   } @result;
62 |   
63 |   my($contig,$start,$stop)=(split(/\t/,$result[0]))[1,8,9];
64 |   return ($contig,$start,$stop);
65 | }
66 | 
67 | sub usage{
68 |   "Blasts a nucleotide sequence against a database and extracts the hit
69 |   Usage: $0 database.fasta query.fasta > hit.fasta
70 |     -f 100 to extract 100bp upstream/downstream
71 |     -n custom genome name to put into the defline
72 |   "
73 | }
74 | 


--------------------------------------------------------------------------------
/scripts/bp_jackknifeTrees.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Data::Dumper;
  6 | use Bio::TreeIO;
  7 | use Bio::Tree::Statistics;
  8 | use Getopt::Long;
  9 | 
 10 | exit main();
 11 | 
 12 | sub main{
 13 |   my $settings={};
 14 |   GetOptions($settings,qw(help)) or die $!;
 15 | 
 16 |   if(!@ARGV){
 17 |     die usage();
 18 |   }
 19 | 
 20 |   print STDERR "Reading in files\n";
 21 |   my $guideTree; # will be the first tree
 22 |   my @bs_tree=();
 23 |   my $i;
 24 |   for my $file(@ARGV){
 25 |     if(!-e $file|| !-s $file){
 26 |       print STDERR "Not found or empty: $file ";
 27 |       next;
 28 |     }
 29 |     my $in=Bio::TreeIO->new(-file=>$file,-format=>"newick");
 30 |     #while(my $tree=next_tree_fast($in)){
 31 |     while(my $tree=$in->next_tree){
 32 |       if(!$guideTree){
 33 |         $guideTree = $tree;
 34 |         next;
 35 |       }
 36 |       push(@bs_tree, $tree);
 37 |       print STDERR ".";
 38 |     }
 39 |   }
 40 | 
 41 |   if(!$guideTree){
 42 |     die "ERROR: no guide tree found";
 43 |   }
 44 |   if(!@bs_tree){
 45 |     die "ERROR: no jack knife trees found";
 46 |   }
 47 | 
 48 |   print STDERR "\n";
 49 |   print STDERR "Combining jack knife files\n";
 50 |   my $biostat = Bio::Tree::Statistics->new();
 51 |   #my $bsTree=$biostat->assess_bootstrap(\@bs_tree,$guideTree);
 52 |   my $bsTree = assess_bootstrap($biostat, \@bs_tree, $guideTree);
 53 |   print STDERR "Reading internal nodes\n";
 54 |   for my $node($bsTree->get_nodes){
 55 |     print STDERR ".";
 56 |     next if($node->is_Leaf);
 57 | 
 58 |     if(!$node->id){
 59 |       $node->id($node->bootstrap);
 60 |     }
 61 |   }
 62 |   print STDERR "\n";
 63 |   print $bsTree->as_text("newick")."\n";
 64 | 
 65 |   return 0;
 66 | }
 67 | 
 68 | sub assess_bootstrap{
 69 |    my ($self,$bs_trees,$guide_tree) = @_;
 70 |    my @consensus;
 71 |  
 72 |    # internal nodes are defined by their children
 73 |  
 74 |    my (%lookup,%internal);
 75 |    my $i = 0;
 76 |    for my $tree ( $guide_tree, @$bs_trees ) {
 77 |        # Do this as a top down approach, can probably be
 78 |        # improved by caching internal node states, but not going
 79 |        # to worry about it right now.
 80 |  
 81 |        my @allnodes = $tree->get_nodes;
 82 |        my @internalnodes = grep { ! $_->is_Leaf } @allnodes;
 83 |        for my $node ( @internalnodes ) {
 84 |            my @tips = sort map { $_->id } 
 85 |                       grep { $_->is_Leaf() } $node->get_all_Descendents;
 86 |            my $id = "(".join(",", @tips).")";
 87 |            if( $i == 0 ) {
 88 |                $internal{$id} = $node->internal_id;
 89 |            } else { 
 90 |                $lookup{$id}++;
 91 |            }
 92 |        }
 93 |        $i++;
 94 |    }
 95 |    $i--; # do not count the guide tree in the denominator
 96 | 
 97 |    my @save;
 98 |    for my $l ( keys %lookup ) {
 99 |        if( defined $internal{$l} ) {#&& $lookup{$l} > $min_seen ) {
100 |            my $intnode = $guide_tree->find_node(-internal_id => $internal{$l});
101 |            $intnode->bootstrap(sprintf("%d",100 * $lookup{$l} / $i));
102 |        }
103 |    }
104 |    return $guide_tree;
105 | }
106 | 
107 | sub usage{
108 |   "Usage: $0 guidetree.dnd jackknife.dnd [jackknife2.dnd...] > tree_with_confidence.nwk
109 |   Where each jack knife tree can have multiple entries and the output tree
110 |   will be a single entry with confidence values."
111 | }
112 | 


--------------------------------------------------------------------------------
/scripts/clusterDensityFromFastq.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Data::Dumper;
  6 | use File::Basename qw/basename/;
  7 | use Getopt::Long qw/GetOptions/;
  8 | use POSIX qw/ceil/;
  9 | use threads;
 10 | use Thread::Queue;
 11 | 
 12 | local $0 = basename $0;
 13 | sub logmsg{print STDERR "$0: @_\n";}
 14 | 
 15 | exit main();
 16 | sub main{
 17 |   my $settings={};
 18 |   GetOptions($settings,qw(help numcpus=i tile-size|size-of-tile=s)) or die $!;
 19 |   $$settings{'tile-size'} ||= 1;
 20 |   $$settings{numcpus}||=1;
 21 | 
 22 |   my $fastqPerThread = ceil(scalar(@ARGV) / $$settings{numcpus});
 23 | 
 24 |   my $printQ = Thread::Queue->new();
 25 |   $printQ->enqueue("File\tspots-per-mm2");
 26 | 
 27 |   my @thr;
 28 |   for(my $i=0;$i<$$settings{numcpus};$i++){
 29 |     my @fastq = splice(@ARGV,0,$fastqPerThread);
 30 |     $thr[$i] = threads->new(sub{
 31 |       my($fastqArr, $printQ)=@_;
 32 |       for my $fastq(@$fastqArr){
 33 |         logmsg $fastq;
 34 |         my $density = clusterDensity($fastq,$settings);
 35 |         $printQ->enqueue([$fastq, $density]);
 36 |       }
 37 |       return scalar(@fastq);
 38 |     }, \@fastq, $printQ);
 39 |   }
 40 | 
 41 |   # start the printer
 42 |   my $printerThread = threads->new(\&printer, $printQ);
 43 | 
 44 |   # join the threads
 45 |   for(@thr){
 46 |     $_->join;
 47 |   }
 48 | 
 49 |   # Terminate multithreaded printing
 50 |   $printQ->enqueue(undef);
 51 |   $printerThread->join();
 52 | 
 53 |   return 0;
 54 | }
 55 | 
 56 | sub printer{
 57 |   my($Q)=@_;
 58 |   while(defined(my $toPrint = $Q->dequeue)){
 59 |     if(ref($toPrint) eq 'ARRAY'){
 60 |       print join("\t",@$toPrint)."\n";
 61 |     } else {
 62 |       print "$toPrint\n";
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | sub clusterDensity{
 68 |   my($fastq,$settings)=@_;
 69 | 
 70 |   # Try to get some compile-time speedup
 71 |   my $colonRegex = qr/:/;
 72 |   my $whitespaceRegex = qr/\s+/;
 73 | 
 74 |   my %tileCount;
 75 |   open(my $fh, "-|", "zcat $fastq") or die "ERROR: could not zcat $fastq: $!";
 76 |   while(my $header=<$fh>){
 77 |     # Burn three lines. We're only looking at the header.
 78 |     <$fh>;
 79 |     <$fh>;
 80 |     <$fh>;
 81 | 
 82 |     chomp($header);
 83 |     my($firstPart,undef) = split($whitespaceRegex, $header);
 84 |     my($instrument, $run, $flowcell, $lane, $tile, $x, $y) = split($colonRegex, $firstPart);
 85 |     #my(undef, undef, undef, undef, $tile, $x, $y) = split($colonRegex, $firstPart);
 86 |     $tileCount{$tile}++;
 87 |   }
 88 |   close $fh;
 89 | 
 90 |   my $total=0;
 91 |   while(my($tile,$count)=each(%tileCount)){
 92 |     $total+=$count;
 93 |   }
 94 |   my $averagePerTile = sprintf("%0.2f", $total/scalar(keys(%tileCount))/$$settings{'tile-size'});
 95 | 
 96 |   return $averagePerTile;
 97 | }
 98 | 
 99 | sub usage{
100 |   "$0: Calculates cluster density of a fastq file with casava style headers
101 |   Usage: $0 [options] *.fastq.gz
102 |   
103 |   --tile-size  1  Size of the tile in square mm.
104 |   --numcpus    1
105 |   "
106 | }
107 | 


--------------------------------------------------------------------------------
/scripts/countATCG.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Author:  Lee Katz <lkatz@cdc.gov>
 3 | # Count the number of nucleotides in a sequence file
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | use Bio::Perl;
 8 | use Getopt::Long;
 9 | 
10 | exit(main());
11 | 
12 | sub main{
13 |   
14 |   my $settings={};
15 |   GetOptions($settings,qw(help discoverNts)) or die $!;
16 | 
17 |   die usage() if($$settings{help});
18 | 
19 |   my @seq=@ARGV;
20 |   die "ERROR: no sequence files given!\n".usage() if(!@seq);
21 |   # Make an array of nts so that the ordering doesn't change
22 |   my @nt=ntArray(\@seq,$settings);
23 |   my @ntWithN=(@nt,"N");
24 | 
25 |   # print the header
26 |   print $_."\t" for("file",@ntWithN);
27 |   print "\n";
28 | 
29 |   # Count the ATCG for each parameter
30 |   for my $seq(@seq){
31 |     count($seq,\@nt,\@ntWithN);
32 |   }
33 |   return 0;
34 | }
35 | 
36 | sub ntArray{
37 |   my($seq,$settings)=@_;
38 |   my @nt=qw(A T C G);
39 |   if($$settings{discoverNts}){
40 |     # make a long string of nucleotides
41 |     my $concatSeq="";
42 |     for my $seqfile(@$seq){
43 |       my $seqin=Bio::SeqIO->new(-file=>$seqfile);
44 |       while(my $seqObj=$seqin->next_seq){
45 |         $concatSeq.=uc($seqObj->seq);
46 |       }
47 |     }
48 |     # Figure out which nts are present
49 |     my %nt;
50 |     my $seqLength=length($concatSeq);
51 |     for(my $i=0;$i<$seqLength;$i++){
52 |       $nt{substr($concatSeq,$i,1)}=1;
53 |     }
54 |     @nt=keys(%nt);
55 |   }
56 |   return @nt;
57 | }
58 | 
59 | sub count{
60 |   my($arg,$ntArr,$ntWithN)=@_;
61 |   # Read the sequence into an object
62 |   my $in=Bio::SeqIO->new(-file=>$arg); 
63 | 
64 |   my %num;
65 |   while(my $seq=$in->next_seq){
66 |     $seq=$seq->seq; 
67 |     # get the counts of nts
68 |     for my $nt(@$ntArr){
69 |       $num{$nt}=($seq=~s/$nt//gi); 
70 |     } 
71 |     $num{"N"}=length($seq);
72 |   } 
73 |   print $arg."\t";
74 |   print $num{$_}."\t" for(@$ntWithN); # print values
75 |   print "\n"; # newline to make it pretty
76 |   $in->close;
77 | }
78 | 
79 | sub usage{
80 |   "Counts the number of each nucleotide in a sequence file
81 |   Usage: $0 file.fasta [file2.fasta ...]
82 |   --discoverNts  Look at the sequencing file first to figure out which Nts are present in the first place (slower)
83 |   "
84 | }
85 | 


--------------------------------------------------------------------------------
/scripts/directoryDuration.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Give the approximate lenght of time it took to make all files in a directory
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | use Getopt::Long;
 8 | use File::Basename qw/basename/;
 9 | use File::Find qw/find/;
10 | 
11 | local $0=basename $0;
12 | 
13 | my $settings={};
14 | GetOptions($settings,qw(help exclude=s include=s verbose)) or die $!;
15 | die usage() if(!@ARGV || $$settings{help});
16 | my $exclude=$$settings{exclude}||0;
17 | my $include=$$settings{include}||0;
18 | 
19 | for my $dir(@ARGV){
20 |   die "ERROR: $dir is not a directory" if(!-d $dir);
21 | 
22 |   my $oldest= ~0;
23 |   my $newest=0;
24 |   find({no_chdir=>1, wanted=>sub{
25 |     return if(-d $File::Find::name);
26 |     if($exclude && $File::Find::name =~ /$exclude/){
27 |       print "Excluding: $File::Find::name\n" if($$settings{verbose});
28 |       return;
29 |     }
30 |     if($include && $File::Find::name !~ /$include/){
31 |       print "Not including: $File::Find::name\n" if($$settings{verbose});
32 |       return;
33 |     }
34 |     my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
35 |             $atime,$mtime,$ctime,$blksize,$blocks)
36 |       =stat($File::Find::name);
37 | 
38 |    if(!$mtime){
39 |      print "WARNING: no timestamp $File::Find::name\n" if($$settings{verbose});
40 |      return;
41 |    }
42 |    if($mtime < $oldest){
43 |      print "oldest $File::Find::name\n" if($$settings{verbose});
44 |      $oldest=$mtime;
45 |    }
46 |    if($mtime > $newest){
47 |      print "newest $File::Find::name\n" if($$settings{verbose});
48 |      $newest=$mtime;
49 |    }
50 |   }},$dir);
51 | 
52 |   my $duration=$newest-$oldest;
53 |   print "$dir\t$duration\n";
54 | }
55 | 
56 | 
57 | exit 0;
58 | 
59 | sub usage{
60 |   "Usage: $0 dir
61 |   Gives the number of seconds between the oldest and 
62 |   newest files
63 | 
64 |   --verbose
65 |   --exclude  PATTERN  Supply a regex pattern to ignore
66 |                       certain filenames.
67 |   "
68 | }
69 | 


--------------------------------------------------------------------------------
/scripts/downloadSrrRemotely.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Author: Lee Katz <gzu2@cdc.gov>
 3 | 
 4 | #$ -cwd -V 
 5 | #$ -S /bin/sh
 6 | #$ -q all.q -pe smp 1
 7 | #$ -N downloadSrrViaRGN
 8 | #$ -o download.log -j y
 9 | 
10 | ## CONFIGURATION
11 | RGN_ASCP_PATH="/opt/aspera/bin/ascp"
12 | ASCP_XOPTS="-v -QT -l640M -i /opt/aspera/etc/asperaweb_id_dsa.putty"
13 | RGN_URI="gzu2-rgntds@rgntds.cdc.gov"
14 | ## END CONFIGURATION
15 | #####################
16 | 
17 | SRR=$1
18 | NAME=$2
19 | OUTDIR=$3
20 | THISSCRIPT=$(basename $0);
21 | 
22 | # Usage statement
23 | if [ "$NAME" == "" ]; then
24 |   echo "Downloads a genome using the RGN, then sends it back to you, then decompresses it into split reads"
25 |   echo "Usage: $THISSCRIPT SRR0123456 nameOfGenome outdir"
26 |   echo "Example: $THISSCRIPT SRR1041486 FL_FLDACS-00090 ."
27 |   exit 1;
28 | fi
29 | 
30 | 
31 | echo `date +'%H:%M:%S'`" Transferring the file to the remote computer from NCBI"
32 | THREE=${SRR:0:3}
33 | SIX=${SRR:0:6}
34 | ssh $RGN_URI "mkdir -p /tmp/$USER; $RGN_ASCP_PATH $ASCP_XOPTS anonftp@ftp-private.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/$THREE/$SIX/$SRR/$SRR.sra /tmp/$USER/$SRR.sra"
35 | if [ $? -gt 0 ]; then echo "ERROR with ascp on the remote computer!"; exit 1; fi;
36 | 
37 | echo `date +'%H:%M:%S'`" Transferring the file back to this computer"
38 | rsync -e "ssh -x" --progress -a $RGN_URI:/tmp/$USER/$SRR.sra $OUTDIR/$NAME.sra
39 | if [ $? -gt 0 ]; then echo "ERROR with transferring the file to here using rsync"; exit 1; fi;
40 | 
41 | echo `date +'%H:%M:%S'` "Decompressing the file into fastq.gz - this might take a while";
42 | fastq-dump -v --defline-seq '@$ac_$sn[_$rn]/$ri' --defline-qual '+' --split-files -O . --gzip $OUTDIR/$NAME.sra
43 | if [ $? -gt 0 ]; then echo "ERROR with fastq-dump"; exit 1; fi;
44 | 
45 | 
46 | echo `date +'%H:%M:%S'`" Finished. Files will be found in $OUTDIR";
47 | 
48 | exit 0;
49 | 
50 | 


--------------------------------------------------------------------------------
/scripts/exportBioNumericsFastaWithCoverage.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Author: Lee Katz <lkatz@cdc.gov>
 3 | # Splits up a bionumerics fasta file into contigs
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | use Bio::Perl;
 8 | use Bio::Tools::GuessSeqFormat;
 9 | use File::Basename;
10 | use autodie;
11 | use Getopt::Long;
12 | use File::Basename qw/basename/;
13 | 
14 | local $0 = basename($0);
15 | 
16 | sub logmsg{ print STDERR "$0: @_\n"; }
17 | 
18 | exit main();
19 | 
20 | sub main{
21 |   my $settings={};
22 |   GetOptions($settings,qw(help outdir=s)) or die $!;
23 | 
24 |   die usage() if(!@ARGV || $$settings{help});
25 | 
26 |   if($$settings{outdir}){
27 |     if(! -d $$settings{outdir}){
28 |       mkdir $$settings{outdir};
29 |     }
30 |   }
31 | 
32 |   for my $f(@ARGV){
33 |     printFasta($f, $settings);
34 |   }
35 | 
36 |   return 0;
37 | }
38 | 
39 | sub printFasta{
40 |   my($infile, $settings) = @_;
41 | 
42 |   # Because we are reading bionumerics files and do not
43 |   # trust their extensions, make a better guess at their
44 |   # format using the format guesser.
45 |   #my $formatGuesser=Bio::Tools::GuessSeqFormat->new(-file=>$infile);
46 |   #my $format       =$formatGuesser->guess;
47 | 
48 |   my $format = "fasta";
49 | 
50 |   logmsg $infile;
51 |   my $in=Bio::SeqIO->new(-file=>$infile,-format=>$format); 
52 |   while(my $seq=$in->next_seq){
53 |     my @seq=split(/\|/,$seq->seq);
54 |     my $id=$seq->id;
55 |        $id=~s/^denovo\|//;   # remove 'denovo' since most exports seem to have that
56 | 
57 |     my $out=Bio::SeqIO->new(-format=>"fasta"); 
58 |     if($$settings{outdir}){
59 |       # For potential filenames, get a safe name
60 |       my($SRR, $orig, $cov, $rep, $asm) = split(/_/, $id);
61 | 
62 |       my $outfile="$$settings{outdir}/${SRR}_${orig}_${cov}_${rep}_${asm}.fa";
63 |       if(-e $outfile){
64 |         #print "Collision: $infile => $outfile\n";
65 |         next;
66 |       }
67 |       $out=Bio::SeqIO->new(-format=>"fasta",-file=>">$outfile");
68 |     }
69 |     for(my $i=1;$i<=@seq;$i++){
70 |       my $subseq=Bio::Seq->new(-seq=>$seq[$i-1],-id=>$id."_".$i);
71 |       $out->write_seq($subseq);
72 |     }
73 |     $out->close;
74 |   }
75 | }
76 | 
77 | sub usage{
78 |   local $0=fileparse $0;
79 |   "Usage: $0 bionumerics.fasta > out.fasta
80 |   --outdir   ''  If given, all genomes will be written here.
81 |                  If blank, output will be sent to stdout.
82 |   "
83 | }
84 | 


--------------------------------------------------------------------------------
/scripts/fastqDump-SE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SRR=$1
 4 | 
 5 | R1="${SRR}.fastq.gz";
 6 | R1uncompressed="${SRR}.fastq"
 7 | 
 8 | script=$(basename $0);
 9 | if [ "$SRR" == "" ]; then
10 |   echo "Downloads a fastq properly using fastq-dump"
11 |   echo "  Sorts the reads for maximum compression using fasten_sort."
12 |   echo "Usage: $script SRR_accession"
13 |   exit 1;
14 | fi
15 | 
16 | set -e
17 | set -u
18 | 
19 | if [ -e "${SRR}.fastq.gz" ]; then
20 |   echo "${SRR}.fastq.gz is already present."
21 |   exit 1
22 | fi
23 | if [ -e "$R1" ]; then
24 |   echo "$R1 is already present."
25 |   exit 1
26 | fi
27 | 
28 | module purge
29 | #module load sratoolkit/2.9.1
30 | module load sratoolkit/2.11.3
31 | 
32 | # Check if fasten_sort is in the path and if not, quit
33 | echo "Checking dependency paths"
34 | which fasten_sort
35 | which fasterq-dump || which fastq-dump
36 | 
37 | tempdir=$(mktemp --directory --tmpdir=$TMPDIR $(basename $0).XXXXXX)
38 | trap "{ rm -rf $tempdir; }" EXIT SIGINT SIGTERM
39 | echo "Files will temporarily be stored in $tempdir"
40 | 
41 | # Decide whether to run fastq-dump or fasterq-dump
42 | fasterqDump="$(which fasterq-dump 2>/dev/null)";
43 | if [ "$fasterqDump" == "" ]; then
44 |   fastq-dump --accession $SRR --outdir $tempdir --defline-seq '@$ac.$si/$ri' --defline-qual '+' --split-files --skip-technical --dumpbase --clip
45 |   if [ $? -gt 0 ]; then 
46 |     echo "ERROR with fastq-dump and $SRR"
47 |     exit 1
48 |   fi
49 | else
50 |   cd $tempdir
51 |   fasterq-dump $SRR --print-read-nr --threads 1 --outdir $tempdir --split-files --skip-technical 
52 |   if [ $? -gt 0 ]; then 
53 |     echo "ERROR with fasterq-dump and $SRR"
54 |     exit 1
55 |   fi
56 |   if [ ! -e "$R1uncompressed" ]; then 
57 |     echo "ERROR: R1uncompressed not present in filename $R1uncompressed";
58 |     ls -lhA $tempdir
59 |     exit 1;
60 |   fi
61 |   cd -
62 | 
63 |   # Compress fastq files
64 |   for fastq in $tempdir/*.fastq; do 
65 |     # Remove quality defline with perl before compressing
66 |     perl -lane '
67 |       if($. % 4 == 3){
68 |         $_="+";
69 |       }
70 |       print;
71 |     ' < $fastq > $fastq.tmp;
72 |     mv $fastq.tmp $fastq
73 |   done;
74 | fi
75 | 
76 | # Intense compression
77 | mv -v $tempdir/$R1uncompressed $tempdir/unsorted.fastq
78 | cat $tempdir/unsorted.fastq | \
79 |   fasten_sort --sort-by SEQ | \
80 |   fasten_progress --print --id sort-reads --update-every 100000 | \
81 |   gzip -vc9 > $tempdir/$R1uncompressed.gz
82 | 
83 | rm -v $tempdir/unsorted.fastq
84 | 
85 | ls -lhd $tempdir
86 | ls -lh $tempdir/*
87 | mv -v $tempdir/$R1 .
88 | 
89 | 


--------------------------------------------------------------------------------
/scripts/fastqDump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SRR=$1
 4 | 
 5 | R1="${SRR}_1.fastq.gz";
 6 | R2="${SRR}_2.fastq.gz";
 7 | R1uncompressed="${SRR}_1.fastq"
 8 | R2uncompressed="${SRR}_2.fastq"
 9 | 
10 | script=$(basename $0);
11 | if [ "$SRR" == "" ]; then
12 |   echo "Downloads a fastq properly using fastq-dump"
13 |   echo "  Sorts the reads for maximum compression using fasten_sort."
14 |   echo "Usage: $script SRR_accession"
15 |   exit 1;
16 | fi
17 | 
18 | set -e
19 | set -u
20 | 
21 | if [ -e "${SRR}.fastq.gz" ]; then
22 |   echo "${SRR}.fastq.gz is already present."
23 |   exit 1
24 | fi
25 | if [ -e "$R1" ]; then
26 |   echo "$R1 is already present."
27 |   exit 1
28 | fi
29 | 
30 | module purge
31 | #module load sratoolkit/2.9.1
32 | module load sratoolkit/2.11.3
33 | 
34 | # Check if fasten_sort is in the path and if not, quit
35 | echo "Checking dependency paths"
36 | which fasten_sort
37 | which fasterq-dump || which fastq-dump
38 | 
39 | tempdir=$(mktemp --directory --tmpdir=$TMPDIR $(basename $0).XXXXXX)
40 | trap "{ rm -rf $tempdir; }" EXIT SIGINT SIGTERM
41 | echo "Files will temporarily be stored in $tempdir"
42 | 
43 | # Decide whether to run fastq-dump or fasterq-dump
44 | fasterqDump="$(which fasterq-dump 2>/dev/null)";
45 | if [ "$fasterqDump" == "" ]; then
46 |   fastq-dump --accession $SRR --outdir $tempdir --defline-seq '@$ac.$si/$ri' --defline-qual '+' --split-files --skip-technical --dumpbase --clip
47 |   if [ $? -gt 0 ]; then 
48 |     echo "ERROR with fastq-dump and $SRR"
49 |     exit 1
50 |   fi
51 | else
52 |   cd $tempdir
53 |   fasterq-dump $SRR --print-read-nr --threads 1 --outdir $tempdir --split-files --skip-technical 
54 |   if [ $? -gt 0 ]; then 
55 |     echo "ERROR with fasterq-dump and $SRR"
56 |     exit 1
57 |   fi
58 |   if [ ! -e "$R1uncompressed" ]; then 
59 |     echo "ERROR: R1uncompressed not present in filename $R1uncompressed";
60 |     ls -lhA $tempdir
61 |     exit 1;
62 |   fi
63 |   cd -
64 | 
65 |   # Compress fastq files
66 |   for fastq in $tempdir/*.fastq; do 
67 |     # Remove quality defline with perl before compressing
68 |     perl -lane '
69 |       if($. % 4 == 3){
70 |         $_="+";
71 |       }
72 |       print;
73 |     ' < $fastq > $fastq.tmp;
74 |     mv $fastq.tmp $fastq
75 |   done;
76 | fi
77 | 
78 | # Intense compression
79 | #echo "DEBUG"; head -n 888 $tempdir/$R1uncompressed > $tempdir/unsorted_1.fastq
80 | #echo "DEBUG"; head -n 888 $tempdir/$R2uncompressed > $tempdir/unsorted_2.fastq
81 | mv -v $tempdir/$R1uncompressed $tempdir/unsorted_1.fastq
82 | mv -v $tempdir/$R2uncompressed $tempdir/unsorted_2.fastq
83 | cat $tempdir/unsorted_1.fastq $tempdir/unsorted_2.fastq | \
84 |   fasten_shuffle | \
85 |   fasten_sort --sort-by SEQ --paired-end | \
86 |   fasten_progress --print --id sort-reads --update-every 100000 | \
87 |   fasten_shuffle -d -1 $tempdir/$R1uncompressed -2 $tempdir/$R2uncompressed
88 | gzip -v9 $tempdir/$R1uncompressed $tempdir/$R2uncompressed
89 | rm -v $tempdir/unsorted_1.fastq $tempdir/unsorted_2.fastq
90 | 
91 | ls -lhd $tempdir
92 | ls -lh $tempdir/*
93 | mv -v $tempdir/{$R1,$R2} .
94 | 
95 | 


--------------------------------------------------------------------------------
/scripts/fastqMaxCompression.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e 
 4 | 
 5 | DIR=$1
 6 | 
 7 | if [ "$DIR" == "" ]; then
 8 |   echo "Finds fastq.gz files and runs max compression on them";
 9 |   echo "Usage: $0 dir"
10 |   exit 1
11 | fi
12 | 
13 | TMP=$(mktemp --directory FASTQMAXCOMPRESSION.XXXXXX --tmpdir=$TMPDIR)
14 | trap "{ rm -rf $TMP; }" EXIT
15 | export TMP
16 | 
17 | find $DIR -iname '*.fastq.gz' -or -iname '*.fq.gz' | xargs -P 1 -n 1 bash -c '
18 |   if [ "$(file $0 | grep -m 1 -o "max compression" | head -n 1)" != "" ]; then
19 |     echo "Skipping $0 bc it is already max compressed"
20 |     exit 0
21 |   fi
22 | 
23 |   originalSize=$(du $0 | cut -f 1)
24 |   
25 |   tmpfile=$(mktemp --tmpdir=$TMP MAX.XXXXXX --suffix=.fastq.gz)
26 |   trap "{ rm -f $tmpfile; }" EXIT
27 | 
28 |   echo "$0 => $tmpfile"
29 |   gzip -dc $0 | gzip -9c > $tmpfile && \
30 |     mv -v $tmpfile $0
31 | 
32 |   newsize=$(du $0 | cut -f 1);
33 |   savings=$(printf "%0.2f" $(echo "$newsize/$originalSize" | bc -l));
34 | 
35 |   echo "New file is $savings of original"
36 | '
37 | 
38 | find $DIR -name '*.fastq' -or '*.fq' | xargs -P 1 -n 1 bash -c '
39 |   tmpfile=$(mktemp --tmpdir=$TMP MAX.XXXXXX --suffix=.fastq.gz)
40 |   trap "{ rm -f $tmpfile; }" EXIT
41 | 
42 |   gzip -c9 $0 > $tmpfile && \
43 |     mv $tmpfile $0.gz && \
44 |     rm $0
45 | '
46 | 
47 | 


--------------------------------------------------------------------------------
/scripts/fastqToFastaQual.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # Convert a fastq to a fasta/qual combo using BioPerl, with some Linux commands
  4 | 
  5 | use Bio::Perl;
  6 | use Data::Dumper;
  7 | use strict;
  8 | use warnings;
  9 | use threads;
 10 | use Thread::Queue;
 11 | use Getopt::Long;
 12 | 
 13 | my $settings={};
 14 | 
 15 | $|=1;
 16 | my %numSequences; # static for a subroutine
 17 | 
 18 | exit(main());
 19 | 
 20 | sub main{
 21 |   my $usage="Usage: $0 -i inputFastqFile [-n numCpus -q outputQualfile -f outputFastaFile]";
 22 |   die($usage) if(@ARGV<1);
 23 | 
 24 |   GetOptions($settings,('numCpus=s','input=s','qualOut=s','fastaOut=s','help'));
 25 |   die $usage if($$settings{help});
 26 | 
 27 |   my $file=$$settings{input}||die("input parameter missing");
 28 |   my $outfasta=$$settings{fastaOut}||"$file.fasta";
 29 |   my $outqual=$$settings{qualOut}||"$file.qual";
 30 |   my $numCpus=$$settings{numCpus}||1;
 31 | 
 32 |   my @subfile=splitFastq($file,$numCpus);
 33 |   for my $f(@subfile){
 34 |     threads->create(\&convert,$f,"$f.fasta","$f.qual");
 35 |   }
 36 |   $_->join for (threads->list);
 37 | 
 38 |   # join the sub files together
 39 |   joinFastqFiles(\@subfile,$file);
 40 | 
 41 |   return 1;
 42 | }
 43 | 
 44 | sub convert{
 45 |   my($file,$outfasta,$outqual)=@_;
 46 | 
 47 |   my $numSequences=numSequences($file);
 48 |   my $reportEvery=int($numSequences/100) || 1;
 49 |   print "$numSequences sequences to convert in $file\n";
 50 | 
 51 |   my $in=Bio::SeqIO->new(-file=>$file,-format=>"fastq-illumina");
 52 |   my $seqOut=Bio::SeqIO->new(-file=>">$outfasta",-format=>"fasta");
 53 |   my $qualOut=Bio::SeqIO->new(-file=>">$outqual",-format=>"qual");
 54 |   my $seqCount=0;
 55 |   my $percentDone=0;
 56 |   while(my $seq=$in->next_seq){
 57 |     $seqOut->write_seq($seq);
 58 |     $qualOut->write_seq($seq);
 59 |     $seqCount++;
 60 |     if($seqCount%$reportEvery == 0){
 61 |       $percentDone++;
 62 |       print "$percentDone%..";
 63 |     }
 64 |   }
 65 |   print "Done with subfile $file.\n";
 66 |   return 1;
 67 | }
 68 | 
 69 | sub joinFastqFiles{
 70 |   my($subfile,$outfileBasename)=@_;
 71 |   my($command,$subfasta,$subqual);
 72 | 
 73 |   # fasta
 74 |   $subfasta.="$_.fasta " for(@$subfile);
 75 |   $command="cat $subfasta > $outfileBasename.fasta";
 76 |   system($command);
 77 |   
 78 |   # qual
 79 |   $subqual.="$_.qual " for (@$subfile);
 80 |   $command="cat $subqual > $outfileBasename.qual";
 81 |   system($command);
 82 | 
 83 |   return 1;
 84 | }
 85 | 
 86 | sub splitFastq{
 87 |   my($file,$numCpus)=@_;
 88 |   my $prefix="FQ"; # for fastq
 89 |   my $numSequences=numSequences($file);
 90 |   my $numSequencesPerFile=int($numSequences/$numCpus);
 91 |   my $numSequencesPerFileRemainder=$numSequences % $numCpus;
 92 |   my $numLinesPerFile=$numSequencesPerFile*4; # four lines per read
 93 |   system("rm -r tmp;mkdir tmp;");
 94 |   system("split -l $numLinesPerFile $file 'tmp/FQ'");
 95 | 
 96 |   return glob("tmp/FQ*");
 97 | }
 98 | 
 99 | 
100 | sub numSequences{
101 |   my $file=shift;
102 |   return $numSequences{$file} if($numSequences{$file});
103 |   my $num=`grep -c '^\@' $file`;
104 |   chomp($num);
105 |   $numSequences{$file}=$num;
106 |   return $num;
107 | }
108 | 


--------------------------------------------------------------------------------
/scripts/filterKrakenOutput.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Long qw/GetOptions/;
 6 | use File::Basename qw/basename/;
 7 | use File::Temp qw/tempdir/;
 8 | use Data::Dumper;
 9 | 
10 | local $0=basename $0;
11 | 
12 | sub logmsg{print STDERR "$0: @_\n"}
13 | exit main();
14 | 
15 | sub main{
16 |   my $settings={};
17 |   GetOptions($settings,qw(help tempdir=s taxid=s)) or die $!;
18 |   die usage() if(!@ARGV);
19 |   die "ERROR: need taxid" if(!defined $$settings{taxid});
20 |   $$settings{tempdir}||=tempdir("$0.XXXXXX", TMPDIR=>1, CLEANUP=>1);
21 | 
22 |   my @taxid = split(/,/, $$settings{taxid});
23 |   my @fastq = @ARGV;
24 | 
25 |   my $regex = "^".join('$|^', @taxid)."\$";
26 |   $regex = qr/$regex/;
27 | 
28 |   my %readid = ();
29 |   while(<STDIN>){
30 |     my(undef, $readid, $taxid) = split(/\t/, $_);
31 |     if($taxid =~ $regex){
32 |       $readid{$readid} = 1;
33 |     }
34 |   }
35 |   
36 |   for my $f(@fastq){
37 |     logmsg "Reading $f and filtering for $$settings{taxid}";
38 |     open(my $fh, "zcat $f | ") or die "ERROR: could not gunzip $f: $!";
39 |     while(my $id = <$fh>){
40 |       my $entry = $id;
41 |       $entry.=<$fh> for(1..3);
42 | 
43 |       $id =~ s/^@|\s+.*$//g; # remove @ and anything after whitespace
44 | 
45 |       if($readid{$id}){
46 |         print $entry;
47 |       }
48 |     }
49 |     close $fh;
50 |   }
51 | 
52 |   return 0;
53 | }
54 |   
55 |   #    cat out.kraken | perl -MData::Dumper -lane 'BEGIN{open($fh, "zcat SE-le_S12_L001_R1_001.fastq.gz | ") or die $!; while(my $id=<$fh>){$entry=$id; for(1..3){$entry.=<$fh>;} $id=~s/\s.*//; $id=~s/^\@//; chomp($id); $entry{$id}=$entry; } } my(undef,$readid,$taxid)=@F; next if($taxid !~ /^561$|^562$|^83334$/); print $entry{$readid};' | grep . > R1.subset.fastq
56 | 
57 | sub usage{
58 |   "$0: Filter for reads matching a given taxon using kraken raw results
59 |   usage: $0 --taxid=taxid in.fastq.gz < kraken.out > out.fastq
60 | 
61 |   --taxid     The taxon ID from NCBI (required)
62 |               Can be comma-separated
63 |   "
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/scripts/findpids.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Borrowed from 
 4 | # https://unix.stackexchange.com/questions/294299/how-to-renice-all-threads-and-children-of-one-process-on-linux
 5 | 
 6 | if [ "$#" -eq 0 ]; then
 7 |   echo "Finds all children pids of a process"
 8 |   echo "  Usage: $(basename $0) 012345"
 9 |   exit 1
10 | fi
11 | 
12 | PID_LIST=
13 | findpids() {
14 |         for pid in /proc/$1/task/* ; do
15 |                 pid="$(basename "$pid")"
16 |                 PID_LIST="$PID_LIST$pid "
17 |                 if [ ! -e "/proc/$1/task/$pid/children" ]; then
18 |                   continue;
19 |                 fi
20 |                 for cpid in $(cat /proc/$1/task/$pid/children 2>/dev/null) ; do
21 |                         findpids $cpid
22 |                 done
23 |         done
24 | }
25 | 
26 | for pid in $@; do 
27 | 
28 |   if [ ! -e "/proc/$1/task" ]; then
29 |     echo "ERROR: could not find pid $pid in the process list";
30 |     exit 1
31 |   fi
32 | 
33 |   findpids $1
34 | done
35 | 
36 | echo $PID_LIST
37 | 


--------------------------------------------------------------------------------
/scripts/fixKsnpVcf.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Bio::Perl;
 6 | use Getopt::Long;
 7 | use Data::Dumper;
 8 | use constant reportEvery=>1000;
 9 | 
10 | sub logmsg{print STDERR "@_\n";}
11 | exit main();
12 | 
13 | sub main{
14 |   my $settings={};
15 |   GetOptions($settings,qw(ref|reference=s help)) or die $!;
16 |   die usage() if($$settings{help} || !$$settings{ref});
17 | 
18 |   my %seq;
19 |   my $in=Bio::SeqIO->new(-file=>$$settings{ref});
20 |   while(my $seq=$in->next_seq){
21 |     $seq{$seq->id}=uc($seq->seq);
22 |   }
23 |   
24 |   my $lineCount=0;
25 |   my $numFixed=0;
26 |   while(<>){
27 |     # Print headers
28 |     if(/^#/){
29 |       print;
30 |       next;
31 |     }
32 | 
33 |     # Fix VCF lines
34 |     $lineCount++;
35 |     chomp;
36 |     my @F=split /\t/;
37 |     $F[1]||=0;
38 |     $F[2]=uc($F[2]);
39 | 
40 |     # Fix up the kmer line and print it
41 |     $numFixed += !! fixPosition(\@F,\%seq,$settings);
42 | 
43 |     if($lineCount % reportEvery == 0){
44 |       my $percent=int($numFixed/$lineCount * 100);
45 |       logmsg "Have fixed $numFixed out of $lineCount ($percent%)";
46 |     }
47 |   }
48 | 
49 |   my $percent=int($numFixed/$lineCount * 100);
50 |   logmsg "Fixed $numFixed out of $lineCount ($percent%)";
51 | }
52 | 
53 | sub fixPosition{
54 |   my($F,$seqHash,$settings)=@_;
55 | 
56 |   # Remove the old Chrom/pos information from the VCF line.
57 |   # It's not what we want anyway.
58 |   splice(@$F,0,2);
59 | 
60 |   # Figure out where the snp is within the kmer to help
61 |   # with the genomic position later on.
62 |   my $dotIndex=index($$F[0],'.');
63 |   # Also need to search with the reverse complement.
64 |   my $revcom=revcom($$F[0])->seq;
65 | 
66 |   # Keep track of whether or not there are matches
67 |   my $numMatches=0;
68 |   for my $id(keys(%$seqHash)){
69 |     while($$seqHash{$id}=~/($$F[0]|$revcom)/g){
70 |       my $pos=length($`)+$dotIndex;
71 |       print join("\t",$id,$pos,@$F)."\n";
72 |       $numMatches++;
73 |     }
74 |   }
75 | 
76 |   logmsg "WARNING: I could not find kmer $$F[0] in $$settings{ref}" if($numMatches < 1);
77 |   return $numMatches;
78 | }
79 | 
80 | sub usage{
81 |   "Usage: $0 -ref reference.fasta < ksnp.vcf > fixed.vcf
82 |   "
83 | }
84 | 


--------------------------------------------------------------------------------
/scripts/fixProkkaHeader.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Bio::Perl;
 6 | use File::Basename qw/fileparse basename dirname/;
 7 | use Getopt::Long;
 8 | 
 9 | exit main();
10 | 
11 | sub main{
12 |   my $settings={};
13 |   GetOptions($settings,qw(help min_length|min-length=i)) or die $!;
14 |   die usage() if(!@ARGV || $$settings{help});
15 |   $$settings{min_length}||=1;
16 |   
17 |   my($infile)=@ARGV;
18 |   my $in=Bio::SeqIO->new(-file=>$infile,-verbose=>-1);
19 |   my $out=Bio::SeqIO->new(-format=>"genbank");
20 |   my $i=0;
21 |   while(my $seq=$in->next_seq){
22 |     next if($seq->length < $$settings{min_length});
23 |     my $id=sprintf("contig%06d",++$i);
24 |     $seq->id($id);
25 |     $out->write_seq($seq);
26 |   }
27 |   return 0;
28 | }
29 | 
30 | sub usage{
31 |   local $0=basename $0;
32 |   "$0: Fixes headers in genbank files
33 |   Usage: $0 in.gbk > out.gbk
34 |   --min_length  1   Minimum length of a contig
35 |   "
36 | }
37 | 


--------------------------------------------------------------------------------
/scripts/flattenTree.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl 
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use Data::Dumper;
 6 | use Bio::TreeIO;
 7 | use Getopt::Long;
 8 | use File::Basename qw/basename/;
 9 | use Scalar::Util qw/looks_like_number/;
10 | 
11 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
12 | exit(main());
13 | 
14 | sub main{
15 |   my $settings={};
16 |   GetOptions($settings,qw(help debug confidence|bootstrap|min-confidence=f)) or die $!;
17 |   $$settings{confidence}||=0;
18 | 
19 |   for my $file(@ARGV){
20 |     my $in = Bio::TreeIO->new(-file=>$file);
21 |     while(my $tree = $in->next_tree){
22 |       flattenTree($tree, $$settings{confidence}, $settings);
23 |     }
24 |   }
25 | 
26 |   return 0;
27 | }
28 | 
29 | sub flattenTree{
30 |   my($tree, $minConfidence, $settings)=@_;
31 |   
32 |   for my $leaf($tree->get_nodes()){
33 |     next if(!$leaf->is_Leaf());
34 | 
35 |     my @lineage = ($leaf,reverse($tree->get_lineage_nodes($leaf)));
36 |     my $numLineage = @lineage;
37 | 
38 |     # For a flattening to work, a node must have a
39 |     # grandparent so that there is a "root" node.
40 |     # TODO: add a pseudo root node and remove it later.
41 |     for(my $i=0;$i<$numLineage-2;$i++){
42 |       #next if(!defined($lineage[$i]->ancestor));
43 | 
44 |       my $confidence = $lineage[$i+1]->id;
45 |       $confidence //= 0;
46 |       if(!looks_like_number($confidence)){
47 |         next;
48 |       }
49 | 
50 |       if($confidence < $minConfidence){
51 |         if($$settings{debug}){
52 |           logmsg $lineage[$i]->id." ".$confidence." < ".$minConfidence;
53 |         }
54 |         # Branch length increases by the ancestor's branch
55 |         # length, which we will now bypass.
56 |         my $branch_length     = $lineage[$i]->branch_length;
57 |         my $anc_branch_length = $lineage[$i+1]->branch_length || 0;
58 |         $lineage[$i]->branch_length(
59 |           $branch_length + $anc_branch_length
60 |         );
61 |         # Bump this node up to being a descendent of the
62 |         # ancestor's ancestor.
63 |         $lineage[$i]->ancestor(
64 |           $lineage[$i+2]
65 |         );
66 |       }
67 |     }
68 |   }
69 | 
70 |   my $numRemoved = 1;
71 |   while($numRemoved > 0){
72 |     $numRemoved = 0;
73 |     # Remove singleton paths
74 |     $tree->contract_linear_paths;
75 |     # Remove dead ancestor nodes
76 |     for my $leaf($tree->get_nodes){
77 |       next if(!$leaf->is_Leaf);
78 | 
79 |       if(looks_like_number($leaf->id)){
80 |         $tree->remove_Node($leaf);
81 |         $numRemoved++;
82 |       }
83 |     }
84 |   }
85 | 
86 |   print $tree->as_text('newick')."\n";
87 | }
88 | 
89 | sub usage{
90 |   $0=basename $0;
91 |   "$0: flattens a tree using node confidence scores
92 |   NOTE: leaves with number-only identifiers will be removed.
93 | 
94 |   Usage: $0 tree.dnd [tree2.dnd...] > out.dnd
95 |   --confidence   0  Minimum confidence for flattening a tree
96 |   --debug           Print debugging information to stderr
97 |   "
98 | }
99 | 


--------------------------------------------------------------------------------
/scripts/formatFastaForKraken.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Bio::SeqIO;
 6 | use Getopt::Long qw/GetOptions/;
 7 | use File::Basename qw/basename/;
 8 | use File::Copy qw/mv/;
 9 | use File::Temp qw/tempdir tempfile/;
10 | use Data::Dumper;
11 | 
12 | local $0=basename $0;
13 | 
14 | sub logmsg{print STDERR "$0: @_\n"}
15 | exit main();
16 | 
17 | sub main{
18 |   my $settings={};
19 |   GetOptions($settings,qw(help tempdir=s taxid=i)) or die $!;
20 |   die usage() if(!@ARGV);
21 |   die "ERROR: need taxid" if(!defined $$settings{taxid});
22 |   $$settings{tempdir}||=tempdir("$0.XXXXXX", TMPDIR=>1, CLEANUP=>1);
23 | 
24 |   my $suffix = '|kraken:taxid|'.$$settings{taxid};
25 |   my $suffixRegex = qr/(.*?)(\|kraken:taxid\|\d+)*$/;
26 |   
27 |   logmsg "Taxid: $$settings{taxid}";
28 |   for my $fasta(@ARGV){
29 |     die "ERROR: cannot find $fasta" if(!-e $fasta);
30 |     logmsg $fasta;
31 | 
32 |     my($tempfasFh, $tempfas)=tempfile("XXXXXX", SUFFIX=>".fasta", DIR=> $$settings{tempdir});
33 |     my $in=Bio::SeqIO->new(-file=>$fasta);
34 |     my $out=Bio::SeqIO->new(-fh=>$tempfasFh,-format=>"fasta");
35 | 
36 |     while(my $seq=$in->next_seq){
37 |       my $id=$seq->id;
38 |       $id=~s/$suffixRegex/$1$suffix/;
39 |       $seq->id($id);
40 |       $seq->description(" ");
41 |       $out->write_seq($seq);
42 |     }
43 | 
44 |     $in->close; 
45 |     $out->close;
46 |     close($tempfasFh);
47 |     mv($tempfas,$fasta);
48 |   }
49 | 
50 |   return 0;
51 | }
52 | 
53 | sub usage{
54 |   "$0: Perform in-place editing of fasta files for Kraken
55 |   usage: $0 --taxid=taxid *.fasta
56 | 
57 |   --taxid     The taxon ID from NCBI (required)
58 |   "
59 | }
60 | 
61 | 


--------------------------------------------------------------------------------
/scripts/kaptivate_wrapper.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl 
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use Data::Dumper;
 6 | use Getopt::Long;
 7 | use File::Basename qw/basename/;
 8 | use File::Copy qw/cp/;
 9 | use File::Temp qw/tempdir/;
10 | 
11 | use version 0.77;
12 | our $VERSION = '0.1.1';
13 | 
14 | local $0 = basename $0;
15 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
16 | exit(main());
17 | 
18 | sub main{
19 |   my $settings={};
20 |   GetOptions($settings,qw(help db=s numcpus=i)) or die $!;
21 |   usage() if($$settings{help});
22 | 
23 |   $$settings{tempdir} //= tempdir("kaptive.XXXXXX", TMPDIR=>1, CLEANUP=>1);
24 |   $$settings{db} or die "ERROR: need --db set to the database with the kaptive database";
25 |   $$settings{numcpus} ||= 1;
26 | 
27 |   for my $fasta(@ARGV){
28 |     my $outdir = runKaptive($fasta, $$settings{db}, $settings);
29 |     
30 |     # Get the results
31 |     open(my $kFh, "<", "$outdir/k.log") or die "ERROR: could not read $outdir/k.log: $!";
32 |     my $k = <$kFh>;
33 |     chomp($k);
34 |     close $kFh;
35 |     open(my $oFh, "<", "$outdir/o.log") or die "ERROR: could not read $outdir/o.log: $!";
36 |     my $o = <$oFh>;
37 |     chomp($o);
38 |     close $oFh;
39 | 
40 |     # Parse the results
41 |     $k =~ s/.*?K/K/;
42 |     $o =~ s/.*?O/O/;
43 |     print join("\t", basename($fasta), $k, $o)."\n";
44 |   }
45 | 
46 |   return 0;
47 | }
48 | 
49 | sub runKaptive{
50 |   my($fasta, $db, $settings) = @_;
51 |   logmsg "Running Kaptive on $fasta";
52 |   
53 |   # Staging area
54 |   my $tmpdir = "$$settings{tempdir}/".basename($fasta, qw(.fasta .fa));
55 |   mkdir $tmpdir;
56 | 
57 |   # Input and output directories
58 |   my $tmpdirIn  = "$tmpdir/in";
59 |   my $tmpdirOut = "$tmpdir/out";
60 |   mkdir $tmpdirIn;
61 |   mkdir $tmpdirOut;
62 | 
63 |   my $tmpfasta = "$tmpdirIn/in.fasta";
64 |   cp($fasta, $tmpfasta) or die "ERROR: could not copy $fasta to $tmpfasta: $!";
65 | 
66 |   my $oGbk = "$db/VibrioPara_Kaptivedb_O.gbk";
67 |   my $kGbk = "$db/VibrioPara_Kaptivedb_K.gbk";
68 | 
69 |   system("kaptive.py --threads $$settings{numcpus} -k $oGbk -a $tmpfasta -o $tmpdirOut/o > $tmpdirOut/o.log");
70 |   die if $?;
71 |   system("kaptive.py --threads $$settings{numcpus} -k $kGbk -a $tmpfasta -o $tmpdirOut/k > $tmpdirOut/k.log");
72 |   die if $?;
73 | 
74 |   return $tmpdirOut;
75 | }
76 | 
77 | sub usage{
78 |   print "$0: runs Kaptive on a set of fasta files
79 |   Usage: $0 [options] *.fasta > out.tsv
80 |   --db      Database directory for Kaptive containing *.gbk
81 |   --numcpus Number of threads to use (default: 1)
82 |   --help    This useful help menu
83 |   \n";
84 |   exit 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/scripts/kraken2-translate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl 
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use Data::Dumper;
 6 | use Getopt::Long;
 7 | use File::Basename qw/basename/;
 8 | 
 9 | local $0 = basename $0;
10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
11 | exit(main());
12 | 
13 | sub main{
14 |   my $settings={};
15 |   GetOptions($settings,qw(help)) or die $!;
16 |   usage() if($$settings{help} || !@ARGV);
17 | 
18 |   my($infile) = @ARGV;
19 | 
20 |   translateKraken2($infile);
21 | 
22 |   return 0;
23 | }
24 | 
25 | # Emulate the kraken-translate format with 2-space-delimited columns:
26 | #   count
27 | #   parent node 1
28 | #   parent node 2
29 | #   ...
30 | #   genus
31 | #   genus/species
32 | #   ... further divisions
33 | sub translateKraken2{
34 |   my($infile) = @_;
35 |   
36 |   # An array of parent nodes excluding the current node,
37 |   # going from root to kingdom to phylum all the way down,
38 |   # where the 0th element is almost always going to be root
39 |   my @parent = ();
40 |   my $parentLevel  = 0; # always equivalent to number of spaces x 2
41 | 
42 |   open(my $fh, "<", $infile) or die "ERROR: could not open $infile: $!";
43 |   while(my $line = <$fh>){
44 |     # left and right whitespace trim
45 |     $line =~ s/^\s+|\s+$//g;
46 | 
47 |     # fields defined in the manual at
48 |     #   https://ccb.jhu.edu/software/kraken/MANUAL.html#sample-reports
49 |     #   * Percentage of reads covered by the clade rooted at this taxon
50 |     #   * Number of reads covered by the clade rooted at this taxon
51 |     #   * Number of reads assigned directly to this taxon
52 |     #   * A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
53 |     #   * NCBI taxonomy ID
54 |     #   * indented scientific name
55 |     my @F = split(/\t/, $line);
56 |     my($percent, $readsUmbrella, $readsSpecific, $rank, $taxid, $nameWithIndentation) = @F;
57 | 
58 |     # Figure out the level from the number of spaces in the name.
59 |     # Each two spaces is one more level down.
60 |     # There are ways to optimize this step in perl but I just wanted
61 |     # to make it clear what was happening.
62 |     my $childLevel = 0;
63 |     if($nameWithIndentation =~ /^( +)/){
64 |       my $prefixWhitespace = $1;
65 |       while($prefixWhitespace =~ /  /g){
66 |         $childLevel++;
67 |       }
68 |     }
69 | 
70 |     # Remove the indentation into $name
71 |     my $name = $nameWithIndentation;
72 |     $name =~ s/^\s+//;
73 |     $name =~ s/\s+/_/g; # also remove internal whitespace
74 | 
75 |     # Cut down the parent nodes to this child level
76 |     # while getting the current node tacked on.
77 |     $parent[$childLevel] = $name;
78 |     
79 |     # Set up the taxa fields with the parent node(s) and the current node.
80 |     my @taxaField = @parent[0..$childLevel];
81 | 
82 |     print join("\t", $readsSpecific, @taxaField)."\n";
83 |   }
84 | }
85 | 
86 | sub usage{
87 |   print "$0: changes kraken report to a format for ktImportText in Krona
88 |   Usage: $0 [options] kraken.report
89 |   --help   This useful help menu
90 | 
91 |   Output is tab delimited:
92 |   * count of reads
93 |   * parent node 1
94 |   * ...
95 |   * last child node (usually genus/species)
96 | ";
97 |   exit 0;
98 | }
99 | 


--------------------------------------------------------------------------------
/scripts/ksnpsToVcf.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Long;
 6 | use File::Basename qw/basename/;
 7 | use Data::Dumper;
 8 | 
 9 | exit main();
10 | 
11 | sub main{
12 |   my $settings={};
13 |   GetOptions($settings,qw(help)) or die $!;
14 |   
15 |   my($infile)=@ARGV;
16 |   die usage() if(!$infile || $$settings{help});
17 |   
18 |   my @genomeList=`cut -f 5 $infile | sort| grep . | uniq`;
19 |   die if $?;
20 |   chomp(@genomeList);
21 |   SNPs_allToVcf($infile,\@genomeList,$settings);
22 |   return 0;
23 | } 
24 | 
25 | sub SNPs_allToVcf{
26 |   my($infile,$genomeList,$settings)=@_;
27 | 
28 |   my @nt=qw(A C G T);
29 |   my $numGenomes=scalar(@$genomeList);
30 | 
31 |   local $0=basename $0;
32 |   print "##fileformat=VCFv4.1\n##source=kSNP3, $0\n";
33 |   print join("\t",'##CHROM',qw(POS ID REF ALT QUAL FILTER INFO FORMAT),@$genomeList)."\n";
34 | 
35 |   # Get a list of zeros for the GTs later on
36 |   my @zeroVariantTags=(0) x $numGenomes; #split(//, "0" x $numGenomes);
37 | 
38 |   # index the genomeList
39 |   my %genomeIndex;
40 |   @genomeIndex{@$genomeList}=keys(@$genomeList);
41 | 
42 |   my @GT=@zeroVariantTags;
43 |   my %altIndex=();
44 |   my @ALT=('.');
45 |   my($id,$kmer,$variant,$x,$genome)=('.') x 5;
46 |   open(IN,$infile) or die "ERROR: could not read $infile for reading: $!";
47 |   while(<IN>){
48 |     s/^\s+|\s+$//g; # whitespace trim
49 |     
50 |     # Whenever there is a blank line, the ID is about to increment.
51 |     # Print the VCF line.
52 |     if(/^$/){
53 |       #print Dumper \@ALT;
54 |       #next if(join("",@GT) eq join("",@zeroVariantTags)); # don't print non-variants
55 |       next if(scalar(@ALT) < 2);
56 |       next if( (grep{$_ > 0} @GT) < 2);
57 | 
58 |       # Print the VCF line.
59 |       shift(@ALT); # remove the dot.
60 |       print join("\t",'.','.',$kmer,'.',join(",",@ALT), '.', 'PASS', "NS=$numGenomes", 'GT', @GT)."\n";
61 | 
62 |       # reset
63 |       @GT=@zeroVariantTags;
64 |       %altIndex=();
65 |       @ALT=('.');
66 |       next;
67 |     }
68 |     
69 |     ($id,$kmer,$variant,$x,$genome)=split(/\t/,$_);
70 |     $id=uc($id);
71 |     $variant=uc($variant);
72 | 
73 |     if(!defined($altIndex{$variant})){
74 |       $altIndex{$variant}=scalar(@ALT);
75 |       push(@ALT,$variant);
76 |     }
77 |     $GT[$genomeIndex{$genome}]=$altIndex{$variant};
78 |   }
79 |   close IN;
80 | }
81 | 
82 | sub usage{
83 |   local $0=basename $0;
84 |   "$0: transform a kSNP3 output into a vcf file
85 |   Usage: $0 kSNP3.out/SNPs_all > kSNP3.vcf
86 |   SNPs_all file is formatted with the tab-delimited fields
87 |   ID  kmer  variant  x  genomeName
88 |   "
89 | }
90 | 


--------------------------------------------------------------------------------
/scripts/lasergeneToFna.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use File::Path;
 6 | use Bio::Perl;
 7 | use File::Basename;
 8 | 
 9 | die usage() if(!@ARGV || $ARGV[0]=~/^\-+h/);
10 | 
11 | my $out=Bio::SeqIO->new(-format=>"fasta");
12 | my @file=@ARGV or die("need files to convert\n");
13 | foreach my $f (@file){
14 |   my $in=Bio::SeqIO->new(-format=>"lasergene",-file=>$f);
15 |   while(my $seq=$in->next_seq){
16 |     my($id,$desc);
17 |     ($id)=fileparse($f);
18 |     ($id,$desc)=split(/\s+/,$id,2);
19 | 
20 |     $seq->id($id);
21 |     $seq->desc($desc);
22 |     $out->write_seq($seq);
23 |   }
24 | }
25 | 
26 | sub trim{
27 |   my $str=shift;
28 |   $str=~s/^\s+|\s+$//g;
29 |   return $str;
30 | }
31 | 
32 | sub usage{
33 |   local $0 = fileparse $0;
34 |   "Converts lasergene sequence files to a multifasta file
35 |   Usage: $0 *.lasergene > file.fasta
36 |   "
37 | }
38 | 


--------------------------------------------------------------------------------
/scripts/mashesToAlignment.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl 
  2 | 
  3 | use warnings;
  4 | use strict;
  5 | use Data::Dumper;
  6 | use Getopt::Long;
  7 | use File::Basename qw/basename/;
  8 | use Bio::Sketch::Mash;
  9 | 
 10 | local $0 = basename $0;
 11 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
 12 | exit(main());
 13 | 
 14 | sub main{
 15 |   my $settings={};
 16 |   GetOptions($settings,qw(help presence=s absence=s)) or die $!;
 17 | 
 18 |   $$settings{presence} //= "1";
 19 |   $$settings{absence}  //= "0";
 20 | 
 21 |   die usage($settings) if($$settings{help} || !@ARGV);
 22 | 
 23 |   # Get the sorted arguments so that they are uniform
 24 |   # and not random in each of the methods below,
 25 |   # resulting in a determininstic order for the pseudo
 26 |   # alignment
 27 |   my @infile = sort(@ARGV);
 28 | 
 29 |   # Find presence/absence of hashes
 30 |   logmsg "Finding presence/absence of ".scalar(@infile)." files";
 31 |   my $presence = readSketches(\@infile, $settings);
 32 |   # Make sequences out of the hashes
 33 |   logmsg "Determining the pseudosequence for each input file";
 34 |   logmsg "Present nucleotides will be $$settings{presence} and absent nucleotides will be $$settings{absence}";
 35 |   my $seqs = determinePseudoSequences(\@infile, $presence, $settings);
 36 |   # make an actual alignment string
 37 |   logmsg "Making the alignment from sequence";
 38 |   my $aln  = makeAlignment(\@infile, $seqs, $settings);
 39 | 
 40 |   print "$aln";
 41 | 
 42 |   return 0;
 43 | }
 44 | 
 45 | sub readSketches{
 46 |   my($sketches, $settings) = @_;
 47 | 
 48 |   my %p; # presence/absence
 49 |   for my $file(@$sketches){
 50 |     print STDERR ".";
 51 |     my $msh = Bio::Sketch::Mash->new($file);
 52 |     my $sketches=$$msh{sketches}[0]{hashes};
 53 |     for my $s(@$sketches){
 54 |       $p{$s}{$file}=1;
 55 |     }
 56 |   }
 57 |   print STDERR "\n";
 58 | 
 59 |   return \%p;
 60 | }
 61 | 
 62 | sub determinePseudoSequences{
 63 |   my($infiles, $p, $settings) = @_;
 64 | 
 65 |   my %sampleSeq;
 66 | 
 67 |   # Sort to help keep output stable
 68 |   my @hashInt = sort{$a<=>$b} keys(%$p);
 69 |   for my $h(@hashInt){
 70 |     for my $file(@$infiles){
 71 |       # If the hash is present, then give the "present" nucleotide
 72 |       if($$p{$h}{$file}){
 73 |         $sampleSeq{$file} .= $$settings{presence};
 74 |       }
 75 |       # If the hash is not present, then give the "absent" nucleotide
 76 |       else{
 77 |         $sampleSeq{$file} .= $$settings{absence};
 78 |       }
 79 |     }
 80 |   }
 81 | 
 82 |   return \%sampleSeq;
 83 | }
 84 | 
 85 | sub makeAlignment{
 86 |   my($infiles, $seqs, $settings) = @_;
 87 | 
 88 |   my $alnStr = "";
 89 |   for my $file(@$infiles){
 90 |     $alnStr .= ">$file\n";
 91 |     $alnStr .= $$seqs{$file}."\n";
 92 |   }
 93 |   return $alnStr;
 94 | }
 95 | 
 96 | sub usage{
 97 |   my($settings) = @_;
 98 |   print "$0: transforms a set of mash sketches to an alignment
 99 |   Usage: $0 [options] *.msh > aln.fasta
100 |   --presence  The nucleotide to use for a present hash integer
101 |               default: $$settings{presence}
102 |   --absence   The nucleotide to use for an absent hash integer
103 |               default: $$settings{absence}
104 |   --help      This useful help menu
105 | 
106 |   suggested workflow:
107 |   $0 ... 
108 |   goalign reformat phylip -i binary.fasta > binary.fasta.phylip
109 |   raxmlHPC -f a -s binary.fasta.phylip -n \$prefix -T \$numcpus -p \$RANDOM -x \$RANDOM -N 100 -m BINGAMMA
110 |   \n";
111 | }
112 | 


--------------------------------------------------------------------------------
/scripts/md5sumDir.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | use Digest::MD5 qw/md5_hex/;
 5 | use File::Slurp qw/read_file/;
 6 | use threads;
 7 | 
 8 | my $numcpus = 24;
 9 | 
10 | my @file;
11 | for my $dir(@ARGV){
12 |   push(@file, 
13 |     `find $dir -type f`
14 |   );
15 | }
16 | chomp(@file);
17 | 
18 | my $num_files_per_thread = int(scalar(@file)/$numcpus) + 1;
19 | my @thr;
20 | for my $i(0..$numcpus - 1){
21 |   my @subfile = splice(@file, 0, $num_files_per_thread);
22 |   $thr[$i] = threads->new(sub{
23 |     my @hex;
24 |     for my $file(@subfile){
25 |       my $content = read_file($file);
26 |       push(@hex, md5_hex($content));
27 |     }
28 |     return \@hex;
29 |   });
30 | }
31 | 
32 | my @hex;
33 | for my $thr(@thr){
34 |   my $hexSubArr = $thr->join;
35 |   push(@hex, @$hexSubArr);
36 | }
37 | my $finalHex = md5_hex(join("\n", sort @hex)."\n");
38 | print $finalHex."\n";
39 | 


--------------------------------------------------------------------------------
/scripts/md5sumDir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | NUMCPUS=1
 5 | DIR=$1
 6 | 
 7 | if [ "$DIR" == "" ]; then
 8 |   echo "Finds md5sum of all files in a whole directory, recursively"
 9 |   echo "Usage: $0 dir"
10 |   exit 1
11 | fi
12 | 
13 | # Make a temporary directory that will be removed upon exit
14 | export TEMPDIR=$(mktemp --directory $(basename $0).XXXXXX --tmpdir)
15 | function cleanup(){
16 |   rm -rf $TEMPDIR
17 | }
18 | trap cleanup EXIT
19 | 
20 | # Find all files in the directory recursively,
21 | # and md5sum them into TEMPDIR.
22 | # Using the temporary directory helps with race conditions.
23 | find $DIR -type f | xargs -P $NUMCPUS -n 1 sh -c '
24 |   md5sum $0 > $TEMPDIR/$(basename $0).$$.md5
25 | '
26 | # Combine all md5sums to one file.
27 | # Adding $$ to the filename will make it unique in the temp
28 | # directory; the full file path appears in the md5 file
29 | # contents and so it should be a stable sort.
30 | sort $TEMPDIR/*.md5 | md5sum | cut -f 1 -d ' '
31 |   
32 | 


--------------------------------------------------------------------------------
/scripts/mvSymlink.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Lee Katz
 3 | # Moves a symbolic link or more to a target directory
 4 | 
 5 | use strict;
 6 | use warnings FATAL=>'all';
 7 | use File::Copy qw(mv);
 8 | use Getopt::Long qw(GetOptions);
 9 | use File::Spec;
10 | use Cwd qw/realpath getcwd/;
11 | use File::Basename;
12 | 
13 | my $target;
14 | GetOptions('target-directory=s' => \$target);
15 | die "$0 -t target_dir symlink1 symlink2 symlink3\n" unless $target && -d $target;
16 | 
17 | my $origDir=getcwd;
18 | for (@ARGV) {
19 |     unless (-l $_) {
20 |         warn "$_ is not a symlink\n";
21 |         next;
22 |     }
23 |     my $filename=fileparse $_;
24 |     my $absPath=realpath($_);
25 |     chdir $target;
26 |       my $relPath=File::Spec->abs2rel($absPath);
27 |       symlink $relPath, $filename;
28 |     chdir $origDir;
29 |     unlink $_;
30 | }
31 | 


--------------------------------------------------------------------------------
/scripts/pairwiseDistances.mlst.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl 
  2 | 
  3 | use warnings;
  4 | use strict;
  5 | use Data::Dumper;
  6 | use Getopt::Long;
  7 | use File::Basename qw/basename/;
  8 | 
  9 | local $0 = basename $0;
 10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
 11 | exit(main());
 12 | 
 13 | sub main{
 14 |   my $settings={};
 15 |   GetOptions($settings,qw(help)) or die $!;
 16 |   die usage() if($$settings{help} || !@ARGV);
 17 | 
 18 |   my(@infile) = @ARGV;
 19 | 
 20 |   for my $file(@infile){
 21 |     logmsg "Reading $file";
 22 |     my $alleles = readAlleles($file, $settings);
 23 |     my $distances=distances($alleles, $settings);
 24 | 
 25 |     my @name = sort(keys(%$distances));
 26 |     for(my $i=0;$i<@name;$i++){
 27 |       for(my $j=$i+1;$j<@name;$j++){
 28 |         my($nameI,$nameJ) = sort ($name[$i],$name[$j]);
 29 |         next if($name[$i] !~ /_100/ && $name[$j] !~ /_100/);
 30 |         my $distance = $$distances{$name[$i]}{$name[$j]} // "UNKNOWN";
 31 |         print join("\t", $name[$i], $name[$j], $distance)."\n";
 32 |       }
 33 |     }
 34 |   }
 35 | 
 36 |   return 0;
 37 | }
 38 | 
 39 | sub readAlleles{
 40 |   my($file, $settings) = @_;
 41 | 
 42 |   my %allele;
 43 | 
 44 |   open(my $fh, $file) or die "ERROR reading $file: $!";
 45 |   my $header = <$fh>;
 46 |   my @header = split(/\t/, $header);
 47 |   shift(@header); # assume the first column is the key and disregard it otherwise
 48 |   while(<$fh>){
 49 |     chomp;
 50 |     my @F = split /\t/;
 51 |     my $name = shift(@F);
 52 |     my %F;
 53 |     @F{@header} = @F;
 54 | 
 55 |     $allele{$name} = \%F;
 56 |   }
 57 |   close $fh;
 58 | 
 59 |   return \%allele;
 60 | }
 61 | 
 62 | sub distances{
 63 |   my($alleles, $settings) = @_;
 64 | 
 65 |   my %distance;
 66 | 
 67 |   my @name = sort keys %$alleles;
 68 |   my @allele=sort keys %{$$alleles{$name[0]}};
 69 |   my $numNames = @name;
 70 |   my $numAlleles=@allele;
 71 | 
 72 |   for(my $i=0; $i<$numNames; $i++){
 73 |     for(my $j=$i+1; $j<$numNames; $j++){
 74 |       # Sort the names to avoid having to make sure that 
 75 |       # each "vice versa" distance doesn't have to be
 76 |       # calculated.
 77 |       my($nameI, $nameJ) = sort($name[$i], $name[$j]);
 78 |       # initialize the distances
 79 |       $distance{$nameI}{$nameJ} = 0;
 80 | 
 81 |       for(my $k=0; $k<$numAlleles; $k++){
 82 |         my $alleleI = $$alleles{$nameI}{$allele[$k]};
 83 |         my $alleleJ = $$alleles{$nameJ}{$allele[$k]};
 84 | 
 85 |         if($alleleI eq '?' || $alleleJ eq '?'){
 86 |           next;
 87 |         }
 88 | 
 89 |         if($alleleI ne $alleleJ){
 90 |           $distance{$nameI}{$nameJ}++;
 91 |         }
 92 |       }
 93 |     }
 94 |   }
 95 | 
 96 |   return \%distance;
 97 | }
 98 | 
 99 | sub usage{
100 |   "$0: Calculates pairwise distances between genomes in a bionumerics MLST export
101 |   Usage: $0 [options] mlst.tsv > pairwise.tsv
102 |   --help   This useful help menu
103 |   "
104 | }
105 | 


--------------------------------------------------------------------------------
/scripts/parseMultiblast.partialAnswer.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | use Data::Dumper;
 5 | 
 6 | my %query=();
 7 | my %hit=();
 8 | my $section="";
 9 | while(<>){ # read line by line
10 |   if(/Table of genes/){
11 |     $section="Table of genes";
12 |     while(<>){
13 |       # The section ends on a blank line
14 |       if(/^\s*$/){
15 |         last;
16 |       }
17 | 
18 |       chomp; # remove whitespace
19 |       my($locus, $start, $stop, $strand, $annotation)=split(/\s+/);
20 |       $query{$locus}={
21 |         start     =>$start,
22 |         stop      =>$stop,
23 |         strand    =>$strand,
24 |         annotation=>$annotation,
25 |       };
26 |       
27 |     }
28 |   }
29 | 
30 |   # Redundant with the details section
31 |   elsif(/Significant hits/){
32 |     while(<>){
33 |       if(/Details/){
34 |         $section="Details";
35 |         parseDetailsSection(\%hit, \%query);
36 |         last;
37 |       }
38 |     }
39 |   }
40 | }
41 | 
42 | print Dumper \%query;
43 | 
44 | sub parseDetailsSection{
45 |   my($hit,$query)=@_;
46 | 
47 |   my $currentHit="";
48 |   while(<>){
49 |     if(/^\s*$/){
50 |       next;
51 |     }
52 | 
53 |     chomp;
54 |     if(/\d+\.\s+(\S+)/){
55 |       $currentHit=$1;
56 |       # Source is on the next line.
57 |       my $source = scalar(<>);
58 |       chomp $source;
59 |       $source=~s/^Source: //;
60 |       $$hit{$currentHit}{source}=$source;
61 |     }
62 | 
63 |     # A regex double meaning with elipses: a commentary
64 |     # that this is too wordy but it also simply matches 
65 |     # on at least three characters.
66 |     # Also: this perl comment is wordy.
67 |     if(/^Number of proteins with BLAST hits...+(\d+)/){
68 |       $$hit{$currentHit}{numhits}=$1;
69 |       $$hit{$currentHit}{multiblastscore}=scalar(<>);
70 |       $$hit{$currentHit}{multiblastscore}=~s/\s+|\D+//g; # trim and remove non-digits
71 |       $$hit{$currentHit}{blastscore}=scalar(<>)i
72 |       $$hit{$currentHit}{blastscore}=~s/\s+|\D+//g; # trim and remove non-digits
73 |     }
74 |     elsif(/Table of genes.../){
75 |       while(<>){
76 |         if(/^\s*$/){
77 |           last;
78 |         }
79 |         chomp;
80 |         my($locus, $start, $stop, $strand, $annotation)=split(/\s+/);
81 |         $$hit{$currentHit}{query}{$locus}={
82 |           start     => $start,
83 |           stop      => $stop,
84 |           strand    => $strand,
85 |           annotation=> $annotation,
86 |         };
87 |     }
88 | 
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/scripts/pfgeOnGenome.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Bio::Perl;
 6 | use Bio::Restriction::EnzymeCollection;
 7 | use Bio::Restriction::Analysis;
 8 | use Data::Dumper;
 9 | use File::Basename qw/basename/;
10 | use Getopt::Long;
11 | 
12 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n"}
13 | 
14 | exit main();
15 | 
16 | sub main{
17 |   my $settings={};
18 |   GetOptions($settings,qw(help outtype=s enz|enzyme=s@)) or die $!;
19 |   $$settings{enz}||=[qw(AscI)];
20 |   $$settings{outtype}||="bed";
21 | 
22 |   my @seq=@ARGV;
23 |   die usage() if($$settings{help}||!@seq);
24 | 
25 |   my $all_collection = Bio::Restriction::EnzymeCollection->new(); 
26 |   my @enz;
27 |   for my $enzName(@{ $$settings{enz} }){
28 |     my $re=$all_collection->get_enzyme($enzName); 
29 |     # Try capitalizing the enzyme or other tricks
30 |     # if it isn't found right away.
31 |     if(!$re){
32 |       my $EnzName=ucfirst($enzName);
33 |       $re=$all_collection->get_enzyme($EnzName);
34 |       logmsg "Tried transforming $enzName to $EnzName";
35 |     }
36 |     if(!$re){
37 |       my $enzN4me=$enzName;
38 |       $enzN4me=~s/1(I*)$/I$1/g;  # replace tail ones with Is
39 |       $re=$all_collection->get_enzyme($enzN4me);
40 |       logmsg "Tried transforming $enzName to $enzN4me";
41 |     }
42 |     if(!$re){
43 |       die "ERROR: I do not understand enzyme $enzName";
44 |     }
45 |     push(@enz,$re);
46 |   }
47 | 
48 |   my $counter=0;
49 |   for my $gbk(@seq){
50 |     my $in=Bio::SeqIO->new(-file=>$gbk); 
51 |     while(my $seq=$in->next_seq){ 
52 |       my $seqLength=$seq->length;
53 |       my $ra=Bio::Restriction::Analysis->new(-seq=>$seq);
54 | 
55 |       if($$settings{outtype} eq 'sizes'){
56 |         my @fragments=map{length($$_{seq})} $ra->fragment_maps(@enz);
57 |         for(@fragments){
58 |           print join("\t",$seq->id,$_)."\n";
59 |         }
60 |       } elsif($$settings{outtype} eq 'bed'){
61 |         for my $re(@enz){
62 |           my @pos=$ra->positions($re->name);
63 |           for my $pos(@pos){
64 |             # I'm not 100% sure why this position math works, but
65 |             # it matches up with what Apollo genome browser does.
66 |             my $start=$pos-$re->cut+1;
67 |             my $end=$start+$re->recognition_length-1;
68 |             print join("\t",$seq->id,$start,$end,$re->name.++$counter)."\n";
69 |           }
70 |         }
71 |       } else {
72 |         die "ERROR: I don't understand outtype $$settings{outtype}";
73 |       }
74 |     }
75 |   }
76 | 
77 |   return 0;
78 | }
79 | 
80 | sub usage{
81 |   local $0=basename $0;
82 |   "Usage: $0 *.fasta > restrictionAnalysis.bed
83 |   --enzyme    AscI  The enzyme to digest with. Can suppy
84 |                     multiple --enzyme arguments.
85 |   --outtype   bed   Outputs a bed file of cut size coordinates.
86 |                     If 'sizes' is supplied instead, then 
87 |                     fragment sizes will be output.
88 |   "
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/scripts/phylipDistToTallSkinny.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl 
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use Data::Dumper;
 6 | use Getopt::Long;
 7 | use File::Basename qw/basename/;
 8 | 
 9 | local $0 = basename $0;
10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
11 | exit(main());
12 | 
13 | sub main{
14 |   my $settings={};
15 |   GetOptions($settings,qw(help)) or die $!;
16 |   die usage() if($$settings{help} || !@ARGV);
17 | 
18 |   my @file = @ARGV;
19 |   my $numFiles = @file;
20 |   my %dist;
21 |   for my $file(@file){
22 |     $dist{$file} = readPhylip($file, $settings);
23 |   }
24 | 
25 |   my @taxon = keys(%{ $dist{$file[0]} });
26 |   my $numTaxa = @taxon;
27 | 
28 |   print join("\t", "taxon1", "taxon2", @file)."\n";
29 |   for(my $i=0;$i<$numTaxa;$i++){
30 |     for(my $j=0;$j<$numTaxa;$j++){
31 |       print $taxon[$i]."\t".$taxon[$j];
32 |       for(my $k=0;$k<$numFiles;$k++){
33 |         my $singleDist = $dist{$file[$k]}{$taxon[$i]}{$taxon[$j]};
34 |         if(!defined($singleDist)){
35 |           $singleDist = $dist{$file[$k]}{$taxon[$j]}{$taxon[$i]};
36 |         }
37 |         if(!defined($singleDist)){
38 |           die "ERROR: distance not found:\n".Dumper [$k,$file[$k]], [$i,$taxon[$i]], [$j, $taxon[$j]];
39 |         }
40 |         print "\t".$singleDist;
41 |       }
42 |       print "\n";
43 |     }
44 |   }
45 | 
46 |   return 0;
47 | }
48 | 
49 | sub readPhylip{
50 |   my($file, $settings)=@_;
51 |   
52 |   my %dist;
53 |   my %distArr;
54 |   my @taxon;
55 |   open(my $fh, $file) or die "ERROR: could not read $file: $!";
56 |   my $numTaxa = <$fh>;
57 |   $numTaxa =~ s/^\s+|\s+$//g;
58 |   while(<$fh>){
59 |     chomp;
60 |     my($taxon, @dist) = split(/\s+/, $_);
61 |     $distArr{$taxon} = \@dist;
62 |     push(@taxon, $taxon);
63 |   }
64 | 
65 |   my $actualNumTaxa = @taxon;
66 |   if($actualNumTaxa != $numTaxa){
67 |     die "ERROR: in $file, reported number of taxa does not match number of taxa found";
68 |   }
69 |   
70 |   # Now that we know all the taxa in the list, go back
71 |   # and fill in the 2d hash
72 |   while(my($refTaxon, $distances) = each(%distArr)){
73 |     for(my $i=0;$i<$numTaxa;$i++){
74 |       my $queryTaxon = $taxon[$i];
75 |       $dist{$refTaxon}{$queryTaxon} = $$distances[$i];
76 |     }
77 |   }
78 | 
79 |   return \%dist;
80 | }
81 | 
82 | sub usage{
83 |   "
84 |   $0: changes phylip distance files to a single tall/skinny format
85 |   Usage: $0 [options] file1.phylip [file2.phylip...]
86 |   --help   This useful help menu
87 | 
88 |   "
89 | }
90 | 


--------------------------------------------------------------------------------
/scripts/phylogeneticOrder.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Bio::Perl;
 6 | use Bio::TreeIO;
 7 | use Getopt::Long;
 8 | use Data::Dumper;
 9 | 
10 | exit main();
11 | 
12 | sub main{
13 |   my $settings={};
14 |   GetOptions($settings,qw(help root=s));
15 |   $$settings{root}||="";
16 |   die usage() if($$settings{help});
17 | 
18 |   my(@tree)=@ARGV;
19 |   die "ERROR: need a tree file!\n".usage() if(!@tree);
20 | 
21 |   for my $tree(@tree){
22 |     printPhylogeneticOrder($tree,$settings);
23 |   }
24 |   return 0;
25 | }
26 | 
27 | sub printPhylogeneticOrder{
28 |   my($tree,$settings)=@_;
29 | 
30 |   my $numtaxa=0;
31 |   my $in=Bio::TreeIO->new(-file=>$tree);
32 |   while(my $tree=$in->next_tree){
33 |     reroot($tree,$settings) if($$settings{root});
34 |     for my $node($tree->get_nodes(-order=>"depth")){ # other choice: "breadth"
35 |       next if(!$node->is_Leaf);
36 |       my $id=$node->id;
37 |       $id=~s/^'|'$//g; # remove single quotes at beginning/end that bioperl adds
38 |       print "$id\n";
39 |     }
40 |   }
41 |   $in->close;
42 | }
43 | 
44 | sub reroot{
45 |   my($tree,$settings)=@_;
46 |   # TODO validate that {root} eq 'midpoint' or a node name
47 |   if($$settings{root} =~/^longest$/i){
48 |     # converge on a longest branch
49 |     _rerootLongestBranch($tree,$settings) for(1..10);
50 |   } else {
51 |     # Reroot on whichever node matches.
52 |     # NOTE: the name is not validated here and so 
53 |     # rerooting might not happen if the ID is not found.
54 |     for my $node($tree->get_leaf_nodes){
55 |       $tree->reroot($node) if($node->id eq $$settings{root});
56 |     }
57 |   }
58 | }
59 | 
60 | sub _rerootLongestBranch{
61 |   my($tree,$settings)=@_;
62 | 
63 |   my @node=$tree->get_nodes;
64 |   my $outgroup=$node[0];
65 |   my $longest=$node[0]->branch_length || 0;
66 | 
67 |   for(my $i=1;$i<@node;$i++){
68 |     if($node[$i]->branch_length() > $longest){
69 |       $longest=$node[$i]->branch_length;
70 |       $outgroup=$node[$i];
71 |       #print join("\t",$outgroup->id,$longest)."\n";
72 |     }
73 |   }
74 | 
75 |   $tree->reroot($outgroup);
76 | }
77 | 
78 | sub usage{
79 |   "$0: determine the phylogenetic order from a tree file
80 |   Usage: $0 tree.dnd
81 |   --root longest  Reroot the tree at the longest branch.  If you supply a taxon ID then it will root on the branch leading to it instead.
82 |   "
83 | }
84 | 


--------------------------------------------------------------------------------
/scripts/pruneSafely.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Author: Lee Katz
  3 | # Safely remove a taxon from a tree without incurring singleton nodes
  4 | 
  5 | use strict;
  6 | use warnings;
  7 | use Bio::TreeIO;
  8 | use Getopt::Long;
  9 | use Data::Dumper;
 10 | use File::Basename qw/basename/;
 11 | 
 12 | sub logmsg{local $0=basename($0); print STDERR "$0: @_\n";}
 13 | 
 14 | exit main();
 15 | 
 16 | sub main{
 17 |   my $settings={};
 18 |   GetOptions($settings,qw(help tree=s)) or die $!;
 19 |   
 20 |   die usage() if($$settings{help});
 21 |   $$settings{tree} || die "ERROR: no tree was given:\n".usage();
 22 |   
 23 |   my @remove=@ARGV;
 24 |   die "ERROR: need to remove at least taxon\n".usage() if(@remove < 1);
 25 | 
 26 |   my $treeObj=safeRemove($$settings{tree},\@remove,$settings);
 27 | 
 28 |   my $out=Bio::TreeIO->new(-format=>"newick");
 29 |   $out->write_tree($treeObj);
 30 |   print "\n"; # just because newick files don't have newlines for some reason
 31 | 
 32 |   return 0;
 33 | }
 34 | 
 35 | sub safeRemove{
 36 |   my($tree,$remove,$settings)=@_;
 37 | 
 38 |   my $treeObj=Bio::TreeIO->new(-file=>$tree)->next_tree;
 39 | 
 40 |   $treeObj=removeTaxa($treeObj,$remove,$settings);
 41 |   
 42 |   return $treeObj;
 43 | }
 44 | 
 45 | sub removeTaxa{
 46 |   my($tree,$remove,$settings)=@_;
 47 | 
 48 |   my %leaf_node_id=();
 49 |   my %ancestor_node=();
 50 |   my @ancestor_node=();
 51 |   my @node = $tree->get_nodes;
 52 |   for my $node(@node){
 53 |     if($node->is_Leaf()){
 54 |       $leaf_node_id{$node->id}=1;
 55 |     } else {
 56 |       push(@ancestor_node, $node);
 57 |       $ancestor_node{$node}=1;
 58 |     }
 59 |   }
 60 | 
 61 |   for my $taxon(@$remove){
 62 |     die "ERROR: taxon $taxon does not exist in the tree!" if(!$leaf_node_id{$taxon});
 63 |     my $safely_removed=$tree->remove_Node($taxon);
 64 |     if(!$safely_removed){
 65 |       die "ERROR: could not remove $taxon safely";
 66 |     }
 67 |   }
 68 | 
 69 |   $tree->contract_linear_paths(1);
 70 | 
 71 |   # Now remove all nodes that were ancestors but are now leaves
 72 |   #my $nodes_were_removed=1;
 73 |   #while($nodes_were_removed){
 74 |   #  $nodes_were_removed = removeUselessNewLeafNodes($tree,\@ancestor_node);
 75 |   #  last;
 76 |   #}
 77 | 
 78 |   return $tree;
 79 | }
 80 | 
 81 | sub removeUselessNewLeafNodes{
 82 |   my($tree,$ancestor_node)=@_;
 83 | 
 84 |   my $nodesRemovedCounter=0;
 85 |   for my $node(@$ancestor_node){
 86 |     # If an ancestor node is now a leaf, prune it too
 87 |     if($node->is_Leaf()){
 88 |       my $safely_removed=$tree->remove_Node($node);
 89 |       if(!$safely_removed){
 90 |         die "ERROR: could not remove a useless ancestor node safely";
 91 |       }
 92 |       $nodesRemovedCounter++;
 93 |     }
 94 |   }
 95 |   return $nodesRemovedCounter;
 96 | }
 97 | 
 98 | sub usage{
 99 |   local $0=basename($0);
100 |   "$0: Removes a taxon from a tree
101 |   Usage: $0 --tree tree.dnd taxon1 [taxon2...]
102 |   "
103 | }
104 | 


--------------------------------------------------------------------------------
/scripts/pwdLinux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get the Linux path from a windows path
 4 | WINDOWS=$1
 5 | 
 6 | # Change backslashes to forward slashes
 7 | LINUX=$(sed 's|\\|/|g' <<< $WINDOWS)
 8 | # Remove extra leading slashes
 9 | LINUX=$(sed 's|^/\+|/|' <<< $LINUX)
10 | 
11 | # Change the domain name
12 | LINUX=$(sed 's|^/data.biotech.cdc.gov/|/scicomp/|' <<< $LINUX);
13 | 
14 | # Print the final linux path
15 | echo $LINUX;
16 | 


--------------------------------------------------------------------------------
/scripts/pwdWindows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get the Windows network path
 4 | 
 5 | # Get the actual path with resolved symlinks
 6 | pwd=$(pwd -P);
 7 | if [ "$1" != "" ]; then
 8 |   pwd=$(realpath $1)
 9 | fi
10 | # remove scicomp and leading slash
11 | pwd=$(sed 's|^/scicomp||' <<< $pwd);
12 | # is this in the home directory?
13 | pwd=$(sed "s|^/home/$USER|/home|" <<< $pwd);
14 | # / to \
15 | pwd=$(sed 's|/|\\|g' <<< $pwd);
16 | # tack on the domain name
17 | pwd="\\\\data.biotech.cdc.gov$pwd"
18 | 
19 | 
20 | echo $pwd;
21 | 


--------------------------------------------------------------------------------
/scripts/qsubStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Lee Katz
 4 | # Figures out some quick metrics from SGE
 5 | 
 6 | # Take a snapshot of qstat.
 7 | QSTAT=$(qstat -u '*')
 8 | if [ $? -gt 0 ]; then
 9 |   echo "ERROR with qstat" >&2
10 |   exit 1;
11 | fi
12 | 
13 | QSTAT=$(echo "$QSTAT"|tail -n +3) # qstat, minus the header
14 | 
15 | # How many of the cluster's slots I'm taking
16 | # echo "$QSTAT" |tail -n +3| perl -MData::Dumper -e 'while(<>){s/^\s+|\s+$//g; @F=split /\s+/; next if($F[4] ne 'r'); $slots=$F[8]; if($F[3] eq $ENV{USER}){$mine+=$slots;} $total+=$slots; } print "$mine out of $total\n";'
17 | 
18 | # who is the current hog
19 | echo "$QSTAT" | perl -lane '
20 |   BEGIN{print "USER\tSLOTS";} 
21 |   next if(!$F[3] || !$F[8]); 
22 |   next if($F[4] !~ /^R?r$/); 
23 |   $slot{$F[3]}+=$F[8]; END{@user=sort{$slot{$b}<=>$slot{$a} || $a cmp $b} keys(%slot); print "$_\t$slot{$_}" for @user;}' | column -t
24 | 


--------------------------------------------------------------------------------
/scripts/randFastq.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Randomizes the order of fastq reads
 3 | #
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | use Getopt::Long qw/GetOptions/;
 8 | use Data::Dumper qw/Dumper/;
 9 | use File::Basename qw/fileparse/;
10 | use List::Util qw/shuffle/;
11 | 
12 | local $0=fileparse $0;
13 | sub logmsg { print "$0: @_\n";}
14 | 
15 | exit main();
16 | 
17 | sub main{
18 |   my $settings={};
19 |   GetOptions($settings,qw(help pe|paired-end freq|frequency=f)) or die $!;
20 |   $$settings{freq}||=1;
21 |   die usage() if($$settings{help});
22 | 
23 |   my @fastq=@ARGV;
24 |   die usage() if(!@fastq);
25 | 
26 |   my $reads=readFastqs(\@fastq,$settings);
27 | 
28 |   printRandomReads($reads,$settings);
29 | 
30 |   return 0;
31 | }
32 | 
33 | sub readFastqs{
34 |   my($fastq,$settings)=@_;
35 | 
36 |   my $linesPerEntry=4;
37 |   if($$settings{pe}){
38 |     $linesPerEntry=8;
39 |   }
40 | 
41 |   # Get this out of the hash in case it helps with speed
42 |   my $freq=$$settings{freq};
43 | 
44 |   my @reads;
45 |   for my $f(@$fastq){
46 |     my($name,$dir,$ext)=fileparse($f,qw(.gz));
47 |     my $fastqFh;
48 |     if($ext eq '.gz'){
49 |       open($fastqFh,"zcat $f |") or die "ERROR: could not zcat $f for reading: $!";
50 |     } else {
51 |       open($fastqFh,$f) or die "ERROR: could not open $f: $!";
52 |     }
53 |     while(my $entry=<$fastqFh>){
54 |       for(2..$linesPerEntry){
55 |         $entry.=<$fastqFh>;
56 |       }
57 | 
58 |       # Randomly skip reads if a random number is greater
59 |       # than the user-defined threshold.
60 |       next if(rand() > $freq);
61 | 
62 |       push(@reads,$entry);
63 |     }
64 |     close $fastqFh;
65 |   }
66 | 
67 |   return \@reads;
68 | }
69 | 
70 | sub printRandomReads{
71 |   my($reads,$settings)=@_;
72 |   
73 |   for my $entry(shuffle(@$reads)){
74 |     print $entry;
75 |   }
76 | }
77 | 
78 | sub usage{
79 |   "$0: randomize the order of reads in a fastq file
80 |   Usage: $0 file.fastq[.gz] [file2.fastq...] > rand.fastq
81 | 
82 |   --paired-end      If the file is interleaved
83 |   --frequency   1   Frequency of reads to keep (values: 0-1)
84 |   "
85 | }
86 | 
87 | 


--------------------------------------------------------------------------------
/scripts/replaceReadsWithReference.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use warnings;
  4 | use strict;
  5 | use Data::Dumper;
  6 | use Getopt::Long;
  7 | use File::Basename qw/basename/;
  8 | 
  9 | local $0 = basename $0;
 10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
 11 | exit(main());
 12 | 
 13 | sub main{
 14 |   my $settings={};
 15 |   GetOptions($settings,qw(help)) or die $!;
 16 |   usage() if($$settings{help} || @ARGV < 2);
 17 | 
 18 |   my $refFasta = shift(@ARGV);
 19 | 
 20 |   for my $bam (@ARGV){
 21 |     printFastq($bam, $refFasta, $settings);
 22 |   }
 23 | 
 24 |   return 0;
 25 | }
 26 | 
 27 | # Print the fastq file from the bam
 28 | sub printFastq{
 29 |   my($bam, $refFasta, $settings) = @_;
 30 | 
 31 |   open(my $fh, "samtools sort -n '$bam' | samtools view -f 1 |") or die "ERROR using samtools view on $bam: $!";
 32 |   while(my $line = <$fh>){
 33 |     chomp($line);
 34 |     my($qname, $flag, $rname, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual) =
 35 |         split(/\t/, $line);
 36 | 
 37 |     # add /1 or /2
 38 |     if($flag & 0x40){
 39 |       $qname .= "/1";
 40 |     }
 41 |     if($flag & 0x80){
 42 |       $qname .= "/2";
 43 |     }
 44 | 
 45 |     # if the read is not unmapped then replace with reference
 46 |     if(! ($flag & 0x4)){
 47 |       my $refHit = referenceHit($rname, $pos, $cigar, $refFasta, $settings);
 48 |       $seq = $refHit;
 49 |       $qname .= " replaced";
 50 |     }
 51 | 
 52 |     # Sanity check to match the lengths of seq and qual
 53 |     if(length($seq) != length($qual)){
 54 |       my$bp = length($seq) - length($qual);
 55 |       logmsg "WARNING: seq is not the same length as qual for $qname($bp bp)";
 56 |       # Adjust to match shortest length
 57 |       if(length($seq) > length($qual)){
 58 |         $seq = substr($seq, 0, length($qual));
 59 |       }else{
 60 |         $qual = substr($qual, 0, length($seq));
 61 |       }
 62 | 
 63 |     }
 64 | 
 65 |     print "\@$qname\n$seq\n+\n$qual\n";
 66 |   }
 67 | }
 68 | 
 69 | # Get the sequence of the reference genome at the mapped position
 70 | sub referenceHit{
 71 |   my($rname, $pos, $cigar, $refFasta, $settings) = @_;
 72 | 
 73 |   # Determine length from cigar
 74 |   # TODO other operation codes like [NSHP=X]
 75 |   my $length = 0;
 76 |   while($cigar =~ /(\d+)(\w)/g){
 77 |     my $code = $2;
 78 |     my $int  = $1;
 79 |     if($code eq 'M'){
 80 |       $length+=$int;
 81 |     } elsif($code eq 'D') {
 82 |       $length+=$int;
 83 |     } elsif($code eq 'I') {
 84 |       $length+=0;
 85 |     } else {
 86 |       die "ERROR: cigar string has a $code which I do not know how to interpret.  Here is the full cigar string: $cigar";
 87 |     }
 88 |   }
 89 | 
 90 |   if($length < 1){
 91 |     die "INTERNAL ERROR: length of reference hit for this mapped read is <1" . Dumper \@_;
 92 |   }
 93 | 
 94 |   # Grab the reference hit
 95 |   my $stopPos = $pos + $length - 1;
 96 |   my $refHit = `samtools faidx $refFasta '$rname:$pos-$stopPos' | tail -n +2`;
 97 |   die "ERROR running samtools faidx on $refFasta" if $?;
 98 |   chomp($refHit);
 99 |   $refHit =~ s/\n//g;
100 |   $refHit =~ tr/[a-z]/[A-Z]/; # uppercase
101 | 
102 |   return $refHit;
103 | }
104 | 
105 | sub usage{
106 |   print "$0: print a bam as a fastq file, replacing reads with the reference genome
107 |   Usage: $0 [options] ref.fasta *.bam > out.fastq
108 |   --help   This useful help menu
109 |   ";
110 |   exit 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/scripts/representativeTaxa.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Data::Dumper;
  6 | use Getopt::Long;
  7 | use File::Basename qw/basename/;
  8 | use Bio::TreeIO;
  9 | 
 10 | local $0 = basename $0;
 11 | sub logmsg { print STDERR "$0: @_\n";}
 12 | 
 13 | exit main();
 14 | 
 15 | sub main{
 16 | 
 17 |   my $settings={};
 18 |   GetOptions($settings,qw(help cluster-distance|distance|max-distance=f)) or die $!;
 19 |   $$settings{'cluster-distance'}||=0.001;
 20 | 
 21 |   die usage() if(!@ARGV);
 22 | 
 23 |   for my $file(@ARGV){
 24 |     findRepresentatives($file,$settings);
 25 |   }
 26 |   
 27 |   return 0;
 28 | }
 29 | 
 30 | sub findRepresentatives{
 31 |   my($file,$settings)=@_;
 32 | 
 33 |   my $tree=Bio::TreeIO->new(-file=>$file)->next_tree;
 34 | 
 35 |   my @taxon = grep {$_->is_Leaf} $tree->get_nodes();
 36 |   my $numnodes=@taxon;
 37 | 
 38 |   # Find distances between all genomes
 39 |   logmsg "Finding distances between all taxa";
 40 |   my %distance;
 41 |   for(my $i=0;$i<$numnodes;$i++){
 42 |     print STDERR ".";
 43 |     my $taxonName1=$taxon[$i]->id;
 44 |     for(my $j=$i+1; $j<$numnodes; $j++){
 45 |       my $taxonName2=$taxon[$j]->id;
 46 |       my $distance=distanceBetweenTwoNodes($tree,$taxon[$i],$taxon[$j]);
 47 |       $distance{$taxonName1}{$taxonName2} = $distance;
 48 |       $distance{$taxonName2}{$taxonName1} = $distance;
 49 |     }
 50 |   }
 51 |   print STDERR "\n";
 52 | 
 53 | 
 54 |   # Cluster the taxa by distance
 55 |   # The index of %cluster is the representative genome,
 56 |   # and all other genomes have to be within X distance
 57 |   # of it.
 58 |   my %cluster;
 59 |   my $cluster_counter=0;
 60 |   for(my $i=0;$i<$numnodes;$i++){
 61 |     my $taxonName=$taxon[$i]->id;
 62 |     my $is_representative_taxon=1;
 63 |     for my $representative (keys(%cluster)) {
 64 |       if($distance{$taxonName}{$representative} < $$settings{'cluster-distance'}){
 65 |         push(@{ $cluster{$representative} }, $taxonName);
 66 |         $is_representative_taxon=0;
 67 |         last;
 68 |       }
 69 |     }
 70 | 
 71 |     if($is_representative_taxon){
 72 |       $cluster{$taxonName}=[$taxonName];
 73 |     }
 74 |   }
 75 | 
 76 |   for my $members(values(%cluster)){
 77 |     print join("\t",@$members)."\n";
 78 |   }
 79 | 
 80 |   logmsg "Found ".scalar(keys(%cluster))." clusters";
 81 | 
 82 | }
 83 | 
 84 | # http://cpansearch.perl.org/src/CJFIELDS/BioPerl-1.007002/Bio/Tree/TreeFunctionsI.pm
 85 | #   -> sub distance
 86 | # without error checking to speed it up
 87 | sub distanceBetweenTwoNodes{
 88 |   my($tree,$node1,$node2)=@_;
 89 | 
 90 |     my $lca = $tree->get_lca($node1,$node2);
 91 |     my $cumul_dist = 0;
 92 |     foreach my $current_node ($node1,$node2){
 93 |       do {
 94 |         $cumul_dist += $current_node->branch_length;
 95 | 
 96 |         $current_node = $current_node->ancestor || last;
 97 | 
 98 |       } while($current_node ne $lca);
 99 |     }
100 | 
101 |     return $cumul_dist;
102 | }
103 | 
104 | sub usage{
105 |   "$0: Find representative taxa in each tree. Assumes one
106 |   tree per tree file.
107 | 
108 |   Usage: $0 [options] tree.dnd [tree2.dnd...]
109 | 
110 |   --cluster-distance  0.001  The max distance between every
111 |                              taxon in a cluster
112 |   "
113 | }
114 | 


--------------------------------------------------------------------------------
/scripts/rerootTree.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl 
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use Data::Dumper;
 6 | use Bio::TreeIO;
 7 | use Getopt::Long;
 8 | use File::Basename qw/basename/;
 9 | use List::Util qw/sum/;
10 | use Statistics::Descriptive;
11 | use File::Temp qw/tempdir tempfile/;
12 | 
13 | local $0=basename $0;
14 | sub logmsg{print STDERR "$0: @_\n";}
15 | exit(main());
16 | 
17 | sub main{
18 |   my $settings={};
19 |   GetOptions($settings,qw(help root-on|root-with|root=s)) or die $!;
20 | 
21 |   die usage() if(!@ARGV || $$settings{help});
22 | 
23 |   $$settings{'root-on'} || die "ERROR: parameter --root-on is required. --help for more information.";
24 | 
25 |   my(@query)=@ARGV;
26 |   my $treeout=Bio::TreeIO->new(-format=>"newick");
27 | 
28 |   # For each query, reroot and print
29 |   for my $q(@query){
30 |     my $treein = Bio::TreeIO->new(-file=>$q,-format=>"newick");
31 |     my $treeCounter=0;
32 |     while(my $treeObject = $treein->next_tree){
33 |       $treeCounter++;
34 |       my @leaves = sort {$a->id cmp $b->id} grep {$_->is_Leaf} $treeObject->get_nodes;
35 |       if(scalar(@leaves) < 2){
36 |         if(scalar($treeObject->get_nodes) < 3){
37 |           logmsg "Skipping: only ".scalar($treeObject->get_nodes)." nodes found in tree $treeCounter in $q";
38 |           logmsg "  Possible reason: empty line in tree file";
39 |           next;
40 |         }
41 |         die "ERROR: there are fewer than 2 leaves on tree $treeCounter in $q:\n".Dumper [map{$_->id} @leaves];
42 |       }
43 |       my @node = grep {$_->id eq $$settings{'root-on'}} @leaves;
44 |       if(@node > 1){
45 |         die "ERROR: found multiple nodes named ".$$settings{'root-on'}." in $q";
46 |       }
47 |       logmsg "Rerooting tree $treeCounter in $q";
48 |       
49 |       my $was_rerooted=$treeObject->reroot($node[0]);
50 |       if(!$was_rerooted){
51 |         die "ERROR: could not reroot tree $treeCounter in $q";
52 |       }
53 | 
54 |       $treeout->write_tree($treeObject);
55 | 
56 |     }
57 |   }
58 | 
59 |   return 0;
60 | }
61 | 
62 | sub usage{
63 |   "$0: Roots a set of trees on the same leaf.
64 |   Output trees will be in the same order as tree parameters
65 |   Usage: $0 --root-on LEAF tree.dnd [tree2.dnd...] > trees.dnd
66 |   --root-on    ''  The name of the leaf
67 |   "
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/scripts/rowMath.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #Author: Lee Katz <lkatz@cdc.gov>
 3 | 
 4 | use strict;
 5 | use warnings;
 6 | use Data::Dumper;
 7 | use Getopt::Long;
 8 | use File::Basename qw/basename/;
 9 | 
10 | local $0=basename $0;
11 | sub logmsg{print STDERR "$0: @_\n";}
12 | 
13 | exit main();
14 | sub main{
15 |   my $settings={};
16 |   GetOptions($settings,qw(help test operation=s));
17 |   die usage() if($$settings{help});
18 |   $$settings{operation}||="\$next - \$cur";
19 |   #$$settings{operation}=quotemeta($$settings{operation});
20 |   
21 |   if($$settings{test}){
22 |     test($settings);
23 |   } else {
24 |     printDifferences($settings);
25 |   }
26 |   return 0;
27 | }
28 | 
29 | sub printDifferences{
30 |   my($settings)=@_;
31 |   
32 |   my $cur=<>; chomp($cur);
33 |   while(my $next=<>){
34 |     chomp($next);
35 | 
36 |     # do the math
37 |     my $answer=eval($$settings{operation});
38 |     if($@){
39 |       die "ERROR: $$settings{operation} resulted in a failure: $@";
40 |     }
41 |     print "$answer\n";
42 |     $cur=$next;
43 |   }
44 | }
45 | 
46 | sub test{
47 |   my($settings)=@_;
48 |   my $cmd=" echo -e '1\n5\n7\n33\n33\n33\n37' | $0";
49 |   logmsg "COMMAND:\n====\n$cmd\n====";
50 |   system($cmd);
51 | }
52 | 
53 | sub usage{
54 |   "Calculates the difference between rows or a custom arithmetic 
55 |   Usage: sort -n numbers.txt | $0 > difference.txt
56 |   -o 'custom arithmetic'
57 |     Variables: \$cur is the first row in the iteration
58 |                \$next is the second row
59 |     Example:   \$next - \$cur
60 |   "
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/scripts/splitBionumericsFasta.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Author: Lee Katz <lkatz@cdc.gov>
 3 | # Splits up a bionumerics fasta file into contigs
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | use Bio::Perl;
 8 | use Bio::Tools::GuessSeqFormat;
 9 | use File::Basename;
10 | use autodie;
11 | use Getopt::Long;
12 | 
13 | exit main();
14 | 
15 | sub main{
16 |   my $settings={};
17 |   GetOptions($settings,qw(help outdir=s)) or die $!;
18 | 
19 |   die usage() if(!@ARGV || $$settings{help});
20 | 
21 |   if($$settings{outdir}){
22 |     mkdir $$settings{outdir};
23 |   }
24 | 
25 |   # Because we are reading bionumerics files and do not
26 |   # trust their extensions, make a better guess at their
27 |   # format using the format guesser.
28 |   my $formatGuesser=Bio::Tools::GuessSeqFormat->new(-file=>$ARGV[0]);
29 |   my $format       =$formatGuesser->guess;
30 | 
31 |   my $in=Bio::SeqIO->new(-file=>$ARGV[0],-format=>$format); 
32 |   while(my $seq=$in->next_seq){
33 |     my @seq=split(/\|/,$seq->seq);
34 |     my $id=$seq->id;
35 |        $id=~s/^denovo\|//;   # remove 'denovo' since most exports seem to have that
36 |     print STDERR "$id\n";
37 | 
38 |     my $out=Bio::SeqIO->new(-format=>"fasta"); 
39 |     if($$settings{outdir}){
40 |       # For potential filenames, get a safe name
41 |       my $id_safe=$id;
42 |          $id_safe=~s/[^\w\d]//g;  # remove non words, non letters
43 | 
44 |       my $outfile="$$settings{outdir}/$id_safe.fasta";
45 |       $out=Bio::SeqIO->new(-format=>"fasta",-file=>">$outfile");
46 |     }
47 |     for(my $i=1;$i<=@seq;$i++){
48 |       my $subseq=Bio::Seq->new(-seq=>$seq[$i-1],-id=>$id."_".$i);
49 |       $out->write_seq($subseq);
50 |     }
51 |     $out->close;
52 |   }
53 | 
54 |   return 0;
55 | }
56 | 
57 | sub usage{
58 |   local $0=fileparse $0;
59 |   "Usage: $0 bionumerics.fasta > out.fasta
60 |   --outdir   ''  If given, all genomes will be written here.
61 |                  If blank, output will be sent to stdout.
62 |   "
63 | }
64 | 


--------------------------------------------------------------------------------
/scripts/splitPolytomies.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | use Bio::Perl;
 5 | use Bio::TreeIO;
 6 | use Getopt::Long;
 7 | use File::Basename qw/basename/;
 8 | use Math::Round qw/ceil/;
 9 | 
10 | local $0=basename($0);
11 | sub logmsg{print STDERR "$0: @_\n"}
12 | 
13 | exit main();
14 | 
15 | sub main{
16 |   my $settings={};
17 |   GetOptions($settings,qw(help));
18 | 
19 |   my $$settings{bootstrap}//=70;
20 |   die usage() if($$settings{help});
21 | 
22 |   my @tree=@ARGV;
23 |   for my $tree(@tree){
24 |     my $in=Bio::TreeIO->new(-file=>$tree);
25 |     while(my $treeObj=$in->next_tree){
26 |       $treeObj=splitPolytomies($treeObj,undef,$settings);
27 |     }
28 |   }
29 |   
30 |   return 0;
31 | }
32 | 
33 | sub splitPolytomies{
34 |   my($treeObj,$node,$settings)=@_;
35 | 
36 |   my $node ||= $treeObj->get_root_node;
37 | 
38 |   my @descs = $node->each_Descendent;
39 |   if (@descs > 2) {
40 |     # Many nodes have no identifying names, a simple warning is probably
41 |     # enough.
42 | 
43 |     $treeObj->warn("Node has more than two descendants\nWill do an arbitrary balanced split");
44 |     my @working = @descs;
45 |     # create an even set of artifical nodes on which to later hang the descs
46 |     my $half = ceil(@working / 2);
47 |     my @artificials;
48 |     while ($half > 1) {
49 |       my @this_level;
50 |       foreach my $top_node (@artificials || $node) {
51 |         for (1..2) {
52 |           my $art = $top_node->new(-id => "artificial_".++$treeObj->{_art_num});
53 |           $top_node->add_Descendent($art);
54 |           push(@this_level, $art);
55 |         }
56 |       }
57 |       @artificials = @this_level;
58 |       $half--;
59 |     }
60 |     # attach two descs to each artifical leaf
61 |     foreach my $art (@artificials) {
62 |       for (1..2) {
63 |         my $desc = shift(@working) || $node->new(-id => "artificial_".++$treeObj->{_art_num});
64 |         $desc->ancestor($art);
65 |       }
66 |     }
67 |   }
68 |   elsif (@descs == 1) {
69 |     # ensure that all nodes have 2 descs
70 |     $node->add_Descendent($node->new(-id => "artificial_".++$treeObj->{_art_num}));
71 |   }
72 |   # recurse
73 |   foreach my $desc (@descs) {
74 |     splitPolytomies($treeObj,$desc,$settings);
75 |   }
76 | 
77 |   return $treeObj;
78 | }
79 | 
80 | sub usage{
81 |   "$0: split polytomies in a predictable way
82 |   Usage: $0 tree.dnd [tree2.dnd...] > out.dnd
83 |   --bootstrap  70  The minimum bootstrap value where a
84 |                    clade will be considered a polytomy
85 |   "
86 | }
87 | 


--------------------------------------------------------------------------------
/scripts/tanglegram_ape.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Authors: Beau Bruce and Weidong Gu
 3 | # Modified by Lee Katz
 4 | 
 5 | library(argparse, quietly=TRUE)
 6 | parser <- ArgumentParser()
 7 | parser$add_argument("-t1", "--tree1", default=FALSE,
 8 |                         help="First tree")
 9 | parser$add_argument("-t2", "--tree2", default=FALSE,
10 |                         help="Second tree")
11 | parser$add_argument("-o",  "--outfile", default=FALSE,
12 |                         help="output png file")
13 | args <- parser$parse_args()
14 | 
15 | treefile1 <- args$tree1
16 | treefile2 <- args$tree2
17 | outfile   <- args$outfile
18 | 
19 | myReturn <- suppressPackageStartupMessages(c(
20 |   library(phytools,  quietly=TRUE),
21 |   library(ape,       quietly=TRUE)
22 | ));
23 | 
24 | outbreak <- read.delim('/scicomp/home/gzu2/projects/mashtree/data/katzEtAl/Lyve-SET/outbreakStatus.tsv',
25 |                        sep="\t", header=T, stringsAsFactors=F)
26 | tree1 <- ladderize(midpoint.root(read.tree(treefile1)))
27 | tree2 <- ladderize(midpoint.root(read.tree(treefile2)))
28 | 
29 | #tree1 <- reorder(tree1, "postorder")
30 | #tree2 <- reorder(tree2, "postorder")
31 | 
32 | # Default minimum length
33 | min_length <- 0.000000000000000000001
34 | tree1$edge.length[ tree1$edge.length < min_length ] <- min_length
35 | tree2$edge.length[ tree2$edge.length < min_length ] <- min_length
36 | 
37 | outbreakIndex    <- match(outbreak$sample[outbreak$outbreak== 1],tree1$tip.label)
38 | nonoutbreakIndex <- match(outbreak$sample[outbreak$outbreak== 0],tree1$tip.label)
39 | maybeoutbreakIndex <- match(outbreak$sample[outbreak$outbreak==-1],tree1$tip.label)
40 | 
41 | myColors <- c()
42 | myColors[outbreakIndex]      <- 'red'
43 | myColors[nonoutbreakIndex]   <- 'blue'
44 | myColors[maybeoutbreakIndex] <- 'gray'
45 | 
46 | association <- cbind(tree1$tip.label, tree1$tip.label)
47 | png(outfile);
48 | cophyloplot(tree1, tree2, assoc = association, space = 100, length.line=0, gap=1, show.tip.label=F, col = myColors);
49 | myReturn <- dev.off();
50 | 
51 | 


--------------------------------------------------------------------------------
/scripts/tanglegram_code.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Authors: Beau Bruce and Weidong Gu
 3 | # Modified by Lee Katz
 4 | 
 5 | library(argparse, quietly=TRUE)
 6 | parser <- ArgumentParser()
 7 | parser$add_argument("-t1", "--tree1", default=FALSE,
 8 |                         help="First tree")
 9 | parser$add_argument("-t2", "--tree2", default=FALSE,
10 |                         help="Second tree")
11 | parser$add_argument("-o",  "--outfile", default=FALSE,
12 |                         help="output png file")
13 | args <- parser$parse_args()
14 | 
15 | treefile1 <- args$tree1
16 | treefile2 <- args$tree2
17 | outfile   <- args$outfile
18 | 
19 | print("Loading libraries")
20 | #library(phytools,  quietly=TRUE)
21 | #library(dendextend,quietly=TRUE)
22 | #library(ape,       quietly=TRUE)
23 | myReturn <- suppressPackageStartupMessages(c(
24 |   library(phytools,  quietly=TRUE),
25 |   library(dendextend,quietly=TRUE),
26 |   library(ape,       quietly=TRUE)
27 | ));
28 | 
29 | #eid_iso=read.delim('\\\\cdc.gov\\project\\CCID_NCZVED_DFBMD_EDEB\\Analytics\\Weidong\\LM model\\ncbi_access_dat.csv',sep=',',header=T,stringsAsFactors = F)
30 | outbreak <- read.delim('/scicomp/home/gzu2/projects/mashtree/data/katzEtAl/Lyve-SET/outbreakStatus.tsv',
31 |                        sep="\t", header=T, stringsAsFactors=F)
32 | tree1 <- reorder(midpoint.root(read.tree(treefile1)), order = "cladewise")
33 | tree2 <- reorder(midpoint.root(read.tree(treefile2)), order = "cladewise")
34 | 
35 | dend_tree1 <- force.ultrametric(tree1)
36 | dend_tree2 <- force.ultrametric(tree2)
37 | 
38 | min_length <- 0.000000000000000000001
39 | dend_tree1$edge.length[ dend_tree1$edge.length < min_length ] <- min_length
40 | dend_tree2$edge.length[ dend_tree2$edge.length < min_length ] <- min_length
41 | 
42 | dend_tree1=(midpoint.root(dend_tree1))
43 | dend_tree2=(midpoint.root(dend_tree2))
44 | 
45 | my.col=c('blue','brown','green','pink','red')
46 | 
47 | outbreakIndex    <- match(outbreak$sample[outbreak$outbreak== 1],dend_tree1$tip.label)
48 | nonoutbreakIndex <- match(outbreak$sample[outbreak$outbreak== 0],dend_tree1$tip.label)
49 | maybeoutbreakIndex <- match(outbreak$sample[outbreak$outbreak==-1],dend_tree1$tip.label)
50 | myColors <- c()
51 | myColors[outbreakIndex]      <- 'red'
52 | myColors[nonoutbreakIndex]   <- 'blue'
53 | myColors[maybeoutbreakIndex] <- 'green'
54 | #outbreak$color[ outbreakIndex ] <- 'red'
55 | #outbreak$color[nonoutbreakIndex]<- 'blue'
56 | #col.s=my.col[as.factor(conn_l_col)]
57 | 
58 | print("untangle")
59 | dendl <- dendextend::untangle(as.dendrogram(dend_tree1), 
60 |                      as.dendrogram(dend_tree2), 
61 |                      method = "step2side") 
62 |                      #method = "labels") 
63 |                      #method = "ladderize") 
64 |                      #method = "random") 
65 |                      #method = "step1side") 
66 |                      #method = "DendSer") 
67 | 
68 | # Make the branches look nice
69 | dendl %>% set("branches_lwd", 1) %>%
70 |   set("labels_col", "white") -> dendl
71 | 
72 | print("entanglement...");
73 | myEntanglement <- entanglement(dendl)
74 | cophenetic <- cor.dendlist(dendl, method = "cophenetic")
75 | baker      <- cor.dendlist(dendl, method = "baker")
76 | 
77 | # Start off the viz
78 | png(outfile)
79 | tanglegram(dendl,
80 |            main_left='Lyve-SET',
81 |            main_right='Mashtree',
82 |            lab.cex=0.3,
83 |            highlight_distinct_edges = FALSE,
84 |            color_lines=myColors
85 |            )
86 | #myReturn <- text("SOMETHING", x=1, y=1)
87 | myReturn <- dev.off();
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/scripts/translate-kraken-contigs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Data::Dumper;
 6 | 
 7 | my %length;
 8 | my %percentage;
 9 | open(my $fh, $ARGV[0]) or die "ERROR: could not read $ARGV[0]: $!";
10 | while(<$fh>){
11 |   chomp;
12 |   my($classified,$seqname,$taxid,$length,$kmerTaxid)=split(/\t/,$_);
13 |   if($classified eq 'U'){
14 |     $percentage{'unclassified'}+=$length;
15 |   } else {
16 |     $length{$seqname}=$length;
17 |   }
18 | }
19 | close $fh;
20 | 
21 | 
22 | # kraken-translate but tally all the sequence lengths
23 | open(my $translateFh, "kraken-translate $ARGV[0] | ") or die "ERROR: could not run kraken-translate on $ARGV[0]:$!";
24 | while(<$translateFh>){
25 |   chomp;
26 |   my($seqname,$taxonomyString)=split(/\t/,$_);
27 |   $taxonomyString=~s/\s+/_/g;
28 |   $taxonomyString=~s/;/\t/g;
29 |   $percentage{$taxonomyString}+=$length{$seqname};
30 | }
31 | close $translateFh;
32 | 
33 | # Make the file
34 | while(my($taxonomyString,$sliceOfPie)=each(%percentage)){
35 |   print join("\t",$sliceOfPie,$taxonomyString)."\n";
36 | }
37 | 


--------------------------------------------------------------------------------
/scripts/treeDistanceMatrix.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://www.biostars.org/p/6661/#142113
 3 | use strict;
 4 | use warnings;
 5 | use Bio::TreeIO;
 6 | use Data::Dumper;
 7 | 
 8 | sub logmsg{print STDERR "$0: @_\n";}
 9 | 
10 | die "Usage: $0 tree.dnd" if(!@ARGV || $ARGV[0]=~/\-+h/);
11 | my $treeObj = Bio::TreeIO->new(-file=>$ARGV[0])->next_tree;
12 | $treeObj->force_binary;
13 | my $tree = $treeObj->as_text("newick")."\n";
14 | #my $tree = $treeObj->simplify_to_leaves_string();
15 | chomp($tree);
16 | 
17 | die "Usage: $0 tree.dnd" if(!$tree);
18 | 
19 | ##record the distance of parentheses
20 | my %dis;
21 | my $par = -1;
22 | my @current;
23 | while($tree =~ /./g)
24 |     {if ($& eq '(')
25 |         {$par ++;
26 |         next if $par == 0;
27 |         $current[$#current+1] = $par;
28 |         }
29 |     elsif($& eq ')')
30 |         {(my $tem) = $' =~ /:(\d+\.\d+|\d+)/;
31 |         next if $#current == -1;
32 |         $dis{'node_'.$current[$#current]} = $tem;
33 |         pop @current;
34 |         }
35 |     }
36 | 
37 | ##record the distance of leaves
38 | my @order;
39 | while ($tree =~ /([^\(\):,]+):(\d+\.\d+|\d+)/g)
40 |     {$dis{$1} = $2;
41 |     $order[$#order+1] = $1;
42 |     }
43 | 
44 | ##record parents of leaves
45 | my %pare;
46 | @current = ();
47 | $par = -1;
48 | while($tree =~ /(\(|\)|([^\(\):,]+):)/g)
49 |     {if ($& eq '(')
50 |         {$par ++;
51 |         next if $par == 0;
52 |         $current[$#current+1] = $par;
53 |         }
54 |     elsif($& eq ')')
55 |         {pop @current;
56 |         }
57 |     else{map {$pare{$2}{$_} = 1} @current;
58 |         $pare{$2} = [@current];
59 |         }
60 |     }
61 | 
62 | ##Distance matrix
63 | my %dis2;
64 | foreach my $i (0..$#order)
65 |     {foreach my $j ($i..$#order)
66 |         {if ($i == $j)
67 |             {$dis2{$order[$i]}{$order[$j]} = 0;
68 |             }
69 |         else{my $tem = $dis{$order[$i]} + $dis{$order[$j]};
70 |             my $tem2 = -1;
71 |             foreach my $k (0..$#{$pare{$order[$i]}})
72 |                 {last if ($k > $#{$pare{$order[$j]}});
73 |                 if ($pare{$order[$i]}[$k] eq $pare{$order[$j]}[$k])
74 |                     {$tem2 = $k;
75 |                     }
76 |                 }
77 |             if ($#{$pare{$order[$i]}} != -1)
78 |                 {map {$tem += $dis{'node_'.$_}} map {$pare{$order[$i]}[$_]} ($tem2+1)..$#{$pare{$order[$i]}};
79 |                 }
80 |             if ($#{$pare{$order[$j]}} != -1)
81 |                 {map {$tem += $dis{'node_'.$_}} map {$pare{$order[$j]}[$_]} ($tem2+1)..$#{$pare{$order[$j]}};
82 |                 }
83 |             $dis2{$order[$i]}{$order[$j]} = $dis2{$order[$j]}{$order[$i]} = $tem;
84 |             }
85 |         }
86 |     }
87 | 
88 | ##output
89 | print join("\t",'',@order),"\n";
90 | foreach my $i (@order)
91 |     {print join("\t",$i,map {$dis2{$i}{$_}} @order),"\n";
92 |     }
93 | 


--------------------------------------------------------------------------------
/scripts/ttrToMiSeq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Transform a TTR simulation to a MiSeq run
 4 | 
 5 | if [ "$2" == "" ]; then
 6 |   echo "Transform a TTR directory to a MiSeq run"
 7 |   echo "Usage: $0 TTR out.miseq"
 8 |   exit 1;
 9 | fi
10 | 
11 | IN=$1
12 | OUT=$2
13 | 
14 | mkdir -p /tmp/$USER
15 | tmpdir=$(mktemp --directory --tmpdir=/tmp/$USER ttrToMiSeq.XXXXXX)
16 | 
17 | 
18 | # Sample sheet
19 | RUNNAME=$(basename IN)
20 | READLENGTH=$(grep read_length $IN/TTR.cfg | grep -o [0-9]*)
21 | if [ $? -gt 0 ]; then echo "ERROR reading $IN/TTR.cfg"; exit 1; fi;
22 | DATE=$(date +'%m/%d/%Y')
23 | 
24 | CSV="$tmpdir/SampleSheet.csv"
25 | echo -ne "[Header] 
26 | IEMFileVersion,4
27 | Investigator Name,TreeToReads
28 | Experiment Name,$RUNNAME
29 | Date,$DATE
30 | Workflow,GenerateFASTQ
31 | Application,FASTQ Only
32 | Assay,Nextera XT
33 | Description,$RUNNAME
34 | Chemistry,Amplicon
35 |   
36 | [Reads] 
37 | $READLENGTH
38 | $READLENGTH
39 | 
40 | [Settings]  
41 | ReverseComplement,0
42 | Adapter ATCGATCGATCG
43 | 
44 | [Data]
45 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
46 | " > $CSV
47 | 
48 | SAMPLE=$(ls $IN/fastq | xargs -n 1 basename)
49 | for i in $SAMPLE; do
50 |   echo -e "$i,,$RUNNAME,A01,N801,ATCGAAA,S801,ATCGAAA,1," >> $CSV
51 | done
52 | 
53 | # Fastq files
54 | FASTQDIR="$tmpdir/Data/Intensities/BaseCalls"
55 | mkdir -p $FASTQDIR
56 | 
57 | for i in $SAMPLE; do
58 |   set -e
59 |   cp -v $IN/fastq/$i/*_1.fq.gz $FASTQDIR/${i}_S1_L001_R1_001.fastq.gz
60 |   cp -v $IN/fastq/$i/*_2.fq.gz $FASTQDIR/${i}_S1_L001_R2_001.fastq.gz
61 |   set +e
62 | done;
63 | 
64 | echo "$tmpdir -> $OUT"
65 | mv $tmpdir $OUT
66 | 
67 | 


--------------------------------------------------------------------------------
/tests/all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | scriptDir=$(dirname $0);
 6 | export PATH=$PATH:$scriptDir/../scripts
 7 | 
 8 | echo PATH
 9 | echo $PATH
10 | echo PATH
11 | 
12 | which randTrees.pl
13 | 
14 | # Find all unit tests under this directory
15 | # and simply run them with -e and -x
16 | executables=$(find $(dirname $0)/unittests -maxdepth 1 -type f -name '*.sh')
17 | for exe in $executables; do
18 |   $exe
19 |   if [ $? -gt 0 ]; then
20 |     exit 1
21 |   fi
22 | done;
23 | 
24 | exit 0
25 | 


--------------------------------------------------------------------------------
/tests/unittests/Kendall.pl.bats:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$BATS_TEST_DIRNAME
 4 | export PATH=$dir/../../scripts:$PATH
 5 | 
 6 | @test "kendall" {
 7 | 
 8 |   lambda0=$($dir/../../scripts/Kendall.pl --alreadyrooted --lambda 0 $dir/input/kendall-colijn1.dnd $dir/input/kendall-colijn2.dnd | tail -n 1 | cut -f 4)
 9 |   [ "$lambda0" == "2.00" ]
10 | 
11 |   lambda1=$($dir/../../scripts/Kendall.pl --alreadyrooted --lambda 1 $dir/input/kendall-colijn1.dnd $dir/input/kendall-colijn2.dnd | tail -n 1 | cut -f 4)
12 |   [ "$lambda1" == "1.96" ]
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/unittests/avgstdev.pl.bats:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$BATS_TEST_DIRNAME
 4 | export PATH=$dir/../../scripts:$PATH
 5 | 
 6 | @test "avgstdev" {
 7 |   total="Total: 66"
 8 |   average="Average: 6.00 +/- 3.32"
 9 |   median="Median: 6.00 [3.50,8.50] [1.00-11.00]"
10 |   mad="MAD: 3.00"
11 | 
12 |   observed=$(seq 1 11 | $dir/../../scripts/avgstdev.pl)
13 | 
14 |   [ "$(grep Total <<< "$observed")" == "$total" ]
15 | 
16 |   [ "$(grep Average <<< "$observed")" == "$average" ]
17 | 
18 |   [ "$(grep Median <<< "$observed")" == "$median" ]
19 | 
20 |   [ "$(grep MAD <<< "$observed")" == "$mad" ]
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/tests/unittests/input/SRR27366697.10x.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lskatz/lskScripts/fe9397382e04c791cd2b932d0f37ff7090fbbb87/tests/unittests/input/SRR27366697.10x.fastq.gz


--------------------------------------------------------------------------------
/tests/unittests/input/kendall-colijn1.dnd:
--------------------------------------------------------------------------------
1 | ((A:1.2,B:0.8):0.5,(C:0.8,D:1):1.1);
2 | 


--------------------------------------------------------------------------------
/tests/unittests/input/kendall-colijn2.dnd:
--------------------------------------------------------------------------------
1 | (((A:0.8,B:1.4):0.3,C:0.7):0.9,D:1);
2 | 
3 | 


--------------------------------------------------------------------------------
/tests/unittests/input/kraken/FA1090/kraken.report:
--------------------------------------------------------------------------------
 1 |   0.00	0	0	U	0	unclassified
 2 | 100.00	43078	6	-	1	root
 3 |  99.99	43072	7	-	131567	  cellular organisms
 4 |  99.97	43065	122	D	2	    Bacteria
 5 |  99.69	42943	406	P	1224	      Proteobacteria
 6 |  98.74	42537	107	C	28216	        Betaproteobacteria
 7 |  98.50	42430	38	O	206351	          Neisseriales
 8 |  98.41	42392	365	F	481	            Neisseriaceae
 9 |  97.56	42027	22725	G	482	              Neisseria
10 |  44.81	19302	18696	S	485	                Neisseria gonorrhoeae
11 |   1.41	606	606	-	242231	                  Neisseria gonorrhoeae FA 1090
12 | 


--------------------------------------------------------------------------------
/tests/unittests/input/kraken/FA1090/kraken.taxonomy:
--------------------------------------------------------------------------------
 1 | 22725	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales	Neisseriaceae	Neisseria
 2 | 18696	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales	Neisseriaceae	Neisseria	Neisseria gonorrhoeae
 3 | 606	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales	Neisseriaceae	Neisseria	Neisseria gonorrhoeae	Neisseria gonorrhoeae FA 1090
 4 | 406	root	cellular organisms	Bacteria	Proteobacteria
 5 | 365	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales	Neisseriaceae
 6 | 122	root	cellular organisms	Bacteria
 7 | 107	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria
 8 | 38	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales
 9 | 7	root	cellular organisms
10 | 6	root
11 | 0
12 | 


--------------------------------------------------------------------------------
/tests/unittests/input/kraken/contaminated/kraken.filtered.report:
--------------------------------------------------------------------------------
 1 | 100.00	111249	91	-	1	root
 2 |  99.92	111158	1	-	131567	  cellular organisms
 3 |  99.92	111157	64	D	2	    Bacteria
 4 |  99.86	111093	101	P	1224	      Proteobacteria
 5 |  61.18	68060	22	C	1236	        Gammaproteobacteria
 6 |  61.16	68038	7	O	118969	          Legionellales
 7 |  61.15	68031	1	F	444	            Legionellaceae
 8 |  61.15	68030	87	G	445	              Legionella
 9 |  61.07	67943	67943	S	446	                Legionella pneumophila
10 |  38.59	42932	47	C	28216	        Betaproteobacteria
11 |  38.55	42885	0	O	206351	          Neisseriales
12 |  38.55	42885	0	F	481	            Neisseriaceae
13 |  38.55	42885	13056	G	482	              Neisseria
14 |  26.81	29829	29829	S	485	                Neisseria gonorrhoeae
15 | 


--------------------------------------------------------------------------------
/tests/unittests/input/kraken/contaminated/kraken.report:
--------------------------------------------------------------------------------
 1 |   0.00	0	0	U	0	unclassified
 2 | 100.00	111249	91	-	1	root
 3 |  99.92	111158	1	-	131567	  cellular organisms
 4 |  99.92	111157	64	D	2	    Bacteria
 5 |  99.86	111093	101	P	1224	      Proteobacteria
 6 |  61.18	68060	22	C	1236	        Gammaproteobacteria
 7 |  61.16	68038	7	O	118969	          Legionellales
 8 |  61.15	68031	1	F	444	            Legionellaceae
 9 |  61.15	68030	87	G	445	              Legionella
10 |  61.07	67943	67943	S	446	                Legionella pneumophila
11 |  38.59	42932	47	C	28216	        Betaproteobacteria
12 |  38.55	42885	0	O	206351	          Neisseriales
13 |  38.55	42885	0	F	481	            Neisseriaceae
14 |  38.55	42885	13056	G	482	              Neisseria
15 |  26.81	29829	29829	S	485	                Neisseria gonorrhoeae
16 | 


--------------------------------------------------------------------------------
/tests/unittests/input/kraken/contaminated/kraken.taxonomy:
--------------------------------------------------------------------------------
 1 | 67943	root	cellular organisms	Bacteria	Proteobacteria	Gammaproteobacteria	Legionellales	Legionellaceae	Legionella	Legionella pneumophila
 2 | 29829	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales	Neisseriaceae	Neisseria	Neisseria gonorrhoeae
 3 | 13056	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria	Neisseriales	Neisseriaceae	Neisseria
 4 | 101	root	cellular organisms	Bacteria	Proteobacteria
 5 | 91	root
 6 | 87	root	cellular organisms	Bacteria	Proteobacteria	Gammaproteobacteria	Legionellales	Legionellaceae	Legionella
 7 | 64	root	cellular organisms	Bacteria
 8 | 47	root	cellular organisms	Bacteria	Proteobacteria	Betaproteobacteria
 9 | 22	root	cellular organisms	Bacteria	Proteobacteria	Gammaproteobacteria
10 | 7	root	cellular organisms	Bacteria	Proteobacteria	Gammaproteobacteria	Legionellales
11 | 1	root	cellular organisms
12 | 1	root	cellular organisms	Bacteria	Proteobacteria	Gammaproteobacteria	Legionellales	Legionellaceae
13 | 0
14 | 


--------------------------------------------------------------------------------
/tests/unittests/randTrees.pl.bats:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$BATS_TEST_DIRNAME
 4 | export PATH=$dir/../../scripts:$PATH
 5 | 
 6 | @test "randTrees" {
 7 |   tree=$($dir/../../scripts/randTrees.pl --numTrees 1 $dir/input/kendall-colijn1.dnd)
 8 |   bytes=$(wc -c <<< $tree)
 9 |   [ "$bytes" -ge 140 ]
10 | 
11 |   [ "$bytes" -le 150 ]
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------