├── .github └── workflows │ └── testing.yml ├── .gitignore ├── LICENSE ├── README.md ├── lib ├── Statistics │ ├── Descriptive.pm │ └── Descriptive │ │ ├── Smoother.pm │ │ ├── Smoother │ │ ├── Exponential.pm │ │ └── Weightedexponential.pm │ │ └── Weighted.pm └── Vcf.pm ├── misc ├── EmpMiSeq250R1.txt └── EmpMiSeq250R2.txt ├── qsub ├── array │ ├── launch_FastAniAllVsAll.sh │ ├── launch_TTR.sh │ ├── launch_art.sh │ ├── launch_downsampling.sh │ ├── launch_etoki_mlst_array.sh │ ├── launch_fastq-dump_split.sh │ ├── launch_genotyphi.sh │ ├── launch_gzip.sh │ ├── launch_ksnp.sh │ ├── launch_mash_fasta.sh │ ├── launch_pymlst.sh │ ├── launch_realphy.sh │ ├── launch_set_alreadyShuffled.sh │ ├── launch_set_qsubarray.sh │ ├── launch_shovill_array.sh │ ├── launch_shuffleReads.pl │ ├── launch_skesa.sh │ ├── launch_skesa_sra.sh │ ├── launch_snp-pipeline.sh │ ├── launch_spades_split.sh │ ├── launch_wgmlst.sh │ ├── lyvesetVsLyveset.pl │ ├── lyvesetVsSimulations.pl │ ├── makeConfigs.pl │ └── snppipelineVsSimulations.pl ├── launch_SRST2.sh ├── launch_annotation.sh ├── launch_baym.sh ├── launch_chewbbaca.simple.sh ├── launch_circlator.sh ├── launch_colorid_mlst.sh ├── launch_downloadSrr.sh ├── launch_etoki_mlst.sh ├── launch_fastqToFasta.sh ├── launch_freyja.sh ├── launch_kraken.sh ├── launch_kraken2.sh ├── launch_kraken_contigs.sh ├── launch_mergeFastaReads.sh ├── launch_minion_guppy_wtdbg2_nanopolish.sh ├── launch_minion_wtdbg2.sh ├── launch_parsnp.sh ├── launch_polish.pl ├── launch_predict.sh ├── launch_prokka.sh ├── launch_shovill.sh ├── launch_skesa.sh ├── launch_spades.sh ├── launch_spades_SE.sh ├── launch_spades_iontorrent.sh ├── launch_spades_split.sh ├── launch_trimClean.sh ├── launch_velvet.sh ├── modules.csh ├── modules.sh └── sub_unicycler.sh ├── scripts ├── Kendall.R ├── Kendall.pl ├── Kuhner-Felsenstein.sh ├── MCM.sh ├── addCutSites.pl ├── alignmentToPhyloviz.pl ├── alignment_stats.pl ├── allelesDifference.pl ├── anagramChecker.pl ├── art_profile.pl ├── avgstdev.pl ├── bamStats.pl ├── blastAndExtract.pl ├── bp_jackknifeTrees.pl ├── clusterDensityFromFastq.pl ├── colorid.mlst.pl ├── comparePredictions.pl ├── constraintTree.pl ├── convertAlignment.pl ├── countATCG.pl ├── detectAdapters.pl ├── directoryDuration.pl ├── distance.bn.pl ├── distance.chewbbaca.pl ├── distance.coloridmlst.pl ├── distance.etoki.pl ├── distance.wgmlst.pl ├── downloadReadsFromBioproject.pl ├── downloadSra.pl ├── downloadSrrRemotely.sh ├── exportBioNumericsFastaWithCoverage.pl ├── extractSequence.pl ├── fastacmd.pl ├── fastqDump-SE.sh ├── fastqDump.sh ├── fastqMaxCompression.sh ├── fastqToFastaQual.pl ├── filterContigs.pl ├── filterKrakenOutput.pl ├── findpids.sh ├── fixKsnpVcf.pl ├── fixProkkaHeader.pl ├── flattenTree.pl ├── formatFastaForKraken.pl ├── genomeDist.pl ├── getGenesFromGb.pl ├── kaptivate_wrapper.pl ├── kraken-report-contamination.pl ├── kraken2-translate.pl ├── ksnpsToVcf.pl ├── lasergeneToFna.pl ├── lyve_splitgbk.pl ├── mashesToAlignment.pl ├── matrix.etoki.pl ├── md5sumDir.pl ├── md5sumDir.sh ├── mlstToTree.pl ├── mummerToVcf.pl ├── mvSymlink.pl ├── normalizeDepth.pl ├── pairwiseDistances.mlst.pl ├── parseMultiblast.partialAnswer.pl ├── pfgeOnGenome.pl ├── phylipDistToTallSkinny.pl ├── phylogeneticOrder.pl ├── pruneSafely.pl ├── pwdLinux.sh ├── pwdWindows.sh ├── qsubStats.sh ├── randFastq.pl ├── randTrees.pl ├── readLingual.pl ├── remoteMlst.pl ├── renameTreeNodes.pl ├── replaceReadsWithReference.pl ├── representativeTaxa.pl ├── rerootTree.pl ├── rootTheSameWay.pl ├── rowMath.pl ├── sangerPrimers.pl ├── sortFastq.pl ├── sortFastqByCommonKmers.pl ├── sortFastq_lowDisk.pl ├── splitBionumericsFasta.pl ├── splitPolytomies.pl ├── srr_to_tsv.pl ├── subtractContigs.pl ├── tanglegram.pl ├── tanglegram_ape.R ├── tanglegram_code.R ├── tbl2gff3.pl ├── translate-kraken-contigs.pl ├── treeDistance.pl ├── treeDistanceMatrix.pl ├── treeInfo.pl ├── treedist_wrapper.pl ├── ttrToMiSeq.sh ├── validateFastq.pl └── validateTaxonomy.pl ├── tests ├── all.sh └── unittests │ ├── Kendall.pl.bats │ ├── avgstdev.pl.bats │ ├── input │ ├── 2011C-3609.fasta │ ├── CFSAN023463.fasta │ ├── NC001416.fasta │ ├── NC_045512.fasta │ ├── SRR27366697.10x.fastq.gz │ ├── kendall-colijn1.dnd │ ├── kendall-colijn2.dnd │ └── kraken │ │ ├── FA1090 │ │ ├── kraken.report │ │ └── kraken.taxonomy │ │ └── contaminated │ │ ├── kraken.filtered.report │ │ ├── kraken.report │ │ └── kraken.taxonomy │ └── randTrees.pl.bats └── unfinishedScripts ├── gibbs.bak.pl └── gibbs.pl /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches-ignore: 6 | dev 7 | 8 | jobs: 9 | build: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | os: ['ubuntu-20.04' ] 15 | perl: [ '5.36.0' ] 16 | defaults: 17 | run: 18 | shell: bash -el {0} 19 | name: ${{ matrix.os }} perl ${{ matrix.perl }} 20 | steps: 21 | - name: Get Date 22 | id: get-date 23 | run: | 24 | today=$(/bin/date -u '+%Y%m%d') 25 | echo $today 26 | echo "today=$today" >> $GITHUB_OUTPUT 27 | - name: set up conda 28 | uses: conda-incubator/setup-miniconda@v2 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | use-mamba: true 32 | miniforge-variant: Mambaforge 33 | miniforge-version: latest 34 | channel-priority: strict 35 | channels: conda-forge,bioconda,defaults 36 | mamba-version: "*" 37 | auto-activate-base: true 38 | activate-environment: "~/conda_pkgs_dir/my-env" 39 | use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! 40 | - name: conda info 41 | run: | 42 | conda info 43 | echo 44 | conda list 45 | echo 46 | conda config --show 47 | - name: Cache Conda env 48 | id: cache-conda 49 | uses: actions/cache@v3 50 | with: 51 | path: | 52 | ~/conda_pkgs_dir 53 | ~/.conda 54 | ~/.condarc 55 | #/usr/share/miniconda 56 | key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-perl_v${{ matrix.perl }}--${{env.CACHE_NUMBER}} 57 | env: 58 | CACHE_NUMBER: 2 59 | - name: conda installations 60 | shell: bash -el {0} 61 | if: steps.cache-conda.outputs.cache-hit != 'true' 62 | run: | 63 | mamba install -y perl-app-cpanminus perl-bioperl perl-statistics-descriptive bats-core 64 | - name: check installation 65 | shell: bash -el {0} 66 | run: | 67 | which perl 68 | perl -v 69 | which cpanm 70 | echo 71 | which python 72 | python -V 73 | echo 74 | - name: checkout my repo 75 | uses: actions/checkout@v3 76 | - name: apt-get install 77 | run: | 78 | sudo apt-get update 79 | sudo apt-get -y install ca-certificates 80 | cpanm --verbose 'Math::Gauss' 81 | - name: Run tests 82 | run: | 83 | bats tests/unittests/*.bats 84 | 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | blib/ 2 | .build/ 3 | _build/ 4 | cover_db/ 5 | inc/ 6 | Build 7 | !Build/ 8 | Build.bat 9 | .last_cover_stats 10 | Makefile 11 | Makefile.old 12 | MANIFEST.bak 13 | META.yml 14 | MYMETA.yml 15 | nytprof.out 16 | pm_to_blib 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Lee Katz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /qsub/array/launch_FastAniAllVsAll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /etc/profile.d/modules.sh 4 | if [ $? -gt 0 ]; then 5 | echo "ERROR: cannot load the modules system"; 6 | exit 1; 7 | fi 8 | 9 | module purge 10 | 11 | if [ "$3" == "" ]; then 12 | echo "Usage: $0 out.tsv in1.fasta in2.fasta [in3.fasta...]" 13 | echo " Runs ANI on all vs all and places that output in out.tsv" 14 | exit 15 | fi 16 | 17 | OUT=$1 18 | shift 19 | 20 | if [ -e "$OUT" ]; then 21 | echo "ERROR: $OUT already exists" 22 | exit 1 23 | fi 24 | 25 | LOGDIR=$(mktemp --directory $(basename $0 .sh).XXXXXX) 26 | CTRL_FILE="$LOGDIR/array.txt" 27 | mkdir $LOGDIR/log 28 | mkdir $LOGDIR/out 29 | echo "log directory is $LOGDIR/log" 30 | 31 | echo "$@" | tr ' ' '\n' > $CTRL_FILE 32 | 33 | qsub -q edlb.q -q all.q -N FastANIarray -o $LOGDIR/out -e $LOGDIR/log -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 34 | -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 35 | #!/bin/bash 36 | set -e 37 | 38 | QUERY=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 39 | 40 | echo "ANI for query $QUERY" >&2 41 | hostname >&2 42 | 43 | fastANI -q $QUERY --rl $CTRL_FILE -o /dev/stdout 44 | 45 | END_OF_SCRIPT 46 | 47 | qsub -q all.q -N combine_FastANI -o $LOGDIR -j y -pe smp 1 -V -cwd -hold_jid FastANIarray \ 48 | -v "LOGDIR=$LOGDIR" -v "OUT=$OUT" <<- "END_OF_SCRIPT" 49 | #!/bin/bash 50 | set -e 51 | 52 | sort -k1,2r $LOGDIR/out/*.o* | uniq > $OUT 53 | END_OF_SCRIPT 54 | -------------------------------------------------------------------------------- /qsub/array/launch_TTR.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs any TreeToReads projects in a cluster-friendly method 4 | # Author: Lee Katz 5 | # Usage: bash launch_TTR.sh project1 project2 [... projectX] 6 | # where each project has its own TTR.cfg file and associated TTR files. 7 | 8 | TMP=$(mktemp --tmpdir='.' --directory qsubTTR.XXXXXXXX) 9 | echo "tmp dir is $TMP " 10 | 11 | CTRL_FILE="$TMP/array.txt" 12 | echo "$@" | tr ' ' '\n' > $CTRL_FILE 13 | 14 | mkdir -p $TMP/log 15 | qsub -q all.q -N TTR -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 16 | -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 17 | export PATH=$PATH:~/bin/TreeToReads:~/bin/ART 18 | module unload perl/5.16.1-MT 19 | export PERL5LIB="" 20 | 21 | base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 22 | echo "Working on $base_dir" 23 | scratch_out="/scratch/gzu2/TTR/$base_dir" 24 | rm -rfv $scratch_out $base_dir/out 25 | mkdir -p $(dirname $scratch_out) 26 | cd $base_dir 27 | sed -i.bak "s|output_dir.*|output_dir = $scratch_out|" TTR.cfg 28 | treetoreads.py TTR.cfg 29 | cd - 30 | mv -v $scratch_out $base_dir/out 31 | END_OF_SCRIPT 32 | 33 | -------------------------------------------------------------------------------- /qsub/array/launch_art.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Launches ART whole genome sequence simulator on a set 4 | # of fasta files 5 | set -e 6 | 7 | if [ "$4" == "" ]; then 8 | echo "Simulates reads from a set of fasta files." 9 | echo "Usage: $(basename $0) profileR1.txt profileR2.txt outdir/ file1.fasta file2.fasta..." 10 | exit 1 11 | fi 12 | 13 | profile1=$1; 14 | profile2=$2 15 | outdir=$3 16 | shift;shift;shift; 17 | 18 | # Check params 19 | if [ ! -e $profile1 ]; then 20 | echo "ERROR: could not find $profile1" 21 | exit 1; 22 | fi 23 | if [ ! -e $profile2 ]; then 24 | echo "ERROR: could not find $profile2" 25 | exit 1; 26 | fi 27 | if [ -e $outdir ]; then 28 | echo "ERROR: outdir already exists: $outdir" 29 | exit 1; 30 | fi 31 | mkdir $outdir 32 | 33 | export PATH=$PATH:~/bin/ART-MountRainier 34 | 35 | TMP=$(mktemp --tmpdir='.' --directory ART.XXXXXX) 36 | CTRL_FILE="$TMP/fasta.txt" 37 | echo "$@" | tr ' ' '\n' > $CTRL_FILE 38 | 39 | mkdir -p $TMP/log 40 | qsub -q all.q -N ART -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 41 | -v "CTRL_FILE=$CTRL_FILE" -v "outdir=$outdir" -v "profile1=$profile1" -v "profile2=$profile2" <<- "END_OF_SCRIPT" 42 | #!/bin/bash 43 | 44 | set -e 45 | 46 | fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 47 | b=$(basename $fasta | sed 's/\.[^.]*$//') 48 | 49 | if [ -e "$outdir/$b.1.fq.gz" ]; then 50 | echo "Found $outdir/$b.1.fq.gz. Exiting."; 51 | exit 0; 52 | fi 53 | 54 | mkdir -p /dev/shm/$USER 55 | tmpdir=$(mktemp --tmpdir=/dev/shm/$USER --directory ART.XXXXXX); 56 | trap "{ rm -rf $tmpdir; }" EXIT 57 | 58 | prefix="$tmpdir/$b." # will generate fastq with correct basename followed by '.1.fq.gz' or '.2.fq.gz' 59 | 60 | art_illumina -1 $profile1 -2 $profile2 -na -p -i $fasta -l 250 -f 40 -m 480 -s 120 -o $prefix 61 | gzip -v9 $prefix* 62 | 63 | mv -v $prefix*.fq.gz $outdir 64 | 65 | END_OF_SCRIPT 66 | 67 | -------------------------------------------------------------------------------- /qsub/array/launch_downsampling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # downsamples a set of reads 4 | 5 | if [ "$4" == "" ]; then 6 | echo "Usage: $0 oneX minCov maxCov 1.fastq.gz [2.fastq.gz ...]" 7 | echo "Reads will be deposited in cov\$i to cov\$j directories" 8 | echo "oneX must equal to the size of the reference assembly"; 9 | echo "Can be fastq.gz or fastq files" 10 | exit 1; 11 | fi 12 | 13 | TMP=$(mktemp --tmpdir='.' --directory qsubDownsample.XXXXXXXX) 14 | mkdir -p $TMP/log 15 | echo "tmp dir is $TMP " 16 | 17 | # Read ARGV 18 | oneX=$1; 19 | shift; 20 | MIN=$1 21 | MAX=$2 22 | shift; shift; 23 | 24 | # CTRL file will have per line: 25 | # filename coverageLevel 26 | CTRL_FILE="$TMP/array.txt" 27 | (for cov in `seq $MIN $MAX`; do 28 | for j in "$@"; do 29 | echo "$j $cov"; 30 | done; 31 | done;) | grep . > $CTRL_FILE 32 | 33 | qsub -q all.q -N downsample -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 34 | -v oneX=$oneX -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 35 | #!/bin/bash 36 | 37 | set -e 38 | 39 | # The global temporary directory 40 | tmpdir=/scratch 41 | 42 | echo "Downsampling script will be $(which run_assembly_removeDuplicateReads.pl)" 43 | 44 | # What coverage? Directories? 45 | fastq=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $1}') 46 | b=$(basename $fastq); 47 | cov=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $2}') 48 | localDir=cov$cov/reads 49 | scratchDir=$tmpdir/$USER/cov$cov 50 | mkdir -p $scratchDir $localDir 51 | 52 | if [ ! -d $scratchDir ]; then 53 | echo "ERROR: could not make $scratchDir"; 54 | exit 1; 55 | fi; 56 | 57 | # Final number of bp in the fastq file 58 | bp=$(($oneX * $cov)); 59 | 60 | echo "Reading $fastq to a coverage of $cov ($bp bp)"; 61 | 62 | tmpFastq=$scratchDir/$b; 63 | outFastq=$localDir/$b; 64 | run_assembly_removeDuplicateReads.pl $fastq --sizeto $bp --nobin | gzip -c > $tmpFastq 65 | if [ $? -gt 0 ]; then 66 | echo "ERROR: $b" >> $localDir/ERROR 67 | fi 68 | mv -v $tmpFastq $outFastq 69 | 70 | # Clean up temporary directory if it's empty 71 | rmdir $scratchDir 72 | 73 | END_OF_SCRIPT 74 | 75 | -------------------------------------------------------------------------------- /qsub/array/launch_etoki_mlst_array.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # Runs EToKi MLST 3 | # Author: Lee Katz 4 | 5 | #$ -S /bin/bash 6 | #$ -pe smp 1 7 | #$ -cwd -V 8 | #$ -o EToKi.log 9 | #$ -j y 10 | #$ -N EToKi 11 | 12 | outdir=$1 13 | refs=$2 14 | db=$3 15 | shift;shift;shift 16 | asm=$@ 17 | 18 | #NSLOTS=${NSLOTS:=1} 19 | 20 | source /etc/profile.d/modules.sh 21 | scriptname=$(basename $0); 22 | 23 | 24 | if [ "$asm" == "" ]; then 25 | echo "Usage: $scriptname outdir refs.fasta etoki.csv *.fasta" 26 | exit 0; 27 | fi; 28 | 29 | set -e 30 | set -u 31 | 32 | which EToKi.py 33 | #EToKi.py configure 34 | 35 | tmpdir=$(mktemp --tmpdir=. --directory --suffix=.$(basename $0)); 36 | #trap ' { rm -rf $tmpdir; } ' EXIT 37 | mkdir -p $tmpdir/log 38 | mkdir -p $tmpdir/scratch 39 | echo "tmp dir is $tmpdir" 40 | 41 | CTRL_FILE="$tmpdir/array.txt" 42 | for i in $asm; do 43 | if [ -e "$outdir/$(basename $i)" ]; then 44 | continue; 45 | fi 46 | echo $i 47 | done > $CTRL_FILE 48 | 49 | echo "CTRL_FILE is $CTRL_FILE" 50 | 51 | if [ -d "$outdir" ]; then 52 | echo "WARNING: outdir already exists: $outdir" 53 | echo " pausing 2 seconds in case you want to cancel."; 54 | sleep 2; 55 | fi 56 | mkdir -pv $outdir 57 | 58 | qsub -N EToKi -o $tmpdir/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 59 | -v "outdir=$outdir" -v "refs=$refs" -v "db=$db" -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 60 | #!/bin/bash -l 61 | 62 | set -eu 63 | hostname 64 | 65 | which EToKi.py 66 | EToKi.py configure || true 67 | echo 68 | 69 | tmpdir=$(mktemp --directory $(basename $0).TASK_ID$SGE_TASK_ID.XXXXXX --tmpdir=$TMPDIR) 70 | trap ' { rm -rf $tmpdir; } ' EXIT 71 | 72 | asm=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 73 | samplename=$(basename $asm .fasta) 74 | b=$(basename $asm) 75 | echo "Sample name will be $samplename" 76 | 77 | # Speed this up by working on scratch 78 | cp -v $asm $tmpdir/asm.fasta 79 | cp -v $db $tmpdir/db.csv 80 | cp -v $refs $tmpdir/refs.fasta 81 | 82 | set -x 83 | EToKi.py MLSType -i $tmpdir/asm.fasta -r $tmpdir/refs.fasta -k $samplename -d $tmpdir/db.csv -o $tmpdir/out.fasta 84 | mv -v $tmpdir/out.fasta $outdir/$b 85 | 86 | END_OF_SCRIPT 87 | 88 | -------------------------------------------------------------------------------- /qsub/array/launch_fastq-dump_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Modified from a script by Taylor Griswold 4 | 5 | # Read ARGV 6 | OUTDIR=$1 7 | DOWNLOAD_LIST=$2 8 | SLOTS_PER_JOB=1 # manually change this as needed 9 | 10 | if [ "$DOWNLOAD_LIST" == "" ]; then 11 | scriptName=$(basename $0) 12 | echo "Executes the sratoolkit module into an array batch job." 13 | echo "Text file has white-space delimited SRA run IDs" 14 | echo "Usage: $scriptName outdir run_ids.txt" 15 | exit 1; 16 | fi 17 | 18 | if [ -f $OUTDIR ]; then 19 | echo "ERROR: $OUTDIR is not a directory"; 20 | exit 1; 21 | fi 22 | mkdir -pv $OUTDIR 23 | 24 | if [ ! -e "$DOWNLOAD_LIST" ]; then 25 | echo "ERROR: $DOWNLOAD_LIST could not be found"; 26 | exit 1; 27 | fi 28 | 29 | TMP=$(mktemp --tmpdir='.' --directory qsubFastqDump.XXXXXXXX) 30 | mkdir -p $TMP/log 31 | echo "tmp dir is $TMP " 32 | 33 | # CTRL file will have one SRA run ID per line 34 | CTRL_FILE="$TMP/array.txt" 35 | cat $DOWNLOAD_LIST | perl -lane ' 36 | for my $sra(@F){ 37 | print $sra; 38 | } 39 | ' > $CTRL_FILE 40 | echo "CTRL_FILE is $CTRL_FILE" 41 | 42 | 43 | qsub -N FastqDump -q edlb.q -o $TMP/log -j y -pe smp $SLOTS_PER_JOB -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 44 | -v "OUTDIR=$OUTDIR" -v "CTRL_FILE=$CTRL_FILE","DOWNLOAD_LIST=$DOWNLOAD_LIST" <<- "END_OF_SCRIPT" 45 | #!/bin/bash -l 46 | set -e 47 | source /etc/profile.d/modules.sh 48 | module purge 49 | module load sratoolkit/2.9.1 50 | 51 | which fastq-dump 52 | 53 | # Set up filenames 54 | SRRID=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $1}') 55 | READ1=$(echo "$SRRID"_1.fastq.gz) 56 | READ2=$(echo "$SRRID"_2.fastq.gz) 57 | tmpdir=/scratch/$USER 58 | mkdir -p $tmpdir 59 | outdir=$(mktemp --tmpdir=$tmpdir --directory FastqDump.XXXXXX); 60 | trap "rm -rf $outdir" EXIT 61 | newdir="$OUTDIR/$SRRID.fastq-dump" 62 | 63 | if [ -e "$newdir" ]; then 64 | echo "ERROR: found pre-existing dir $newdir. Will not continue."; 65 | exit 1 66 | fi 67 | 68 | echo "fastq-dump will be run on $SRRID under $(hostname)" 69 | echo "Working directory is $outdir" 70 | echo -e "Final directory will be $newdir.\n"; 71 | 72 | fastq-dump --accession $SRRID --outdir $outdir --defline-seq '@$ac.$si/$ri' --defline-qual '+' --split-files --skip-technical --dumpbase --clip --gzip 73 | 74 | mv -v $outdir $newdir 75 | rm /scicomp/home/ycj5/ncbi/public/sra/"$SRRID".sra 76 | 77 | END_OF_SCRIPT 78 | 79 | -------------------------------------------------------------------------------- /qsub/array/launch_genotyphi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e; 4 | 5 | FASTQLIST="$1" 6 | ref="$2" 7 | 8 | ref=$(realpath $ref) 9 | echo "Testing the path to the reference genome" 10 | ls $ref 11 | 12 | TMP=$(mktemp --tmpdir='.' --directory $(basename $0 .sh).tmp.XXXXXX) 13 | echo "tmp dir is $TMP" 14 | mkdir $TMP/samples 15 | mkdir $TMP/log 16 | 17 | CTRL_FILE="$TMP/array.txt" 18 | grep '_1.*fastq.gz' $FASTQLIST > $CTRL_FILE 19 | 20 | qsub -N genotypi -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 21 | -v "CTRL_FILE=$CTRL_FILE" -v "TMP=$TMP" -v "ref=$ref" <<- "END_OF_SCRIPT" 22 | #/bin/bash -l 23 | set -e 24 | set -x 25 | 26 | R1=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 27 | R2=${R1/_1/_2}; 28 | out="out/$(basename $R1 .fastq.gz).tsv"; 29 | out=$(realpath $out) 30 | if [ -e "$out" ]; then 31 | #echo "FOUND $out"; 32 | exit 0 33 | fi 34 | 35 | 36 | set +x 37 | module load samtools/1.10 bcftools/1.10.2 bowtie2/2.3.5.1 38 | set -x 39 | 40 | export PATH=$PATH:$HOME/bin/genotyphi 41 | 42 | 43 | # I think that this script pollutes the working directory 44 | # but I might be wrong. Still though, just change 45 | # the working directory to something in TMP to avoid problems. 46 | workingDir="$TMP/samples/$(basename $R1)" 47 | mkdir -pv $workingDir 48 | cd $workingDir 49 | bowtie2 -p $NSLOTS -x $ref -1 $R1 $R2 | samtools view -bh - > unsorted.bam 50 | samtools sort -o sorted.bam unsorted.bam 51 | samtools index sorted.bam 52 | genotyphi.py --mode bam --bam sorted.bam --ref $ref --ref_id AL513382.1 --output $(basename $out) 53 | 54 | # just in case I needed to see files before something broke 55 | ls -lht 56 | 57 | mv *.tsv $out 58 | 59 | END_OF_SCRIPT 60 | 61 | exit 62 | 63 | #### original script 64 | 65 | grep '_1.*fastq.gz' ../fofn.txt |\ 66 | while read R1; do 67 | R2=${R1/_1/_2}; 68 | bam="bam/$(basename $R1 .fastq.gz).bam"; 69 | out="out/$(basename $R1 .fastq.gz).tsv"; 70 | if [ -e "$out" ]; then 71 | echo "FOUND $out"; 72 | continue; 73 | fi; 74 | 75 | bowtie2 -p 24 -x CT18.fasta -1 $R1 $R2 2> "$out.log" |\ 76 | samtools view -bh - > unsorted.bam; 77 | samtools sort -o $bam unsorted.bam; 78 | python genotyphi/genotyphi.py --mode bam --bam $bam --ref CT18.fasta --ref_id AL513382.1 --output $(basename $out) >> "$out.log" 2>&1; 79 | localout=$(\ls *$(basename "$out")); 80 | mv -v "$localout" "$out"; 81 | done 82 | 83 | -------------------------------------------------------------------------------- /qsub/array/launch_gzip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # taken from Seth's example: http://git.biotech.cdc.gov/snippets/3 3 | # This is meant as an example to help me learn job arrays 4 | 5 | set -e 6 | 7 | if [[ -z $1 ]]; then 8 | echo "Gzips fastq files with -9" 9 | echo "Usage: $0 *.fastq *.fastq.gz" 10 | exit 0 11 | fi 12 | 13 | set -u 14 | 15 | # Make temp folder that can hold log files and temporary files 16 | TMP=$(mktemp --tmpdir='.' --directory qsubtmp.XXXXXXXX) 17 | mkdir $TMP/log 18 | echo "tmp dir is $TMP " 19 | # Make a temp file that can hold an array of input files 20 | CTRL_FILE="$TMP/array.txt" 21 | echo $@ | tr ' ' '\n' | grep . > $CTRL_FILE 22 | 23 | # Start off the job array 24 | # -N is the job name 25 | # -o $TMP -j y puts all log files into the temporary directory 26 | # -V -cwd is to use the current environment in the current working directory 27 | # -t indicates an array, but it needs a min to max in the next parameter 28 | # 1-$(cat $CTRL_FILE | wc -l) translates to "1 to the number of gzip files" 29 | # -v "CTRL_FILE=$CTRL_FILE" creates a variable to use inside of the here document. 30 | qsub -N gzip-job -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 31 | -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 32 | # This is a "here document." It gets submitted as though it were a 33 | # separate file. The here document ends right before END_OF_SCRIPT 34 | file=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 35 | 36 | TMP=$(mktemp --directory --suffix _qsub) 37 | echo "TMP DIR is $HOSTNAME:$TMP" 38 | trap " { set -x; rm -rf $TMP; } " EXIT 39 | tmpIn=$TMP/$(basename $file) 40 | uncompressed=$TMP/$(basename $file .gz) 41 | cp -v $file $tmpIn 42 | 43 | if [[ "$tmpIn" =~ gz$ ]]; then 44 | gunzip -v $tmpIn && \ 45 | gzip -9v $uncompressed && \ 46 | mv -v $tmpIn $file 47 | else 48 | gzip -v9 $tmpIn && \ 49 | mv -v $tmpIn.gz $file.gz && \ 50 | rm -v $file 51 | fi 52 | END_OF_SCRIPT 53 | 54 | -------------------------------------------------------------------------------- /qsub/array/launch_ksnp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs any set of reads through KSNP in a cluster-friendly way. 4 | # Each reads directory will be a distinct project. 5 | # Author: Lee Katz 6 | # Usage: bash $0 readsdir1 [readsdir2 ... readsdirX] 7 | 8 | if [ "$2" == "" ]; then 9 | echo "Usage: $0 ref.fasta cov1 [cov2...]" 10 | exit 1; 11 | fi 12 | 13 | TMP=$(mktemp --tmpdir='.' --directory qsubKsnp3.XXXXXXXX) 14 | echo "tmp dir is $TMP " 15 | 16 | REF=$1; shift; 17 | 18 | CTRL_FILE="$TMP/array.txt" 19 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE 20 | 21 | mkdir -p $TMP/log 22 | qsub -q all.q -N KSNP3 -o $TMP/log -j y -pe smp 1-2 -V -S /bin/bash -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 23 | -v "REF=$REF" -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 24 | #!/bin/bash 25 | 26 | module load kSNP/3.0.0 27 | module load fastx-toolkit/0.0.13 28 | 29 | base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 30 | echo "Working on $base_dir, host " $(hostname) 31 | mkdir -p /scratch/$USER 32 | scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory ksnp3.XXXXXX) 33 | export scratch_out 34 | echo "Temporary directory is $scratch_out" 35 | if [ ! -e "$scratch_out" ]; then echo "ERROR: could make temporary directory $scratch_out"; exit 1; fi; 36 | 37 | # Find what reads we're using 38 | READS=$(find $base_dir $base_dir/reads -maxdepth 1 -name '*.fastq.gz'); 39 | echo -e "Found reads\n" $READS 40 | 41 | # Convert to fasta 42 | echo $READS | xargs -P $NSLOTS -n 1 sh -c ' 43 | sample=$(basename $0 .fastq.gz); 44 | sampleDir=$scratch_out/samples/$sample; 45 | mkdir -pv $sampleDir; 46 | 47 | run_assembly_trimClean.pl -i $0 -o $sampleDir/cleaned.fastq --bases_to_trim 50 --auto --nosingletons 48 | if [ $? -gt 0 ]; then echo "ERROR on trimClean on $0"; exit 1; fi; 49 | 50 | fastq_to_fasta -Q33 < $sampleDir/cleaned.fastq > $sampleDir/$sample.fasta 51 | if [ $? -gt 0 ]; then echo "ERROR converting $0 to fasta"; exit 1; fi; 52 | merge_fasta_reads3 $sampleDir/$sample.fasta > $sampleDir/merged.fasta 53 | if [ $? -gt 0 ]; then echo "ERROR merging $0"; exit 1; fi; 54 | 55 | # cleanup 56 | mv $sampleDir/merged.fasta $scratch_out/samples/$sample.fasta 57 | rm -rvf $sampleDir 58 | '; 59 | cp -v $REF $scratch_out/samples/reference.fasta 60 | 61 | # Switch to the tmp dir 62 | pushd $scratch_out 63 | 64 | # Find optimal kmer value 65 | MakeKSNP3infile samples in.txt A 66 | cat samples/*.fasta > kchooser.fasta 67 | KMERLENGTH=$(Kchooser kchooser.fasta | grep "The optimum value of K is" | grep -o "[0-9]\+") 68 | if [ $? -gt 0 ]; then echo "ERROR with Kchooser"; exit 1; fi; 69 | echo "The optimal kmer length is $KMERLENGTH"; 70 | rm -f kchooser.fasta 71 | 72 | grep reference in.txt > reference_in.txt 73 | 74 | kSNP3 -k $KMERLENGTH -annotate reference_in.txt -in in.txt -c 4 -core -ML -min_frac 0.75 -CPU $NSLOTS -NJ -vcf -outdir out 75 | if [ $? -gt 0 ]; then echo "ERROR with kSNP3"; exit 1; fi; 76 | 77 | # pop out of the tmp dir 78 | popd 79 | 80 | mv -v $scratch_out/out $base_dir/kSNP3 81 | rm -rvf $scratch_out 82 | END_OF_SCRIPT 83 | 84 | -------------------------------------------------------------------------------- /qsub/array/launch_mash_fasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs mash sketch on a set of fasta files 4 | 5 | 6 | # Read ARGV 7 | FASTA=$@ 8 | 9 | set -e 10 | set -u 11 | 12 | if [ "$FASTA" == "" ]; then 13 | echo "Mash sketch files in place" 14 | echo "Usage: $0 *.fasta" 15 | exit 1; 16 | fi 17 | 18 | TMP=$(mktemp --tmpdir='.' --directory qsubMash.XXXXXXXX) 19 | mkdir -p $TMP/log 20 | echo "tmp dir is $TMP " 21 | 22 | CTRL_FILE="$TMP/array.txt" 23 | echo "$FASTA" | tr ' ' '\n' > $CTRL_FILE 24 | echo "CTRL_FILE is $CTRL_FILE" 25 | 26 | source /etc/profile.d/modules.sh 27 | module purge 28 | module load Mash 29 | module load gcc/5.5 30 | mash --version # ensure it loaded 31 | 32 | qsub -q edlb.q -N mashSketch -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 33 | -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 34 | #!/bin/bash -l 35 | 36 | set -e 37 | set -u 38 | set -x 39 | 40 | hostname 41 | # Reload modules to ensure things like LD_LIBRARY_PATH are re-added 42 | source /etc/profile.d/modules.sh || true 43 | module purge 44 | module load Mash 45 | module load gcc/5.5 46 | 47 | mash --version # ensure it loaded 48 | 49 | # Set up filenames 50 | fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 51 | mkdir /scratch/$USER || true 52 | tmpdir=$(mktemp --tmpdir=/scratch/$USER --directory qsubMash.XXXXXXXX) 53 | trap ' { rm -rf $tmpdir; } ' EXIT 54 | 55 | tmpsketch="$tmpdir/out.msh" 56 | outsketch="$(dirname $fasta)/$(basename $fasta).msh" 57 | 58 | echo "mash will be run on $fasta => $tmpsketch => $outsketch" 59 | 60 | if [ -e "$outsketch" ]; then 61 | echo "$outsketch already exists. Exiting."; 62 | exit 1; 63 | fi 64 | 65 | mash sketch -o $tmpsketch $fasta 66 | 67 | mv -v $tmpsketch $outsketch 68 | END_OF_SCRIPT 69 | 70 | -------------------------------------------------------------------------------- /qsub/array/launch_pymlst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs NCBI's wgmlst in an array for MLST calling 4 | 5 | OUTDIR=$1; shift; 6 | DB=$1; shift 7 | ASM=$@ 8 | 9 | set -e 10 | set -u 11 | 12 | if [ "$ASM" == "" ]; then 13 | echo "Run NCBI's wgmlst for wgMLST allele calling" 14 | echo "Usage: $0 outdir something.scheme *.fasta" 15 | exit 1; 16 | fi 17 | 18 | if [ ! -d "$OUTDIR" ]; then 19 | mkdir "$OUTDIR" 20 | fi; 21 | if [ ! -e "$DB" ]; then 22 | echo "ERROR: not found: $DB" 23 | exit 2; 24 | fi 25 | 26 | tmpdir=$(mktemp --tmpdir='.' --directory wgmlst.XXXXXXXX) 27 | mkdir -pv $tmpdir/log 28 | #trap " rm -rf $tmpdir " EXIT 29 | echo "tmp dir is $tmpdir " 30 | 31 | # CTRL file will have per line: 32 | # filename coverageLevel 33 | CTRL_FILE="$tmpdir/array.txt" 34 | # Put the reads one at a time into the CTRL_FILE but use paste to keep paired ends together 35 | echo "$ASM" | tr ' ' '\n' > $CTRL_FILE 36 | 37 | head $CTRL_FILE 38 | 39 | module purge 40 | 41 | # Check executables 42 | which wgmlst 43 | 44 | qsub -N ncbi_wgmlst -o $tmpdir/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 45 | -v "tmpdir=$tmpdir" -v "DB=$DB" -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT" 46 | #!/bin/bash -l 47 | set -e 48 | set -u 49 | 50 | # Set up filenames 51 | fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 52 | name=$(basename $fasta .fasta) 53 | 54 | sampledir=$(mktemp --tmpdir=$tmpdir --directory $name.wgmlst.XXXXXX); 55 | trap "rm -rf $sampledir" EXIT 56 | finalout="$OUTDIR/$name" 57 | 58 | if [ -e "$finalout/.done" ]; then 59 | echo "Found $finalout/.done; not repeating." 60 | exit 0; 61 | fi 62 | 63 | mappings=$sampledir/mappings 64 | alleles=$sampledir/alleles 65 | stdout=$sampledir/wgmlst.out 66 | log=$sampledir/wgmlst.log 67 | 68 | ( 69 | date # mark how long this takes 70 | set -x 71 | wgmlst --genome $fasta --alleles $DB --cores $NSLOTS --kmer 15 --output_mappings $mappings --output_loci $alleles 72 | set +x 73 | date 74 | ) 1>$stdout 2>$log 75 | 76 | ls -lh $sampledir 77 | mv -v $sampledir $finalout 78 | touch $finalout/.done 79 | 80 | END_OF_SCRIPT 81 | 82 | -------------------------------------------------------------------------------- /qsub/array/launch_realphy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs any set of reads through KSNP in a cluster-friendly way. 4 | # Each reads directory will be a distinct project. 5 | # Author: Lee Katz 6 | # Usage: bash $0 readsdir1 [readsdir2 ... readsdirX] 7 | 8 | if [ "$2" == "" ]; then 9 | echo "Usage: $0 ref.fasta cov1 [cov2...]" 10 | exit 1; 11 | fi 12 | 13 | TMP=$(mktemp --tmpdir='.' --directory qsubRealPhy.XXXXXXXX) 14 | echo "tmp dir is $TMP " 15 | 16 | export REF=$1; shift; 17 | CTRL_FILE="$TMP/array.txt" 18 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE 19 | 20 | mkdir -p $TMP/log 21 | #qsub -q all.q -N RealPhy -o $TMP/log -j y -pe smp 12 -hard -l exclusive=true -V -S /bin/bash -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 22 | qsub -q all.q -N RealPhy -o $TMP/log -j y -pe smp 2 -V -S /bin/bash -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 23 | -v "REF=$REF" -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 24 | #!/bin/bash 25 | 26 | module load RealPhy/v112 27 | module load bowtie2/2.2.4 28 | module load phylip/3.69 29 | module load phyml/3.0 30 | module load tree-puzzle/5.3.rc16 31 | 32 | base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 33 | echo "Working on $base_dir, host " $(hostname) 34 | mkdir -p /scratch/$USER 35 | scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory realphy.XXXXXX) 36 | export scratch_out 37 | echo "Temporary directory is $scratch_out" 38 | if [ ! -e "$scratch_out" ]; then echo "ERROR: could make temporary directory $scratch_out"; exit 1; fi; 39 | if [ -e $base_dir/RealPhy ]; then echo "Found $base_dir/RealPhy. Skipping."; exit 0; fi 40 | 41 | # Find what reads we're using. Do not symlink 42 | # because realphy has a stupid problem with symlinks. 43 | # Instead, copy. 44 | READS=$(ls $base_dir/reads/*.fastq.gz); 45 | mkdir -p $scratch_out/samples 46 | cp -v $READS $scratch_out/samples/ 47 | cp -v $REF $scratch_out/samples/reference.fasta 48 | 49 | mkdir -p $scratch_out/out 50 | 51 | # Make a config file. RealPhy is intolerant of 52 | # beginning indentation for each line and of a 53 | # beginning empty line. 54 | echo -e "BOWTIE2\t/apps/x86_64/bowtie2/bowtie2-2.2.4/bowtie2 55 | BOWTIE2BUILDER\t/apps/x86_64/bowtie2/bowtie2-2.2.4/bowtie2-build-l 56 | TREEPUZZLE\t/apps/x86_64/tree-puzzle/5.3.rc16/bin/puzzle 57 | RAXML\t/scicomp/groups/OID/NCEZID/DFWED/EDLB/share/bin/lyve-SET-v1.1.4/lib/standard-RAxML-8.1.16/raxmlHPC-PTHREADS 58 | Rscript\t/apps/x86_64/R/3.2.3/bin/Rscript 59 | MaxPars\t/apps/x86_64/phylip/phylip-3.69/exe/dnapars 60 | PhyML\t/apps/x86_64/phyml/PhyML_3.0/phyml 61 | " > $scratch_out/out/config.txt 62 | 63 | # Sanity check: look at the files and their permissions in 64 | # the config file. 65 | cut -f 2 $scratch_out/out/config.txt | xargs -I {} ls -lh {} 66 | 67 | echo "Config file contents:" 68 | cat $scratch_out/out/config.txt 69 | 70 | REALPHY_v112 $scratch_out/samples $scratch_out/out -readLength 250 -ref reference 71 | if [ $? -gt 0 ]; then exit 1; fi; 72 | 73 | ls -lh $scratch_out/reference/alignOut_NoGenes > $scratch_out/reference/alignOut_NoGenes.txt 74 | rm -rvf $scratch_out/reference/alignOut_NoGenes 75 | mv -v $scratch_out/out $base_dir/RealPhy 76 | rm -rvf $scratch_out 77 | END_OF_SCRIPT 78 | 79 | -------------------------------------------------------------------------------- /qsub/array/launch_set_alreadyShuffled.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs any set of reads through Lyve-SET in a cluster-friendly way. 4 | # Each reads directory will be a distinct project. 5 | # Author: Lee Katz 6 | # Usage: bash $0 reference.fasta readsdir1 readsdir2 [... readsdirX] 7 | 8 | if [ "$2" == "" ]; then 9 | echo "Usage: $0 ref/ dir [dir2 ... ]" 10 | echo " The reference directory can just contain reference.fasta or can have other Lyve-SET reference directory files." 11 | echo " Each directory will be searched for shuffled reads files matching *.f*q.gz" 12 | exit 1; 13 | fi 14 | 15 | TMP=$(mktemp --tmpdir='.' --directory qsubLyveSET.XXXXXXXX) 16 | echo "tmp dir is $TMP " 17 | 18 | export PATH=/scicomp/groups/OID/NCEZID/DFWED/EDLB/share/bin/lyve-SET-v1.1.4e/scripts:$PATH 19 | echo -n "Lyve-SET is being launched from "; 20 | \which launch_set.pl 21 | 22 | REF=$1; shift; # get the reference genome and remove it from ARGV 23 | CTRL_FILE="$TMP/array.txt" 24 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE 25 | 26 | mkdir -p $TMP/log 27 | qsub -q all.q -N LyveSetShuffled -o $TMP/log -j y -pe smp 3-4 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 28 | -v "CTRL_FILE=$CTRL_FILE" -v "REF=$REF" <<- "END_OF_SCRIPT" 29 | #!/bin/bash 30 | 31 | base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 32 | echo "Working on $base_dir with reference file $REF" 33 | mkdir -p /scratch/$USER 34 | if [ -e "$base_dir/Lyve-SET" ]; then 35 | echo "Found $base_dir/Lyve-SET! Will not continue."; 36 | exit 0; 37 | fi; 38 | scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory Lyve-SET.XXXXXX) 39 | rm -rfv $scratch_out 40 | 41 | # Run Lyve-SET on the scratch drive without qsub inception. 42 | set_manage.pl --create $scratch_out 43 | rmdir $scratch_out/reference; 44 | cp -r $REF $scratch_out/reference 45 | ln -sv $(find $(realpath $base_dir)/reads -name '*.f*q.gz') $scratch_out/reads/ 46 | if [ $? -gt 0 ]; then exit 1; fi; 47 | launch_set.pl --noqsub --numcpus $NSLOTS --read_cleaner CGP --mask-phages --mask-cliffs $scratch_out 48 | if [ $? -gt 0 ]; then exit 1; fi; 49 | 50 | rm -rvf $scratch_out/{reads,bam,tmp}/* # no need to take up all this space 51 | mv -v $scratch_out $base_dir/Lyve-SET 52 | END_OF_SCRIPT 53 | 54 | -------------------------------------------------------------------------------- /qsub/array/launch_set_qsubarray.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs any set of reads through Lyve-SET in a cluster-friendly way. 4 | # Each reads directory will be a distinct project. 5 | # Author: Lee Katz 6 | # Usage: bash $0 reference.fasta readsdir1 readsdir2 [... readsdirX] 7 | 8 | if [ "$2" == "" ]; then 9 | echo "Usage: $0 ref.fasta dir [dir2 ... ]" 10 | echo " Each directory will be searched for split reads files matching *.f*q.gz" 11 | exit 1; 12 | fi 13 | 14 | TMP=$(mktemp --tmpdir='.' --directory qsubLyveSET.XXXXXXXX) 15 | echo "tmp dir is $TMP " 16 | 17 | REF=$1; shift; # get the reference genome and remove it from ARGV 18 | CTRL_FILE="$TMP/array.txt" 19 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE 20 | 21 | mkdir -p $TMP/log 22 | qsub -q all.q -N LyveSET -o $TMP/log -j y -pe smp 1-2 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 23 | -v "CTRL_FILE=$CTRL_FILE" -v "REF=$REF" <<- "END_OF_SCRIPT" 24 | #!/bin/bash 25 | 26 | set -e 27 | 28 | base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 29 | echo "Working on $base_dir" 30 | mkdir -p /scratch/$USER 31 | if [ -e "$base_dir/Lyve-SET" ]; then 32 | echo "Found $base_dir/Lyve-SET! Will not continue."; 33 | exit 0; 34 | fi; 35 | scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory Lyve-SET.XXXXXX) 36 | rm -rfv $scratch_out 37 | 38 | # Run Lyve-SET on the scratch drive without qsub inception. 39 | set_manage.pl --create $scratch_out 40 | cp $REF $scratch_out/reference/reference.fasta 41 | shuffleSplitReads.pl --numcpus $NSLOTS --outdir $scratch_out/reads $(find $base_dir -name '*.f*q.gz') --regex '(.+)(_[12]\.f.*)' 42 | if [ $? -gt 0 ]; then exit 1; fi; 43 | launch_set.pl --noqsub --numcpus $NSLOTS --read_cleaner CGP --mask-phages --mask-cliffs $scratch_out 44 | 45 | rm -rvf $scratch_out/{bam,reads,vcf,tmp}/* # no need to take up all this space 46 | mv -v $scratch_out $base_dir/Lyve-SET 47 | END_OF_SCRIPT 48 | 49 | -------------------------------------------------------------------------------- /qsub/array/launch_shovill_array.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs shovill on a set of Illumina reads 4 | 5 | 6 | # Read ARGV 7 | OUTDIR=$1; shift; 8 | READS=$@ 9 | 10 | set -e 11 | 12 | if [ "$READS" == "" ]; then 13 | echo "Assemble all reads in a directory from Illumina runs" 14 | echo "All files must be in order for paired-end to work" 15 | echo "Usage: $0 outdir *.fastq.gz" 16 | exit 1; 17 | fi 18 | 19 | if [ ! -d "$OUTDIR" ]; then 20 | mkdir "$OUTDIR" 21 | fi; 22 | 23 | tmpdir=$(mktemp --tmpdir='.' --directory qsubShovillSpades.XXXXXXXX) 24 | mkdir -pv $tmpdir/log 25 | #trap " rm -rf $tmpdir " EXIT 26 | echo "tmp dir is $tmpdir " 27 | 28 | # CTRL file will have per line: 29 | # filename coverageLevel 30 | CTRL_FILE="$tmpdir/array.txt" 31 | # Put the reads one at a time into the CTRL_FILE but use paste to keep paired ends together 32 | echo "$READS" | tr ' ' '\n' | paste - - > $CTRL_FILE 33 | #echo "DEBUG"; head $CTRL_FILE > $CTRL_FILE.tmp && mv $CTRL_FILE.tmp $CTRL_FILE 34 | #echo "CTRL_FILE is $CTRL_FILE" 35 | 36 | head -n 5 $CTRL_FILE 37 | echo "This is what the top of the CTRL file looks like. " 38 | echo " Waiting 1 second in case you want to ctrl-c..." 39 | 40 | # Might as well start loading the environment before sleeping 41 | module purge 42 | export PATH=$HOME/bin/shovill-v1.1.0/bin:$PATH 43 | module load SPAdes/3.13.0 Skesa/2.3.0 megahit/1.1.2 velvet/1.2.10 lighter/1.1.1 flash/1.2.11 samtools/1.9 bwa/0.7.17 seqtk/1.3 pilon/1.22 trimmomatic/0.35 perl/5.16.1-MT kmc/3.0 java/jdk1.8.0_301 44 | sleep 1 45 | 46 | # See if we have all the right components 47 | shovill --check 48 | 49 | qsub -q all.q -q edlb.q -N ShovillSpades -o $tmpdir/log -j y -pe smp 4-6 -l h_vmem=72G -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 50 | -v "tmpdir=$tmpdir" -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT" 51 | #!/bin/bash -l 52 | set -e 53 | set -u 54 | 55 | # bring back in modules that load LD_LIBRARY_PATH 56 | # because that variable is stripped away for security purposes 57 | module load gcc/4.9.3 pilon/1.22 58 | 59 | # Set up filenames 60 | fastq=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 61 | R1=$(echo "$fastq" | cut -f 1); 62 | R2=$(echo "$fastq" | cut -f 2); 63 | name=$(basename $R1 .fastq.gz) 64 | 65 | sampledir=$(mktemp --tmpdir=$tmpdir --directory $name.shovillSpades.XXXXXX); 66 | trap "rm -rf $sampledir" EXIT 67 | fastaOut="$OUTDIR/$name.shovillSpades.fasta" 68 | 69 | ls -lh $R1 $R2 70 | echo "Shovill with SPAdes will be run on $R1 $R2, on $(hostname)" 71 | echo "Fasta will be written to $fastaOut"; 72 | 73 | if [ -e "$fastaOut" ]; then 74 | echo "$fastaOut already exists. Exiting."; 75 | exit 1; 76 | fi 77 | 78 | # Make sure the skesa lib is there before the full shovill check 79 | skesa --version 2>&1 80 | 81 | shovill --check 82 | 83 | shovill --R1 $R1 --R2 $R2 --outdir $sampledir --assembler spades --cpus $NSLOTS --force --tmpdir /scratch --ram 64 84 | cp -v $sampledir/contigs.fa $fastaOut 85 | # trap command will remove $sampledir 86 | 87 | END_OF_SCRIPT 88 | 89 | -------------------------------------------------------------------------------- /qsub/array/launch_shuffleReads.pl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ "$2" == "" ]; then 6 | echo "Only give _R1_ files to this script" 7 | echo "Usage:"; 8 | exit 1 9 | fi 10 | 11 | outdir=$1 12 | shift 13 | 14 | TMP=$(mktemp --tmpdir='.' --directory SHUFFLE.XXXXXX) 15 | CTRL_FILE="$TMP/fasta.txt" 16 | echo "$@" | tr ' ' '\n' > $CTRL_FILE 17 | mkdir -p $TMP/log 18 | 19 | export PATH=$PATH:$HOME/bin/cg_pipeline/scripts 20 | 21 | which run_assembly_shuffleReads.pl 22 | 23 | qsub -q all.q -N shuffleReads -o $TMP/log -j y -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 24 | -v "CTRL_FILE=$CTRL_FILE" -v "outdir=$outdir" <<- "END_OF_SCRIPT" 25 | #!/bin/bash 26 | 27 | set -e 28 | 29 | R1=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 30 | b=$(basename $R1); 31 | d=$(dirname $R1); 32 | 33 | R2=${R1/_R1_/_R2_} 34 | 35 | echo "Shuffling on $(hostname) $R1 and $R2" 36 | 37 | ls $R1 $R2 38 | 39 | if [ -e "$outdir/$b.fastq.gz" ]; then 40 | echo "Found $outdir/$b.fastq.gz. Exiting."; 41 | exit 0; 42 | fi 43 | 44 | mkdir -p /dev/shm/$USER 45 | tmpdir=$(mktemp --tmpdir=/dev/shm/$USER --directory shuffleReads.XXXXXX); 46 | trap "{ rm -rf $tmpdir; }" EXIT 47 | 48 | shuffled="$tmpdir/$b.fastq.gz"; 49 | 50 | run_assembly_shuffleReads.pl $R1 $R2 | gzip -9c > $shuffled 51 | 52 | mv -v $shuffled $outdir/ 53 | 54 | END_OF_SCRIPT 55 | 56 | -------------------------------------------------------------------------------- /qsub/array/launch_skesa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs spades on a set of Illumina reads 4 | 5 | 6 | # Read ARGV 7 | OUTDIR=$1; shift; 8 | READS=$@ 9 | 10 | set -e 11 | 12 | if [ "$READS" == "" ]; then 13 | echo "Assemble all reads in a directory from Illumina runs" 14 | echo "All files must be in interleaved format"; 15 | echo "Usage: $0 outdir *.fastq.gz" 16 | exit 1; 17 | fi 18 | 19 | if [ ! -d "$OUTDIR" ]; then 20 | mkdir "$OUTDIR" 21 | fi; 22 | 23 | TMP=$(mktemp --tmpdir='.' --directory qsubSkesa.XXXXXXXX) 24 | mkdir -p $TMP/log 25 | echo "tmp dir is $TMP " 26 | 27 | # CTRL file will have per line: 28 | # filename coverageLevel 29 | CTRL_FILE="$TMP/array.txt" 30 | echo "$READS" | tr ' ' '\n' > $CTRL_FILE 31 | echo "CTRL_FILE is $CTRL_FILE" 32 | 33 | module purge 34 | module load Skesa/2.0_2 35 | #module unload gcc 36 | #module load gcc/4.9.3 37 | skesa --version 38 | 39 | qsub -q all.q -N skesa -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 40 | -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT" 41 | #!/bin/bash -l 42 | set -e 43 | 44 | # LD_LIBRARY_PATH is stripped by qsub and needs to be readded 45 | # The following is how it appeared in my own environment. 46 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/apps/x86_64/gcc/5.4/lib64:/apps/x86_64/gmp/6.1.0/lib:/apps/x86_64/mpfr/3.1.3/lib:/apps/x86_64/mpc/1.0.3/lib:/apps/x86_64/isl/0.18/lib 47 | echo "WARNING: setting LD_LIBRARY_PATH=$LD_LIBRARY_PATH" 48 | 49 | module list 50 | skesa --version 51 | 52 | # Set up filenames 53 | fastq=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 54 | tmpdir=/scratch/$USER/skesa 55 | mkdir -p $tmpdir 56 | sampledir=$(mktemp --tmpdir=$tmpdir --directory skesa.XXXXXX); 57 | trap "rm -rf $sampledir" EXIT 58 | fastaOut="$sampledir/$(basename $fastq .fastq.gz).fasta" 59 | 60 | echo "skesa will be run on $fastq, on $(hostname)" 61 | echo "Temporary fasta will be written to $fastaOut"; 62 | 63 | if [ -e "$fastaOut" ]; then 64 | echo "$fastaOut already exists. Exiting."; 65 | exit 1; 66 | fi 67 | 68 | /usr/bin/time -o $OUTDIR/time.$SGE_TASK_ID.tsv -f "$fastq\t%e" \ 69 | skesa --cores $NSLOTS --fastq $fastq --gz --use_paired_ends > $fastaOut 70 | 71 | mv -v $fastaOut $OUTDIR 72 | END_OF_SCRIPT 73 | 74 | -------------------------------------------------------------------------------- /qsub/array/launch_skesa_sra.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs spades on a set of Illumina reads 4 | 5 | 6 | # Read ARGV 7 | OUTDIR=$1; shift; 8 | READS=$@ 9 | 10 | set -e 11 | 12 | if [ "$READS" == "" ]; then 13 | echo "Assemble all sra runs in a file" 14 | echo "Usage: $0 outdir allsra_acc.txt" 15 | echo " where the sra accessions are separated by any whitespace" 16 | exit 1; 17 | fi 18 | 19 | if [ ! -d "$OUTDIR" ]; then 20 | mkdir "$OUTDIR" 21 | fi; 22 | 23 | TMP=$(mktemp --tmpdir='.' --directory qsubSkesa.XXXXXXXX) 24 | mkdir -p $TMP/log 25 | echo "tmp dir is $TMP " 26 | 27 | # CTRL file will have per line: 28 | # filename coverageLevel 29 | CTRL_FILE="$TMP/array.txt" 30 | cat $READS | perl -lane ' 31 | chomp; 32 | for my $f (split(/\s+/, $_)) { 33 | print "$f"; 34 | } 35 | ' > $CTRL_FILE 36 | echo "CTRL_FILE is $CTRL_FILE" 37 | 38 | # Check to make sure skesa will work in the heredoc before getting there 39 | module purge 40 | module load Skesa 41 | skesa --version 42 | module purge 43 | 44 | qsub -q all.q -N skesa -o $TMP/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 45 | -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT" 46 | #!/bin/bash -l 47 | set -e 48 | 49 | module purge 50 | module load Skesa 51 | module list 52 | skesa --version 53 | 54 | # Set up filenames 55 | run_acc=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 56 | tmpdir=/scratch/$USER/skesa 57 | mkdir -pv $tmpdir 58 | sampledir=$(mktemp --tmpdir=$tmpdir --directory skesa.XXXXXX); 59 | trap "rm -rf $sampledir" EXIT 60 | fastaOut="$sampledir/$run_acc.skesa.fasta" 61 | finalOut="$OUTDIR/$run_acc.skesa.fasta" 62 | 63 | echo "skesa will be run on $run_acc, on $(hostname)" 64 | echo "Temporary fasta will be written to $fastaOut"; 65 | 66 | if [ -e "$fastaOut" ]; then 67 | echo "$fastaOut already exists. Exiting."; 68 | exit 2; 69 | fi 70 | 71 | if [ -e "$finalOut" ]; then 72 | echo "$finalOut already exists. Exiting."; 73 | exit 3; 74 | fi 75 | 76 | skesa --cores $NSLOTS --sra_run $run_acc > $fastaOut 77 | 78 | mv -v $fastaOut $finalOut 79 | END_OF_SCRIPT 80 | 81 | -------------------------------------------------------------------------------- /qsub/array/launch_snp-pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs any set of reads through Snp-Pipeline in a cluster-friendly way. 4 | # Each reads directory will be a distinct project. 5 | # Author: Lee Katz 6 | # Usage: bash $0 reference.fasta readsdir1 readsdir2 [... readsdirX] 7 | 8 | TMP=$(mktemp --tmpdir='.' --directory qsubSnp-Pipeline.XXXXXXXX) 9 | echo "tmp dir is $TMP " 10 | 11 | REF=$1; shift; # get the reference genome and remove it from ARGV 12 | CTRL_FILE="$TMP/array.txt" 13 | echo "$@" | tr ' ' '\n' | grep . > $CTRL_FILE 14 | 15 | mkdir -p $TMP/log 16 | # Has to have an exclusive node because it is a greedy script 17 | qsub -q all.q -N snp-pipeline -o $TMP/log -j y -pe smp 12,16 -hard -l exclusive=true -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 18 | -v "CTRL_FILE=$CTRL_FILE" -v "REF=$REF" <<- "END_OF_SCRIPT" 19 | #!/bin/bash 20 | 21 | module load bowtie2/2.1.0 22 | module load varscan/2.3.7 23 | export PATH=~/.local/bin:$PATH # make sure snp-pipeline is prioritized 24 | export CLASSPATH=/apps/x86_64/varscan/bin/VarScan.v2.3.7.jar # varscan 25 | 26 | base_dir=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE) 27 | echo "Working on $base_dir" 28 | mkdir -p /scratch/$USER 29 | scratch_out=$(mktemp --tmpdir="/scratch/$USER" --directory snp-pipeline.XXXXXX) 30 | export scratch_out 31 | mkdir -p $scratch_out/samples 32 | 33 | # Make the config file and put it into scratch_out (snppipeline.conf) 34 | copy_snppipeline_data.py configurationFile $scratch_out 35 | 36 | # Find what reads we're using. Assume all reads have been shuffled. 37 | READS=$(find $base_dir $base_dir/reads -maxdepth 1 -name '*.fastq.gz' 2>/dev/null); 38 | 39 | # Deshuffle reads into the correct directories 40 | echo $READS | xargs -P $NSLOTS -n 1 sh -c ' 41 | sample=$(basename $0 .fastq.gz); 42 | sampleDir=$scratch_out/samples/$sample; 43 | mkdir -p $sampleDir; 44 | echo "Deshuffling into $sampleDir"; 45 | run_assembly_shuffleReads.pl $0 -d -gz 1>$sampleDir/1.fastq.gz 2>$sampleDir/2.fastq.gz 46 | if [ $? -gt 0 ]; then echo "ERROR deshuffling $i"; exit 1; fi; 47 | '; 48 | 49 | # Run snp-pipeline on the scratch drive without qsub inception. 50 | run_snp_pipeline.sh -c $scratch_out/snppipeline.conf -s $scratch_out/samples -m copy -o $scratch_out $REF 51 | if [ $? -gt 0 ]; then echo "ERROR with run_snp_pipeline.sh"; exit 1; fi; 52 | rm -rvf $scratch_out/samples/* 53 | 54 | mv -v $scratch_out $base_dir/snp-pipeline 55 | END_OF_SCRIPT 56 | 57 | -------------------------------------------------------------------------------- /qsub/array/launch_spades_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs spades on a set of Illumina reads 4 | 5 | # Read ARGV 6 | READS=$@ 7 | SLOTS_PER_JOB=1 # manually change this as needed 8 | 9 | if [ "$READS" == "" ]; then 10 | echo "Assemble all reads in a directory from Illumina runs" 11 | echo "All files must be in the format *_R1_*.fastq.gz to ensure pairs stay together" 12 | echo "Usage: $0 *.fastq.gz" 13 | exit 1; 14 | fi 15 | 16 | TMP=$(mktemp --tmpdir='.' --directory qsubSPAdes.XXXXXXXX) 17 | mkdir -p $TMP/log 18 | echo "tmp dir is $TMP " 19 | 20 | # CTRL file will have per line: 21 | # filename coverageLevel 22 | CTRL_FILE="$TMP/array.txt" 23 | echo "$READS" | tr ' ' '\n' | grep "_R1_" > $CTRL_FILE 24 | echo "CTRL_FILE is $CTRL_FILE" 25 | 26 | qsub -q all.q -N spades -o $TMP/log -j y -pe smp $SLOTS_PER_JOB -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 27 | -v "CTRL_FILE=$CTRL_FILE" <<- "END_OF_SCRIPT" 28 | #!/bin/bash -l 29 | set -e 30 | module load SPAdes 31 | 32 | which spades.py 33 | 34 | # Set up filenames 35 | R1=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE | awk '{print $1}') 36 | R2=${R1/_R1_/_R2_} 37 | tmpdir=/scratch/$USER 38 | mkdir -p $tmpdir 39 | outdir=$(mktemp --tmpdir=$tmpdir --directory SPAdes.XXXXXX); 40 | trap "rm -rf $outdir" EXIT 41 | newdir=$(dirname $R1)/$(basename $R1 .fastq.gz).spades 42 | 43 | if [ -e "$newdir" ]; then 44 | echo "ERROR: found pre-existing dir $newdir. Will not continue."; 45 | exit 1 46 | fi 47 | 48 | echo "spades will be run on $(hostname)" 49 | echo "R1/R2:" 50 | echo " $R1" 51 | echo " $R2" 52 | echo "Working directory is $outdir" 53 | echo "Final directory will be $newdir"; 54 | 55 | spades.py -t $NSLOTS -1 $R1 -2 $R2 -o $outdir --careful 56 | 57 | mv -v $outdir $newdir 58 | END_OF_SCRIPT 59 | 60 | -------------------------------------------------------------------------------- /qsub/array/launch_wgmlst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | # Runs NCBI's wgmlst in an array for MLST calling 4 | 5 | OUTDIR=$1; shift; 6 | DB=$1; shift 7 | ASM=$@ 8 | 9 | set -e 10 | set -u 11 | 12 | if [ "$ASM" == "" ]; then 13 | echo "Run NCBI's wgmlst for wgMLST allele calling" 14 | echo "Usage: $0 outdir something.scheme *.fasta" 15 | exit 1; 16 | fi 17 | 18 | if [ ! -d "$OUTDIR" ]; then 19 | mkdir "$OUTDIR" 20 | fi; 21 | if [ ! -e "$DB" ]; then 22 | echo "ERROR: not found: $DB" 23 | exit 2; 24 | fi 25 | 26 | tmpdir=$(mktemp --tmpdir='.' --directory wgmlst.XXXXXXXX) 27 | mkdir -pv $tmpdir/log 28 | #trap " rm -rf $tmpdir " EXIT 29 | echo "tmp dir is $tmpdir " 30 | 31 | # CTRL file will have per line: 32 | # filename coverageLevel 33 | CTRL_FILE="$tmpdir/array.txt" 34 | # Put the reads one at a time into the CTRL_FILE but use paste to keep paired ends together 35 | echo "$ASM" | tr ' ' '\n' > $CTRL_FILE 36 | 37 | head $CTRL_FILE 38 | 39 | module purge 40 | 41 | # Check executables 42 | which wgmlst 43 | 44 | qsub -N ncbi_wgmlst -o $tmpdir/log -j y -pe smp 1 -V -cwd -t 1-$(cat $CTRL_FILE | wc -l) \ 45 | -v "tmpdir=$tmpdir" -v "DB=$DB" -v "CTRL_FILE=$CTRL_FILE" -v "OUTDIR=$OUTDIR" <<- "END_OF_SCRIPT" 46 | #!/bin/bash -l 47 | set -e 48 | set -u 49 | 50 | # Set up filenames 51 | fasta=$(sed -n ${SGE_TASK_ID}p $CTRL_FILE); 52 | name=$(basename $fasta .fasta) 53 | 54 | sampledir=$(mktemp --tmpdir=$tmpdir --directory $name.wgmlst.XXXXXX); 55 | trap "rm -rf $sampledir" EXIT 56 | finalout="$OUTDIR/$name" 57 | 58 | if [ -e "$finalout/.done" ]; then 59 | echo "Found $finalout/.done; not repeating." 60 | exit 0; 61 | fi 62 | 63 | mappings=$sampledir/mappings 64 | alleles=$sampledir/alleles 65 | stdout=$sampledir/wgmlst.out 66 | log=$sampledir/wgmlst.log 67 | 68 | ( 69 | date # mark how long this takes 70 | set -x 71 | wgmlst --genome $fasta --alleles $DB --cores $NSLOTS --kmer 15 --output_mappings $mappings --output_loci $alleles 72 | set +x 73 | date 74 | ) 1>$stdout 2>$log 75 | 76 | ls -lh $sampledir 77 | mv -v $sampledir $finalout 78 | touch $finalout/.done 79 | 80 | END_OF_SCRIPT 81 | 82 | -------------------------------------------------------------------------------- /qsub/array/lyvesetVsLyveset.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Uses a tree to reads directory and a Lyve-SET run to determine 3 | # true positives, false negatives, false positives 4 | # 5 | 6 | use strict; 7 | use warnings; 8 | use Data::Dumper; 9 | use Getopt::Long; 10 | 11 | exit main(); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help lyveset=s ref|reference=s)) or die $!; 16 | die "ERROR: need Lyve-SET project name\n".usage() if(!$$settings{lyveset}); 17 | die "ERROR: need reference project name\n".usage() if(!$$settings{ref}); 18 | 19 | # where are the Lyve-SET SNPs? 20 | my $lyveSetSnps=lyveSetSnps($$settings{lyveset},$settings); 21 | 22 | # where are the real SNPs? 23 | my $realSnps=lyveSetSnps($$settings{ref},$settings); 24 | 25 | # compare 26 | my($TP,$TN,$FP,$FN)=compareSnps($lyveSetSnps,$realSnps,$settings); 27 | 28 | # Report 29 | print join("\t",qw(L-S TTR TP TN FP FN))."\n"; 30 | print join("\t",$$settings{lyveset},$$settings{ref},$TP, $TN, $FP, $FN)."\n"; 31 | } 32 | 33 | sub lyveSetSnps{ 34 | my($proj,$settings)=@_; 35 | my %pos; 36 | 37 | my $matrix="$proj/msa/out.filteredMatrix.tsv"; 38 | #my $matrix="$proj/msa/out.snpmatrix.tsv"; 39 | open(SNPMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!"; 40 | while(){ 41 | next if(/^#/); 42 | chomp; 43 | my($chr,$pos,$ref,@alt)=split /\t/; 44 | $pos{$pos}=$ref; 45 | } 46 | close SNPMATRIX; 47 | 48 | return \%pos; 49 | } 50 | 51 | sub compareSnps{ 52 | my ($lyveSetSnps,$realSnps,$settings)=@_; 53 | 54 | # Initialize counts to zero 55 | my($TP,$TN,$FP,$FN)=split(//,"0" x 4); 56 | 57 | # How many of the real SNPs were found? 58 | while(my($truePos,$trueRef)=each(%$realSnps)){ 59 | if($$lyveSetSnps{$truePos}){ 60 | $TP++; # True positive: correct SNP was found 61 | } else { 62 | $FN++; # False negative: a real SNP was not found 63 | } 64 | } 65 | 66 | # How many SNPs were found that were not real? 67 | while(my($pos,$ref)=each(%$lyveSetSnps)){ 68 | if($$realSnps{$pos}){ 69 | # This is a true positive and was already counted in the previous loop. 70 | } else { 71 | $FP++; # A SNP was found but is not in the real set of SNPs. 72 | } 73 | } 74 | 75 | return ($TP,$TN,$FP,$FN); 76 | } 77 | 78 | sub usage{ 79 | "Compares a Lyve-SET run to a specific Lyve-SET run 80 | Usage: $0 --lyveset projdirectory --ref projdirectory 81 | " 82 | } 83 | -------------------------------------------------------------------------------- /qsub/array/lyvesetVsSimulations.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Uses a tree to reads directory and a Lyve-SET run to determine 3 | # true positives, false negatives, false positives 4 | # 5 | 6 | use strict; 7 | use warnings; 8 | use Data::Dumper; 9 | use Getopt::Long; 10 | 11 | exit main(); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help lyveset=s ttr=s)); 16 | die usage() if($$settings{help}); 17 | die "ERROR: need Lyve-SET project name\n".usage() if(!$$settings{lyveset}); 18 | die "ERROR: need TreeToReads project name\n".usage() if(!$$settings{ttr}); 19 | 20 | # where are the Lyve-SET SNPs? 21 | my $lyveSetSnps=lyveSetSnps($$settings{lyveset},$settings); 22 | 23 | # where are the real SNPs? 24 | my $realSnps=ttrSites($$settings{ttr},$settings); 25 | 26 | # compare 27 | my($TP,$TN,$FP,$FN)=compareSnps($lyveSetSnps,$realSnps,$settings); 28 | 29 | # Report 30 | print join("\t",qw(L-S TTR TP TN FP FN))."\n"; 31 | print join("\t",$$settings{lyveset},$$settings{ttr},$TP, $TN, $FP, $FN)."\n"; 32 | } 33 | 34 | sub lyveSetSnps{ 35 | my($proj,$settings)=@_; 36 | my %pos; 37 | 38 | my $matrix="$proj/msa/out.snpmatrix.tsv"; 39 | open(SNPMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!"; 40 | while(){ 41 | next if(/^#/); 42 | chomp; 43 | my($chr,$pos,$ref,@alt)=split /\t/; 44 | $pos{$pos}=$ref; 45 | } 46 | close SNPMATRIX; 47 | 48 | return \%pos; 49 | } 50 | 51 | sub ttrSites{ 52 | my($proj,$settings)=@_; 53 | my %pos; 54 | 55 | my $matrix="$proj/mutsites.txt"; 56 | open(TTRMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!"; 57 | while(){ 58 | chomp; 59 | my($pos)=split /\s+/; 60 | $pos{$pos}=1 61 | } 62 | close TTRMATRIX; 63 | return \%pos; 64 | } 65 | 66 | sub ttrSnps{ 67 | my($proj,$settings)=@_; 68 | my %pos; 69 | 70 | my $matrix="$proj/var_site_matrix"; 71 | open(TTRMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!"; 72 | while(){ 73 | chomp; 74 | my($chr,$ref,$pos)=split /\s+/; 75 | $pos{$pos}=$ref; 76 | } 77 | close TTRMATRIX; 78 | 79 | return \%pos; 80 | } 81 | 82 | sub compareSnps{ 83 | my ($lyveSetSnps,$realSnps,$settings)=@_; 84 | 85 | # Initialize counts to zero 86 | my($TP,$TN,$FP,$FN)=split(//,"0" x 4); 87 | 88 | # How many of the real SNPs were found? 89 | while(my($truePos,$trueRef)=each(%$realSnps)){ 90 | if($$lyveSetSnps{$truePos}){ 91 | $TP++; # True positive: correct SNP was found 92 | } else { 93 | $FN++; # False negative: a real SNP was not found 94 | } 95 | } 96 | 97 | # How many SNPs were found that were not real? 98 | while(my($pos,$ref)=each(%$lyveSetSnps)){ 99 | if($$realSnps{$pos}){ 100 | # This is a true positive and was already counted in the previous loop. 101 | } else { 102 | $FP++; # A SNP was found but is not in the real set of SNPs. 103 | } 104 | } 105 | 106 | return ($TP,$TN,$FP,$FN); 107 | } 108 | 109 | sub usage{ 110 | "Compares a Lyve-SET run to a simulated dataset 111 | Usage: $0 --lyveset projdirectory --ttr treetoreadsdirectory 112 | " 113 | } 114 | -------------------------------------------------------------------------------- /qsub/array/makeConfigs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; 6 | use Config::Simple; 7 | use Getopt::Long; 8 | use File::Spec::Functions qw/rel2abs/; 9 | 10 | #my $baseDir="/scicomp/home/gzu2/projects/wgsStandards/accuracyVsCoverage/manyCoverages"; 11 | my $baseDir="."; 12 | 13 | exit(main()); 14 | 15 | sub main{ 16 | my $settings={}; 17 | GetOptions($settings,qw(help reps=i min_coverage=i max_coverage=i)); 18 | $$settings{reps}||=1; 19 | $$settings{min_coverage}||=1; 20 | $$settings{max_coverage}||=$$settings{min_coverage}; 21 | die usage() if($$settings{help}); 22 | $$settings{config}=shift(@ARGV); 23 | die "ERROR: need config file" if(!$$settings{config}); 24 | 25 | # Get configuration from cfg file 26 | my %config; 27 | Config::Simple->import_from($$settings{config},\%config); 28 | my $fixed=fixConfigValue(\%config); 29 | %config=%$fixed; 30 | 31 | 32 | for(my $i=$$settings{min_coverage};$i<=$$settings{max_coverage};$i++){ 33 | for(my $rep=0; $rep<$$settings{reps}; $rep++){ 34 | my $coverage=($i*1); 35 | 36 | # make a simulation directory 37 | my $simDir="cov$i/rep$rep"; 38 | $simDir=rel2abs($simDir); 39 | system("mkdir -pv $simDir"); 40 | # copy all necessary files except config 41 | system("cp -rv $baseDir/IlluminaErrorProfiles $baseDir/reference $baseDir/lyve-set.dnd $simDir/"); 42 | 43 | # generate a custom config file 44 | my %newConfig=%config; 45 | # update some paths 46 | $newConfig{'default.output_dir'}="$simDir/out"; 47 | $newConfig{'default.coverage'}=$coverage; 48 | $newConfig{'default.treefile_path'}="$simDir/$newConfig{'default.treefile_path'}"; 49 | $newConfig{'default.base_genome_path'}="$simDir/".$newConfig{'default.base_genome_path'}; 50 | for(qw(error_model1 error_model2)){ 51 | $newConfig{"default.$_"}="$simDir/".$newConfig{"default.$_"}; 52 | } 53 | 54 | open(CFG,">","$simDir/TTR.cfg") or die "ERROR: could not open $simDir/TTR.cfg for writing: $!"; 55 | while(my($key,$value)=each(%newConfig)){ 56 | $key=~s/^default\.//; 57 | if(ref($value) eq 'ARRAY'){ 58 | $value=join(",",@$value); 59 | } 60 | print CFG "$key = $value\n"; 61 | } 62 | close CFG; 63 | } 64 | } 65 | } 66 | 67 | sub fixConfigValue{ 68 | my($value)=@_; 69 | if(ref($value) eq 'HASH'){ 70 | while(my($k,$v)=each(%$value)){ 71 | $$value{$k}=fixConfigValue($v); 72 | } 73 | } elsif(ref($value) eq 'ARRAY'){ 74 | for(my $i=0;$i<@$value;$i++){ 75 | $$value[$i]=fixConfigValue($$value[$i]); 76 | } 77 | } else { 78 | $value=~s/#.*$//; 79 | $value=~s/^\s+|\s+$//g; 80 | } 81 | return $value; 82 | } 83 | 84 | sub usage{ 85 | "Create a bunch of different treetoreads projects 86 | Usage: $0 treetoreads.cfg 87 | --reps 1 Number of repetitions 88 | --min_coverage 1 89 | --max_coverage 1 90 | " 91 | } 92 | -------------------------------------------------------------------------------- /qsub/array/snppipelineVsSimulations.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Uses a tree to reads directory and a Snp-Pipeline run to determine 3 | # true positives, false negatives, false positives 4 | # 5 | 6 | use strict; 7 | use warnings; 8 | use Data::Dumper; 9 | use Getopt::Long; 10 | 11 | exit main(); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help snppipeline=s ttr=s)); 16 | die usage() if($$settings{help}); 17 | die "ERROR: need Snp-Pipeline project name\n".usage() if(!$$settings{snppipeline}); 18 | die "ERROR: need TreeToReads project name\n".usage() if(!$$settings{ttr}); 19 | 20 | # where are the Snp-Pipeline SNPs? 21 | my $snppipelineSnps=snppipelineSnps($$settings{snppipeline},$settings); 22 | 23 | # where are the real SNPs? 24 | my $realSnps=ttrSnps($$settings{ttr},$settings); 25 | 26 | # compare 27 | my($TP,$TN,$FP,$FN)=compareSnps($snppipelineSnps,$realSnps,$settings); 28 | 29 | # Report 30 | print join("\t",qw(S-P TTR TP TN FP FN))."\n"; 31 | print join("\t",$$settings{snppipeline},$$settings{ttr},$TP, $TN, $FP, $FN)."\n"; 32 | } 33 | 34 | sub snppipelineSnps{ 35 | my($proj,$settings)=@_; 36 | my %pos; 37 | 38 | my $list="$proj/snplist.txt"; 39 | open(SNPMATRIX,"<",$list) or die "ERROR: could not open $list for reading: $!"; 40 | while(){ 41 | next if(/^#/); 42 | chomp; 43 | my($chr,$pos,$count,@genomes)=split /\t/; 44 | $pos{$pos}=1; 45 | } 46 | close SNPMATRIX; 47 | 48 | return \%pos; 49 | } 50 | 51 | sub ttrSnps{ 52 | my($proj,$settings)=@_; 53 | my %pos; 54 | 55 | my $matrix="$proj/var_site_matrix"; 56 | open(TTRMATRIX,"<",$matrix) or die "ERROR: could not open $matrix for reading: $!"; 57 | while(){ 58 | chomp; 59 | my($chr,$ref,$pos)=split /\s+/; 60 | $pos{$pos}=$ref; 61 | } 62 | close TTRMATRIX; 63 | 64 | return \%pos; 65 | } 66 | 67 | sub compareSnps{ 68 | my ($snppipelineSnps,$realSnps,$settings)=@_; 69 | 70 | # Initialize counts to zero 71 | my($TP,$TN,$FP,$FN)=split(//,"0" x 4); 72 | 73 | # How many of the real SNPs were found? 74 | while(my($truePos,$trueRef)=each(%$realSnps)){ 75 | if($$snppipelineSnps{$truePos}){ 76 | $TP++; # True positive: correct SNP was found 77 | } else { 78 | $FN++; # False negative: a real SNP was not found 79 | } 80 | } 81 | 82 | # How many SNPs were found that were not real? 83 | while(my($pos,$ref)=each(%$snppipelineSnps)){ 84 | if($$realSnps{$pos}){ 85 | # This is a true positive and was already counted in the previous loop. 86 | } else { 87 | $FP++; # A SNP was found but is not in the real set of SNPs. 88 | } 89 | } 90 | 91 | return ($TP,$TN,$FP,$FN); 92 | } 93 | 94 | sub usage{ 95 | "Compares a Snp-Pipeline run to a simulated dataset 96 | Usage: $0 --snpPipeline projdirectory --ttr treetoreadsdirectory 97 | " 98 | } 99 | -------------------------------------------------------------------------------- /qsub/launch_SRST2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -S /bin/bash 3 | #$ -pe smp 1 4 | #$ -cwd -V 5 | #$ -o SRST.log -j y 6 | #$ -N SRST2 7 | 8 | source /etc/profile.d/modules.sh 9 | module load samtools/0.1.18; 10 | module load bowtie2/2.1.0; 11 | 12 | DB=$1 13 | MLST_DEFS=$2 14 | PREFIX=$3 15 | OUTDIR=$4 16 | INTERLEVED=$5 17 | 18 | thisScript=`basename $0`; 19 | if [ "$INTERLEVED" == "" ]; then 20 | echo "Usage: $thisScript MLSTDB.fasta MLST_DEFS.txt OUTPREFIX OUTDIR/ INTERLEVED.fastq[.gz]" 21 | echo "OUTPREFIX cannot include a prefix directory which is why you should specify OUTDIR" 22 | exit 1; 23 | fi 24 | 25 | NSLOTS=${NSLOTS-1} 26 | SCRIPT=/scicomp/home/gzu2/bin/srst2/scripts/srst2.py 27 | 28 | b=$(basename $INTERLEVED); 29 | READ1="TMP/$b.read_1.fastq" 30 | READ2="TMP/$b.read_2.fastq" 31 | run_assembly_shuffleReads.pl -d $INTERLEVED > $READ1 2> $READ2 32 | if [ $? -gt 0 ]; then exit 1; fi; 33 | 34 | # Low compression to make it compatible with srst2. 35 | # These files will be deleted later anyway 36 | gzip -f -v -1 $READ1 $READ2 37 | if [ $? -gt 0 ]; then exit 1; fi; 38 | 39 | READ1="$READ1.gz" 40 | READ2="$READ2.gz" 41 | 42 | 43 | $SCRIPT --input_pe $READ1 $READ2 --mlst_delimiter "_" --output $PREFIX --log --mlst_db $DB --mlst_definitions $MLST_DEFS 44 | if [ $? -gt 0 ]; then exit 1; fi; 45 | mv -v ${PREFIX}*__results.txt "$OUTDIR/" 46 | if [ $? -gt 0 ]; then exit 1; fi; 47 | 48 | # remove temp files 49 | rm -v $READ1 $READ2 50 | rm -vf ${PREFIX}__*.bam* ${PREFIX}__*.sam* ${PREFIX}__*results.txt ${PREFIX}__*.pileup ${PREFIX}*.log 51 | -------------------------------------------------------------------------------- /qsub/launch_annotation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #$ -S /bin/bash 4 | #$ -pe smp 10-16 5 | #$ -cwd -V 6 | #$ -o annotation.log 7 | #$ -j y 8 | #$ -N cgpAnnotate 9 | #$ -q all.q 10 | 11 | project=$1 12 | 13 | if [ "$project" == "" ]; then 14 | echo "Usage: " $(basename $0) " CGP-project/" 15 | exit 1; 16 | fi; 17 | 18 | NSLOTS=${NSLOTS:=1} 19 | 20 | run_pipeline annotate -p "$project" --numcpus $NSLOTS --skip INTERPRO 21 | if [ $? -gt 0 ]; then echo "ERROR with run_pipeline annotatE"; exit 1; fi; 22 | -------------------------------------------------------------------------------- /qsub/launch_baym.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Runs a shovill assembly. Run with no options for usage. 3 | # Author: Lee Katz 4 | # Workflow taken from Dorian Feistel 5 | 6 | #$ -S /bin/bash 7 | #$ -pe smp 1 8 | #$ -cwd -V 9 | #$ -o baym.log 10 | #$ -j y 11 | #$ -N baym 12 | #$ -q all.q 13 | 14 | outdir=$1; shift 15 | reads=$@ 16 | NSLOTS=${NSLOTS:=1} 17 | 18 | source /etc/profile.d/modules.sh 19 | scriptname=$(basename $0); 20 | 21 | if [ "$reads" == "" ]; then 22 | echo "Usage: $scriptname outdir *_1.fastq[.gz]" 23 | echo " R2 reads will be detected automatically based on matchiing filenames" 24 | exit 0; 25 | fi; 26 | 27 | set -e 28 | set -u 29 | 30 | module purge 31 | module load seqtk/1.3 trimmomatic/0.39 bwa/0.7.17 java/latest 32 | 33 | # die if some crucial program is not present 34 | # And side bonus of knowing where each executable is loading from 35 | which seqtk 36 | which bwa 37 | which java 38 | which kallisto 39 | which 1output_abundances.py 40 | which trim_reads_nwss.sh 41 | 42 | reference_db=/scicomp/groups-pure/Projects/NWSS_SequencingData/apps/wastewater_analysis/NWSS_PIPELINE/03.reference_set_29SEP2021-2/sequences.kallisto_idx 43 | scripts=/scicomp/groups-pure/Projects/NWSS_SequencingData/apps/wastewater_analysis/NWSS_PIPELINE/scripts 44 | 45 | tmpdir=$(mktemp --directory --suffix=$(basename $0)); 46 | trap ' { rm -rf $tmpdir; } ' EXIT 47 | 48 | mkdir -v $outdir || echo "WARNING: outdir already exists: $outdir" 49 | 50 | for R1 in $reads; do 51 | name=$(basename $R1 .fastq.gz | perl -plane 's/_\d|\.fastq|\.gz//g'); 52 | sampledir="$tmpdir/$name" 53 | mkdir -v $sampledir 54 | 55 | # Get the file extension, if it's .gz 56 | ext=${R1##*.} 57 | 58 | # This file has to be local 59 | ln -sv $(which 1output_abundances.py) $sampledir/ 60 | 61 | # Get fastq files into our local tmp folder 62 | R2=${R1/_1.f/_2.f} 63 | # decompress or simply cat R1/R2 with zcat 64 | local_R1="$sampledir/$(basename $R1 .gz)" 65 | local_R2="$sampledir/$(basename $R2 .gz)" 66 | 67 | if [[ "$ext" == "gz" ]]; then 68 | zcat $R1 > $local_R1 69 | zcat $R2 > $local_R2 70 | else 71 | cp -vL $R1 $R2 $sampledir/ 72 | fi 73 | 74 | # WARNING 75 | # Some commands require that you are in the sample directory 76 | # And so I will indent to indicate CWD 77 | cd $sampledir 78 | 79 | # Should just yield one sample name 80 | # into sample_ID 81 | local_sample_ID="sample_IDs.txt" 82 | \ls -f1 *.fastq | sed 's/_[12].fastq.*//' | sort | uniq > $local_sample_ID 83 | 84 | trim_dir="trimmed.out" 85 | kallisto="kallisto.out" 86 | 87 | echo "SAMPLE(S): $(cat $local_sample_ID | tr '\n' ',')" 88 | 89 | bash trim_reads_nwss.sh . $trim_dir $local_sample_ID 90 | 91 | echo 92 | 93 | bash run_kallisto_WG.sh $local_sample_ID $trim_dir/ivar $kallisto 94 | 95 | # Moving back out of the sample directory 96 | cd - 97 | 98 | cp -rv $sampledir/*kallisto.out $outdir/ 99 | 100 | #run_kallisto_SPIKE.sh 101 | 102 | # Directly remove the directory since we're done with it now 103 | rm -rvf $sampledir 104 | done 105 | 106 | -------------------------------------------------------------------------------- /qsub/launch_chewbbaca.simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Lee Katz 3 | 4 | #$ -S /bin/bash 5 | #$ -pe smp 4-24 6 | #$ -cwd -V 7 | #$ -o chewbbaca.log 8 | #$ -j y 9 | #$ -N Chewbbaca 10 | 11 | set -eu 12 | 13 | scriptname=$(basename $0); 14 | dirname=$(dirname $0); 15 | if [ "$#" -lt 3 ]; then 16 | echo "Runs cgMLST on a genome assembly" 17 | echo "Usage: $scriptname outdir/ cgMLST-database/ indir" 18 | echo " where indir is a directory of assembly fasta files" 19 | exit 0; 20 | fi; 21 | 22 | outdir=$1 23 | DB=$2 24 | indir=$3 25 | 26 | # Some defaults 27 | NSLOTS=${NSLOTS:=24} 28 | TMPDIR=${TMPDIR:=/scratch} 29 | 30 | function logmsg() { 31 | script=$(basename $0) 32 | HR="-------------" 33 | echo $HR >&2 34 | echo "$script: $@" >&2 35 | echo $HR >&2 36 | } 37 | 38 | if [ ! -d "$DB" ]; then 39 | logmsg "ERROR: could not find a cgMLST database folder at $DB" 40 | exit 1 41 | fi 42 | 43 | # Count how many fasta files are in the directory and if 44 | # they aren't there then make an error 45 | num_schema_loci=$(\ls -f1 "$DB" | grep -c 'fasta$' || true) 46 | if [ "$num_schema_loci" -lt 1 ]; then 47 | logmsg "ERROR: no fasta files found in $DB" 48 | exit 1 49 | fi 50 | 51 | logmsg "Found $num_schema_loci loci in the schema at $DB" 52 | 53 | tempdir=$(mktemp --tmpdir=$TMPDIR --directory chewbbaca.XXXXXX) 54 | trap ' { cd; rm -rf $tempdir; } ' EXIT 55 | 56 | # Runs chewbbaca in a container 57 | # Arguments: 58 | # database path 59 | # file of fasta filenames or fasta file 60 | function chewbbaca() { 61 | DB=$1 62 | indir=$2 63 | # TMPDIR is a global for basically /scratch 64 | # $tempdir is a global for a directory under /scratch 65 | # NSLOTS 66 | 67 | ls -dl $DB 68 | ls -dl $indir 69 | ls -dl $TMPDIR 70 | ls -dl $tempdir 71 | echo "NSLOTS: $NSLOTS" 72 | 73 | rm -rfv $DB/temp 74 | set -x 75 | singularity exec -B $TMPDIR:$TMPDIR -B $PWD:$PWD -B $indir:/input -B $DB:/schema -B $tempdir:/out bin/chewbbaca.cif chewBBACA.py AlleleCall -i /input --schema-directory /schema -o /out --cpu $NSLOTS 76 | set +x 77 | mv -v $tempdir/results* $outdir/ || true 78 | 79 | logmsg $tempdir 80 | ls -lhA $tempdir 81 | } 82 | 83 | mkdir -pv "$tempdir/input" 84 | cp -vL $indir/*.f*a $tempdir/input/ || true 85 | #cp -vrL $indir "$tempdir/input" || true 86 | 87 | mkdir -p $outdir 88 | chewbbaca $DB "$tempdir/input" 89 | 90 | -------------------------------------------------------------------------------- /qsub/launch_circlator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -e assembly.circlator.err 3 | #$ -o assembly.circlator.out 4 | #$ -N circ.assembly 5 | #$ -pe smp 4-16 6 | source /etc/profile.d/modules.sh 7 | module load Python/3.4 8 | module load circlator/1.2.1 9 | if [ -d circ_dir ]; then rm -r circ_dir; fi 10 | circlator all --threads $NSLOTS assembly.fasta reads.fasta circ_dir 11 | -------------------------------------------------------------------------------- /qsub/launch_colorid_mlst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Runs ColorID MLST 3 | # Author: Lee Katz 4 | 5 | #$ -S /bin/bash 6 | #$ -pe smp 1 7 | #$ -cwd -V 8 | #$ -o colorid 9 | #$ -j y 10 | #$ -N colorid 11 | 12 | outdir=$1 13 | dbdir=$2 14 | asm=$3 15 | 16 | NSLOTS=${NSLOTS:=1} 17 | 18 | source /etc/profile.d/modules.sh 19 | scriptname=$(basename $0); 20 | 21 | 22 | if [ "$asm" == "" ]; then 23 | echo "Usage: $scriptname outdir dbdir asm.fasta" 24 | echo " where dbdir is a directory of locus fasta files like chewbbaca" 25 | exit 0; 26 | fi; 27 | 28 | set -e 29 | set -u 30 | 31 | module purge 32 | 33 | colorid=$(which colorid_bv || which colorid) 34 | process_MLST=$(which process_MLST.py) 35 | 36 | tmpdir=$(mktemp --directory --suffix=$(basename $0) --tmpdir=./); 37 | trap ' { rm -rf $tmpdir; } ' EXIT 38 | 39 | if [ -e $outdir ]; then 40 | echo "WARNING: outdir already exists: $outdir" 41 | exit 1 42 | fi 43 | 44 | bxi="$tmpdir/asm.bxi" 45 | 46 | # Estimate assembly size by file size and then multiply by 10x 47 | asm_size=$(cat $asm | wc -c) 48 | bxi_size=$(( $asm_size * 10 )) 49 | 50 | # Some colorid vars 51 | fofn="$tmpdir/asm.fofn" 52 | alleles="$tmpdir/alleles.tsv" 53 | 54 | # Get a file of filenames 55 | echo -e "$(basename $asm .fasta)\t$asm" > $fofn 56 | 57 | $colorid build -b $bxi -s $bxi_size -n 2 -k 39 -t $NSLOTS -r $fofn 58 | 59 | $colorid search -ms -b $bxi -q $dbdir/*.fasta > $alleles 60 | sed -i.bak '/\*/d' $alleles 61 | $process_MLST $alleles $tmpdir/mlst 62 | 63 | cp -v $alleles $tmpdir/mlst* $outdir/ 64 | 65 | -------------------------------------------------------------------------------- /qsub/launch_downloadSrr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #$ -S /bin/bash 3 | #$ -pe smp 1 4 | #$ -cwd -V 5 | #$ -o fastq-dump.log 6 | #$ -j y 7 | #$ -N fastq-dump 8 | #$ -q all.q 9 | 10 | name=$1 11 | OUT=$2 12 | 13 | if [ "$OUT" == "" ]; then 14 | echo "Usage: $0 NAME" 15 | echo " Downloads a set of fastq files using fastq-dump"; 16 | exit 1; 17 | fi; 18 | 19 | module load sratoolkit/2.4.5-2 20 | if [ $? -gt 0 ]; then echo "unable to load sratoolkit/2.4.5-2"; exit 1; fi; 21 | 22 | mkdir -p /scratch/$USER/fastq-dump/$name 23 | 24 | downloadSra.pl -t /scratch/$USER/fastq-dump/$name $name | gzip -c > $OUT 25 | if [ $? -gt 0 ]; then echo "ERROR with fastq-dump $name"; exit 1; fi; 26 | 27 | rm -rf /scratch/$USER/fastq-dump/$name 28 | 29 | -------------------------------------------------------------------------------- /qsub/launch_etoki_mlst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Runs EToKi MLST 3 | # Author: Lee Katz 4 | 5 | #$ -S /bin/bash 6 | #$ -pe smp 1 7 | #$ -cwd -V 8 | #$ -o EToKi.log 9 | #$ -j y 10 | #$ -N EToKi 11 | 12 | out=$1 13 | refs=$2 14 | db=$3 15 | asm=$4 16 | 17 | #NSLOTS=${NSLOTS:=1} 18 | 19 | source /etc/profile.d/modules.sh 20 | scriptname=$(basename $0); 21 | 22 | 23 | if [ "$asm" == "" ]; then 24 | echo "Usage: $scriptname out.etoki.fasta refs.fasta etoki.csv assembly.fasta" 25 | exit 0; 26 | fi; 27 | 28 | set -e 29 | set -u 30 | 31 | module purge 32 | 33 | source ~/.bash_conda || echo "Could not find bash file for loading conda" 34 | conda activate etoki || echo "could not activate etoki env" 35 | 36 | EToKi.py configure 37 | 38 | tmpdir=$(mktemp --directory --suffix=$(basename $0)); 39 | trap ' { rm -rf $tmpdir; } ' EXIT 40 | 41 | #mkdir -v $outdir || echo "WARNING: outdir already exists: $outdir" 42 | 43 | if [ -e $out ]; then 44 | echo "ERROR: output file already exists: $out" 45 | exit 1 46 | fi 47 | 48 | samplename=$(basename $asm .fasta) 49 | echo "Sample name will be $samplename" 50 | 51 | EToKi.py MLSType -i $asm -r $refs -k $samplename -d $db -o $tmpdir/$(basename $out) 52 | mv -v $tmpdir/$(basename $out) $out 53 | -------------------------------------------------------------------------------- /qsub/launch_fastqToFasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #$ -S /bin/bash 4 | #$ -pe smp 1 5 | #$ -cwd -V 6 | #$ -o fastq_to_fasta.log -j y 7 | #$ -N fastq_to_fasta 8 | #$ -q all.q 9 | 10 | in=$1 11 | out=$2 12 | 13 | if [ "$out" == "" ]; then 14 | echo "Usage: $0 in.fastq[.gz] out.fasta" 15 | exit 1; 16 | fi; 17 | 18 | extension="${in##*.}" 19 | 20 | if [ "$extension" = "gz" ]; then 21 | gunzip -c "$in" | fastq_to_fasta -Q33 > "$out" 22 | else 23 | fastq_to_fasta -Q33 < "$in" > "$out" 24 | fi 25 | if [ $? -gt 0 ]; then echo "ERROR with fastq_to_fasta"; exit 1; fi; 26 | -------------------------------------------------------------------------------- /qsub/launch_freyja.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Runs Frejya wastewater pipeline for SARS-CoV-2 3 | # Author: Lee Katz 4 | 5 | #$ -S /bin/bash 6 | #$ -pe smp 1 7 | #$ -cwd -V 8 | #$ -o freyja.log 9 | #$ -j y 10 | #$ -N freyja 11 | 12 | ref=$1; shift 13 | outdir=$1; shift 14 | reads=$@ 15 | NSLOTS=${NSLOTS:=1} 16 | 17 | source /etc/profile.d/modules.sh 18 | scriptname=$(basename $0); 19 | 20 | if [ "$reads" == "" ]; then 21 | echo "Usage: $scriptname ref.fasta outdir *_1.fastq[.gz]" 22 | echo " R2 reads will be detected automatically based on matchiing filenames" 23 | exit 0; 24 | fi; 25 | 26 | set -e 27 | set -u 28 | 29 | module purge 30 | 31 | source ~/.bash_conda > /dev/null 2>&1 || echo "Could not find bash file for loading conda" 32 | conda activate freyja || echo "could not activate freyja env" 33 | 34 | which ivar 35 | which freyja 36 | which samtools 37 | which bowtie2 38 | 39 | tmpdir=$(mktemp --directory --suffix=$(basename $0)); 40 | trap ' { rm -rf $tmpdir; } ' EXIT 41 | 42 | mkdir -v $outdir || echo "WARNING: outdir already exists: $outdir" 43 | 44 | for R1 in $reads; do 45 | name=$(basename $R1 .fastq.gz | perl -plane 's/_\d|\.fastq|\.gz//g'); 46 | sampledir="$tmpdir/$name.freyja" 47 | echo "START $name in $sampledir" 48 | mkdir -v $sampledir 49 | 50 | # Get the file extension, if it's .gz 51 | ext=${R1##*.} 52 | 53 | # Get fastq files into our local tmp folder 54 | R2=${R1/_1.f/_2.f} 55 | 56 | echo "$R1 $R2"; 57 | 58 | cp -v $R1 $R2 $sampledir/ 59 | R1=$sampledir/$(basename $R1) 60 | R2=$sampledir/$(basename $R2) 61 | 62 | bowtie2 -x $ref -1 $R1 -2 $R2 | samtools view -bhS | samtools sort > $sampledir/sorted.bam 63 | 64 | # Trim primers 65 | ivar trim -i $sampledir/sorted.bam -p $sampledir/ivar.unsorted 66 | samtools sort $sampledir/ivar.unsorted.bam > $sampledir/ivar.bam 67 | 68 | # call variants 69 | samtools mpileup -aa -A -d 600000 -B -Q 0 $sampledir/ivar.bam | ivar variants -p $sampledir/ivar -q 20 -t 0.03 -r $ref 70 | 71 | # abundances 72 | freyja variants $sampledir/ivar.bam --variants $sampledir/freyja.variants --depths $sampledir/freyja.depths --ref $ref 73 | freyja demix $sampledir/freyja.variants.tsv $sampledir/freyja.depths --output $sampledir/freyja.demix 74 | 75 | # Clear results we don't need 76 | (cd $sampledir && rm -v *.bam* *.fastq.gz) 77 | # Save results 78 | rsync -av $sampledir $outdir/ 79 | done 80 | 81 | -------------------------------------------------------------------------------- /qsub/launch_kraken.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -pe smp 1-16 3 | #$ -S /bin/bash 4 | #$ -cwd -V 5 | #$ -o kraken.log -j y 6 | #$ -N kraken 7 | 8 | function logmsg () { 9 | script=$(basename $0); 10 | echo -e "$script: $@" >&2; 11 | } 12 | 13 | function run () { 14 | script=$(basename $0); 15 | logmsg "Running $@" 16 | eval $@ 17 | if [ $? -gt 0 ]; then logmsg "ERROR with previous command"; exit 1; fi; 18 | } 19 | 20 | NSLOTS=${NSLOTS-8} 21 | KRAKEN_DEFAULT_DB=${KRAKEN_DEFAULT_DB-/scicomp/reference/kraken/0.10.4/mini-20140330} 22 | 23 | if [ $# -eq 0 ]; then 24 | logmsg "Usage: $0 out.kraken/ reads_1.fastq.gz reads_2.fastq.gz [more_1.fastq.gz more_2.fastq.gz ...]"; 25 | logmsg "NOTE: KRAKEN_DEFAULT_DB is currently set to $KRAKEN_DEFAULT_DB" 26 | exit 1; 27 | fi; 28 | 29 | source /etc/profile.d/modules.sh 30 | module load kraken/1.0.0 31 | module load krona/2.5 32 | 33 | # Output is the first arg. 34 | # The rest of the args are reads. 35 | OUTDIR=$1; shift; 36 | READS=$@; 37 | 38 | if [ -e $OUTDIR ]; then 39 | echo "ERROR: dir $OUTDIR already exists! I will not overwrite it." 40 | #echo "DEBUG"; rm -rvf $OUTDIR; 41 | exit 1; 42 | fi 43 | 44 | # Where are my executables? 45 | KRAKENDIR=$(dirname $(which kraken)); 46 | KRONADIR=$(dirname $(which ktImportText)); 47 | 48 | # Set up the work space 49 | TEMPDIR=$(mktemp --directory --suffix=$(basename $0)); 50 | KRAKENOUT="$TEMPDIR/kraken.out" 51 | KRAKENTAXONOMY="$TEMPDIR/kraken.taxonomy"; 52 | KRAKENREPORT="$TEMPDIR/kraken.report" 53 | HTML="$TEMPDIR/out.html" 54 | 55 | function cleanup () { 56 | rm -rvf $TEMPDIR 57 | } 58 | trap cleanup EXIT 59 | 60 | hostname 61 | logmsg "tempdir is $TEMPDIR\n kraken dir is $KRAKENDIR\n krona dir is $KRONADIR"; 62 | 63 | run $KRAKENDIR/kraken --fastq-input --paired --db=$KRAKEN_DEFAULT_DB --preload --gzip-compressed --quick --threads $NSLOTS --output $KRAKENOUT $READS 64 | 65 | run kraken-translate --db $KRAKEN_DEFAULT_DB $KRAKENOUT | cut -f 2- | sort | uniq -c |\ 66 | perl -lane ' 67 | s/^ +//; # remove leading spaces 68 | s/ +/\t/; # change first set of spaces from uniq -c to a tab 69 | s/;/\t/g; # change the semicolon-delimited taxonomy to tab-delimited 70 | print; 71 | ' |\ 72 | sort -k1,1nr > $KRAKENTAXONOMY 73 | 74 | # Grab the unclassified reads 75 | head -n 1 $KRAKENOUT | cut -f 3 >> $KRAKENTAXONOMY 76 | cat $KRAKENTAXONOMY 77 | 78 | run $KRONADIR/ktImportText -o $HTML $KRAKENTAXONOMY 79 | run kraken-report $KRAKENOUT > $KRAKENREPORT 80 | perl -lane ' print if($F[0] > 0.00); ' < $KRAKENREPORT > $KRAKENREPORT.filtered 81 | 82 | rm -v $KRAKENOUT 83 | cp -rv $TEMPDIR $OUTDIR 84 | 85 | logmsg "DONE!" 86 | 87 | -------------------------------------------------------------------------------- /qsub/launch_kraken2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -pe smp 1-16 3 | #$ -S /bin/bash 4 | #$ -cwd -V 5 | #$ -o kraken2.log -j y 6 | #$ -N kraken2 7 | 8 | function logmsg () { 9 | script=$(basename $0); 10 | echo -e "$script: $@" >&2; 11 | } 12 | 13 | function run () { 14 | script=$(basename $0); 15 | logmsg "Running $@" 16 | eval $@ 17 | if [ $? -gt 0 ]; then logmsg "ERROR with previous command"; exit 1; fi; 18 | } 19 | 20 | NSLOTS=${NSLOTS-16} 21 | KRAKEN2_DEFAULT_DB=${KRAKEN2_DEFAULT_DB-/scicomp/reference/kraken/2.0.0/} 22 | 23 | if [ $# -eq 0 ]; then 24 | logmsg "Usage: $0 out.kraken/ reads_1.fastq.gz reads_2.fastq.gz [more_1.fastq.gz more_2.fastq.gz ...]"; 25 | logmsg "NOTE: KRAKEN2_DEFAULT_DB is currently set to $KRAKEN2_DEFAULT_DB" 26 | exit 1; 27 | fi; 28 | 29 | source /etc/profile.d/modules.sh 30 | module load kraken/2.0.8 31 | module load krona/2.5 32 | 33 | # Output is the first arg. 34 | # The rest of the args are reads. 35 | OUTDIR=$1; shift; 36 | READS=$@; 37 | 38 | if [ -e $OUTDIR ]; then 39 | echo "ERROR: dir $OUTDIR already exists! I will not overwrite it." 40 | #echo "DEBUG"; rm -rvf $OUTDIR; 41 | exit 1; 42 | fi 43 | 44 | # Where are my executables? 45 | KRAKENDIR=$(dirname $(which kraken2)); 46 | KRONADIR=$(dirname $(which ktImportText)); 47 | 48 | # Set up the work space 49 | TEMPDIR=$(mktemp --directory --suffix=$(basename $0)); 50 | KRAKENOUT="$TEMPDIR/kraken.out" 51 | KRAKENTAXONOMY="$TEMPDIR/kraken.taxonomy"; 52 | KRAKENREPORT="$TEMPDIR/kraken.report" 53 | HTML="$TEMPDIR/out.html" 54 | 55 | function cleanup () { 56 | rm -rvf $TEMPDIR 57 | } 58 | trap cleanup EXIT 59 | 60 | hostname 61 | logmsg "tempdir is $TEMPDIR\n kraken dir is $KRAKENDIR\n krona dir is $KRONADIR"; 62 | 63 | run $KRAKENDIR/kraken2 --paired --db=$KRAKEN2_DEFAULT_DB --gzip-compressed --quick --threads $NSLOTS --report $KRAKENREPORT --output $KRAKENOUT $READS 64 | 65 | run kraken2-translate --db $KRAKEN2_DEFAULT_DB $KRAKENOUT | cut -f 2- | sort | uniq -c |\ 66 | perl -lane ' 67 | s/^ +//; # remove leading spaces 68 | s/ +/\t/; # change first set of spaces from uniq -c to a tab 69 | s/;/\t/g; # change the semicolon-delimited taxonomy to tab-delimited 70 | print; 71 | ' |\ 72 | sort -k1,1nr > $KRAKENTAXONOMY 73 | 74 | # Grab the unclassified reads 75 | head -n 1 $KRAKENOUT | cut -f 3 >> $KRAKENTAXONOMY 76 | cat $KRAKENTAXONOMY 77 | 78 | run $KRONADIR/ktImportText -o $HTML $KRAKENTAXONOMY 79 | perl -lane ' print if($F[0] > 0.00); ' < $KRAKENREPORT > $KRAKENREPORT.filtered 80 | 81 | rm -v $KRAKENOUT 82 | cp -rv $TEMPDIR $OUTDIR 83 | 84 | logmsg "DONE!" 85 | 86 | -------------------------------------------------------------------------------- /qsub/launch_kraken_contigs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -pe smp 1-16 3 | #$ -S /bin/bash 4 | #$ -cwd -V 5 | #$ -o kraken.log -j y 6 | #$ -N kraken 7 | 8 | # The aspen module for kraken is broken, and so I just have 9 | # to assume Kraken is in the path. 10 | 11 | source /etc/profile.d/modules.sh 12 | #module load kraken/0.10.4 13 | export PATH=$PATH:~/src/lskScripts/scripts 14 | 15 | function logmsg () { 16 | script=$(basename $0); 17 | echo -e "$script: $@" >&2; 18 | } 19 | 20 | function run () { 21 | script=$(basename $0); 22 | logmsg "Running $@" 23 | eval $@ 24 | if [ $? -gt 0 ]; then logmsg "ERROR with previous command"; exit 1; fi; 25 | } 26 | 27 | NSLOTS=${NSLOTS-8} 28 | KRAKEN_DEFAULT_DB=${KRAKEN_DEFAULT_DB-/scicomp/reference/kraken/0.10.4/mini-20140330} 29 | 30 | if [ $# -eq 0 ]; then 31 | logmsg "Usage: $0 outdir in.fasta"; 32 | logmsg "NOTE: KRAKEN_DEFAULT_DB is currently set to $KRAKEN_DEFAULT_DB" 33 | exit 1; 34 | fi; 35 | 36 | # load modules after help menu 37 | module load kraken/1.0.0 38 | module load krona/2.5 39 | 40 | OUTDIR=$1 41 | ASM=$2 42 | 43 | # Where are my executables? 44 | KRAKENDIR=$(dirname $(which kraken)); 45 | KRONADIR=$(dirname $(which ktImportText)); 46 | 47 | # Set up the work space 48 | if [ -d "$OUTDIR" ]; then 49 | echo "$OUTDIR already exists"; 50 | exit 1; 51 | fi; 52 | mkdir $OUTDIR 53 | if [ $? -gt 0 ]; then 54 | echo "ERROR: I could not create $OUTDIR"; 55 | exit 1 56 | fi 57 | KRAKENOUT="$OUTDIR/kraken.out" 58 | KRAKENTAXONOMY="$OUTDIR/kraken.taxonomy"; 59 | HTML="$OUTDIR/kraken.html" 60 | 61 | logmsg "Outdir is $OUTDIR\n kraken dir is $KRAKENDIR\n krona dir is $KRONADIR"; 62 | 63 | run $KRAKENDIR/kraken --fasta-input --db=$KRAKEN_DEFAULT_DB --preload --threads $NSLOTS --output $KRAKENOUT $ASM 64 | 65 | translate-kraken-contigs.pl $KRAKENOUT | sort -k1,1nr > $KRAKENTAXONOMY 66 | 67 | run $KRONADIR/ktImportText -o $HTML $KRAKENTAXONOMY 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /qsub/launch_mergeFastaReads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #$ -S /bin/bash 4 | #$ -pe smp 1 5 | #$ -cwd -V 6 | #$ -o mergeFastaReads.log 7 | #$ -j y 8 | #$ -N mergeFastaReads 9 | #$ -q all.q 10 | 11 | reads=$1 12 | out=$2 13 | 14 | if [ "$out" == "" ]; then 15 | echo "Usage: $0 reads.fasta merged.fasta" 16 | exit 1; 17 | fi; 18 | 19 | merge_fasta_reads "$reads" > "$out.tmp" && mv "$out.tmp" "$out" 20 | if [ $? -gt 0 ]; then echo "ERROR with merge_fasta_reads"; exit 1; fi; 21 | -------------------------------------------------------------------------------- /qsub/launch_parsnp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #$ -S /bin/bash 4 | #$ -pe smp 4-16 5 | #$ -cwd -V 6 | #$ -o parsnp.log 7 | #$ -j y 8 | #$ -N parsnp 9 | #$ -q all.q 10 | 11 | module load harvest/1.0.1 12 | if [ $? -gt 0 ]; then echo "WARNING: couldn't load harvest module"; fi; 13 | 14 | refGbk=$1 15 | asmDir=$2 16 | out=$3 17 | script=$(basename $0) 18 | 19 | if [ "$out" == "" ]; then 20 | echo "Usage: $script reference.gbk asmDir outDir" 21 | exit 1; 22 | fi; 23 | 24 | NSLOTS=${NSLOTS:=12} 25 | 26 | c="parsnp -a 13 -c -R 1 -g $refGbk -d $asmDir -p $NSLOTS -o $out" 27 | $c # run the command 28 | if [ $? -gt 0 ]; then echo -e "ERROR with parsnp\n $c"; exit 1; fi; 29 | -------------------------------------------------------------------------------- /qsub/launch_polish.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | #$ -pe smp 1-16 3 | #$ -S /bin/bash 4 | #$ -cwd -V 5 | #$ -o polish.log -j y 6 | #$ -N polishAssembly 7 | 8 | use 5.12.0; 9 | use strict; 10 | use warnings; 11 | use Data::Dumper; 12 | use Getopt::Long qw/GetOptions/; 13 | use File::Basename qw/basename/; 14 | use Env::Modulecmd; 15 | use File::Basename qw/basename/; 16 | use File::Temp qw/tempdir/; 17 | use File::Copy qw/cp/; 18 | 19 | Env::Modulecmd::purge(); 20 | Env::Modulecmd::load(qw(nanopolish/0.8.3 minimap2/2.7 Python/2.7.13 21 | samtools/1.4.1 minimap2/2.7 bowtie2/2.3.3.1 pilon/1.22)); 22 | 23 | my $numcpus=$ENV{NSLOTS} || 24; 24 | 25 | exit(main()); 26 | 27 | sub main{ 28 | 29 | my $settings={}; 30 | GetOptions($settings,qw(help nanopore=s illumina1=s illumina2=s outfile|output|outfasta=s)) or die $!; 31 | $$settings{nanopore}//=""; 32 | $$settings{illumina1}//=""; 33 | $$settings{illumina2}//=""; 34 | $$settings{outfile}//="polished.fasta"; 35 | $$settings{tempdir}//=tempdir("asm_polish_XXXXXX", TMPDIR=>1, CLEANUP=>1); 36 | 37 | my $ref=shift @ARGV; 38 | die usage() if(!$ref || $$settings{help}); 39 | 40 | if($$settings{nanopore}){ 41 | $ref=nanopolish($ref,$$settings{nanopore},$settings); 42 | } 43 | if($$settings{illumina1}){ 44 | $ref=pilon ($ref,$$settings{illumina1},$$settings{illumina2},$settings); 45 | } 46 | 47 | # Easy to print results with cat! 48 | #system("cat $ref"); 49 | cp($ref, $$settings{outfile}); 50 | die if $?; 51 | 52 | return 0; 53 | } 54 | 55 | sub nanopolish{ 56 | ...; 57 | } 58 | 59 | sub pilon{ 60 | my($ref, $R1, $R2, $settings)=@_; 61 | 62 | my $symlinkRef="$$settings{tempdir}/unpolished.fasta"; 63 | cp($ref, $symlinkRef); 64 | $ref=$symlinkRef; 65 | 66 | # People say to run Pilon four times in a row, single threaded 67 | my $maxRuns=4; 68 | for(my $i=1;$i<=4;$i++){ 69 | my $bam="$$settings{tempdir}/pilon$i.sorted.bam"; 70 | 71 | system("bowtie2-build $ref $ref"); die if $?; 72 | system("bowtie2 -x $ref -1 $R1 -2 $R2 -p $numcpus | samtools sort -T $$settings{tempdir}/samtoolssort -o $bam"); 73 | die if $?; 74 | system("samtools index $bam"); die if $?; 75 | 76 | system("pilon --genome '$ref' --frags $bam --output $$settings{tempdir}/pilon$i --changes --threads $numcpus --fix snps,indels"); 77 | die if $?; 78 | 79 | # update for the next iteration 80 | $ref="$$settings{tempdir}/pilon$i.fasta"; 81 | } 82 | 83 | return $ref; 84 | } 85 | 86 | sub usage{ 87 | "Polish an assembly with nanopore or illumina reads.\n". 88 | basename($0)." [--nanopore=nanopore.fastq] --illumina1=R1.fastq --illumina2=R2.fastq --outfile=out.fasta contigs.fasta 89 | " 90 | } 91 | 92 | -------------------------------------------------------------------------------- /qsub/launch_predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #$ -S /bin/bash 4 | #$ -pe smp 10-16 5 | #$ -cwd -V 6 | #$ -o prediction.log 7 | #$ -j y 8 | #$ -N cgpPredict 9 | #$ -q all.q 10 | 11 | project=$1 12 | 13 | if [ "$project" == "" ]; then 14 | echo "Usage: " $(basename $0) " CGP-project/" 15 | exit 1; 16 | fi; 17 | 18 | NSLOTS=${NSLOTS:=1} 19 | 20 | run_pipeline predict -p "$project" --numcpus $NSLOTS 21 | if [ $? -gt 0 ]; then echo "ERROR with run_pipeline predict"; exit 1; fi; 22 | -------------------------------------------------------------------------------- /qsub/launch_prokka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -pe smp 8-16 3 | #$ -S /bin/bash 4 | #$ -cwd -V 5 | #$ -o prokka.log -j y 6 | #$ -N prokka 7 | 8 | contigs=$1 9 | genome=$2 10 | 11 | source /etc/profile.d/modules.sh 12 | 13 | genus=${3-genus} 14 | species=${4-species} 15 | NSLOTS=${NSLOTS-1} 16 | 17 | if [ "$genome" == "" ]; then 18 | script=$(basename $0); 19 | echo "Usage: $script contigs.fasta genomename [genus species]" 20 | exit 1; 21 | fi 22 | module load prokka/1.13.3 23 | module load rnammer/1.2 24 | 25 | command="prokka --prefix $genome --compliant --locustag $genome --genus $genus --species $species --strain $genome --force --cpus $NSLOTS $contigs" 26 | eval $command 27 | -------------------------------------------------------------------------------- /qsub/launch_shovill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Runs a shovill assembly. Run with no options for usage. 3 | # Author: Lee Katz 4 | 5 | #$ -S /bin/bash 6 | #$ -pe smp 4-16 7 | #$ -cwd -V 8 | #$ -o shovill.log 9 | #$ -j y 10 | #$ -N Shovill 11 | #$ -q all.q 12 | 13 | R1=$1 14 | R2=$2 15 | outdir=$3 16 | NSLOTS=${NSLOTS:=12} 17 | 18 | source /etc/profile.d/modules.sh 19 | scriptname=$(basename $0); 20 | 21 | if [ "$outdir" == "" ]; then 22 | echo "Usage: $scriptname reads.1.fastq.gz reads.2.fastq.gz outdir" 23 | exit 0; 24 | fi; 25 | 26 | set -e 27 | set -u 28 | 29 | module load shovill 30 | 31 | shovill --check 32 | 33 | shovill --outdir $outdir --R1 $R1 --R2 $R2 --assembler skesa 34 | 35 | -------------------------------------------------------------------------------- /qsub/launch_skesa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #$ -S /bin/bash 3 | #$ -pe smp 1-16 4 | #$ -cwd -V 5 | #$ -o skesa.log -j y 6 | #$ -N Skesa_2.0_2 7 | #$ -q all.q 8 | 9 | reads=$1 10 | fastaOut=$2 11 | 12 | if [ "$fastaOut" == "" ]; then 13 | echo "Usage: $0 shuffled.fastq.gz out.fasta" 14 | exit 1; 15 | fi; 16 | 17 | module load Skesa/2.0_2 18 | if [ $? -gt 0 ]; then echo "unable to load Skesa v2"; exit 1; fi; 19 | 20 | if [ -e "$fastaOut" ]; then 21 | echo "$fastaOut already exists. Exiting."; 22 | exit 1; 23 | fi 24 | 25 | NSLOTS=${NSLOTS:=1} 26 | skesa --cores $NSLOTS --fastq $reads --gz --use_paired_ends > $fastaOut 27 | 28 | -------------------------------------------------------------------------------- /qsub/launch_spades.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #$ -S /bin/bash 3 | #$ -pe smp 4-16 4 | #$ -cwd -V 5 | #$ -o spades.log 6 | #$ -j y 7 | #$ -N SPAdes_3.15.3 8 | #$ -q all.q 9 | 10 | reads=$1 11 | out=$2 12 | fastaOut=$3 13 | 14 | if [ "$out" == "" ]; then 15 | echo "Usage: $0 reads.fastq.gz output/ [out.fasta]" 16 | echo " if out.fasta is given, then the output directory will be removed and the scaffolds.fasta file will be saved" 17 | exit 1; 18 | fi; 19 | 20 | module load SPAdes/3.15.3 21 | if [ $? -gt 0 ]; then echo "unable to load SPAdes/3.15.3"; exit 1; fi; 22 | 23 | NSLOTS=${NSLOTS:=1} 24 | spades.py --12 $reads --careful -o $out -t $NSLOTS 25 | if [ $? -gt 0 ]; then echo "problem with SPAdes/3.15.3"; exit 1; fi; 26 | 27 | # Assembly metrics. Don't die if this script dies. It's not worth it. 28 | echo "# CG-Pipeline metrics" > $out/run_assembly_metrics.txt 29 | run_assembly_metrics.pl $out/scaffolds.fasta >> $out/run_assembly_metrics.txt 30 | 31 | if [ "$fastaOut" != "" ]; then 32 | cp -v "$out/scaffolds.fasta" $fastaOut 33 | if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi; 34 | rm -rf "$out"; 35 | if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi; 36 | fi 37 | 38 | -------------------------------------------------------------------------------- /qsub/launch_spades_SE.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #$ -S /bin/bash 3 | #$ -pe smp 4-16 4 | #$ -cwd -V 5 | #$ -o spades.log -j y 6 | #$ -N SPAdes3.1.0 7 | 8 | reads=$1 9 | out=$2 10 | fasta=$3 11 | 12 | if [ "$out" == "" ]; then 13 | echo "Usage: $0 reads.fastq.gz output/ out.fasta" 14 | exit 1; 15 | fi; 16 | 17 | module load SPAdes/3.1.0 18 | if [ $? -gt 0 ]; then echo "unable to load spades 3.1.0"; exit 1; fi; 19 | 20 | NSLOTS=${NSLOTS:=1} 21 | 22 | spades.py -s $reads -o $out -t $NSLOTS 23 | if [ $? -gt 0 ]; then echo "problem with spades 3.1.0"; exit 1; fi; 24 | 25 | if [ "$fastaOut" != "" ]; then 26 | cp -v "$out/scaffolds.fasta" $fastaOut 27 | if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi; 28 | rm -rf "$out"; 29 | if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi; 30 | fi 31 | 32 | -------------------------------------------------------------------------------- /qsub/launch_spades_iontorrent.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #$ -S /bin/bash 3 | #$ -pe smp 4-16 4 | #$ -cwd -V 5 | #$ -o spades.log 6 | #$ -j y 7 | #$ -N SPAdesIonTorrent3.6.2 8 | #$ -q all.q 9 | 10 | reads=$1 11 | out=$2 12 | fastaOut=$3 13 | 14 | if [ "$out" == "" ]; then 15 | echo "Usage: $0 reads.sff output/ [out.fasta]" 16 | echo " if out.fasta is given, then the output directory will be removed and the scaffolds.fasta file will be saved" 17 | exit 1; 18 | fi; 19 | 20 | module load SPAdes/3.6.2 21 | if [ $? -gt 0 ]; then echo "unable to load spades 3.6.2"; exit 1; fi; 22 | 23 | NSLOTS=${NSLOTS:=1} 24 | spades.py -s $reads --iontorrent -o $out -t $NSLOTS 25 | if [ $? -gt 0 ]; then echo "problem with spades 3.6.2"; exit 1; fi; 26 | 27 | if [ "$fastaOut" != "" ]; then 28 | cp -v "$out/scaffolds.fasta" $fastaOut 29 | if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi; 30 | rm -rf "$out"; 31 | if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi; 32 | fi 33 | -------------------------------------------------------------------------------- /qsub/launch_spades_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # Runs a spades assembly. Run with no options for usage. 3 | # Author: Lee Katz 4 | #Example: (for f in *R1_001.fastq.gz; do b=`basename $f _R1.fastq.gz`; r=`sed 's/R1/R2/' <<< $f`; qsub -N spades$b -o ./assemblies/log/b.spades.log ~/bin/launch_SPAdes_v3.11.0.sh $f $r ./assemblies/$b.spades3.11 ./assemblies/$b; done;) 5 | 6 | #$ -S /bin/bash 7 | #$ -pe smp 4-16 8 | #$ -cwd -V 9 | #$ -o spades.log 10 | #$ -j y 11 | #$ -N SPAdes3.15.3 12 | 13 | forward=$1 14 | reverse=$2 15 | out=$3 16 | fastaOut=$4 17 | 18 | source /etc/profile.d/modules.sh 19 | scriptname=$(basename $0); 20 | 21 | if [ "$out" == "" ]; then 22 | echo "Usage: $scriptname reads.1.fastq.gz reads.2.fastq.gz output/ [out.fasta]" 23 | echo " if out.fasta is given, then the output directory will be removed and the scaffolds.fasta file will be saved" 24 | exit 1; 25 | fi; 26 | 27 | module load SPAdes/3.15.3 28 | if [ $? -gt 0 ]; then echo "unable to load spades 3.15.3"; exit 1; fi; 29 | 30 | NSLOTS=${NSLOTS:=1} 31 | 32 | COMMAND="spades.py -1 $forward -2 $reverse --careful -o $out -t $NSLOTS" 33 | echo "$scriptname: $COMMAND" 34 | $COMMAND 35 | if [ $? -gt 0 ]; then echo "problem with spades 3.11.0"; exit 1; fi; 36 | 37 | if [ "$fastaOut" != "" ]; then 38 | cp -v "$out/scaffolds.fasta" $fastaOut 39 | if [ $? -gt 0 ]; then echo "problem with copying $out/scaffolds.fasta => $fastaOut"; exit 1; fi; 40 | rm -rf "$out"; 41 | if [ $? -gt 0 ]; then echo "problem with removing the directory $out"; exit 1; fi; 42 | fi 43 | -------------------------------------------------------------------------------- /qsub/launch_trimClean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #$ -S /bin/bash 4 | #$ -pe smp 10-16 5 | #$ -cwd -V 6 | #$ -o trimClean.log -j y 7 | #$ -N cgpTrimClean 8 | #$ -q all.q 9 | 10 | reads=$1 11 | out=$2 12 | 13 | if [ "$out" == "" ]; then 14 | echo "Usage: $0 reads.fastq.gz cleaned.fastq.gz" 15 | exit 1; 16 | fi; 17 | 18 | NSLOTS=${NSLOTS:=1} 19 | 20 | run_assembly_trimClean.pl -i $reads -o $out --auto --nosingletons --numcpus $NSLOTS 21 | if [ $? -gt 0 ]; then echo "ERROR with run_assembly_trimClean.pl"; exit 1; fi; 22 | -------------------------------------------------------------------------------- /qsub/launch_velvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -pe smp 16 3 | #$ -cwd -V 4 | #$ -o velvet.log -j y 5 | #$ -N velvet 6 | #$ -q all.q 7 | 8 | module () 9 | { 10 | eval `/usr/bin/modulecmd bash $*` 11 | } 12 | 13 | module load velvet/1.2.10; 14 | if [ $? -gt 0 ]; then echo "unable to load velvet/1.2.10"; exit 1; fi; 15 | 16 | reads=$1 17 | out=$2 18 | 19 | # number of cpus is either set by SGE, or is just 1 20 | #NSLOTS=${NSLOTS:=1} 21 | NSLOTS=${NSLOTS:=1} 22 | echo $NSLOTS 23 | 24 | if [ "$out" == "" ]; then 25 | echo "Usage: $0 reads.fastq.gz output/" 26 | exit 1; 27 | fi; 28 | 29 | command="$(which perl) $(which VelvetOptimiser.pl) -s 55 -e 99 -d $out -p $out -t $NSLOTS -f '-fastq.gz -shortPaired $reads'" 30 | eval $command 31 | if [ $? -gt 0 ]; then echo "problem with VelvetOptimiser"; echo $command; exit 1; fi; 32 | -------------------------------------------------------------------------------- /qsub/modules.csh: -------------------------------------------------------------------------------- 1 | if ($?tcsh) then 2 | set modules_shell="tcsh" 3 | else 4 | set modules_shell="csh" 5 | endif 6 | set exec_prefix='/usr/bin' 7 | 8 | set prefix="" 9 | set postfix="" 10 | 11 | if ( $?histchars ) then 12 | set histchar = `echo $histchars | cut -c1` 13 | set _histchars = $histchars 14 | 15 | set prefix = 'unset histchars;' 16 | set postfix = 'set histchars = $_histchars;' 17 | else 18 | set histchar = \! 19 | endif 20 | 21 | if ($?prompt) then 22 | set prefix = "$prefix"'set _prompt="$prompt";set prompt="";' 23 | set postfix = "$postfix"'set prompt="$_prompt";unset _prompt;' 24 | endif 25 | 26 | if ($?noglob) then 27 | set prefix = "$prefix""set noglob;" 28 | set postfix = "$postfix""unset noglob;" 29 | endif 30 | set postfix = "set _exit="'$status'"; $postfix; /usr/bin/test 0 = "'$_exit;' 31 | 32 | alias module $prefix'eval `'$exec_prefix'/modulecmd '$modules_shell' '$histchar'*`; '$postfix 33 | unset exec_prefix 34 | unset prefix 35 | unset postfix 36 | 37 | setenv MODULESHOME /usr/share/Modules 38 | 39 | if (! $?MODULEPATH ) then 40 | setenv MODULEPATH `sed -n 's/[ #].*$//; /./H; $ { x; s/^\n//; s/\n/:/g; p; }' ${MODULESHOME}/init/.modulespath` 41 | endif 42 | 43 | if (! $?LOADEDMODULES ) then 44 | setenv LOADEDMODULES "" 45 | endif 46 | -------------------------------------------------------------------------------- /qsub/modules.sh: -------------------------------------------------------------------------------- 1 | 2 | module() { eval `/usr/bin/modulecmd bash $*`; } 3 | #export -f module 4 | 5 | MODULESHOME=/usr/share/Modules 6 | export MODULESHOME 7 | 8 | if [ "${LOADEDMODULES:-}" = "" ]; then 9 | LOADEDMODULES= 10 | export LOADEDMODULES 11 | fi 12 | 13 | ## july 2013 wgx9 chris dagdigian 14 | ## master node had incorrect modulepath for centos nodes and the correct 15 | ## path was not being set because the env was already present and defined 16 | ## commenting out the if-then test so we force the recreation of the 17 | ## proper centos-specific module file paths ... 18 | 19 | #if [ "${MODULEPATH:-}" = "" ]; then 20 | # MODULEPATH=`sed -n 's/[ #].*$//; /./H; $ { x; s/^\n//; s/\n/:/g; p; }' ${MODULESHOME}/init/.modulespath` 21 | # export MODULEPATH 22 | #fi 23 | 24 | MODULEPATH=`sed -n 's/[ #].*$//; /./H; $ { x; s/^\n//; s/\n/:/g; p; }' ${MODULESHOME}/init/.modulespath` 25 | export MODULEPATH 26 | 27 | 28 | if [ ${BASH_VERSINFO:-0} -ge 3 ] && [ -r ${MODULESHOME}/init/bash_completion ]; then 29 | . ${MODULESHOME}/init/bash_completion 30 | fi 31 | -------------------------------------------------------------------------------- /qsub/sub_unicycler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -e err.err 3 | #$ -o out.out 4 | #$ -N racon 5 | #$ -pe smp 8,16 6 | #$ -q short.q,all.q 7 | #$ -cwd 8 | 9 | # Author: Dhwani Batra 10 | 11 | source /etc/profile.d/modules.sh 12 | module purge 13 | unset PYTHONPATH 14 | module load Unicycler/0.4.4 15 | 16 | 17 | 18 | usage(){ 19 | 20 | echo -e "\nUSAGE: $(basename $0)\n"\ 21 | " -1 Full path to Illumina Read 1 \n"\ 22 | " -2 Full path to Illumina Read 2 \n"\ 23 | " -r Full path to Pacbio Reference \n"\ 24 | exit 1 25 | 26 | } 27 | 28 | SHORT1="" 29 | SHORT2="" 30 | REFERENCE="" 31 | 32 | while getopts 1:2:r: opt; do 33 | case $opt in 34 | 1) 35 | SHORT1=$OPTARG 36 | ;; 37 | 2) 38 | SHORT2=$OPTARG 39 | ;; 40 | r) 41 | REFERENCE=$OPTARG 42 | ;; 43 | 44 | esac 45 | done 46 | 47 | if [ -z "$SHORT1" ]; then 48 | echo -e "\nERROR: -1 is a required parameter." 49 | usage 50 | exit 1 51 | fi 52 | 53 | if [ -z "$SHORT2" ]; then 54 | echo -e "\nERROR: -2 is a required parameter." 55 | usage 56 | exit 1 57 | fi 58 | 59 | if [ -z "$REFERENCE" ]; then 60 | echo $REFERENCE 61 | echo -e "\nERROR: -r is a required parameter." 62 | usage 63 | exit 1 64 | fi 65 | 66 | if [ -z "$NSLOTS" ]; then 67 | NSLOTS=8 68 | fi 69 | 70 | LNAME=$(basename $REFERENCE) 71 | LABEL=${LNAME%%.fsa} 72 | mkdir $LABEL 73 | cd $LABEL 74 | 75 | 76 | unicycler_polish -1 ${SHORT1} -2 ${SHORT2} -a ${REFERENCE} --pilon /apps/x86_64/pilon/1.22/lib/pilon/pilon-1.22.jar --ale /apps/x86_64/ALE/ALE-20130717/src/ALE --threads $NSLOTS 77 | cd .. 78 | -------------------------------------------------------------------------------- /scripts/Kuhner-Felsenstein.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ref_tree=$1 4 | export query_tree=$2 5 | 6 | script=$(basename $0); 7 | if [ "$ref_tree" == "" ]; then 8 | echo "Usage: $script ref.dnd query.dnd" 9 | exit 1; 10 | fi; 11 | 12 | logmsg () { 13 | echo "$script: $@" >&2 14 | } 15 | 16 | # Check executables 17 | for exe in treedist randTrees.pl; do 18 | which $exe >/dev/null 2>&1; 19 | if [ $? -gt 0 ]; then 20 | logmsg "ERROR: I could not find $exe in your path!"; 21 | exit 1; 22 | fi; 23 | done; 24 | 25 | mkdir -p /tmp/$USER 26 | tmpdir=$(mktemp --directory --tmpdir=/tmp/$USER Kuhner-Felsenstein.XXXXXX) 27 | if [ $? -gt 0 ]; then logmsg "ERROR making temporary directory under /tmp/$USER"; exit 1; fi; 28 | logmsg "Temporary dir is $tmpdir"; 29 | 30 | cp $ref_tree $tmpdir/ &&\ 31 | cp $query_tree $tmpdir/ 32 | 33 | tmpRefTree="$tmpdir/$(basename $ref_tree)" 34 | tmpQueryTree="$tmpdir/$(basename $query_tree)" 35 | 36 | # Create a list of trees to compare against in $comparisonTrees: 37 | # The first tree is the reference tree 38 | # The next trees are randomly made trees from 39 | # the query tree. 40 | # Therefore, the question being answered is, is the query 41 | # closer to the ref than random trees? 42 | comparisonTrees="$tmpdir/compareAgainst.dnd"; 43 | cp $tmpRefTree $comparisonTrees 44 | randTrees.pl $tmpQueryTree --numTrees 1000 >> $comparisonTrees 45 | # We need the files to be named intree and intree2 because of inflexible treedist 46 | ln -s $tmpQueryTree "$tmpdir/intree" 47 | ln -s $comparisonTrees "$tmpdir/intree2" 48 | 49 | # Must do treedist in the temp directory because it pollutes the 50 | # local directory 51 | pushd $tmpdir > /dev/null 2>&1 52 | 53 | # Run treedist with the Kuhner and Felsenstein distance metric. 54 | # If the option "D" were given, then it would be run with Robinson-Foulds 55 | #echo -ne 'D\n2\nL\nS\nY\n' | treedist > "$tmpdir/treedist.log" 2> "$tmpdir/treedist.out" 56 | echo -ne '2\nL\nS\nY\n' | treedist > "$tmpdir/treedist.log" 2> "$tmpdir/treedist.out" 57 | if [ $? -gt 0 ]; then logmsg "ERROR with treedist program: $(cat $tmpdir/treedist.log)"; exit 1; fi; 58 | 59 | # Find average and stdev 60 | cat outfile | perl -MStatistics::Descriptive -MMath::Gauss=cdf,pdf -MList::Util=sum -lane ' 61 | BEGIN{ 62 | my @F=split(/\s+/,<>); 63 | $obs=$F[2]; 64 | } 65 | next if($F[0] != 1); 66 | push(@difference,$F[2]); 67 | END{ 68 | $stat=Statistics::Descriptive::Full->new(); 69 | $stat->add_data(@difference); 70 | $num=@difference; 71 | $avg=$stat->mean; 72 | $stdev=$stat->standard_deviation; 73 | $var=$stdev**2; 74 | 75 | $Z=($obs - $avg)/$stdev; 76 | $p=cdf($obs,$avg,$stdev); 77 | 78 | # scientific and floating point formatting 79 | $_=sprintf("%0.2e",$_) for($p); 80 | $_=sprintf("%0.2f",$_) for($obs,$avg,$stdev,$Z); 81 | 82 | # Print results 83 | print join("\t",qw(ref_tree query_tree obs num avg stdev Z p)); 84 | print join("\t",$ENV{ref_tree},$ENV{query_tree},$obs, $num, $avg,$stdev,$Z,$p); 85 | } 86 | ' 87 | 88 | rm -rf $tmpdir 89 | 90 | -------------------------------------------------------------------------------- /scripts/MCM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # runs mauve contig mover 4 | # Author: Lee Katz 5 | 6 | set -e 7 | 8 | ref=$1 9 | draft=$2 10 | output=$3 11 | MAUVE=$(which Mauve) 12 | jar=`dirname $MAUVE`/Mauve.jar 13 | echo "JAR: $jar" 14 | 15 | if [ "$output" = "" ]; then 16 | echo "Usage: $0 ref.fasta draft.fasta outputDir"; 17 | exit 1; 18 | fi 19 | 20 | java -Xmx500m -cp $jar org.gel.mauve.contigs.ContigOrderer -output $output -ref $ref -draft $draft 21 | 22 | cd $output 23 | lastAlignment=$(\ls -t alignment*/alignment* | grep -P '\d$' | head -n 1) 24 | ln -s $lastAlignment ./alignment.xmfa 25 | cd .. 26 | echo "Best alignment can be found at $output/alignment.xmfa" 27 | -------------------------------------------------------------------------------- /scripts/alignmentToPhyloviz.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; 6 | use Bio::Perl; 7 | use Bio::AlignIO; 8 | use Getopt::Long; 9 | use File::Basename; 10 | use File::Slurp qw/read_file/; 11 | 12 | exit(main()); 13 | 14 | sub main{ 15 | my $settings={}; 16 | GetOptions($settings,qw(prefix=s defline-format=s help)); 17 | die usage() if($$settings{help}); 18 | 19 | my $prefix=$$settings{prefix}||"$0.out"; 20 | $$settings{'defline-format'}||="incremental"; 21 | my $alnFile=shift(@ARGV) || die "ERROR: need an alignment file\n".usage(); 22 | die "ERROR: you gave more than one alignment file: ".join(", ",@ARGV)."\n".usage() if(@ARGV>1); 23 | 24 | my($strainSeq)=seqToIdHash($alnFile,$settings); 25 | printResults($strainSeq,$prefix,$settings); 26 | 27 | return 0; 28 | } 29 | 30 | # Make a hash of sequence => [id1,id2,...] 31 | sub seqToIdHash{ 32 | my($alnFile,$settings)=@_; 33 | my $aln=Bio::AlignIO->new(-file=>$alnFile)->next_aln; 34 | 35 | my %strainSeq; 36 | for my $seq($aln->each_seq){ 37 | push(@{$strainSeq{$seq->seq}},$seq->id); 38 | } 39 | return \%strainSeq; 40 | } 41 | 42 | sub printResults{ 43 | my($strainSeq,$prefix,$settings)=@_; 44 | open(ALN,">","$prefix.aln.fas") or die "ERROR: Could not write to alignment file $prefix.aln.fas:$!"; 45 | open(STS,">","$prefix.STs.txt") or die "ERROR: Could not write to alignment file $prefix.STs.txt:$!"; 46 | print STS join("\t",qw(ST IDs))."\n"; 47 | my $STcounter=0; 48 | while(my($sequence,$idArr)=each(%$strainSeq)){ 49 | $STcounter++; 50 | 51 | my $idStr=join("__",@$idArr); 52 | 53 | my $defline; 54 | if($$settings{'defline-format'} eq 'incremental'){ 55 | $defline=$STcounter; 56 | }elsif($$settings{'defline-format'} eq 'join'){ 57 | $defline=$idStr." ST:$STcounter"; 58 | } else{ 59 | die "ERROR: Could not understand defline-format parameter\n".usage(); 60 | } 61 | 62 | print ALN ">$defline\n$sequence\n"; 63 | print STS join("\t",$defline,$idStr)."\n"; 64 | } 65 | close ALN; close STS; 66 | print "Wrote $prefix.aln.fas and $prefix.STs.txt\n"; 67 | } 68 | sub usage{ 69 | local $0=fileparse($0); 70 | "Make an alignment suitable for PhyloViz. 71 | Converts an alignment into sequence types and the boiled down alignment, removing redundant ST entries. 72 | Usage: $0 file.aln -p prefix 73 | -p prefix for output files: \$p.STs.aln and \$p.STs 74 | --defline-format incremental Options: incremental (default), or join 75 | " 76 | } 77 | -------------------------------------------------------------------------------- /scripts/anagramChecker.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long qw/GetOptions/; 6 | use File::Basename qw/basename/; 7 | use Data::Dumper; 8 | 9 | local $0 = basename $0; 10 | 11 | exit(main()); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help)) or die $!; 16 | if($$settings{help} || @ARGV < 2){ 17 | die "Usage: $0 referenceWord queryWord [queryWord2...] 18 | 19 | This script checks for anagrams as compared to the reference word. 20 | However, it will report false positives if there differences in 21 | letter counts."; 22 | } 23 | 24 | my $bitwiseLetters = bitwiseLetters($settings); 25 | 26 | # Simplify to uppercase for all 27 | @ARGV = map{uc($_)} @ARGV; 28 | 29 | my $refWord=shift(@ARGV); 30 | my $refBitwise = wordToBitwise($refWord,$bitwiseLetters,$settings); 31 | for my $queryWord(@ARGV){ 32 | my $queryBitwise = wordToBitwise($queryWord,$bitwiseLetters,$settings); 33 | if($refBitwise == $queryBitwise){ 34 | print "$queryWord is an anagram of $refWord\n"; 35 | } else { 36 | print "$queryWord is not an anagram of $refWord\n"; 37 | } 38 | } 39 | 40 | return 0; 41 | } 42 | 43 | sub bitwiseLetters{ 44 | my($settings)=@_; 45 | 46 | # Make the resolution as high as 10 letters per word. 47 | my $numLetters=26 * 10; 48 | 49 | my %bitwiseLetter; 50 | my $ordOffset=ord("A"); 51 | for(my $i=0;$i<$numLetters;$i++){ 52 | # Mod to find the letter of the alphabet 53 | my $mod = $i % 26; 54 | # String multiplier, e.g., A x 3 = AAA 55 | my $multiplier = int($i / 26)+1; 56 | # The chr that corresponds to the letter extended by $multiplier. 57 | my $key=chr($mod + $ordOffset) x $multiplier; 58 | 59 | # Power of 2 to take advantage of binary 60 | $bitwiseLetter{$key} = 2 ** $i; 61 | } 62 | 63 | return \%bitwiseLetter; 64 | } 65 | 66 | sub wordToBitwise{ 67 | my($word,$bitwiseLetters,$settings)=@_; 68 | 69 | my $bitwise=0; 70 | my $sortedLetters = join("",sort{$a cmp $b} split(//,$word)); 71 | 72 | while($sortedLetters=~/((.)\2*)/g){ 73 | $bitwise = $bitwise | $$bitwiseLetters{$1}; 74 | } 75 | 76 | return $bitwise; 77 | } 78 | -------------------------------------------------------------------------------- /scripts/art_profile.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Data::Dumper; 5 | use File::Basename qw/basename/; 6 | use Getopt::Long; 7 | use threads; 8 | use Thread::Queue; 9 | 10 | local $0=basename $0; 11 | sub logmsg {print STDERR "$0: @_\n";} 12 | exit(main()); 13 | 14 | sub main{ 15 | my $settings={}; 16 | GetOptions($settings,qw(help numcpus=i)) or die $!; 17 | $$settings{numcpus}||=1; 18 | 19 | die usage() if(!@ARGV || $$settings{help}); 20 | 21 | # Find the counts for each fastq file 22 | my %counts; 23 | 24 | my $Q=Thread::Queue->new(@ARGV); 25 | my @thr; 26 | for(0..$$settings{numcpus}-1){ 27 | $Q->enqueue(undef); 28 | $thr[$_]=threads->new(\&getQualityCounts,$Q,$settings); 29 | } 30 | 31 | logmsg "All fastq files enqueued!"; 32 | for my $t(@thr){ 33 | logmsg "Waiting to join thread ".$t->tid; 34 | my $c = $t->join(); 35 | while(my($nt, $positionalQualityCounts)=each(%$c)){ 36 | while(my($pos, $qualityCounts)=each(%$positionalQualityCounts)){ 37 | while(my($quality, $count)=each(%$qualityCounts)){ 38 | $counts{$nt}{$pos}{$quality}+=$count; 39 | } 40 | } 41 | } 42 | } 43 | #print Dumper \%counts;die; 44 | 45 | # Print an ART format 46 | my @nt=sort(keys(%counts)); 47 | for my $nt (@nt){ 48 | my $positionalQualityCounts=$counts{$nt}; 49 | my @posArr=sort {$a<=>$b} keys(%$positionalQualityCounts); 50 | 51 | for my $pos (@posArr){ 52 | my @qualArr=sort {$a cmp $b} keys(%{$$positionalQualityCounts{$pos}}); 53 | 54 | # Print the first row for this position, the quality value 55 | print "$nt\t$pos"; 56 | for my $quality (@qualArr){ 57 | $quality //= chr(33); 58 | print "\t"; 59 | print ord($quality)-33; 60 | } 61 | print "\n"; 62 | 63 | # Print the second row for this position, the counts at the quality value 64 | print "$nt\t$pos"; 65 | for my $quality (@qualArr){ 66 | print "\t".$$positionalQualityCounts{$pos}{$quality}; 67 | } 68 | print "\n"; 69 | } 70 | 71 | } 72 | 73 | return 0; 74 | } 75 | 76 | sub getQualityCounts{ 77 | my($Q,$settings)=@_; 78 | 79 | my %counts; 80 | while(defined(my $fastq=$Q->dequeue)){ 81 | 82 | logmsg "Processing $fastq"; 83 | 84 | my $lineCounter=0; 85 | open(my $fh, "zcat $fastq | ") or die "ERROR: could not read $fastq: $!"; 86 | while(<$fh>){ 87 | my $seq=<$fh>; 88 | chomp($seq); 89 | my @seq = split(//, $seq); 90 | <$fh>; # burn the plus line 91 | my @qual = split(//, scalar(<$fh>)); 92 | # I _could_ chomp @qual but the last item won't be 93 | # reached if I do the for loop right. 94 | 95 | my $readLength=@seq; 96 | for(my $i=0;$i<$readLength;$i++){ 97 | $counts{$seq[$i]}{$i}{$qual[$i]}++; 98 | } 99 | } 100 | close $fh; 101 | } 102 | 103 | return \%counts; 104 | } 105 | 106 | sub usage{ 107 | "Usage: $0 *_R1.fastq.gz > profile_R1.txt 108 | $0 *_R2.fastq.gz > profile_R2.txt 109 | --numcpus 1 Number of cpus 110 | " 111 | } 112 | -------------------------------------------------------------------------------- /scripts/bamStats.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use Data::Dumper; 7 | use File::Basename qw/basename/; 8 | 9 | local $0=basename $0; 10 | sub logmsg{print STDERR "$0: @_\n";} 11 | 12 | exit main(); 13 | 14 | sub main{ 15 | my $settings={}; 16 | GetOptions($settings,qw(help percent|percentage label=s)) or die $!; 17 | die usage() if($$settings{help}); 18 | die usage() if(-t STDIN); 19 | 20 | my $QC=bamMetrics($settings); 21 | 22 | my @header=sort {$a cmp $b} keys(%$QC); 23 | 24 | if($$settings{label}){ 25 | unshift(@header, 'label'); 26 | $$QC{label}=$$settings{label}; 27 | } 28 | 29 | print join("\t",@header)."\n"; 30 | for my $header(@header){ 31 | print $$QC{$header}."\t"; 32 | } 33 | print "\n"; 34 | 35 | return 0; 36 | } 37 | 38 | sub bamMetrics{ 39 | my($settings)=@_; 40 | 41 | my $numReads=0; 42 | my %QC; 43 | for(qw(simple-unmapped simple-improperPair combination-singletonMap combination-bothUnmapped combination-bothProperPair combination-wrongOrientationGoodInsertSize combination-mappedButWrongInsertSize)){ 44 | $QC{$_}=0; 45 | } 46 | 47 | while(<>){ 48 | $numReads++; 49 | chomp; 50 | my($seqid, $flag, $rname, $pos, $mapQ, $cigar, $rnext, $pnext, $tlen, $seq, $qual) = split(/\t/, $_); 51 | 52 | # Individual tags 53 | if($flag & 4){ 54 | $QC{'simple-unmapped'}++; 55 | } 56 | if(! $flag & 2){ 57 | $QC{'simple-improperPair'}++; 58 | } 59 | 60 | # Combination tags 61 | if($flag =~ /^(?:73|133|89|121|165|181|101|117|153|185|69|137)$/){ 62 | $QC{'combination-singletonMap'}++; 63 | } elsif ($flag =~ /^(?:77|141)$/){ 64 | $QC{'combination-bothUnmapped'}++; 65 | } elsif($flag =~ /^(?:99|147|83|163)$/){ 66 | $QC{'combination-bothProperPair'}++; 67 | } elsif($flag =~ /^(?:67|131|115|179)$/){ 68 | $QC{'combination-wrongOrientationGoodInsertSize'}++; 69 | } elsif($flag =~ /^(?:81|161|97|145|65|129|113|177)$/){ 70 | $QC{'combination-mappedButWrongInsertSize'}++; 71 | } 72 | } 73 | 74 | if($$settings{percent}){ 75 | for my $metric(keys(%QC)){ 76 | $QC{$metric} = sprintf("%0.2f", $QC{$metric}/$numReads*100); 77 | } 78 | } 79 | 80 | return \%QC; 81 | } 82 | 83 | sub usage{ 84 | "$0: get QC information on a sam file 85 | Usage: samtools view file.bam | $0 86 | --percent View results in percentages 87 | " 88 | } 89 | -------------------------------------------------------------------------------- /scripts/blastAndExtract.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Author: Lee Katz 3 | 4 | use strict; 5 | use warnings; 6 | use Data::Dumper; 7 | use Bio::Perl; 8 | use Getopt::Long; 9 | 10 | exit(main()); 11 | 12 | sub main{ 13 | my $settings={}; 14 | die usage() if(!@ARGV); 15 | GetOptions($settings,qw(flanking=i name=s)); 16 | $$settings{flanking}||=0; 17 | $$settings{revcom} ||=0; 18 | my ($db,$query)=@ARGV[0,1]; 19 | die "ERROR: need db and query\n".usage() if(!$query || !$db); 20 | 21 | my($contig,$start,$stop)=blastAgainstDb($db,$query,$settings); 22 | my $seq=extractSeq($contig,$start,$stop,$db,$settings); 23 | print "$seq\n"; 24 | return 0; 25 | } 26 | 27 | sub extractSeq{ 28 | my($contig,$start,$stop,$db,$settings)=@_; 29 | my $fastaStr; 30 | if($start>$stop){ 31 | my $tmp=$start; 32 | $start=$stop; 33 | $stop=$tmp; 34 | $$settings{revcom}=1; 35 | } 36 | my $in=Bio::SeqIO->new(-file=>$db); 37 | while(my $seq=$in->next_seq){ 38 | next if($seq->id ne $contig); 39 | $start=$start-$$settings{flanking}; 40 | $stop=$stop+$$settings{flanking}; 41 | $start=1 if($start<1); 42 | $stop=$seq->length if($stop>$seq->length); 43 | my $sequence=$seq->subseq($start,$stop); 44 | $sequence=~tr/ATCGatcg/TAGCtagc/ if($$settings{revcom}); 45 | $sequence=reverse($sequence) if($$settings{revcom}); 46 | $sequence=~s/(.{60})/$1\n/g; 47 | my $id=join("_",">$$settings{name}".$seq->id,$start,$stop); 48 | $fastaStr="$id\n$sequence"; 49 | } 50 | die "Could not create a fasta in contig $contig with start/stop=$start/$stop\n" if(!$fastaStr); 51 | return $fastaStr; 52 | } 53 | 54 | sub blastAgainstDb{ 55 | my($db,$query,$settings)=@_; 56 | my $command="legacy_blast.pl blastall -i '$query' -d '$db' -a 12 -e 0.05 -m 8 -p blastn -v 5 -b 5"; 57 | my @result=split(/\n/,`$command`); 58 | die if(!@result); 59 | @result=sort{ 60 | (split(/\t/,$b))[2]<=>(split(/\t/,$a))[2] 61 | } @result; 62 | 63 | my($contig,$start,$stop)=(split(/\t/,$result[0]))[1,8,9]; 64 | return ($contig,$start,$stop); 65 | } 66 | 67 | sub usage{ 68 | "Blasts a nucleotide sequence against a database and extracts the hit 69 | Usage: $0 database.fasta query.fasta > hit.fasta 70 | -f 100 to extract 100bp upstream/downstream 71 | -n custom genome name to put into the defline 72 | " 73 | } 74 | -------------------------------------------------------------------------------- /scripts/bp_jackknifeTrees.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; 6 | use Bio::TreeIO; 7 | use Bio::Tree::Statistics; 8 | use Getopt::Long; 9 | 10 | exit main(); 11 | 12 | sub main{ 13 | my $settings={}; 14 | GetOptions($settings,qw(help)) or die $!; 15 | 16 | if(!@ARGV){ 17 | die usage(); 18 | } 19 | 20 | print STDERR "Reading in files\n"; 21 | my $guideTree; # will be the first tree 22 | my @bs_tree=(); 23 | my $i; 24 | for my $file(@ARGV){ 25 | if(!-e $file|| !-s $file){ 26 | print STDERR "Not found or empty: $file "; 27 | next; 28 | } 29 | my $in=Bio::TreeIO->new(-file=>$file,-format=>"newick"); 30 | #while(my $tree=next_tree_fast($in)){ 31 | while(my $tree=$in->next_tree){ 32 | if(!$guideTree){ 33 | $guideTree = $tree; 34 | next; 35 | } 36 | push(@bs_tree, $tree); 37 | print STDERR "."; 38 | } 39 | } 40 | 41 | if(!$guideTree){ 42 | die "ERROR: no guide tree found"; 43 | } 44 | if(!@bs_tree){ 45 | die "ERROR: no jack knife trees found"; 46 | } 47 | 48 | print STDERR "\n"; 49 | print STDERR "Combining jack knife files\n"; 50 | my $biostat = Bio::Tree::Statistics->new(); 51 | #my $bsTree=$biostat->assess_bootstrap(\@bs_tree,$guideTree); 52 | my $bsTree = assess_bootstrap($biostat, \@bs_tree, $guideTree); 53 | print STDERR "Reading internal nodes\n"; 54 | for my $node($bsTree->get_nodes){ 55 | print STDERR "."; 56 | next if($node->is_Leaf); 57 | 58 | if(!$node->id){ 59 | $node->id($node->bootstrap); 60 | } 61 | } 62 | print STDERR "\n"; 63 | print $bsTree->as_text("newick")."\n"; 64 | 65 | return 0; 66 | } 67 | 68 | sub assess_bootstrap{ 69 | my ($self,$bs_trees,$guide_tree) = @_; 70 | my @consensus; 71 | 72 | # internal nodes are defined by their children 73 | 74 | my (%lookup,%internal); 75 | my $i = 0; 76 | for my $tree ( $guide_tree, @$bs_trees ) { 77 | # Do this as a top down approach, can probably be 78 | # improved by caching internal node states, but not going 79 | # to worry about it right now. 80 | 81 | my @allnodes = $tree->get_nodes; 82 | my @internalnodes = grep { ! $_->is_Leaf } @allnodes; 83 | for my $node ( @internalnodes ) { 84 | my @tips = sort map { $_->id } 85 | grep { $_->is_Leaf() } $node->get_all_Descendents; 86 | my $id = "(".join(",", @tips).")"; 87 | if( $i == 0 ) { 88 | $internal{$id} = $node->internal_id; 89 | } else { 90 | $lookup{$id}++; 91 | } 92 | } 93 | $i++; 94 | } 95 | $i--; # do not count the guide tree in the denominator 96 | 97 | my @save; 98 | for my $l ( keys %lookup ) { 99 | if( defined $internal{$l} ) {#&& $lookup{$l} > $min_seen ) { 100 | my $intnode = $guide_tree->find_node(-internal_id => $internal{$l}); 101 | $intnode->bootstrap(sprintf("%d",100 * $lookup{$l} / $i)); 102 | } 103 | } 104 | return $guide_tree; 105 | } 106 | 107 | sub usage{ 108 | "Usage: $0 guidetree.dnd jackknife.dnd [jackknife2.dnd...] > tree_with_confidence.nwk 109 | Where each jack knife tree can have multiple entries and the output tree 110 | will be a single entry with confidence values." 111 | } 112 | -------------------------------------------------------------------------------- /scripts/clusterDensityFromFastq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; 6 | use File::Basename qw/basename/; 7 | use Getopt::Long qw/GetOptions/; 8 | use POSIX qw/ceil/; 9 | use threads; 10 | use Thread::Queue; 11 | 12 | local $0 = basename $0; 13 | sub logmsg{print STDERR "$0: @_\n";} 14 | 15 | exit main(); 16 | sub main{ 17 | my $settings={}; 18 | GetOptions($settings,qw(help numcpus=i tile-size|size-of-tile=s)) or die $!; 19 | $$settings{'tile-size'} ||= 1; 20 | $$settings{numcpus}||=1; 21 | 22 | my $fastqPerThread = ceil(scalar(@ARGV) / $$settings{numcpus}); 23 | 24 | my $printQ = Thread::Queue->new(); 25 | $printQ->enqueue("File\tspots-per-mm2"); 26 | 27 | my @thr; 28 | for(my $i=0;$i<$$settings{numcpus};$i++){ 29 | my @fastq = splice(@ARGV,0,$fastqPerThread); 30 | $thr[$i] = threads->new(sub{ 31 | my($fastqArr, $printQ)=@_; 32 | for my $fastq(@$fastqArr){ 33 | logmsg $fastq; 34 | my $density = clusterDensity($fastq,$settings); 35 | $printQ->enqueue([$fastq, $density]); 36 | } 37 | return scalar(@fastq); 38 | }, \@fastq, $printQ); 39 | } 40 | 41 | # start the printer 42 | my $printerThread = threads->new(\&printer, $printQ); 43 | 44 | # join the threads 45 | for(@thr){ 46 | $_->join; 47 | } 48 | 49 | # Terminate multithreaded printing 50 | $printQ->enqueue(undef); 51 | $printerThread->join(); 52 | 53 | return 0; 54 | } 55 | 56 | sub printer{ 57 | my($Q)=@_; 58 | while(defined(my $toPrint = $Q->dequeue)){ 59 | if(ref($toPrint) eq 'ARRAY'){ 60 | print join("\t",@$toPrint)."\n"; 61 | } else { 62 | print "$toPrint\n"; 63 | } 64 | } 65 | } 66 | 67 | sub clusterDensity{ 68 | my($fastq,$settings)=@_; 69 | 70 | # Try to get some compile-time speedup 71 | my $colonRegex = qr/:/; 72 | my $whitespaceRegex = qr/\s+/; 73 | 74 | my %tileCount; 75 | open(my $fh, "-|", "zcat $fastq") or die "ERROR: could not zcat $fastq: $!"; 76 | while(my $header=<$fh>){ 77 | # Burn three lines. We're only looking at the header. 78 | <$fh>; 79 | <$fh>; 80 | <$fh>; 81 | 82 | chomp($header); 83 | my($firstPart,undef) = split($whitespaceRegex, $header); 84 | my($instrument, $run, $flowcell, $lane, $tile, $x, $y) = split($colonRegex, $firstPart); 85 | #my(undef, undef, undef, undef, $tile, $x, $y) = split($colonRegex, $firstPart); 86 | $tileCount{$tile}++; 87 | } 88 | close $fh; 89 | 90 | my $total=0; 91 | while(my($tile,$count)=each(%tileCount)){ 92 | $total+=$count; 93 | } 94 | my $averagePerTile = sprintf("%0.2f", $total/scalar(keys(%tileCount))/$$settings{'tile-size'}); 95 | 96 | return $averagePerTile; 97 | } 98 | 99 | sub usage{ 100 | "$0: Calculates cluster density of a fastq file with casava style headers 101 | Usage: $0 [options] *.fastq.gz 102 | 103 | --tile-size 1 Size of the tile in square mm. 104 | --numcpus 1 105 | " 106 | } 107 | -------------------------------------------------------------------------------- /scripts/countATCG.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Author: Lee Katz 3 | # Count the number of nucleotides in a sequence file 4 | 5 | use strict; 6 | use warnings; 7 | use Bio::Perl; 8 | use Getopt::Long; 9 | 10 | exit(main()); 11 | 12 | sub main{ 13 | 14 | my $settings={}; 15 | GetOptions($settings,qw(help discoverNts)) or die $!; 16 | 17 | die usage() if($$settings{help}); 18 | 19 | my @seq=@ARGV; 20 | die "ERROR: no sequence files given!\n".usage() if(!@seq); 21 | # Make an array of nts so that the ordering doesn't change 22 | my @nt=ntArray(\@seq,$settings); 23 | my @ntWithN=(@nt,"N"); 24 | 25 | # print the header 26 | print $_."\t" for("file",@ntWithN); 27 | print "\n"; 28 | 29 | # Count the ATCG for each parameter 30 | for my $seq(@seq){ 31 | count($seq,\@nt,\@ntWithN); 32 | } 33 | return 0; 34 | } 35 | 36 | sub ntArray{ 37 | my($seq,$settings)=@_; 38 | my @nt=qw(A T C G); 39 | if($$settings{discoverNts}){ 40 | # make a long string of nucleotides 41 | my $concatSeq=""; 42 | for my $seqfile(@$seq){ 43 | my $seqin=Bio::SeqIO->new(-file=>$seqfile); 44 | while(my $seqObj=$seqin->next_seq){ 45 | $concatSeq.=uc($seqObj->seq); 46 | } 47 | } 48 | # Figure out which nts are present 49 | my %nt; 50 | my $seqLength=length($concatSeq); 51 | for(my $i=0;$i<$seqLength;$i++){ 52 | $nt{substr($concatSeq,$i,1)}=1; 53 | } 54 | @nt=keys(%nt); 55 | } 56 | return @nt; 57 | } 58 | 59 | sub count{ 60 | my($arg,$ntArr,$ntWithN)=@_; 61 | # Read the sequence into an object 62 | my $in=Bio::SeqIO->new(-file=>$arg); 63 | 64 | my %num; 65 | while(my $seq=$in->next_seq){ 66 | $seq=$seq->seq; 67 | # get the counts of nts 68 | for my $nt(@$ntArr){ 69 | $num{$nt}=($seq=~s/$nt//gi); 70 | } 71 | $num{"N"}=length($seq); 72 | } 73 | print $arg."\t"; 74 | print $num{$_}."\t" for(@$ntWithN); # print values 75 | print "\n"; # newline to make it pretty 76 | $in->close; 77 | } 78 | 79 | sub usage{ 80 | "Counts the number of each nucleotide in a sequence file 81 | Usage: $0 file.fasta [file2.fasta ...] 82 | --discoverNts Look at the sequencing file first to figure out which Nts are present in the first place (slower) 83 | " 84 | } 85 | -------------------------------------------------------------------------------- /scripts/directoryDuration.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Give the approximate lenght of time it took to make all files in a directory 4 | 5 | use strict; 6 | use warnings; 7 | use Getopt::Long; 8 | use File::Basename qw/basename/; 9 | use File::Find qw/find/; 10 | 11 | local $0=basename $0; 12 | 13 | my $settings={}; 14 | GetOptions($settings,qw(help exclude=s include=s verbose)) or die $!; 15 | die usage() if(!@ARGV || $$settings{help}); 16 | my $exclude=$$settings{exclude}||0; 17 | my $include=$$settings{include}||0; 18 | 19 | for my $dir(@ARGV){ 20 | die "ERROR: $dir is not a directory" if(!-d $dir); 21 | 22 | my $oldest= ~0; 23 | my $newest=0; 24 | find({no_chdir=>1, wanted=>sub{ 25 | return if(-d $File::Find::name); 26 | if($exclude && $File::Find::name =~ /$exclude/){ 27 | print "Excluding: $File::Find::name\n" if($$settings{verbose}); 28 | return; 29 | } 30 | if($include && $File::Find::name !~ /$include/){ 31 | print "Not including: $File::Find::name\n" if($$settings{verbose}); 32 | return; 33 | } 34 | my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, 35 | $atime,$mtime,$ctime,$blksize,$blocks) 36 | =stat($File::Find::name); 37 | 38 | if(!$mtime){ 39 | print "WARNING: no timestamp $File::Find::name\n" if($$settings{verbose}); 40 | return; 41 | } 42 | if($mtime < $oldest){ 43 | print "oldest $File::Find::name\n" if($$settings{verbose}); 44 | $oldest=$mtime; 45 | } 46 | if($mtime > $newest){ 47 | print "newest $File::Find::name\n" if($$settings{verbose}); 48 | $newest=$mtime; 49 | } 50 | }},$dir); 51 | 52 | my $duration=$newest-$oldest; 53 | print "$dir\t$duration\n"; 54 | } 55 | 56 | 57 | exit 0; 58 | 59 | sub usage{ 60 | "Usage: $0 dir 61 | Gives the number of seconds between the oldest and 62 | newest files 63 | 64 | --verbose 65 | --exclude PATTERN Supply a regex pattern to ignore 66 | certain filenames. 67 | " 68 | } 69 | -------------------------------------------------------------------------------- /scripts/downloadSrrRemotely.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Author: Lee Katz 3 | 4 | #$ -cwd -V 5 | #$ -S /bin/sh 6 | #$ -q all.q -pe smp 1 7 | #$ -N downloadSrrViaRGN 8 | #$ -o download.log -j y 9 | 10 | ## CONFIGURATION 11 | RGN_ASCP_PATH="/opt/aspera/bin/ascp" 12 | ASCP_XOPTS="-v -QT -l640M -i /opt/aspera/etc/asperaweb_id_dsa.putty" 13 | RGN_URI="gzu2-rgntds@rgntds.cdc.gov" 14 | ## END CONFIGURATION 15 | ##################### 16 | 17 | SRR=$1 18 | NAME=$2 19 | OUTDIR=$3 20 | THISSCRIPT=$(basename $0); 21 | 22 | # Usage statement 23 | if [ "$NAME" == "" ]; then 24 | echo "Downloads a genome using the RGN, then sends it back to you, then decompresses it into split reads" 25 | echo "Usage: $THISSCRIPT SRR0123456 nameOfGenome outdir" 26 | echo "Example: $THISSCRIPT SRR1041486 FL_FLDACS-00090 ." 27 | exit 1; 28 | fi 29 | 30 | 31 | echo `date +'%H:%M:%S'`" Transferring the file to the remote computer from NCBI" 32 | THREE=${SRR:0:3} 33 | SIX=${SRR:0:6} 34 | ssh $RGN_URI "mkdir -p /tmp/$USER; $RGN_ASCP_PATH $ASCP_XOPTS anonftp@ftp-private.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/$THREE/$SIX/$SRR/$SRR.sra /tmp/$USER/$SRR.sra" 35 | if [ $? -gt 0 ]; then echo "ERROR with ascp on the remote computer!"; exit 1; fi; 36 | 37 | echo `date +'%H:%M:%S'`" Transferring the file back to this computer" 38 | rsync -e "ssh -x" --progress -a $RGN_URI:/tmp/$USER/$SRR.sra $OUTDIR/$NAME.sra 39 | if [ $? -gt 0 ]; then echo "ERROR with transferring the file to here using rsync"; exit 1; fi; 40 | 41 | echo `date +'%H:%M:%S'` "Decompressing the file into fastq.gz - this might take a while"; 42 | fastq-dump -v --defline-seq '@$ac_$sn[_$rn]/$ri' --defline-qual '+' --split-files -O . --gzip $OUTDIR/$NAME.sra 43 | if [ $? -gt 0 ]; then echo "ERROR with fastq-dump"; exit 1; fi; 44 | 45 | 46 | echo `date +'%H:%M:%S'`" Finished. Files will be found in $OUTDIR"; 47 | 48 | exit 0; 49 | 50 | -------------------------------------------------------------------------------- /scripts/exportBioNumericsFastaWithCoverage.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Author: Lee Katz 3 | # Splits up a bionumerics fasta file into contigs 4 | 5 | use strict; 6 | use warnings; 7 | use Bio::Perl; 8 | use Bio::Tools::GuessSeqFormat; 9 | use File::Basename; 10 | use autodie; 11 | use Getopt::Long; 12 | use File::Basename qw/basename/; 13 | 14 | local $0 = basename($0); 15 | 16 | sub logmsg{ print STDERR "$0: @_\n"; } 17 | 18 | exit main(); 19 | 20 | sub main{ 21 | my $settings={}; 22 | GetOptions($settings,qw(help outdir=s)) or die $!; 23 | 24 | die usage() if(!@ARGV || $$settings{help}); 25 | 26 | if($$settings{outdir}){ 27 | if(! -d $$settings{outdir}){ 28 | mkdir $$settings{outdir}; 29 | } 30 | } 31 | 32 | for my $f(@ARGV){ 33 | printFasta($f, $settings); 34 | } 35 | 36 | return 0; 37 | } 38 | 39 | sub printFasta{ 40 | my($infile, $settings) = @_; 41 | 42 | # Because we are reading bionumerics files and do not 43 | # trust their extensions, make a better guess at their 44 | # format using the format guesser. 45 | #my $formatGuesser=Bio::Tools::GuessSeqFormat->new(-file=>$infile); 46 | #my $format =$formatGuesser->guess; 47 | 48 | my $format = "fasta"; 49 | 50 | logmsg $infile; 51 | my $in=Bio::SeqIO->new(-file=>$infile,-format=>$format); 52 | while(my $seq=$in->next_seq){ 53 | my @seq=split(/\|/,$seq->seq); 54 | my $id=$seq->id; 55 | $id=~s/^denovo\|//; # remove 'denovo' since most exports seem to have that 56 | 57 | my $out=Bio::SeqIO->new(-format=>"fasta"); 58 | if($$settings{outdir}){ 59 | # For potential filenames, get a safe name 60 | my($SRR, $orig, $cov, $rep, $asm) = split(/_/, $id); 61 | 62 | my $outfile="$$settings{outdir}/${SRR}_${orig}_${cov}_${rep}_${asm}.fa"; 63 | if(-e $outfile){ 64 | #print "Collision: $infile => $outfile\n"; 65 | next; 66 | } 67 | $out=Bio::SeqIO->new(-format=>"fasta",-file=>">$outfile"); 68 | } 69 | for(my $i=1;$i<=@seq;$i++){ 70 | my $subseq=Bio::Seq->new(-seq=>$seq[$i-1],-id=>$id."_".$i); 71 | $out->write_seq($subseq); 72 | } 73 | $out->close; 74 | } 75 | } 76 | 77 | sub usage{ 78 | local $0=fileparse $0; 79 | "Usage: $0 bionumerics.fasta > out.fasta 80 | --outdir '' If given, all genomes will be written here. 81 | If blank, output will be sent to stdout. 82 | " 83 | } 84 | -------------------------------------------------------------------------------- /scripts/fastqDump-SE.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SRR=$1 4 | 5 | R1="${SRR}.fastq.gz"; 6 | R1uncompressed="${SRR}.fastq" 7 | 8 | script=$(basename $0); 9 | if [ "$SRR" == "" ]; then 10 | echo "Downloads a fastq properly using fastq-dump" 11 | echo " Sorts the reads for maximum compression using fasten_sort." 12 | echo "Usage: $script SRR_accession" 13 | exit 1; 14 | fi 15 | 16 | set -e 17 | set -u 18 | 19 | if [ -e "${SRR}.fastq.gz" ]; then 20 | echo "${SRR}.fastq.gz is already present." 21 | exit 1 22 | fi 23 | if [ -e "$R1" ]; then 24 | echo "$R1 is already present." 25 | exit 1 26 | fi 27 | 28 | module purge 29 | #module load sratoolkit/2.9.1 30 | module load sratoolkit/2.11.3 31 | 32 | # Check if fasten_sort is in the path and if not, quit 33 | echo "Checking dependency paths" 34 | which fasten_sort 35 | which fasterq-dump || which fastq-dump 36 | 37 | tempdir=$(mktemp --directory --tmpdir=$TMPDIR $(basename $0).XXXXXX) 38 | trap "{ rm -rf $tempdir; }" EXIT SIGINT SIGTERM 39 | echo "Files will temporarily be stored in $tempdir" 40 | 41 | # Decide whether to run fastq-dump or fasterq-dump 42 | fasterqDump="$(which fasterq-dump 2>/dev/null)"; 43 | if [ "$fasterqDump" == "" ]; then 44 | fastq-dump --accession $SRR --outdir $tempdir --defline-seq '@$ac.$si/$ri' --defline-qual '+' --split-files --skip-technical --dumpbase --clip 45 | if [ $? -gt 0 ]; then 46 | echo "ERROR with fastq-dump and $SRR" 47 | exit 1 48 | fi 49 | else 50 | cd $tempdir 51 | fasterq-dump $SRR --print-read-nr --threads 1 --outdir $tempdir --split-files --skip-technical 52 | if [ $? -gt 0 ]; then 53 | echo "ERROR with fasterq-dump and $SRR" 54 | exit 1 55 | fi 56 | if [ ! -e "$R1uncompressed" ]; then 57 | echo "ERROR: R1uncompressed not present in filename $R1uncompressed"; 58 | ls -lhA $tempdir 59 | exit 1; 60 | fi 61 | cd - 62 | 63 | # Compress fastq files 64 | for fastq in $tempdir/*.fastq; do 65 | # Remove quality defline with perl before compressing 66 | perl -lane ' 67 | if($. % 4 == 3){ 68 | $_="+"; 69 | } 70 | print; 71 | ' < $fastq > $fastq.tmp; 72 | mv $fastq.tmp $fastq 73 | done; 74 | fi 75 | 76 | # Intense compression 77 | mv -v $tempdir/$R1uncompressed $tempdir/unsorted.fastq 78 | cat $tempdir/unsorted.fastq | \ 79 | fasten_sort --sort-by SEQ | \ 80 | fasten_progress --print --id sort-reads --update-every 100000 | \ 81 | gzip -vc9 > $tempdir/$R1uncompressed.gz 82 | 83 | rm -v $tempdir/unsorted.fastq 84 | 85 | ls -lhd $tempdir 86 | ls -lh $tempdir/* 87 | mv -v $tempdir/$R1 . 88 | 89 | -------------------------------------------------------------------------------- /scripts/fastqDump.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SRR=$1 4 | 5 | R1="${SRR}_1.fastq.gz"; 6 | R2="${SRR}_2.fastq.gz"; 7 | R1uncompressed="${SRR}_1.fastq" 8 | R2uncompressed="${SRR}_2.fastq" 9 | 10 | script=$(basename $0); 11 | if [ "$SRR" == "" ]; then 12 | echo "Downloads a fastq properly using fastq-dump" 13 | echo " Sorts the reads for maximum compression using fasten_sort." 14 | echo "Usage: $script SRR_accession" 15 | exit 1; 16 | fi 17 | 18 | set -e 19 | set -u 20 | 21 | if [ -e "${SRR}.fastq.gz" ]; then 22 | echo "${SRR}.fastq.gz is already present." 23 | exit 1 24 | fi 25 | if [ -e "$R1" ]; then 26 | echo "$R1 is already present." 27 | exit 1 28 | fi 29 | 30 | module purge 31 | #module load sratoolkit/2.9.1 32 | module load sratoolkit/2.11.3 33 | 34 | # Check if fasten_sort is in the path and if not, quit 35 | echo "Checking dependency paths" 36 | which fasten_sort 37 | which fasterq-dump || which fastq-dump 38 | 39 | tempdir=$(mktemp --directory --tmpdir=$TMPDIR $(basename $0).XXXXXX) 40 | trap "{ rm -rf $tempdir; }" EXIT SIGINT SIGTERM 41 | echo "Files will temporarily be stored in $tempdir" 42 | 43 | # Decide whether to run fastq-dump or fasterq-dump 44 | fasterqDump="$(which fasterq-dump 2>/dev/null)"; 45 | if [ "$fasterqDump" == "" ]; then 46 | fastq-dump --accession $SRR --outdir $tempdir --defline-seq '@$ac.$si/$ri' --defline-qual '+' --split-files --skip-technical --dumpbase --clip 47 | if [ $? -gt 0 ]; then 48 | echo "ERROR with fastq-dump and $SRR" 49 | exit 1 50 | fi 51 | else 52 | cd $tempdir 53 | fasterq-dump $SRR --print-read-nr --threads 1 --outdir $tempdir --split-files --skip-technical 54 | if [ $? -gt 0 ]; then 55 | echo "ERROR with fasterq-dump and $SRR" 56 | exit 1 57 | fi 58 | if [ ! -e "$R1uncompressed" ]; then 59 | echo "ERROR: R1uncompressed not present in filename $R1uncompressed"; 60 | ls -lhA $tempdir 61 | exit 1; 62 | fi 63 | cd - 64 | 65 | # Compress fastq files 66 | for fastq in $tempdir/*.fastq; do 67 | # Remove quality defline with perl before compressing 68 | perl -lane ' 69 | if($. % 4 == 3){ 70 | $_="+"; 71 | } 72 | print; 73 | ' < $fastq > $fastq.tmp; 74 | mv $fastq.tmp $fastq 75 | done; 76 | fi 77 | 78 | # Intense compression 79 | #echo "DEBUG"; head -n 888 $tempdir/$R1uncompressed > $tempdir/unsorted_1.fastq 80 | #echo "DEBUG"; head -n 888 $tempdir/$R2uncompressed > $tempdir/unsorted_2.fastq 81 | mv -v $tempdir/$R1uncompressed $tempdir/unsorted_1.fastq 82 | mv -v $tempdir/$R2uncompressed $tempdir/unsorted_2.fastq 83 | cat $tempdir/unsorted_1.fastq $tempdir/unsorted_2.fastq | \ 84 | fasten_shuffle | \ 85 | fasten_sort --sort-by SEQ --paired-end | \ 86 | fasten_progress --print --id sort-reads --update-every 100000 | \ 87 | fasten_shuffle -d -1 $tempdir/$R1uncompressed -2 $tempdir/$R2uncompressed 88 | gzip -v9 $tempdir/$R1uncompressed $tempdir/$R2uncompressed 89 | rm -v $tempdir/unsorted_1.fastq $tempdir/unsorted_2.fastq 90 | 91 | ls -lhd $tempdir 92 | ls -lh $tempdir/* 93 | mv -v $tempdir/{$R1,$R2} . 94 | 95 | -------------------------------------------------------------------------------- /scripts/fastqMaxCompression.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DIR=$1 6 | 7 | if [ "$DIR" == "" ]; then 8 | echo "Finds fastq.gz files and runs max compression on them"; 9 | echo "Usage: $0 dir" 10 | exit 1 11 | fi 12 | 13 | TMP=$(mktemp --directory FASTQMAXCOMPRESSION.XXXXXX --tmpdir=$TMPDIR) 14 | trap "{ rm -rf $TMP; }" EXIT 15 | export TMP 16 | 17 | find $DIR -iname '*.fastq.gz' -or -iname '*.fq.gz' | xargs -P 1 -n 1 bash -c ' 18 | if [ "$(file $0 | grep -m 1 -o "max compression" | head -n 1)" != "" ]; then 19 | echo "Skipping $0 bc it is already max compressed" 20 | exit 0 21 | fi 22 | 23 | originalSize=$(du $0 | cut -f 1) 24 | 25 | tmpfile=$(mktemp --tmpdir=$TMP MAX.XXXXXX --suffix=.fastq.gz) 26 | trap "{ rm -f $tmpfile; }" EXIT 27 | 28 | echo "$0 => $tmpfile" 29 | gzip -dc $0 | gzip -9c > $tmpfile && \ 30 | mv -v $tmpfile $0 31 | 32 | newsize=$(du $0 | cut -f 1); 33 | savings=$(printf "%0.2f" $(echo "$newsize/$originalSize" | bc -l)); 34 | 35 | echo "New file is $savings of original" 36 | ' 37 | 38 | find $DIR -name '*.fastq' -or '*.fq' | xargs -P 1 -n 1 bash -c ' 39 | tmpfile=$(mktemp --tmpdir=$TMP MAX.XXXXXX --suffix=.fastq.gz) 40 | trap "{ rm -f $tmpfile; }" EXIT 41 | 42 | gzip -c9 $0 > $tmpfile && \ 43 | mv $tmpfile $0.gz && \ 44 | rm $0 45 | ' 46 | 47 | -------------------------------------------------------------------------------- /scripts/fastqToFastaQual.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Convert a fastq to a fasta/qual combo using BioPerl, with some Linux commands 4 | 5 | use Bio::Perl; 6 | use Data::Dumper; 7 | use strict; 8 | use warnings; 9 | use threads; 10 | use Thread::Queue; 11 | use Getopt::Long; 12 | 13 | my $settings={}; 14 | 15 | $|=1; 16 | my %numSequences; # static for a subroutine 17 | 18 | exit(main()); 19 | 20 | sub main{ 21 | my $usage="Usage: $0 -i inputFastqFile [-n numCpus -q outputQualfile -f outputFastaFile]"; 22 | die($usage) if(@ARGV<1); 23 | 24 | GetOptions($settings,('numCpus=s','input=s','qualOut=s','fastaOut=s','help')); 25 | die $usage if($$settings{help}); 26 | 27 | my $file=$$settings{input}||die("input parameter missing"); 28 | my $outfasta=$$settings{fastaOut}||"$file.fasta"; 29 | my $outqual=$$settings{qualOut}||"$file.qual"; 30 | my $numCpus=$$settings{numCpus}||1; 31 | 32 | my @subfile=splitFastq($file,$numCpus); 33 | for my $f(@subfile){ 34 | threads->create(\&convert,$f,"$f.fasta","$f.qual"); 35 | } 36 | $_->join for (threads->list); 37 | 38 | # join the sub files together 39 | joinFastqFiles(\@subfile,$file); 40 | 41 | return 1; 42 | } 43 | 44 | sub convert{ 45 | my($file,$outfasta,$outqual)=@_; 46 | 47 | my $numSequences=numSequences($file); 48 | my $reportEvery=int($numSequences/100) || 1; 49 | print "$numSequences sequences to convert in $file\n"; 50 | 51 | my $in=Bio::SeqIO->new(-file=>$file,-format=>"fastq-illumina"); 52 | my $seqOut=Bio::SeqIO->new(-file=>">$outfasta",-format=>"fasta"); 53 | my $qualOut=Bio::SeqIO->new(-file=>">$outqual",-format=>"qual"); 54 | my $seqCount=0; 55 | my $percentDone=0; 56 | while(my $seq=$in->next_seq){ 57 | $seqOut->write_seq($seq); 58 | $qualOut->write_seq($seq); 59 | $seqCount++; 60 | if($seqCount%$reportEvery == 0){ 61 | $percentDone++; 62 | print "$percentDone%.."; 63 | } 64 | } 65 | print "Done with subfile $file.\n"; 66 | return 1; 67 | } 68 | 69 | sub joinFastqFiles{ 70 | my($subfile,$outfileBasename)=@_; 71 | my($command,$subfasta,$subqual); 72 | 73 | # fasta 74 | $subfasta.="$_.fasta " for(@$subfile); 75 | $command="cat $subfasta > $outfileBasename.fasta"; 76 | system($command); 77 | 78 | # qual 79 | $subqual.="$_.qual " for (@$subfile); 80 | $command="cat $subqual > $outfileBasename.qual"; 81 | system($command); 82 | 83 | return 1; 84 | } 85 | 86 | sub splitFastq{ 87 | my($file,$numCpus)=@_; 88 | my $prefix="FQ"; # for fastq 89 | my $numSequences=numSequences($file); 90 | my $numSequencesPerFile=int($numSequences/$numCpus); 91 | my $numSequencesPerFileRemainder=$numSequences % $numCpus; 92 | my $numLinesPerFile=$numSequencesPerFile*4; # four lines per read 93 | system("rm -r tmp;mkdir tmp;"); 94 | system("split -l $numLinesPerFile $file 'tmp/FQ'"); 95 | 96 | return glob("tmp/FQ*"); 97 | } 98 | 99 | 100 | sub numSequences{ 101 | my $file=shift; 102 | return $numSequences{$file} if($numSequences{$file}); 103 | my $num=`grep -c '^\@' $file`; 104 | chomp($num); 105 | $numSequences{$file}=$num; 106 | return $num; 107 | } 108 | -------------------------------------------------------------------------------- /scripts/filterKrakenOutput.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long qw/GetOptions/; 6 | use File::Basename qw/basename/; 7 | use File::Temp qw/tempdir/; 8 | use Data::Dumper; 9 | 10 | local $0=basename $0; 11 | 12 | sub logmsg{print STDERR "$0: @_\n"} 13 | exit main(); 14 | 15 | sub main{ 16 | my $settings={}; 17 | GetOptions($settings,qw(help tempdir=s taxid=s)) or die $!; 18 | die usage() if(!@ARGV); 19 | die "ERROR: need taxid" if(!defined $$settings{taxid}); 20 | $$settings{tempdir}||=tempdir("$0.XXXXXX", TMPDIR=>1, CLEANUP=>1); 21 | 22 | my @taxid = split(/,/, $$settings{taxid}); 23 | my @fastq = @ARGV; 24 | 25 | my $regex = "^".join('$|^', @taxid)."\$"; 26 | $regex = qr/$regex/; 27 | 28 | my %readid = (); 29 | while(){ 30 | my(undef, $readid, $taxid) = split(/\t/, $_); 31 | if($taxid =~ $regex){ 32 | $readid{$readid} = 1; 33 | } 34 | } 35 | 36 | for my $f(@fastq){ 37 | logmsg "Reading $f and filtering for $$settings{taxid}"; 38 | open(my $fh, "zcat $f | ") or die "ERROR: could not gunzip $f: $!"; 39 | while(my $id = <$fh>){ 40 | my $entry = $id; 41 | $entry.=<$fh> for(1..3); 42 | 43 | $id =~ s/^@|\s+.*$//g; # remove @ and anything after whitespace 44 | 45 | if($readid{$id}){ 46 | print $entry; 47 | } 48 | } 49 | close $fh; 50 | } 51 | 52 | return 0; 53 | } 54 | 55 | # cat out.kraken | perl -MData::Dumper -lane 'BEGIN{open($fh, "zcat SE-le_S12_L001_R1_001.fastq.gz | ") or die $!; while(my $id=<$fh>){$entry=$id; for(1..3){$entry.=<$fh>;} $id=~s/\s.*//; $id=~s/^\@//; chomp($id); $entry{$id}=$entry; } } my(undef,$readid,$taxid)=@F; next if($taxid !~ /^561$|^562$|^83334$/); print $entry{$readid};' | grep . > R1.subset.fastq 56 | 57 | sub usage{ 58 | "$0: Filter for reads matching a given taxon using kraken raw results 59 | usage: $0 --taxid=taxid in.fastq.gz < kraken.out > out.fastq 60 | 61 | --taxid The taxon ID from NCBI (required) 62 | Can be comma-separated 63 | " 64 | } 65 | 66 | -------------------------------------------------------------------------------- /scripts/findpids.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Borrowed from 4 | # https://unix.stackexchange.com/questions/294299/how-to-renice-all-threads-and-children-of-one-process-on-linux 5 | 6 | if [ "$#" -eq 0 ]; then 7 | echo "Finds all children pids of a process" 8 | echo " Usage: $(basename $0) 012345" 9 | exit 1 10 | fi 11 | 12 | PID_LIST= 13 | findpids() { 14 | for pid in /proc/$1/task/* ; do 15 | pid="$(basename "$pid")" 16 | PID_LIST="$PID_LIST$pid " 17 | if [ ! -e "/proc/$1/task/$pid/children" ]; then 18 | continue; 19 | fi 20 | for cpid in $(cat /proc/$1/task/$pid/children 2>/dev/null) ; do 21 | findpids $cpid 22 | done 23 | done 24 | } 25 | 26 | for pid in $@; do 27 | 28 | if [ ! -e "/proc/$1/task" ]; then 29 | echo "ERROR: could not find pid $pid in the process list"; 30 | exit 1 31 | fi 32 | 33 | findpids $1 34 | done 35 | 36 | echo $PID_LIST 37 | -------------------------------------------------------------------------------- /scripts/fixKsnpVcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Bio::Perl; 6 | use Getopt::Long; 7 | use Data::Dumper; 8 | use constant reportEvery=>1000; 9 | 10 | sub logmsg{print STDERR "@_\n";} 11 | exit main(); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(ref|reference=s help)) or die $!; 16 | die usage() if($$settings{help} || !$$settings{ref}); 17 | 18 | my %seq; 19 | my $in=Bio::SeqIO->new(-file=>$$settings{ref}); 20 | while(my $seq=$in->next_seq){ 21 | $seq{$seq->id}=uc($seq->seq); 22 | } 23 | 24 | my $lineCount=0; 25 | my $numFixed=0; 26 | while(<>){ 27 | # Print headers 28 | if(/^#/){ 29 | print; 30 | next; 31 | } 32 | 33 | # Fix VCF lines 34 | $lineCount++; 35 | chomp; 36 | my @F=split /\t/; 37 | $F[1]||=0; 38 | $F[2]=uc($F[2]); 39 | 40 | # Fix up the kmer line and print it 41 | $numFixed += !! fixPosition(\@F,\%seq,$settings); 42 | 43 | if($lineCount % reportEvery == 0){ 44 | my $percent=int($numFixed/$lineCount * 100); 45 | logmsg "Have fixed $numFixed out of $lineCount ($percent%)"; 46 | } 47 | } 48 | 49 | my $percent=int($numFixed/$lineCount * 100); 50 | logmsg "Fixed $numFixed out of $lineCount ($percent%)"; 51 | } 52 | 53 | sub fixPosition{ 54 | my($F,$seqHash,$settings)=@_; 55 | 56 | # Remove the old Chrom/pos information from the VCF line. 57 | # It's not what we want anyway. 58 | splice(@$F,0,2); 59 | 60 | # Figure out where the snp is within the kmer to help 61 | # with the genomic position later on. 62 | my $dotIndex=index($$F[0],'.'); 63 | # Also need to search with the reverse complement. 64 | my $revcom=revcom($$F[0])->seq; 65 | 66 | # Keep track of whether or not there are matches 67 | my $numMatches=0; 68 | for my $id(keys(%$seqHash)){ 69 | while($$seqHash{$id}=~/($$F[0]|$revcom)/g){ 70 | my $pos=length($`)+$dotIndex; 71 | print join("\t",$id,$pos,@$F)."\n"; 72 | $numMatches++; 73 | } 74 | } 75 | 76 | logmsg "WARNING: I could not find kmer $$F[0] in $$settings{ref}" if($numMatches < 1); 77 | return $numMatches; 78 | } 79 | 80 | sub usage{ 81 | "Usage: $0 -ref reference.fasta < ksnp.vcf > fixed.vcf 82 | " 83 | } 84 | -------------------------------------------------------------------------------- /scripts/fixProkkaHeader.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Bio::Perl; 6 | use File::Basename qw/fileparse basename dirname/; 7 | use Getopt::Long; 8 | 9 | exit main(); 10 | 11 | sub main{ 12 | my $settings={}; 13 | GetOptions($settings,qw(help min_length|min-length=i)) or die $!; 14 | die usage() if(!@ARGV || $$settings{help}); 15 | $$settings{min_length}||=1; 16 | 17 | my($infile)=@ARGV; 18 | my $in=Bio::SeqIO->new(-file=>$infile,-verbose=>-1); 19 | my $out=Bio::SeqIO->new(-format=>"genbank"); 20 | my $i=0; 21 | while(my $seq=$in->next_seq){ 22 | next if($seq->length < $$settings{min_length}); 23 | my $id=sprintf("contig%06d",++$i); 24 | $seq->id($id); 25 | $out->write_seq($seq); 26 | } 27 | return 0; 28 | } 29 | 30 | sub usage{ 31 | local $0=basename $0; 32 | "$0: Fixes headers in genbank files 33 | Usage: $0 in.gbk > out.gbk 34 | --min_length 1 Minimum length of a contig 35 | " 36 | } 37 | -------------------------------------------------------------------------------- /scripts/flattenTree.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Bio::TreeIO; 7 | use Getopt::Long; 8 | use File::Basename qw/basename/; 9 | use Scalar::Util qw/looks_like_number/; 10 | 11 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 12 | exit(main()); 13 | 14 | sub main{ 15 | my $settings={}; 16 | GetOptions($settings,qw(help debug confidence|bootstrap|min-confidence=f)) or die $!; 17 | $$settings{confidence}||=0; 18 | 19 | for my $file(@ARGV){ 20 | my $in = Bio::TreeIO->new(-file=>$file); 21 | while(my $tree = $in->next_tree){ 22 | flattenTree($tree, $$settings{confidence}, $settings); 23 | } 24 | } 25 | 26 | return 0; 27 | } 28 | 29 | sub flattenTree{ 30 | my($tree, $minConfidence, $settings)=@_; 31 | 32 | for my $leaf($tree->get_nodes()){ 33 | next if(!$leaf->is_Leaf()); 34 | 35 | my @lineage = ($leaf,reverse($tree->get_lineage_nodes($leaf))); 36 | my $numLineage = @lineage; 37 | 38 | # For a flattening to work, a node must have a 39 | # grandparent so that there is a "root" node. 40 | # TODO: add a pseudo root node and remove it later. 41 | for(my $i=0;$i<$numLineage-2;$i++){ 42 | #next if(!defined($lineage[$i]->ancestor)); 43 | 44 | my $confidence = $lineage[$i+1]->id; 45 | $confidence //= 0; 46 | if(!looks_like_number($confidence)){ 47 | next; 48 | } 49 | 50 | if($confidence < $minConfidence){ 51 | if($$settings{debug}){ 52 | logmsg $lineage[$i]->id." ".$confidence." < ".$minConfidence; 53 | } 54 | # Branch length increases by the ancestor's branch 55 | # length, which we will now bypass. 56 | my $branch_length = $lineage[$i]->branch_length; 57 | my $anc_branch_length = $lineage[$i+1]->branch_length || 0; 58 | $lineage[$i]->branch_length( 59 | $branch_length + $anc_branch_length 60 | ); 61 | # Bump this node up to being a descendent of the 62 | # ancestor's ancestor. 63 | $lineage[$i]->ancestor( 64 | $lineage[$i+2] 65 | ); 66 | } 67 | } 68 | } 69 | 70 | my $numRemoved = 1; 71 | while($numRemoved > 0){ 72 | $numRemoved = 0; 73 | # Remove singleton paths 74 | $tree->contract_linear_paths; 75 | # Remove dead ancestor nodes 76 | for my $leaf($tree->get_nodes){ 77 | next if(!$leaf->is_Leaf); 78 | 79 | if(looks_like_number($leaf->id)){ 80 | $tree->remove_Node($leaf); 81 | $numRemoved++; 82 | } 83 | } 84 | } 85 | 86 | print $tree->as_text('newick')."\n"; 87 | } 88 | 89 | sub usage{ 90 | $0=basename $0; 91 | "$0: flattens a tree using node confidence scores 92 | NOTE: leaves with number-only identifiers will be removed. 93 | 94 | Usage: $0 tree.dnd [tree2.dnd...] > out.dnd 95 | --confidence 0 Minimum confidence for flattening a tree 96 | --debug Print debugging information to stderr 97 | " 98 | } 99 | -------------------------------------------------------------------------------- /scripts/formatFastaForKraken.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Bio::SeqIO; 6 | use Getopt::Long qw/GetOptions/; 7 | use File::Basename qw/basename/; 8 | use File::Copy qw/mv/; 9 | use File::Temp qw/tempdir tempfile/; 10 | use Data::Dumper; 11 | 12 | local $0=basename $0; 13 | 14 | sub logmsg{print STDERR "$0: @_\n"} 15 | exit main(); 16 | 17 | sub main{ 18 | my $settings={}; 19 | GetOptions($settings,qw(help tempdir=s taxid=i)) or die $!; 20 | die usage() if(!@ARGV); 21 | die "ERROR: need taxid" if(!defined $$settings{taxid}); 22 | $$settings{tempdir}||=tempdir("$0.XXXXXX", TMPDIR=>1, CLEANUP=>1); 23 | 24 | my $suffix = '|kraken:taxid|'.$$settings{taxid}; 25 | my $suffixRegex = qr/(.*?)(\|kraken:taxid\|\d+)*$/; 26 | 27 | logmsg "Taxid: $$settings{taxid}"; 28 | for my $fasta(@ARGV){ 29 | die "ERROR: cannot find $fasta" if(!-e $fasta); 30 | logmsg $fasta; 31 | 32 | my($tempfasFh, $tempfas)=tempfile("XXXXXX", SUFFIX=>".fasta", DIR=> $$settings{tempdir}); 33 | my $in=Bio::SeqIO->new(-file=>$fasta); 34 | my $out=Bio::SeqIO->new(-fh=>$tempfasFh,-format=>"fasta"); 35 | 36 | while(my $seq=$in->next_seq){ 37 | my $id=$seq->id; 38 | $id=~s/$suffixRegex/$1$suffix/; 39 | $seq->id($id); 40 | $seq->description(" "); 41 | $out->write_seq($seq); 42 | } 43 | 44 | $in->close; 45 | $out->close; 46 | close($tempfasFh); 47 | mv($tempfas,$fasta); 48 | } 49 | 50 | return 0; 51 | } 52 | 53 | sub usage{ 54 | "$0: Perform in-place editing of fasta files for Kraken 55 | usage: $0 --taxid=taxid *.fasta 56 | 57 | --taxid The taxon ID from NCBI (required) 58 | " 59 | } 60 | 61 | -------------------------------------------------------------------------------- /scripts/kaptivate_wrapper.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | use File::Copy qw/cp/; 9 | use File::Temp qw/tempdir/; 10 | 11 | use version 0.77; 12 | our $VERSION = '0.1.1'; 13 | 14 | local $0 = basename $0; 15 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 16 | exit(main()); 17 | 18 | sub main{ 19 | my $settings={}; 20 | GetOptions($settings,qw(help db=s numcpus=i)) or die $!; 21 | usage() if($$settings{help}); 22 | 23 | $$settings{tempdir} //= tempdir("kaptive.XXXXXX", TMPDIR=>1, CLEANUP=>1); 24 | $$settings{db} or die "ERROR: need --db set to the database with the kaptive database"; 25 | $$settings{numcpus} ||= 1; 26 | 27 | for my $fasta(@ARGV){ 28 | my $outdir = runKaptive($fasta, $$settings{db}, $settings); 29 | 30 | # Get the results 31 | open(my $kFh, "<", "$outdir/k.log") or die "ERROR: could not read $outdir/k.log: $!"; 32 | my $k = <$kFh>; 33 | chomp($k); 34 | close $kFh; 35 | open(my $oFh, "<", "$outdir/o.log") or die "ERROR: could not read $outdir/o.log: $!"; 36 | my $o = <$oFh>; 37 | chomp($o); 38 | close $oFh; 39 | 40 | # Parse the results 41 | $k =~ s/.*?K/K/; 42 | $o =~ s/.*?O/O/; 43 | print join("\t", basename($fasta), $k, $o)."\n"; 44 | } 45 | 46 | return 0; 47 | } 48 | 49 | sub runKaptive{ 50 | my($fasta, $db, $settings) = @_; 51 | logmsg "Running Kaptive on $fasta"; 52 | 53 | # Staging area 54 | my $tmpdir = "$$settings{tempdir}/".basename($fasta, qw(.fasta .fa)); 55 | mkdir $tmpdir; 56 | 57 | # Input and output directories 58 | my $tmpdirIn = "$tmpdir/in"; 59 | my $tmpdirOut = "$tmpdir/out"; 60 | mkdir $tmpdirIn; 61 | mkdir $tmpdirOut; 62 | 63 | my $tmpfasta = "$tmpdirIn/in.fasta"; 64 | cp($fasta, $tmpfasta) or die "ERROR: could not copy $fasta to $tmpfasta: $!"; 65 | 66 | my $oGbk = "$db/VibrioPara_Kaptivedb_O.gbk"; 67 | my $kGbk = "$db/VibrioPara_Kaptivedb_K.gbk"; 68 | 69 | system("kaptive.py --threads $$settings{numcpus} -k $oGbk -a $tmpfasta -o $tmpdirOut/o > $tmpdirOut/o.log"); 70 | die if $?; 71 | system("kaptive.py --threads $$settings{numcpus} -k $kGbk -a $tmpfasta -o $tmpdirOut/k > $tmpdirOut/k.log"); 72 | die if $?; 73 | 74 | return $tmpdirOut; 75 | } 76 | 77 | sub usage{ 78 | print "$0: runs Kaptive on a set of fasta files 79 | Usage: $0 [options] *.fasta > out.tsv 80 | --db Database directory for Kaptive containing *.gbk 81 | --numcpus Number of threads to use (default: 1) 82 | --help This useful help menu 83 | \n"; 84 | exit 0; 85 | } 86 | -------------------------------------------------------------------------------- /scripts/kraken2-translate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | 9 | local $0 = basename $0; 10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 11 | exit(main()); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help)) or die $!; 16 | usage() if($$settings{help} || !@ARGV); 17 | 18 | my($infile) = @ARGV; 19 | 20 | translateKraken2($infile); 21 | 22 | return 0; 23 | } 24 | 25 | # Emulate the kraken-translate format with 2-space-delimited columns: 26 | # count 27 | # parent node 1 28 | # parent node 2 29 | # ... 30 | # genus 31 | # genus/species 32 | # ... further divisions 33 | sub translateKraken2{ 34 | my($infile) = @_; 35 | 36 | # An array of parent nodes excluding the current node, 37 | # going from root to kingdom to phylum all the way down, 38 | # where the 0th element is almost always going to be root 39 | my @parent = (); 40 | my $parentLevel = 0; # always equivalent to number of spaces x 2 41 | 42 | open(my $fh, "<", $infile) or die "ERROR: could not open $infile: $!"; 43 | while(my $line = <$fh>){ 44 | # left and right whitespace trim 45 | $line =~ s/^\s+|\s+$//g; 46 | 47 | # fields defined in the manual at 48 | # https://ccb.jhu.edu/software/kraken/MANUAL.html#sample-reports 49 | # * Percentage of reads covered by the clade rooted at this taxon 50 | # * Number of reads covered by the clade rooted at this taxon 51 | # * Number of reads assigned directly to this taxon 52 | # * A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'. 53 | # * NCBI taxonomy ID 54 | # * indented scientific name 55 | my @F = split(/\t/, $line); 56 | my($percent, $readsUmbrella, $readsSpecific, $rank, $taxid, $nameWithIndentation) = @F; 57 | 58 | # Figure out the level from the number of spaces in the name. 59 | # Each two spaces is one more level down. 60 | # There are ways to optimize this step in perl but I just wanted 61 | # to make it clear what was happening. 62 | my $childLevel = 0; 63 | if($nameWithIndentation =~ /^( +)/){ 64 | my $prefixWhitespace = $1; 65 | while($prefixWhitespace =~ / /g){ 66 | $childLevel++; 67 | } 68 | } 69 | 70 | # Remove the indentation into $name 71 | my $name = $nameWithIndentation; 72 | $name =~ s/^\s+//; 73 | $name =~ s/\s+/_/g; # also remove internal whitespace 74 | 75 | # Cut down the parent nodes to this child level 76 | # while getting the current node tacked on. 77 | $parent[$childLevel] = $name; 78 | 79 | # Set up the taxa fields with the parent node(s) and the current node. 80 | my @taxaField = @parent[0..$childLevel]; 81 | 82 | print join("\t", $readsSpecific, @taxaField)."\n"; 83 | } 84 | } 85 | 86 | sub usage{ 87 | print "$0: changes kraken report to a format for ktImportText in Krona 88 | Usage: $0 [options] kraken.report 89 | --help This useful help menu 90 | 91 | Output is tab delimited: 92 | * count of reads 93 | * parent node 1 94 | * ... 95 | * last child node (usually genus/species) 96 | "; 97 | exit 0; 98 | } 99 | -------------------------------------------------------------------------------- /scripts/ksnpsToVcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Basename qw/basename/; 7 | use Data::Dumper; 8 | 9 | exit main(); 10 | 11 | sub main{ 12 | my $settings={}; 13 | GetOptions($settings,qw(help)) or die $!; 14 | 15 | my($infile)=@ARGV; 16 | die usage() if(!$infile || $$settings{help}); 17 | 18 | my @genomeList=`cut -f 5 $infile | sort| grep . | uniq`; 19 | die if $?; 20 | chomp(@genomeList); 21 | SNPs_allToVcf($infile,\@genomeList,$settings); 22 | return 0; 23 | } 24 | 25 | sub SNPs_allToVcf{ 26 | my($infile,$genomeList,$settings)=@_; 27 | 28 | my @nt=qw(A C G T); 29 | my $numGenomes=scalar(@$genomeList); 30 | 31 | local $0=basename $0; 32 | print "##fileformat=VCFv4.1\n##source=kSNP3, $0\n"; 33 | print join("\t",'##CHROM',qw(POS ID REF ALT QUAL FILTER INFO FORMAT),@$genomeList)."\n"; 34 | 35 | # Get a list of zeros for the GTs later on 36 | my @zeroVariantTags=(0) x $numGenomes; #split(//, "0" x $numGenomes); 37 | 38 | # index the genomeList 39 | my %genomeIndex; 40 | @genomeIndex{@$genomeList}=keys(@$genomeList); 41 | 42 | my @GT=@zeroVariantTags; 43 | my %altIndex=(); 44 | my @ALT=('.'); 45 | my($id,$kmer,$variant,$x,$genome)=('.') x 5; 46 | open(IN,$infile) or die "ERROR: could not read $infile for reading: $!"; 47 | while(){ 48 | s/^\s+|\s+$//g; # whitespace trim 49 | 50 | # Whenever there is a blank line, the ID is about to increment. 51 | # Print the VCF line. 52 | if(/^$/){ 53 | #print Dumper \@ALT; 54 | #next if(join("",@GT) eq join("",@zeroVariantTags)); # don't print non-variants 55 | next if(scalar(@ALT) < 2); 56 | next if( (grep{$_ > 0} @GT) < 2); 57 | 58 | # Print the VCF line. 59 | shift(@ALT); # remove the dot. 60 | print join("\t",'.','.',$kmer,'.',join(",",@ALT), '.', 'PASS', "NS=$numGenomes", 'GT', @GT)."\n"; 61 | 62 | # reset 63 | @GT=@zeroVariantTags; 64 | %altIndex=(); 65 | @ALT=('.'); 66 | next; 67 | } 68 | 69 | ($id,$kmer,$variant,$x,$genome)=split(/\t/,$_); 70 | $id=uc($id); 71 | $variant=uc($variant); 72 | 73 | if(!defined($altIndex{$variant})){ 74 | $altIndex{$variant}=scalar(@ALT); 75 | push(@ALT,$variant); 76 | } 77 | $GT[$genomeIndex{$genome}]=$altIndex{$variant}; 78 | } 79 | close IN; 80 | } 81 | 82 | sub usage{ 83 | local $0=basename $0; 84 | "$0: transform a kSNP3 output into a vcf file 85 | Usage: $0 kSNP3.out/SNPs_all > kSNP3.vcf 86 | SNPs_all file is formatted with the tab-delimited fields 87 | ID kmer variant x genomeName 88 | " 89 | } 90 | -------------------------------------------------------------------------------- /scripts/lasergeneToFna.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use File::Path; 6 | use Bio::Perl; 7 | use File::Basename; 8 | 9 | die usage() if(!@ARGV || $ARGV[0]=~/^\-+h/); 10 | 11 | my $out=Bio::SeqIO->new(-format=>"fasta"); 12 | my @file=@ARGV or die("need files to convert\n"); 13 | foreach my $f (@file){ 14 | my $in=Bio::SeqIO->new(-format=>"lasergene",-file=>$f); 15 | while(my $seq=$in->next_seq){ 16 | my($id,$desc); 17 | ($id)=fileparse($f); 18 | ($id,$desc)=split(/\s+/,$id,2); 19 | 20 | $seq->id($id); 21 | $seq->desc($desc); 22 | $out->write_seq($seq); 23 | } 24 | } 25 | 26 | sub trim{ 27 | my $str=shift; 28 | $str=~s/^\s+|\s+$//g; 29 | return $str; 30 | } 31 | 32 | sub usage{ 33 | local $0 = fileparse $0; 34 | "Converts lasergene sequence files to a multifasta file 35 | Usage: $0 *.lasergene > file.fasta 36 | " 37 | } 38 | -------------------------------------------------------------------------------- /scripts/mashesToAlignment.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | use Bio::Sketch::Mash; 9 | 10 | local $0 = basename $0; 11 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 12 | exit(main()); 13 | 14 | sub main{ 15 | my $settings={}; 16 | GetOptions($settings,qw(help presence=s absence=s)) or die $!; 17 | 18 | $$settings{presence} //= "1"; 19 | $$settings{absence} //= "0"; 20 | 21 | die usage($settings) if($$settings{help} || !@ARGV); 22 | 23 | # Get the sorted arguments so that they are uniform 24 | # and not random in each of the methods below, 25 | # resulting in a determininstic order for the pseudo 26 | # alignment 27 | my @infile = sort(@ARGV); 28 | 29 | # Find presence/absence of hashes 30 | logmsg "Finding presence/absence of ".scalar(@infile)." files"; 31 | my $presence = readSketches(\@infile, $settings); 32 | # Make sequences out of the hashes 33 | logmsg "Determining the pseudosequence for each input file"; 34 | logmsg "Present nucleotides will be $$settings{presence} and absent nucleotides will be $$settings{absence}"; 35 | my $seqs = determinePseudoSequences(\@infile, $presence, $settings); 36 | # make an actual alignment string 37 | logmsg "Making the alignment from sequence"; 38 | my $aln = makeAlignment(\@infile, $seqs, $settings); 39 | 40 | print "$aln"; 41 | 42 | return 0; 43 | } 44 | 45 | sub readSketches{ 46 | my($sketches, $settings) = @_; 47 | 48 | my %p; # presence/absence 49 | for my $file(@$sketches){ 50 | print STDERR "."; 51 | my $msh = Bio::Sketch::Mash->new($file); 52 | my $sketches=$$msh{sketches}[0]{hashes}; 53 | for my $s(@$sketches){ 54 | $p{$s}{$file}=1; 55 | } 56 | } 57 | print STDERR "\n"; 58 | 59 | return \%p; 60 | } 61 | 62 | sub determinePseudoSequences{ 63 | my($infiles, $p, $settings) = @_; 64 | 65 | my %sampleSeq; 66 | 67 | # Sort to help keep output stable 68 | my @hashInt = sort{$a<=>$b} keys(%$p); 69 | for my $h(@hashInt){ 70 | for my $file(@$infiles){ 71 | # If the hash is present, then give the "present" nucleotide 72 | if($$p{$h}{$file}){ 73 | $sampleSeq{$file} .= $$settings{presence}; 74 | } 75 | # If the hash is not present, then give the "absent" nucleotide 76 | else{ 77 | $sampleSeq{$file} .= $$settings{absence}; 78 | } 79 | } 80 | } 81 | 82 | return \%sampleSeq; 83 | } 84 | 85 | sub makeAlignment{ 86 | my($infiles, $seqs, $settings) = @_; 87 | 88 | my $alnStr = ""; 89 | for my $file(@$infiles){ 90 | $alnStr .= ">$file\n"; 91 | $alnStr .= $$seqs{$file}."\n"; 92 | } 93 | return $alnStr; 94 | } 95 | 96 | sub usage{ 97 | my($settings) = @_; 98 | print "$0: transforms a set of mash sketches to an alignment 99 | Usage: $0 [options] *.msh > aln.fasta 100 | --presence The nucleotide to use for a present hash integer 101 | default: $$settings{presence} 102 | --absence The nucleotide to use for an absent hash integer 103 | default: $$settings{absence} 104 | --help This useful help menu 105 | 106 | suggested workflow: 107 | $0 ... 108 | goalign reformat phylip -i binary.fasta > binary.fasta.phylip 109 | raxmlHPC -f a -s binary.fasta.phylip -n \$prefix -T \$numcpus -p \$RANDOM -x \$RANDOM -N 100 -m BINGAMMA 110 | \n"; 111 | } 112 | -------------------------------------------------------------------------------- /scripts/md5sumDir.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Digest::MD5 qw/md5_hex/; 5 | use File::Slurp qw/read_file/; 6 | use threads; 7 | 8 | my $numcpus = 24; 9 | 10 | my @file; 11 | for my $dir(@ARGV){ 12 | push(@file, 13 | `find $dir -type f` 14 | ); 15 | } 16 | chomp(@file); 17 | 18 | my $num_files_per_thread = int(scalar(@file)/$numcpus) + 1; 19 | my @thr; 20 | for my $i(0..$numcpus - 1){ 21 | my @subfile = splice(@file, 0, $num_files_per_thread); 22 | $thr[$i] = threads->new(sub{ 23 | my @hex; 24 | for my $file(@subfile){ 25 | my $content = read_file($file); 26 | push(@hex, md5_hex($content)); 27 | } 28 | return \@hex; 29 | }); 30 | } 31 | 32 | my @hex; 33 | for my $thr(@thr){ 34 | my $hexSubArr = $thr->join; 35 | push(@hex, @$hexSubArr); 36 | } 37 | my $finalHex = md5_hex(join("\n", sort @hex)."\n"); 38 | print $finalHex."\n"; 39 | -------------------------------------------------------------------------------- /scripts/md5sumDir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | NUMCPUS=1 5 | DIR=$1 6 | 7 | if [ "$DIR" == "" ]; then 8 | echo "Finds md5sum of all files in a whole directory, recursively" 9 | echo "Usage: $0 dir" 10 | exit 1 11 | fi 12 | 13 | # Make a temporary directory that will be removed upon exit 14 | export TEMPDIR=$(mktemp --directory $(basename $0).XXXXXX --tmpdir) 15 | function cleanup(){ 16 | rm -rf $TEMPDIR 17 | } 18 | trap cleanup EXIT 19 | 20 | # Find all files in the directory recursively, 21 | # and md5sum them into TEMPDIR. 22 | # Using the temporary directory helps with race conditions. 23 | find $DIR -type f | xargs -P $NUMCPUS -n 1 sh -c ' 24 | md5sum $0 > $TEMPDIR/$(basename $0).$$.md5 25 | ' 26 | # Combine all md5sums to one file. 27 | # Adding $$ to the filename will make it unique in the temp 28 | # directory; the full file path appears in the md5 file 29 | # contents and so it should be a stable sort. 30 | sort $TEMPDIR/*.md5 | md5sum | cut -f 1 -d ' ' 31 | 32 | -------------------------------------------------------------------------------- /scripts/mvSymlink.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Lee Katz 3 | # Moves a symbolic link or more to a target directory 4 | 5 | use strict; 6 | use warnings FATAL=>'all'; 7 | use File::Copy qw(mv); 8 | use Getopt::Long qw(GetOptions); 9 | use File::Spec; 10 | use Cwd qw/realpath getcwd/; 11 | use File::Basename; 12 | 13 | my $target; 14 | GetOptions('target-directory=s' => \$target); 15 | die "$0 -t target_dir symlink1 symlink2 symlink3\n" unless $target && -d $target; 16 | 17 | my $origDir=getcwd; 18 | for (@ARGV) { 19 | unless (-l $_) { 20 | warn "$_ is not a symlink\n"; 21 | next; 22 | } 23 | my $filename=fileparse $_; 24 | my $absPath=realpath($_); 25 | chdir $target; 26 | my $relPath=File::Spec->abs2rel($absPath); 27 | symlink $relPath, $filename; 28 | chdir $origDir; 29 | unlink $_; 30 | } 31 | -------------------------------------------------------------------------------- /scripts/pairwiseDistances.mlst.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | 9 | local $0 = basename $0; 10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 11 | exit(main()); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help)) or die $!; 16 | die usage() if($$settings{help} || !@ARGV); 17 | 18 | my(@infile) = @ARGV; 19 | 20 | for my $file(@infile){ 21 | logmsg "Reading $file"; 22 | my $alleles = readAlleles($file, $settings); 23 | my $distances=distances($alleles, $settings); 24 | 25 | my @name = sort(keys(%$distances)); 26 | for(my $i=0;$i<@name;$i++){ 27 | for(my $j=$i+1;$j<@name;$j++){ 28 | my($nameI,$nameJ) = sort ($name[$i],$name[$j]); 29 | next if($name[$i] !~ /_100/ && $name[$j] !~ /_100/); 30 | my $distance = $$distances{$name[$i]}{$name[$j]} // "UNKNOWN"; 31 | print join("\t", $name[$i], $name[$j], $distance)."\n"; 32 | } 33 | } 34 | } 35 | 36 | return 0; 37 | } 38 | 39 | sub readAlleles{ 40 | my($file, $settings) = @_; 41 | 42 | my %allele; 43 | 44 | open(my $fh, $file) or die "ERROR reading $file: $!"; 45 | my $header = <$fh>; 46 | my @header = split(/\t/, $header); 47 | shift(@header); # assume the first column is the key and disregard it otherwise 48 | while(<$fh>){ 49 | chomp; 50 | my @F = split /\t/; 51 | my $name = shift(@F); 52 | my %F; 53 | @F{@header} = @F; 54 | 55 | $allele{$name} = \%F; 56 | } 57 | close $fh; 58 | 59 | return \%allele; 60 | } 61 | 62 | sub distances{ 63 | my($alleles, $settings) = @_; 64 | 65 | my %distance; 66 | 67 | my @name = sort keys %$alleles; 68 | my @allele=sort keys %{$$alleles{$name[0]}}; 69 | my $numNames = @name; 70 | my $numAlleles=@allele; 71 | 72 | for(my $i=0; $i<$numNames; $i++){ 73 | for(my $j=$i+1; $j<$numNames; $j++){ 74 | # Sort the names to avoid having to make sure that 75 | # each "vice versa" distance doesn't have to be 76 | # calculated. 77 | my($nameI, $nameJ) = sort($name[$i], $name[$j]); 78 | # initialize the distances 79 | $distance{$nameI}{$nameJ} = 0; 80 | 81 | for(my $k=0; $k<$numAlleles; $k++){ 82 | my $alleleI = $$alleles{$nameI}{$allele[$k]}; 83 | my $alleleJ = $$alleles{$nameJ}{$allele[$k]}; 84 | 85 | if($alleleI eq '?' || $alleleJ eq '?'){ 86 | next; 87 | } 88 | 89 | if($alleleI ne $alleleJ){ 90 | $distance{$nameI}{$nameJ}++; 91 | } 92 | } 93 | } 94 | } 95 | 96 | return \%distance; 97 | } 98 | 99 | sub usage{ 100 | "$0: Calculates pairwise distances between genomes in a bionumerics MLST export 101 | Usage: $0 [options] mlst.tsv > pairwise.tsv 102 | --help This useful help menu 103 | " 104 | } 105 | -------------------------------------------------------------------------------- /scripts/parseMultiblast.partialAnswer.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Data::Dumper; 5 | 6 | my %query=(); 7 | my %hit=(); 8 | my $section=""; 9 | while(<>){ # read line by line 10 | if(/Table of genes/){ 11 | $section="Table of genes"; 12 | while(<>){ 13 | # The section ends on a blank line 14 | if(/^\s*$/){ 15 | last; 16 | } 17 | 18 | chomp; # remove whitespace 19 | my($locus, $start, $stop, $strand, $annotation)=split(/\s+/); 20 | $query{$locus}={ 21 | start =>$start, 22 | stop =>$stop, 23 | strand =>$strand, 24 | annotation=>$annotation, 25 | }; 26 | 27 | } 28 | } 29 | 30 | # Redundant with the details section 31 | elsif(/Significant hits/){ 32 | while(<>){ 33 | if(/Details/){ 34 | $section="Details"; 35 | parseDetailsSection(\%hit, \%query); 36 | last; 37 | } 38 | } 39 | } 40 | } 41 | 42 | print Dumper \%query; 43 | 44 | sub parseDetailsSection{ 45 | my($hit,$query)=@_; 46 | 47 | my $currentHit=""; 48 | while(<>){ 49 | if(/^\s*$/){ 50 | next; 51 | } 52 | 53 | chomp; 54 | if(/\d+\.\s+(\S+)/){ 55 | $currentHit=$1; 56 | # Source is on the next line. 57 | my $source = scalar(<>); 58 | chomp $source; 59 | $source=~s/^Source: //; 60 | $$hit{$currentHit}{source}=$source; 61 | } 62 | 63 | # A regex double meaning with elipses: a commentary 64 | # that this is too wordy but it also simply matches 65 | # on at least three characters. 66 | # Also: this perl comment is wordy. 67 | if(/^Number of proteins with BLAST hits...+(\d+)/){ 68 | $$hit{$currentHit}{numhits}=$1; 69 | $$hit{$currentHit}{multiblastscore}=scalar(<>); 70 | $$hit{$currentHit}{multiblastscore}=~s/\s+|\D+//g; # trim and remove non-digits 71 | $$hit{$currentHit}{blastscore}=scalar(<>)i 72 | $$hit{$currentHit}{blastscore}=~s/\s+|\D+//g; # trim and remove non-digits 73 | } 74 | elsif(/Table of genes.../){ 75 | while(<>){ 76 | if(/^\s*$/){ 77 | last; 78 | } 79 | chomp; 80 | my($locus, $start, $stop, $strand, $annotation)=split(/\s+/); 81 | $$hit{$currentHit}{query}{$locus}={ 82 | start => $start, 83 | stop => $stop, 84 | strand => $strand, 85 | annotation=> $annotation, 86 | }; 87 | } 88 | 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /scripts/pfgeOnGenome.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Bio::Perl; 6 | use Bio::Restriction::EnzymeCollection; 7 | use Bio::Restriction::Analysis; 8 | use Data::Dumper; 9 | use File::Basename qw/basename/; 10 | use Getopt::Long; 11 | 12 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n"} 13 | 14 | exit main(); 15 | 16 | sub main{ 17 | my $settings={}; 18 | GetOptions($settings,qw(help outtype=s enz|enzyme=s@)) or die $!; 19 | $$settings{enz}||=[qw(AscI)]; 20 | $$settings{outtype}||="bed"; 21 | 22 | my @seq=@ARGV; 23 | die usage() if($$settings{help}||!@seq); 24 | 25 | my $all_collection = Bio::Restriction::EnzymeCollection->new(); 26 | my @enz; 27 | for my $enzName(@{ $$settings{enz} }){ 28 | my $re=$all_collection->get_enzyme($enzName); 29 | # Try capitalizing the enzyme or other tricks 30 | # if it isn't found right away. 31 | if(!$re){ 32 | my $EnzName=ucfirst($enzName); 33 | $re=$all_collection->get_enzyme($EnzName); 34 | logmsg "Tried transforming $enzName to $EnzName"; 35 | } 36 | if(!$re){ 37 | my $enzN4me=$enzName; 38 | $enzN4me=~s/1(I*)$/I$1/g; # replace tail ones with Is 39 | $re=$all_collection->get_enzyme($enzN4me); 40 | logmsg "Tried transforming $enzName to $enzN4me"; 41 | } 42 | if(!$re){ 43 | die "ERROR: I do not understand enzyme $enzName"; 44 | } 45 | push(@enz,$re); 46 | } 47 | 48 | my $counter=0; 49 | for my $gbk(@seq){ 50 | my $in=Bio::SeqIO->new(-file=>$gbk); 51 | while(my $seq=$in->next_seq){ 52 | my $seqLength=$seq->length; 53 | my $ra=Bio::Restriction::Analysis->new(-seq=>$seq); 54 | 55 | if($$settings{outtype} eq 'sizes'){ 56 | my @fragments=map{length($$_{seq})} $ra->fragment_maps(@enz); 57 | for(@fragments){ 58 | print join("\t",$seq->id,$_)."\n"; 59 | } 60 | } elsif($$settings{outtype} eq 'bed'){ 61 | for my $re(@enz){ 62 | my @pos=$ra->positions($re->name); 63 | for my $pos(@pos){ 64 | # I'm not 100% sure why this position math works, but 65 | # it matches up with what Apollo genome browser does. 66 | my $start=$pos-$re->cut+1; 67 | my $end=$start+$re->recognition_length-1; 68 | print join("\t",$seq->id,$start,$end,$re->name.++$counter)."\n"; 69 | } 70 | } 71 | } else { 72 | die "ERROR: I don't understand outtype $$settings{outtype}"; 73 | } 74 | } 75 | } 76 | 77 | return 0; 78 | } 79 | 80 | sub usage{ 81 | local $0=basename $0; 82 | "Usage: $0 *.fasta > restrictionAnalysis.bed 83 | --enzyme AscI The enzyme to digest with. Can suppy 84 | multiple --enzyme arguments. 85 | --outtype bed Outputs a bed file of cut size coordinates. 86 | If 'sizes' is supplied instead, then 87 | fragment sizes will be output. 88 | " 89 | } 90 | 91 | -------------------------------------------------------------------------------- /scripts/phylipDistToTallSkinny.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | 9 | local $0 = basename $0; 10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 11 | exit(main()); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help)) or die $!; 16 | die usage() if($$settings{help} || !@ARGV); 17 | 18 | my @file = @ARGV; 19 | my $numFiles = @file; 20 | my %dist; 21 | for my $file(@file){ 22 | $dist{$file} = readPhylip($file, $settings); 23 | } 24 | 25 | my @taxon = keys(%{ $dist{$file[0]} }); 26 | my $numTaxa = @taxon; 27 | 28 | print join("\t", "taxon1", "taxon2", @file)."\n"; 29 | for(my $i=0;$i<$numTaxa;$i++){ 30 | for(my $j=0;$j<$numTaxa;$j++){ 31 | print $taxon[$i]."\t".$taxon[$j]; 32 | for(my $k=0;$k<$numFiles;$k++){ 33 | my $singleDist = $dist{$file[$k]}{$taxon[$i]}{$taxon[$j]}; 34 | if(!defined($singleDist)){ 35 | $singleDist = $dist{$file[$k]}{$taxon[$j]}{$taxon[$i]}; 36 | } 37 | if(!defined($singleDist)){ 38 | die "ERROR: distance not found:\n".Dumper [$k,$file[$k]], [$i,$taxon[$i]], [$j, $taxon[$j]]; 39 | } 40 | print "\t".$singleDist; 41 | } 42 | print "\n"; 43 | } 44 | } 45 | 46 | return 0; 47 | } 48 | 49 | sub readPhylip{ 50 | my($file, $settings)=@_; 51 | 52 | my %dist; 53 | my %distArr; 54 | my @taxon; 55 | open(my $fh, $file) or die "ERROR: could not read $file: $!"; 56 | my $numTaxa = <$fh>; 57 | $numTaxa =~ s/^\s+|\s+$//g; 58 | while(<$fh>){ 59 | chomp; 60 | my($taxon, @dist) = split(/\s+/, $_); 61 | $distArr{$taxon} = \@dist; 62 | push(@taxon, $taxon); 63 | } 64 | 65 | my $actualNumTaxa = @taxon; 66 | if($actualNumTaxa != $numTaxa){ 67 | die "ERROR: in $file, reported number of taxa does not match number of taxa found"; 68 | } 69 | 70 | # Now that we know all the taxa in the list, go back 71 | # and fill in the 2d hash 72 | while(my($refTaxon, $distances) = each(%distArr)){ 73 | for(my $i=0;$i<$numTaxa;$i++){ 74 | my $queryTaxon = $taxon[$i]; 75 | $dist{$refTaxon}{$queryTaxon} = $$distances[$i]; 76 | } 77 | } 78 | 79 | return \%dist; 80 | } 81 | 82 | sub usage{ 83 | " 84 | $0: changes phylip distance files to a single tall/skinny format 85 | Usage: $0 [options] file1.phylip [file2.phylip...] 86 | --help This useful help menu 87 | 88 | " 89 | } 90 | -------------------------------------------------------------------------------- /scripts/phylogeneticOrder.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Bio::Perl; 6 | use Bio::TreeIO; 7 | use Getopt::Long; 8 | use Data::Dumper; 9 | 10 | exit main(); 11 | 12 | sub main{ 13 | my $settings={}; 14 | GetOptions($settings,qw(help root=s)); 15 | $$settings{root}||=""; 16 | die usage() if($$settings{help}); 17 | 18 | my(@tree)=@ARGV; 19 | die "ERROR: need a tree file!\n".usage() if(!@tree); 20 | 21 | for my $tree(@tree){ 22 | printPhylogeneticOrder($tree,$settings); 23 | } 24 | return 0; 25 | } 26 | 27 | sub printPhylogeneticOrder{ 28 | my($tree,$settings)=@_; 29 | 30 | my $numtaxa=0; 31 | my $in=Bio::TreeIO->new(-file=>$tree); 32 | while(my $tree=$in->next_tree){ 33 | reroot($tree,$settings) if($$settings{root}); 34 | for my $node($tree->get_nodes(-order=>"depth")){ # other choice: "breadth" 35 | next if(!$node->is_Leaf); 36 | my $id=$node->id; 37 | $id=~s/^'|'$//g; # remove single quotes at beginning/end that bioperl adds 38 | print "$id\n"; 39 | } 40 | } 41 | $in->close; 42 | } 43 | 44 | sub reroot{ 45 | my($tree,$settings)=@_; 46 | # TODO validate that {root} eq 'midpoint' or a node name 47 | if($$settings{root} =~/^longest$/i){ 48 | # converge on a longest branch 49 | _rerootLongestBranch($tree,$settings) for(1..10); 50 | } else { 51 | # Reroot on whichever node matches. 52 | # NOTE: the name is not validated here and so 53 | # rerooting might not happen if the ID is not found. 54 | for my $node($tree->get_leaf_nodes){ 55 | $tree->reroot($node) if($node->id eq $$settings{root}); 56 | } 57 | } 58 | } 59 | 60 | sub _rerootLongestBranch{ 61 | my($tree,$settings)=@_; 62 | 63 | my @node=$tree->get_nodes; 64 | my $outgroup=$node[0]; 65 | my $longest=$node[0]->branch_length || 0; 66 | 67 | for(my $i=1;$i<@node;$i++){ 68 | if($node[$i]->branch_length() > $longest){ 69 | $longest=$node[$i]->branch_length; 70 | $outgroup=$node[$i]; 71 | #print join("\t",$outgroup->id,$longest)."\n"; 72 | } 73 | } 74 | 75 | $tree->reroot($outgroup); 76 | } 77 | 78 | sub usage{ 79 | "$0: determine the phylogenetic order from a tree file 80 | Usage: $0 tree.dnd 81 | --root longest Reroot the tree at the longest branch. If you supply a taxon ID then it will root on the branch leading to it instead. 82 | " 83 | } 84 | -------------------------------------------------------------------------------- /scripts/pruneSafely.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Author: Lee Katz 3 | # Safely remove a taxon from a tree without incurring singleton nodes 4 | 5 | use strict; 6 | use warnings; 7 | use Bio::TreeIO; 8 | use Getopt::Long; 9 | use Data::Dumper; 10 | use File::Basename qw/basename/; 11 | 12 | sub logmsg{local $0=basename($0); print STDERR "$0: @_\n";} 13 | 14 | exit main(); 15 | 16 | sub main{ 17 | my $settings={}; 18 | GetOptions($settings,qw(help tree=s)) or die $!; 19 | 20 | die usage() if($$settings{help}); 21 | $$settings{tree} || die "ERROR: no tree was given:\n".usage(); 22 | 23 | my @remove=@ARGV; 24 | die "ERROR: need to remove at least taxon\n".usage() if(@remove < 1); 25 | 26 | my $treeObj=safeRemove($$settings{tree},\@remove,$settings); 27 | 28 | my $out=Bio::TreeIO->new(-format=>"newick"); 29 | $out->write_tree($treeObj); 30 | print "\n"; # just because newick files don't have newlines for some reason 31 | 32 | return 0; 33 | } 34 | 35 | sub safeRemove{ 36 | my($tree,$remove,$settings)=@_; 37 | 38 | my $treeObj=Bio::TreeIO->new(-file=>$tree)->next_tree; 39 | 40 | $treeObj=removeTaxa($treeObj,$remove,$settings); 41 | 42 | return $treeObj; 43 | } 44 | 45 | sub removeTaxa{ 46 | my($tree,$remove,$settings)=@_; 47 | 48 | my %leaf_node_id=(); 49 | my %ancestor_node=(); 50 | my @ancestor_node=(); 51 | my @node = $tree->get_nodes; 52 | for my $node(@node){ 53 | if($node->is_Leaf()){ 54 | $leaf_node_id{$node->id}=1; 55 | } else { 56 | push(@ancestor_node, $node); 57 | $ancestor_node{$node}=1; 58 | } 59 | } 60 | 61 | for my $taxon(@$remove){ 62 | die "ERROR: taxon $taxon does not exist in the tree!" if(!$leaf_node_id{$taxon}); 63 | my $safely_removed=$tree->remove_Node($taxon); 64 | if(!$safely_removed){ 65 | die "ERROR: could not remove $taxon safely"; 66 | } 67 | } 68 | 69 | $tree->contract_linear_paths(1); 70 | 71 | # Now remove all nodes that were ancestors but are now leaves 72 | #my $nodes_were_removed=1; 73 | #while($nodes_were_removed){ 74 | # $nodes_were_removed = removeUselessNewLeafNodes($tree,\@ancestor_node); 75 | # last; 76 | #} 77 | 78 | return $tree; 79 | } 80 | 81 | sub removeUselessNewLeafNodes{ 82 | my($tree,$ancestor_node)=@_; 83 | 84 | my $nodesRemovedCounter=0; 85 | for my $node(@$ancestor_node){ 86 | # If an ancestor node is now a leaf, prune it too 87 | if($node->is_Leaf()){ 88 | my $safely_removed=$tree->remove_Node($node); 89 | if(!$safely_removed){ 90 | die "ERROR: could not remove a useless ancestor node safely"; 91 | } 92 | $nodesRemovedCounter++; 93 | } 94 | } 95 | return $nodesRemovedCounter; 96 | } 97 | 98 | sub usage{ 99 | local $0=basename($0); 100 | "$0: Removes a taxon from a tree 101 | Usage: $0 --tree tree.dnd taxon1 [taxon2...] 102 | " 103 | } 104 | -------------------------------------------------------------------------------- /scripts/pwdLinux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get the Linux path from a windows path 4 | WINDOWS=$1 5 | 6 | # Change backslashes to forward slashes 7 | LINUX=$(sed 's|\\|/|g' <<< $WINDOWS) 8 | # Remove extra leading slashes 9 | LINUX=$(sed 's|^/\+|/|' <<< $LINUX) 10 | 11 | # Change the domain name 12 | LINUX=$(sed 's|^/data.biotech.cdc.gov/|/scicomp/|' <<< $LINUX); 13 | 14 | # Print the final linux path 15 | echo $LINUX; 16 | -------------------------------------------------------------------------------- /scripts/pwdWindows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get the Windows network path 4 | 5 | # Get the actual path with resolved symlinks 6 | pwd=$(pwd -P); 7 | if [ "$1" != "" ]; then 8 | pwd=$(realpath $1) 9 | fi 10 | # remove scicomp and leading slash 11 | pwd=$(sed 's|^/scicomp||' <<< $pwd); 12 | # is this in the home directory? 13 | pwd=$(sed "s|^/home/$USER|/home|" <<< $pwd); 14 | # / to \ 15 | pwd=$(sed 's|/|\\|g' <<< $pwd); 16 | # tack on the domain name 17 | pwd="\\\\data.biotech.cdc.gov$pwd" 18 | 19 | 20 | echo $pwd; 21 | -------------------------------------------------------------------------------- /scripts/qsubStats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Author: Lee Katz 4 | # Figures out some quick metrics from SGE 5 | 6 | # Take a snapshot of qstat. 7 | QSTAT=$(qstat -u '*') 8 | if [ $? -gt 0 ]; then 9 | echo "ERROR with qstat" >&2 10 | exit 1; 11 | fi 12 | 13 | QSTAT=$(echo "$QSTAT"|tail -n +3) # qstat, minus the header 14 | 15 | # How many of the cluster's slots I'm taking 16 | # echo "$QSTAT" |tail -n +3| perl -MData::Dumper -e 'while(<>){s/^\s+|\s+$//g; @F=split /\s+/; next if($F[4] ne 'r'); $slots=$F[8]; if($F[3] eq $ENV{USER}){$mine+=$slots;} $total+=$slots; } print "$mine out of $total\n";' 17 | 18 | # who is the current hog 19 | echo "$QSTAT" | perl -lane ' 20 | BEGIN{print "USER\tSLOTS";} 21 | next if(!$F[3] || !$F[8]); 22 | next if($F[4] !~ /^R?r$/); 23 | $slot{$F[3]}+=$F[8]; END{@user=sort{$slot{$b}<=>$slot{$a} || $a cmp $b} keys(%slot); print "$_\t$slot{$_}" for @user;}' | column -t 24 | -------------------------------------------------------------------------------- /scripts/randFastq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Randomizes the order of fastq reads 3 | # 4 | 5 | use strict; 6 | use warnings; 7 | use Getopt::Long qw/GetOptions/; 8 | use Data::Dumper qw/Dumper/; 9 | use File::Basename qw/fileparse/; 10 | use List::Util qw/shuffle/; 11 | 12 | local $0=fileparse $0; 13 | sub logmsg { print "$0: @_\n";} 14 | 15 | exit main(); 16 | 17 | sub main{ 18 | my $settings={}; 19 | GetOptions($settings,qw(help pe|paired-end freq|frequency=f)) or die $!; 20 | $$settings{freq}||=1; 21 | die usage() if($$settings{help}); 22 | 23 | my @fastq=@ARGV; 24 | die usage() if(!@fastq); 25 | 26 | my $reads=readFastqs(\@fastq,$settings); 27 | 28 | printRandomReads($reads,$settings); 29 | 30 | return 0; 31 | } 32 | 33 | sub readFastqs{ 34 | my($fastq,$settings)=@_; 35 | 36 | my $linesPerEntry=4; 37 | if($$settings{pe}){ 38 | $linesPerEntry=8; 39 | } 40 | 41 | # Get this out of the hash in case it helps with speed 42 | my $freq=$$settings{freq}; 43 | 44 | my @reads; 45 | for my $f(@$fastq){ 46 | my($name,$dir,$ext)=fileparse($f,qw(.gz)); 47 | my $fastqFh; 48 | if($ext eq '.gz'){ 49 | open($fastqFh,"zcat $f |") or die "ERROR: could not zcat $f for reading: $!"; 50 | } else { 51 | open($fastqFh,$f) or die "ERROR: could not open $f: $!"; 52 | } 53 | while(my $entry=<$fastqFh>){ 54 | for(2..$linesPerEntry){ 55 | $entry.=<$fastqFh>; 56 | } 57 | 58 | # Randomly skip reads if a random number is greater 59 | # than the user-defined threshold. 60 | next if(rand() > $freq); 61 | 62 | push(@reads,$entry); 63 | } 64 | close $fastqFh; 65 | } 66 | 67 | return \@reads; 68 | } 69 | 70 | sub printRandomReads{ 71 | my($reads,$settings)=@_; 72 | 73 | for my $entry(shuffle(@$reads)){ 74 | print $entry; 75 | } 76 | } 77 | 78 | sub usage{ 79 | "$0: randomize the order of reads in a fastq file 80 | Usage: $0 file.fastq[.gz] [file2.fastq...] > rand.fastq 81 | 82 | --paired-end If the file is interleaved 83 | --frequency 1 Frequency of reads to keep (values: 0-1) 84 | " 85 | } 86 | 87 | -------------------------------------------------------------------------------- /scripts/replaceReadsWithReference.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | 9 | local $0 = basename $0; 10 | sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} 11 | exit(main()); 12 | 13 | sub main{ 14 | my $settings={}; 15 | GetOptions($settings,qw(help)) or die $!; 16 | usage() if($$settings{help} || @ARGV < 2); 17 | 18 | my $refFasta = shift(@ARGV); 19 | 20 | for my $bam (@ARGV){ 21 | printFastq($bam, $refFasta, $settings); 22 | } 23 | 24 | return 0; 25 | } 26 | 27 | # Print the fastq file from the bam 28 | sub printFastq{ 29 | my($bam, $refFasta, $settings) = @_; 30 | 31 | open(my $fh, "samtools sort -n '$bam' | samtools view -f 1 |") or die "ERROR using samtools view on $bam: $!"; 32 | while(my $line = <$fh>){ 33 | chomp($line); 34 | my($qname, $flag, $rname, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual) = 35 | split(/\t/, $line); 36 | 37 | # add /1 or /2 38 | if($flag & 0x40){ 39 | $qname .= "/1"; 40 | } 41 | if($flag & 0x80){ 42 | $qname .= "/2"; 43 | } 44 | 45 | # if the read is not unmapped then replace with reference 46 | if(! ($flag & 0x4)){ 47 | my $refHit = referenceHit($rname, $pos, $cigar, $refFasta, $settings); 48 | $seq = $refHit; 49 | $qname .= " replaced"; 50 | } 51 | 52 | # Sanity check to match the lengths of seq and qual 53 | if(length($seq) != length($qual)){ 54 | my$bp = length($seq) - length($qual); 55 | logmsg "WARNING: seq is not the same length as qual for $qname($bp bp)"; 56 | # Adjust to match shortest length 57 | if(length($seq) > length($qual)){ 58 | $seq = substr($seq, 0, length($qual)); 59 | }else{ 60 | $qual = substr($qual, 0, length($seq)); 61 | } 62 | 63 | } 64 | 65 | print "\@$qname\n$seq\n+\n$qual\n"; 66 | } 67 | } 68 | 69 | # Get the sequence of the reference genome at the mapped position 70 | sub referenceHit{ 71 | my($rname, $pos, $cigar, $refFasta, $settings) = @_; 72 | 73 | # Determine length from cigar 74 | # TODO other operation codes like [NSHP=X] 75 | my $length = 0; 76 | while($cigar =~ /(\d+)(\w)/g){ 77 | my $code = $2; 78 | my $int = $1; 79 | if($code eq 'M'){ 80 | $length+=$int; 81 | } elsif($code eq 'D') { 82 | $length+=$int; 83 | } elsif($code eq 'I') { 84 | $length+=0; 85 | } else { 86 | die "ERROR: cigar string has a $code which I do not know how to interpret. Here is the full cigar string: $cigar"; 87 | } 88 | } 89 | 90 | if($length < 1){ 91 | die "INTERNAL ERROR: length of reference hit for this mapped read is <1" . Dumper \@_; 92 | } 93 | 94 | # Grab the reference hit 95 | my $stopPos = $pos + $length - 1; 96 | my $refHit = `samtools faidx $refFasta '$rname:$pos-$stopPos' | tail -n +2`; 97 | die "ERROR running samtools faidx on $refFasta" if $?; 98 | chomp($refHit); 99 | $refHit =~ s/\n//g; 100 | $refHit =~ tr/[a-z]/[A-Z]/; # uppercase 101 | 102 | return $refHit; 103 | } 104 | 105 | sub usage{ 106 | print "$0: print a bam as a fastq file, replacing reads with the reference genome 107 | Usage: $0 [options] ref.fasta *.bam > out.fastq 108 | --help This useful help menu 109 | "; 110 | exit 0; 111 | } 112 | -------------------------------------------------------------------------------- /scripts/representativeTaxa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | use Bio::TreeIO; 9 | 10 | local $0 = basename $0; 11 | sub logmsg { print STDERR "$0: @_\n";} 12 | 13 | exit main(); 14 | 15 | sub main{ 16 | 17 | my $settings={}; 18 | GetOptions($settings,qw(help cluster-distance|distance|max-distance=f)) or die $!; 19 | $$settings{'cluster-distance'}||=0.001; 20 | 21 | die usage() if(!@ARGV); 22 | 23 | for my $file(@ARGV){ 24 | findRepresentatives($file,$settings); 25 | } 26 | 27 | return 0; 28 | } 29 | 30 | sub findRepresentatives{ 31 | my($file,$settings)=@_; 32 | 33 | my $tree=Bio::TreeIO->new(-file=>$file)->next_tree; 34 | 35 | my @taxon = grep {$_->is_Leaf} $tree->get_nodes(); 36 | my $numnodes=@taxon; 37 | 38 | # Find distances between all genomes 39 | logmsg "Finding distances between all taxa"; 40 | my %distance; 41 | for(my $i=0;$i<$numnodes;$i++){ 42 | print STDERR "."; 43 | my $taxonName1=$taxon[$i]->id; 44 | for(my $j=$i+1; $j<$numnodes; $j++){ 45 | my $taxonName2=$taxon[$j]->id; 46 | my $distance=distanceBetweenTwoNodes($tree,$taxon[$i],$taxon[$j]); 47 | $distance{$taxonName1}{$taxonName2} = $distance; 48 | $distance{$taxonName2}{$taxonName1} = $distance; 49 | } 50 | } 51 | print STDERR "\n"; 52 | 53 | 54 | # Cluster the taxa by distance 55 | # The index of %cluster is the representative genome, 56 | # and all other genomes have to be within X distance 57 | # of it. 58 | my %cluster; 59 | my $cluster_counter=0; 60 | for(my $i=0;$i<$numnodes;$i++){ 61 | my $taxonName=$taxon[$i]->id; 62 | my $is_representative_taxon=1; 63 | for my $representative (keys(%cluster)) { 64 | if($distance{$taxonName}{$representative} < $$settings{'cluster-distance'}){ 65 | push(@{ $cluster{$representative} }, $taxonName); 66 | $is_representative_taxon=0; 67 | last; 68 | } 69 | } 70 | 71 | if($is_representative_taxon){ 72 | $cluster{$taxonName}=[$taxonName]; 73 | } 74 | } 75 | 76 | for my $members(values(%cluster)){ 77 | print join("\t",@$members)."\n"; 78 | } 79 | 80 | logmsg "Found ".scalar(keys(%cluster))." clusters"; 81 | 82 | } 83 | 84 | # http://cpansearch.perl.org/src/CJFIELDS/BioPerl-1.007002/Bio/Tree/TreeFunctionsI.pm 85 | # -> sub distance 86 | # without error checking to speed it up 87 | sub distanceBetweenTwoNodes{ 88 | my($tree,$node1,$node2)=@_; 89 | 90 | my $lca = $tree->get_lca($node1,$node2); 91 | my $cumul_dist = 0; 92 | foreach my $current_node ($node1,$node2){ 93 | do { 94 | $cumul_dist += $current_node->branch_length; 95 | 96 | $current_node = $current_node->ancestor || last; 97 | 98 | } while($current_node ne $lca); 99 | } 100 | 101 | return $cumul_dist; 102 | } 103 | 104 | sub usage{ 105 | "$0: Find representative taxa in each tree. Assumes one 106 | tree per tree file. 107 | 108 | Usage: $0 [options] tree.dnd [tree2.dnd...] 109 | 110 | --cluster-distance 0.001 The max distance between every 111 | taxon in a cluster 112 | " 113 | } 114 | -------------------------------------------------------------------------------- /scripts/rerootTree.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use Data::Dumper; 6 | use Bio::TreeIO; 7 | use Getopt::Long; 8 | use File::Basename qw/basename/; 9 | use List::Util qw/sum/; 10 | use Statistics::Descriptive; 11 | use File::Temp qw/tempdir tempfile/; 12 | 13 | local $0=basename $0; 14 | sub logmsg{print STDERR "$0: @_\n";} 15 | exit(main()); 16 | 17 | sub main{ 18 | my $settings={}; 19 | GetOptions($settings,qw(help root-on|root-with|root=s)) or die $!; 20 | 21 | die usage() if(!@ARGV || $$settings{help}); 22 | 23 | $$settings{'root-on'} || die "ERROR: parameter --root-on is required. --help for more information."; 24 | 25 | my(@query)=@ARGV; 26 | my $treeout=Bio::TreeIO->new(-format=>"newick"); 27 | 28 | # For each query, reroot and print 29 | for my $q(@query){ 30 | my $treein = Bio::TreeIO->new(-file=>$q,-format=>"newick"); 31 | my $treeCounter=0; 32 | while(my $treeObject = $treein->next_tree){ 33 | $treeCounter++; 34 | my @leaves = sort {$a->id cmp $b->id} grep {$_->is_Leaf} $treeObject->get_nodes; 35 | if(scalar(@leaves) < 2){ 36 | if(scalar($treeObject->get_nodes) < 3){ 37 | logmsg "Skipping: only ".scalar($treeObject->get_nodes)." nodes found in tree $treeCounter in $q"; 38 | logmsg " Possible reason: empty line in tree file"; 39 | next; 40 | } 41 | die "ERROR: there are fewer than 2 leaves on tree $treeCounter in $q:\n".Dumper [map{$_->id} @leaves]; 42 | } 43 | my @node = grep {$_->id eq $$settings{'root-on'}} @leaves; 44 | if(@node > 1){ 45 | die "ERROR: found multiple nodes named ".$$settings{'root-on'}." in $q"; 46 | } 47 | logmsg "Rerooting tree $treeCounter in $q"; 48 | 49 | my $was_rerooted=$treeObject->reroot($node[0]); 50 | if(!$was_rerooted){ 51 | die "ERROR: could not reroot tree $treeCounter in $q"; 52 | } 53 | 54 | $treeout->write_tree($treeObject); 55 | 56 | } 57 | } 58 | 59 | return 0; 60 | } 61 | 62 | sub usage{ 63 | "$0: Roots a set of trees on the same leaf. 64 | Output trees will be in the same order as tree parameters 65 | Usage: $0 --root-on LEAF tree.dnd [tree2.dnd...] > trees.dnd 66 | --root-on '' The name of the leaf 67 | " 68 | } 69 | 70 | -------------------------------------------------------------------------------- /scripts/rowMath.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | #Author: Lee Katz 3 | 4 | use strict; 5 | use warnings; 6 | use Data::Dumper; 7 | use Getopt::Long; 8 | use File::Basename qw/basename/; 9 | 10 | local $0=basename $0; 11 | sub logmsg{print STDERR "$0: @_\n";} 12 | 13 | exit main(); 14 | sub main{ 15 | my $settings={}; 16 | GetOptions($settings,qw(help test operation=s)); 17 | die usage() if($$settings{help}); 18 | $$settings{operation}||="\$next - \$cur"; 19 | #$$settings{operation}=quotemeta($$settings{operation}); 20 | 21 | if($$settings{test}){ 22 | test($settings); 23 | } else { 24 | printDifferences($settings); 25 | } 26 | return 0; 27 | } 28 | 29 | sub printDifferences{ 30 | my($settings)=@_; 31 | 32 | my $cur=<>; chomp($cur); 33 | while(my $next=<>){ 34 | chomp($next); 35 | 36 | # do the math 37 | my $answer=eval($$settings{operation}); 38 | if($@){ 39 | die "ERROR: $$settings{operation} resulted in a failure: $@"; 40 | } 41 | print "$answer\n"; 42 | $cur=$next; 43 | } 44 | } 45 | 46 | sub test{ 47 | my($settings)=@_; 48 | my $cmd=" echo -e '1\n5\n7\n33\n33\n33\n37' | $0"; 49 | logmsg "COMMAND:\n====\n$cmd\n===="; 50 | system($cmd); 51 | } 52 | 53 | sub usage{ 54 | "Calculates the difference between rows or a custom arithmetic 55 | Usage: sort -n numbers.txt | $0 > difference.txt 56 | -o 'custom arithmetic' 57 | Variables: \$cur is the first row in the iteration 58 | \$next is the second row 59 | Example: \$next - \$cur 60 | " 61 | } 62 | 63 | -------------------------------------------------------------------------------- /scripts/splitBionumericsFasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Author: Lee Katz 3 | # Splits up a bionumerics fasta file into contigs 4 | 5 | use strict; 6 | use warnings; 7 | use Bio::Perl; 8 | use Bio::Tools::GuessSeqFormat; 9 | use File::Basename; 10 | use autodie; 11 | use Getopt::Long; 12 | 13 | exit main(); 14 | 15 | sub main{ 16 | my $settings={}; 17 | GetOptions($settings,qw(help outdir=s)) or die $!; 18 | 19 | die usage() if(!@ARGV || $$settings{help}); 20 | 21 | if($$settings{outdir}){ 22 | mkdir $$settings{outdir}; 23 | } 24 | 25 | # Because we are reading bionumerics files and do not 26 | # trust their extensions, make a better guess at their 27 | # format using the format guesser. 28 | my $formatGuesser=Bio::Tools::GuessSeqFormat->new(-file=>$ARGV[0]); 29 | my $format =$formatGuesser->guess; 30 | 31 | my $in=Bio::SeqIO->new(-file=>$ARGV[0],-format=>$format); 32 | while(my $seq=$in->next_seq){ 33 | my @seq=split(/\|/,$seq->seq); 34 | my $id=$seq->id; 35 | $id=~s/^denovo\|//; # remove 'denovo' since most exports seem to have that 36 | print STDERR "$id\n"; 37 | 38 | my $out=Bio::SeqIO->new(-format=>"fasta"); 39 | if($$settings{outdir}){ 40 | # For potential filenames, get a safe name 41 | my $id_safe=$id; 42 | $id_safe=~s/[^\w\d]//g; # remove non words, non letters 43 | 44 | my $outfile="$$settings{outdir}/$id_safe.fasta"; 45 | $out=Bio::SeqIO->new(-format=>"fasta",-file=>">$outfile"); 46 | } 47 | for(my $i=1;$i<=@seq;$i++){ 48 | my $subseq=Bio::Seq->new(-seq=>$seq[$i-1],-id=>$id."_".$i); 49 | $out->write_seq($subseq); 50 | } 51 | $out->close; 52 | } 53 | 54 | return 0; 55 | } 56 | 57 | sub usage{ 58 | local $0=fileparse $0; 59 | "Usage: $0 bionumerics.fasta > out.fasta 60 | --outdir '' If given, all genomes will be written here. 61 | If blank, output will be sent to stdout. 62 | " 63 | } 64 | -------------------------------------------------------------------------------- /scripts/splitPolytomies.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Bio::Perl; 5 | use Bio::TreeIO; 6 | use Getopt::Long; 7 | use File::Basename qw/basename/; 8 | use Math::Round qw/ceil/; 9 | 10 | local $0=basename($0); 11 | sub logmsg{print STDERR "$0: @_\n"} 12 | 13 | exit main(); 14 | 15 | sub main{ 16 | my $settings={}; 17 | GetOptions($settings,qw(help)); 18 | 19 | my $$settings{bootstrap}//=70; 20 | die usage() if($$settings{help}); 21 | 22 | my @tree=@ARGV; 23 | for my $tree(@tree){ 24 | my $in=Bio::TreeIO->new(-file=>$tree); 25 | while(my $treeObj=$in->next_tree){ 26 | $treeObj=splitPolytomies($treeObj,undef,$settings); 27 | } 28 | } 29 | 30 | return 0; 31 | } 32 | 33 | sub splitPolytomies{ 34 | my($treeObj,$node,$settings)=@_; 35 | 36 | my $node ||= $treeObj->get_root_node; 37 | 38 | my @descs = $node->each_Descendent; 39 | if (@descs > 2) { 40 | # Many nodes have no identifying names, a simple warning is probably 41 | # enough. 42 | 43 | $treeObj->warn("Node has more than two descendants\nWill do an arbitrary balanced split"); 44 | my @working = @descs; 45 | # create an even set of artifical nodes on which to later hang the descs 46 | my $half = ceil(@working / 2); 47 | my @artificials; 48 | while ($half > 1) { 49 | my @this_level; 50 | foreach my $top_node (@artificials || $node) { 51 | for (1..2) { 52 | my $art = $top_node->new(-id => "artificial_".++$treeObj->{_art_num}); 53 | $top_node->add_Descendent($art); 54 | push(@this_level, $art); 55 | } 56 | } 57 | @artificials = @this_level; 58 | $half--; 59 | } 60 | # attach two descs to each artifical leaf 61 | foreach my $art (@artificials) { 62 | for (1..2) { 63 | my $desc = shift(@working) || $node->new(-id => "artificial_".++$treeObj->{_art_num}); 64 | $desc->ancestor($art); 65 | } 66 | } 67 | } 68 | elsif (@descs == 1) { 69 | # ensure that all nodes have 2 descs 70 | $node->add_Descendent($node->new(-id => "artificial_".++$treeObj->{_art_num})); 71 | } 72 | # recurse 73 | foreach my $desc (@descs) { 74 | splitPolytomies($treeObj,$desc,$settings); 75 | } 76 | 77 | return $treeObj; 78 | } 79 | 80 | sub usage{ 81 | "$0: split polytomies in a predictable way 82 | Usage: $0 tree.dnd [tree2.dnd...] > out.dnd 83 | --bootstrap 70 The minimum bootstrap value where a 84 | clade will be considered a polytomy 85 | " 86 | } 87 | -------------------------------------------------------------------------------- /scripts/tanglegram_ape.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Authors: Beau Bruce and Weidong Gu 3 | # Modified by Lee Katz 4 | 5 | library(argparse, quietly=TRUE) 6 | parser <- ArgumentParser() 7 | parser$add_argument("-t1", "--tree1", default=FALSE, 8 | help="First tree") 9 | parser$add_argument("-t2", "--tree2", default=FALSE, 10 | help="Second tree") 11 | parser$add_argument("-o", "--outfile", default=FALSE, 12 | help="output png file") 13 | args <- parser$parse_args() 14 | 15 | treefile1 <- args$tree1 16 | treefile2 <- args$tree2 17 | outfile <- args$outfile 18 | 19 | myReturn <- suppressPackageStartupMessages(c( 20 | library(phytools, quietly=TRUE), 21 | library(ape, quietly=TRUE) 22 | )); 23 | 24 | outbreak <- read.delim('/scicomp/home/gzu2/projects/mashtree/data/katzEtAl/Lyve-SET/outbreakStatus.tsv', 25 | sep="\t", header=T, stringsAsFactors=F) 26 | tree1 <- ladderize(midpoint.root(read.tree(treefile1))) 27 | tree2 <- ladderize(midpoint.root(read.tree(treefile2))) 28 | 29 | #tree1 <- reorder(tree1, "postorder") 30 | #tree2 <- reorder(tree2, "postorder") 31 | 32 | # Default minimum length 33 | min_length <- 0.000000000000000000001 34 | tree1$edge.length[ tree1$edge.length < min_length ] <- min_length 35 | tree2$edge.length[ tree2$edge.length < min_length ] <- min_length 36 | 37 | outbreakIndex <- match(outbreak$sample[outbreak$outbreak== 1],tree1$tip.label) 38 | nonoutbreakIndex <- match(outbreak$sample[outbreak$outbreak== 0],tree1$tip.label) 39 | maybeoutbreakIndex <- match(outbreak$sample[outbreak$outbreak==-1],tree1$tip.label) 40 | 41 | myColors <- c() 42 | myColors[outbreakIndex] <- 'red' 43 | myColors[nonoutbreakIndex] <- 'blue' 44 | myColors[maybeoutbreakIndex] <- 'gray' 45 | 46 | association <- cbind(tree1$tip.label, tree1$tip.label) 47 | png(outfile); 48 | cophyloplot(tree1, tree2, assoc = association, space = 100, length.line=0, gap=1, show.tip.label=F, col = myColors); 49 | myReturn <- dev.off(); 50 | 51 | -------------------------------------------------------------------------------- /scripts/tanglegram_code.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Authors: Beau Bruce and Weidong Gu 3 | # Modified by Lee Katz 4 | 5 | library(argparse, quietly=TRUE) 6 | parser <- ArgumentParser() 7 | parser$add_argument("-t1", "--tree1", default=FALSE, 8 | help="First tree") 9 | parser$add_argument("-t2", "--tree2", default=FALSE, 10 | help="Second tree") 11 | parser$add_argument("-o", "--outfile", default=FALSE, 12 | help="output png file") 13 | args <- parser$parse_args() 14 | 15 | treefile1 <- args$tree1 16 | treefile2 <- args$tree2 17 | outfile <- args$outfile 18 | 19 | print("Loading libraries") 20 | #library(phytools, quietly=TRUE) 21 | #library(dendextend,quietly=TRUE) 22 | #library(ape, quietly=TRUE) 23 | myReturn <- suppressPackageStartupMessages(c( 24 | library(phytools, quietly=TRUE), 25 | library(dendextend,quietly=TRUE), 26 | library(ape, quietly=TRUE) 27 | )); 28 | 29 | #eid_iso=read.delim('\\\\cdc.gov\\project\\CCID_NCZVED_DFBMD_EDEB\\Analytics\\Weidong\\LM model\\ncbi_access_dat.csv',sep=',',header=T,stringsAsFactors = F) 30 | outbreak <- read.delim('/scicomp/home/gzu2/projects/mashtree/data/katzEtAl/Lyve-SET/outbreakStatus.tsv', 31 | sep="\t", header=T, stringsAsFactors=F) 32 | tree1 <- reorder(midpoint.root(read.tree(treefile1)), order = "cladewise") 33 | tree2 <- reorder(midpoint.root(read.tree(treefile2)), order = "cladewise") 34 | 35 | dend_tree1 <- force.ultrametric(tree1) 36 | dend_tree2 <- force.ultrametric(tree2) 37 | 38 | min_length <- 0.000000000000000000001 39 | dend_tree1$edge.length[ dend_tree1$edge.length < min_length ] <- min_length 40 | dend_tree2$edge.length[ dend_tree2$edge.length < min_length ] <- min_length 41 | 42 | dend_tree1=(midpoint.root(dend_tree1)) 43 | dend_tree2=(midpoint.root(dend_tree2)) 44 | 45 | my.col=c('blue','brown','green','pink','red') 46 | 47 | outbreakIndex <- match(outbreak$sample[outbreak$outbreak== 1],dend_tree1$tip.label) 48 | nonoutbreakIndex <- match(outbreak$sample[outbreak$outbreak== 0],dend_tree1$tip.label) 49 | maybeoutbreakIndex <- match(outbreak$sample[outbreak$outbreak==-1],dend_tree1$tip.label) 50 | myColors <- c() 51 | myColors[outbreakIndex] <- 'red' 52 | myColors[nonoutbreakIndex] <- 'blue' 53 | myColors[maybeoutbreakIndex] <- 'green' 54 | #outbreak$color[ outbreakIndex ] <- 'red' 55 | #outbreak$color[nonoutbreakIndex]<- 'blue' 56 | #col.s=my.col[as.factor(conn_l_col)] 57 | 58 | print("untangle") 59 | dendl <- dendextend::untangle(as.dendrogram(dend_tree1), 60 | as.dendrogram(dend_tree2), 61 | method = "step2side") 62 | #method = "labels") 63 | #method = "ladderize") 64 | #method = "random") 65 | #method = "step1side") 66 | #method = "DendSer") 67 | 68 | # Make the branches look nice 69 | dendl %>% set("branches_lwd", 1) %>% 70 | set("labels_col", "white") -> dendl 71 | 72 | print("entanglement..."); 73 | myEntanglement <- entanglement(dendl) 74 | cophenetic <- cor.dendlist(dendl, method = "cophenetic") 75 | baker <- cor.dendlist(dendl, method = "baker") 76 | 77 | # Start off the viz 78 | png(outfile) 79 | tanglegram(dendl, 80 | main_left='Lyve-SET', 81 | main_right='Mashtree', 82 | lab.cex=0.3, 83 | highlight_distinct_edges = FALSE, 84 | color_lines=myColors 85 | ) 86 | #myReturn <- text("SOMETHING", x=1, y=1) 87 | myReturn <- dev.off(); 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /scripts/translate-kraken-contigs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; 6 | 7 | my %length; 8 | my %percentage; 9 | open(my $fh, $ARGV[0]) or die "ERROR: could not read $ARGV[0]: $!"; 10 | while(<$fh>){ 11 | chomp; 12 | my($classified,$seqname,$taxid,$length,$kmerTaxid)=split(/\t/,$_); 13 | if($classified eq 'U'){ 14 | $percentage{'unclassified'}+=$length; 15 | } else { 16 | $length{$seqname}=$length; 17 | } 18 | } 19 | close $fh; 20 | 21 | 22 | # kraken-translate but tally all the sequence lengths 23 | open(my $translateFh, "kraken-translate $ARGV[0] | ") or die "ERROR: could not run kraken-translate on $ARGV[0]:$!"; 24 | while(<$translateFh>){ 25 | chomp; 26 | my($seqname,$taxonomyString)=split(/\t/,$_); 27 | $taxonomyString=~s/\s+/_/g; 28 | $taxonomyString=~s/;/\t/g; 29 | $percentage{$taxonomyString}+=$length{$seqname}; 30 | } 31 | close $translateFh; 32 | 33 | # Make the file 34 | while(my($taxonomyString,$sliceOfPie)=each(%percentage)){ 35 | print join("\t",$sliceOfPie,$taxonomyString)."\n"; 36 | } 37 | -------------------------------------------------------------------------------- /scripts/treeDistanceMatrix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://www.biostars.org/p/6661/#142113 3 | use strict; 4 | use warnings; 5 | use Bio::TreeIO; 6 | use Data::Dumper; 7 | 8 | sub logmsg{print STDERR "$0: @_\n";} 9 | 10 | die "Usage: $0 tree.dnd" if(!@ARGV || $ARGV[0]=~/\-+h/); 11 | my $treeObj = Bio::TreeIO->new(-file=>$ARGV[0])->next_tree; 12 | $treeObj->force_binary; 13 | my $tree = $treeObj->as_text("newick")."\n"; 14 | #my $tree = $treeObj->simplify_to_leaves_string(); 15 | chomp($tree); 16 | 17 | die "Usage: $0 tree.dnd" if(!$tree); 18 | 19 | ##record the distance of parentheses 20 | my %dis; 21 | my $par = -1; 22 | my @current; 23 | while($tree =~ /./g) 24 | {if ($& eq '(') 25 | {$par ++; 26 | next if $par == 0; 27 | $current[$#current+1] = $par; 28 | } 29 | elsif($& eq ')') 30 | {(my $tem) = $' =~ /:(\d+\.\d+|\d+)/; 31 | next if $#current == -1; 32 | $dis{'node_'.$current[$#current]} = $tem; 33 | pop @current; 34 | } 35 | } 36 | 37 | ##record the distance of leaves 38 | my @order; 39 | while ($tree =~ /([^\(\):,]+):(\d+\.\d+|\d+)/g) 40 | {$dis{$1} = $2; 41 | $order[$#order+1] = $1; 42 | } 43 | 44 | ##record parents of leaves 45 | my %pare; 46 | @current = (); 47 | $par = -1; 48 | while($tree =~ /(\(|\)|([^\(\):,]+):)/g) 49 | {if ($& eq '(') 50 | {$par ++; 51 | next if $par == 0; 52 | $current[$#current+1] = $par; 53 | } 54 | elsif($& eq ')') 55 | {pop @current; 56 | } 57 | else{map {$pare{$2}{$_} = 1} @current; 58 | $pare{$2} = [@current]; 59 | } 60 | } 61 | 62 | ##Distance matrix 63 | my %dis2; 64 | foreach my $i (0..$#order) 65 | {foreach my $j ($i..$#order) 66 | {if ($i == $j) 67 | {$dis2{$order[$i]}{$order[$j]} = 0; 68 | } 69 | else{my $tem = $dis{$order[$i]} + $dis{$order[$j]}; 70 | my $tem2 = -1; 71 | foreach my $k (0..$#{$pare{$order[$i]}}) 72 | {last if ($k > $#{$pare{$order[$j]}}); 73 | if ($pare{$order[$i]}[$k] eq $pare{$order[$j]}[$k]) 74 | {$tem2 = $k; 75 | } 76 | } 77 | if ($#{$pare{$order[$i]}} != -1) 78 | {map {$tem += $dis{'node_'.$_}} map {$pare{$order[$i]}[$_]} ($tem2+1)..$#{$pare{$order[$i]}}; 79 | } 80 | if ($#{$pare{$order[$j]}} != -1) 81 | {map {$tem += $dis{'node_'.$_}} map {$pare{$order[$j]}[$_]} ($tem2+1)..$#{$pare{$order[$j]}}; 82 | } 83 | $dis2{$order[$i]}{$order[$j]} = $dis2{$order[$j]}{$order[$i]} = $tem; 84 | } 85 | } 86 | } 87 | 88 | ##output 89 | print join("\t",'',@order),"\n"; 90 | foreach my $i (@order) 91 | {print join("\t",$i,map {$dis2{$i}{$_}} @order),"\n"; 92 | } 93 | -------------------------------------------------------------------------------- /scripts/ttrToMiSeq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Transform a TTR simulation to a MiSeq run 4 | 5 | if [ "$2" == "" ]; then 6 | echo "Transform a TTR directory to a MiSeq run" 7 | echo "Usage: $0 TTR out.miseq" 8 | exit 1; 9 | fi 10 | 11 | IN=$1 12 | OUT=$2 13 | 14 | mkdir -p /tmp/$USER 15 | tmpdir=$(mktemp --directory --tmpdir=/tmp/$USER ttrToMiSeq.XXXXXX) 16 | 17 | 18 | # Sample sheet 19 | RUNNAME=$(basename IN) 20 | READLENGTH=$(grep read_length $IN/TTR.cfg | grep -o [0-9]*) 21 | if [ $? -gt 0 ]; then echo "ERROR reading $IN/TTR.cfg"; exit 1; fi; 22 | DATE=$(date +'%m/%d/%Y') 23 | 24 | CSV="$tmpdir/SampleSheet.csv" 25 | echo -ne "[Header] 26 | IEMFileVersion,4 27 | Investigator Name,TreeToReads 28 | Experiment Name,$RUNNAME 29 | Date,$DATE 30 | Workflow,GenerateFASTQ 31 | Application,FASTQ Only 32 | Assay,Nextera XT 33 | Description,$RUNNAME 34 | Chemistry,Amplicon 35 | 36 | [Reads] 37 | $READLENGTH 38 | $READLENGTH 39 | 40 | [Settings] 41 | ReverseComplement,0 42 | Adapter ATCGATCGATCG 43 | 44 | [Data] 45 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description 46 | " > $CSV 47 | 48 | SAMPLE=$(ls $IN/fastq | xargs -n 1 basename) 49 | for i in $SAMPLE; do 50 | echo -e "$i,,$RUNNAME,A01,N801,ATCGAAA,S801,ATCGAAA,1," >> $CSV 51 | done 52 | 53 | # Fastq files 54 | FASTQDIR="$tmpdir/Data/Intensities/BaseCalls" 55 | mkdir -p $FASTQDIR 56 | 57 | for i in $SAMPLE; do 58 | set -e 59 | cp -v $IN/fastq/$i/*_1.fq.gz $FASTQDIR/${i}_S1_L001_R1_001.fastq.gz 60 | cp -v $IN/fastq/$i/*_2.fq.gz $FASTQDIR/${i}_S1_L001_R2_001.fastq.gz 61 | set +e 62 | done; 63 | 64 | echo "$tmpdir -> $OUT" 65 | mv $tmpdir $OUT 66 | 67 | -------------------------------------------------------------------------------- /tests/all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | scriptDir=$(dirname $0); 6 | export PATH=$PATH:$scriptDir/../scripts 7 | 8 | echo PATH 9 | echo $PATH 10 | echo PATH 11 | 12 | which randTrees.pl 13 | 14 | # Find all unit tests under this directory 15 | # and simply run them with -e and -x 16 | executables=$(find $(dirname $0)/unittests -maxdepth 1 -type f -name '*.sh') 17 | for exe in $executables; do 18 | $exe 19 | if [ $? -gt 0 ]; then 20 | exit 1 21 | fi 22 | done; 23 | 24 | exit 0 25 | -------------------------------------------------------------------------------- /tests/unittests/Kendall.pl.bats: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$BATS_TEST_DIRNAME 4 | export PATH=$dir/../../scripts:$PATH 5 | 6 | @test "kendall" { 7 | 8 | lambda0=$($dir/../../scripts/Kendall.pl --alreadyrooted --lambda 0 $dir/input/kendall-colijn1.dnd $dir/input/kendall-colijn2.dnd | tail -n 1 | cut -f 4) 9 | [ "$lambda0" == "2.00" ] 10 | 11 | lambda1=$($dir/../../scripts/Kendall.pl --alreadyrooted --lambda 1 $dir/input/kendall-colijn1.dnd $dir/input/kendall-colijn2.dnd | tail -n 1 | cut -f 4) 12 | [ "$lambda1" == "1.96" ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/unittests/avgstdev.pl.bats: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$BATS_TEST_DIRNAME 4 | export PATH=$dir/../../scripts:$PATH 5 | 6 | @test "avgstdev" { 7 | total="Total: 66" 8 | average="Average: 6.00 +/- 3.32" 9 | median="Median: 6.00 [3.50,8.50] [1.00-11.00]" 10 | mad="MAD: 3.00" 11 | 12 | observed=$(seq 1 11 | $dir/../../scripts/avgstdev.pl) 13 | 14 | [ "$(grep Total <<< "$observed")" == "$total" ] 15 | 16 | [ "$(grep Average <<< "$observed")" == "$average" ] 17 | 18 | [ "$(grep Median <<< "$observed")" == "$median" ] 19 | 20 | [ "$(grep MAD <<< "$observed")" == "$mad" ] 21 | 22 | } 23 | -------------------------------------------------------------------------------- /tests/unittests/input/SRR27366697.10x.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lskatz/lskScripts/fe9397382e04c791cd2b932d0f37ff7090fbbb87/tests/unittests/input/SRR27366697.10x.fastq.gz -------------------------------------------------------------------------------- /tests/unittests/input/kendall-colijn1.dnd: -------------------------------------------------------------------------------- 1 | ((A:1.2,B:0.8):0.5,(C:0.8,D:1):1.1); 2 | -------------------------------------------------------------------------------- /tests/unittests/input/kendall-colijn2.dnd: -------------------------------------------------------------------------------- 1 | (((A:0.8,B:1.4):0.3,C:0.7):0.9,D:1); 2 | 3 | -------------------------------------------------------------------------------- /tests/unittests/input/kraken/FA1090/kraken.report: -------------------------------------------------------------------------------- 1 | 0.00 0 0 U 0 unclassified 2 | 100.00 43078 6 - 1 root 3 | 99.99 43072 7 - 131567 cellular organisms 4 | 99.97 43065 122 D 2 Bacteria 5 | 99.69 42943 406 P 1224 Proteobacteria 6 | 98.74 42537 107 C 28216 Betaproteobacteria 7 | 98.50 42430 38 O 206351 Neisseriales 8 | 98.41 42392 365 F 481 Neisseriaceae 9 | 97.56 42027 22725 G 482 Neisseria 10 | 44.81 19302 18696 S 485 Neisseria gonorrhoeae 11 | 1.41 606 606 - 242231 Neisseria gonorrhoeae FA 1090 12 | -------------------------------------------------------------------------------- /tests/unittests/input/kraken/FA1090/kraken.taxonomy: -------------------------------------------------------------------------------- 1 | 22725 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales Neisseriaceae Neisseria 2 | 18696 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales Neisseriaceae Neisseria Neisseria gonorrhoeae 3 | 606 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales Neisseriaceae Neisseria Neisseria gonorrhoeae Neisseria gonorrhoeae FA 1090 4 | 406 root cellular organisms Bacteria Proteobacteria 5 | 365 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales Neisseriaceae 6 | 122 root cellular organisms Bacteria 7 | 107 root cellular organisms Bacteria Proteobacteria Betaproteobacteria 8 | 38 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales 9 | 7 root cellular organisms 10 | 6 root 11 | 0 12 | -------------------------------------------------------------------------------- /tests/unittests/input/kraken/contaminated/kraken.filtered.report: -------------------------------------------------------------------------------- 1 | 100.00 111249 91 - 1 root 2 | 99.92 111158 1 - 131567 cellular organisms 3 | 99.92 111157 64 D 2 Bacteria 4 | 99.86 111093 101 P 1224 Proteobacteria 5 | 61.18 68060 22 C 1236 Gammaproteobacteria 6 | 61.16 68038 7 O 118969 Legionellales 7 | 61.15 68031 1 F 444 Legionellaceae 8 | 61.15 68030 87 G 445 Legionella 9 | 61.07 67943 67943 S 446 Legionella pneumophila 10 | 38.59 42932 47 C 28216 Betaproteobacteria 11 | 38.55 42885 0 O 206351 Neisseriales 12 | 38.55 42885 0 F 481 Neisseriaceae 13 | 38.55 42885 13056 G 482 Neisseria 14 | 26.81 29829 29829 S 485 Neisseria gonorrhoeae 15 | -------------------------------------------------------------------------------- /tests/unittests/input/kraken/contaminated/kraken.report: -------------------------------------------------------------------------------- 1 | 0.00 0 0 U 0 unclassified 2 | 100.00 111249 91 - 1 root 3 | 99.92 111158 1 - 131567 cellular organisms 4 | 99.92 111157 64 D 2 Bacteria 5 | 99.86 111093 101 P 1224 Proteobacteria 6 | 61.18 68060 22 C 1236 Gammaproteobacteria 7 | 61.16 68038 7 O 118969 Legionellales 8 | 61.15 68031 1 F 444 Legionellaceae 9 | 61.15 68030 87 G 445 Legionella 10 | 61.07 67943 67943 S 446 Legionella pneumophila 11 | 38.59 42932 47 C 28216 Betaproteobacteria 12 | 38.55 42885 0 O 206351 Neisseriales 13 | 38.55 42885 0 F 481 Neisseriaceae 14 | 38.55 42885 13056 G 482 Neisseria 15 | 26.81 29829 29829 S 485 Neisseria gonorrhoeae 16 | -------------------------------------------------------------------------------- /tests/unittests/input/kraken/contaminated/kraken.taxonomy: -------------------------------------------------------------------------------- 1 | 67943 root cellular organisms Bacteria Proteobacteria Gammaproteobacteria Legionellales Legionellaceae Legionella Legionella pneumophila 2 | 29829 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales Neisseriaceae Neisseria Neisseria gonorrhoeae 3 | 13056 root cellular organisms Bacteria Proteobacteria Betaproteobacteria Neisseriales Neisseriaceae Neisseria 4 | 101 root cellular organisms Bacteria Proteobacteria 5 | 91 root 6 | 87 root cellular organisms Bacteria Proteobacteria Gammaproteobacteria Legionellales Legionellaceae Legionella 7 | 64 root cellular organisms Bacteria 8 | 47 root cellular organisms Bacteria Proteobacteria Betaproteobacteria 9 | 22 root cellular organisms Bacteria Proteobacteria Gammaproteobacteria 10 | 7 root cellular organisms Bacteria Proteobacteria Gammaproteobacteria Legionellales 11 | 1 root cellular organisms 12 | 1 root cellular organisms Bacteria Proteobacteria Gammaproteobacteria Legionellales Legionellaceae 13 | 0 14 | -------------------------------------------------------------------------------- /tests/unittests/randTrees.pl.bats: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$BATS_TEST_DIRNAME 4 | export PATH=$dir/../../scripts:$PATH 5 | 6 | @test "randTrees" { 7 | tree=$($dir/../../scripts/randTrees.pl --numTrees 1 $dir/input/kendall-colijn1.dnd) 8 | bytes=$(wc -c <<< $tree) 9 | [ "$bytes" -ge 140 ] 10 | 11 | [ "$bytes" -le 150 ] 12 | 13 | } 14 | --------------------------------------------------------------------------------