├── .gitmodules
├── pipeline
    ├── runBismark.sh
    ├── runProjectPermission.sh
    ├── runHTSeqStats.sh
    ├── runBuscoPlant.sh
    ├── runFastQCCounts.sh
    ├── runMeme.sh
    ├── runRmarkdown.sh
    ├── sortMeRnaV2Stats.sh
    ├── runGzip.sh
    ├── runXxhashsum.sh
    ├── runGetPasaFasta.sh
    ├── runSamtools_miRNAclusters.sh
    ├── submit_nextflow_RNAseq.sh
    ├── runCmd.sh
    ├── runMinion.sh
    ├── runQCprot.sh
    ├── run_ppfinder.sh
    ├── run_makeblastdb_pglauca.sh
    ├── runVCFPlot.sh
    ├── run_makeblastdb_ptaeda.sh
    ├── run_array_repeatmasker2.sh
    ├── run_array_repeatmasker.sh
    ├── runDexSeqCount.sh
    ├── run_HMMbuild.sh
    ├── runKnitR.sh
    ├── runParseMacs2.sh
    ├── run_ClustalW.sh
    ├── run_check_offtarget_for_guide_RNAs.sh
    ├── run_array_repeatmasker2coredump.sh
    ├── runCreateDEXSeqReference.sh
    ├── runRemoveBlankLines.sh
    ├── runSTARGenomeLoad.sh
    ├── runSTARGenomeRemove.sh
    ├── runAbyssBloom.sh
    ├── runPasaSeqclean.sh
    ├── runMetaxa2.sh
    ├── runReaper.sh
    ├── runMuscle.sh
    ├── runPASA_GFF3_validator.sh
    ├── runSamtoolsFlagstat.sh
    ├── runSampleN_SE.sh
    ├── runPrepInfomap.sh
    ├── runTrim.sh
    ├── join_targets.sh
    ├── runBedToolsGCov.sh
    ├── runTremulaTranslate.sh
    ├── runThresholdNetwork.sh
    ├── runDiamondMakedb.sh
    ├── runAsArray.sh
    ├── runSamtoolsFaidx.sh
    ├── runDISCOVARdeNovo.sh
    ├── runSampleN.sh
    ├── runSamtoolsIdxstats.sh
    ├── runPicardSamToFastq.sh
    ├── runFRC.sh
    ├── runBwaIndex.sh
    ├── runSortmernaDennis.sh
    ├── runSeidrThreshold.sh
    ├── runGATK_CombineGVCFs.sh
    ├── runPearsonSpearmanCorrelation.sh
    ├── runInfomap.sh
    ├── runCLR.sh
    ├── runSRnaWorkBenchFilter.sh
    ├── runGenomeTools.sh
    ├── runBedToolsBamToFastq.sh
    ├── runBESST.sh
    ├── runMiRBase_SS.sh
    ├── runGMAPIndex.sh
    ├── runFastQC.sh
    ├── runBamSubset.sh
    ├── runPasaLoadAnnotation.sh
    ├── runGeneNetworkRPreparation.sh
    ├── runCPC2.sh
    ├── runRepeatMasker.sh
    ├── runMultiQC.sh
    ├── runAnova.sh
    ├── runSSPACE-LR.sh
    ├── runStarFusion.sh
    ├── runCuffcompare.sh
    ├── runFusionInspector.sh
    ├── runTrinityTransDecoder.sh
    ├── runBAMtoCRAM.sh
    ├── runBedToolsIntersect.sh
    ├── runSamtoolsMerge.sh
    ├── runVsearchMergePairs.sh
    ├── runFastQCMultiviewer.sh
    ├── runSraFastqDump.sh
    ├── runMinimap2.sh
    ├── runFastQValidator.sh
    ├── runMmseq.sh
    ├── runRepeatModeler.sh
    ├── runCNCI.sh
    ├── runPicardCreateSequenceDictionary.sh
    ├── runMarkDuplicates.sh
    ├── runSUPPA2PsiPerEvent.sh
    ├── runSUPPA2PsiPerIsoform.sh
    ├── runSalmonStats.sh
    ├── runGeneNetworkRAggregate.sh
    ├── runGeneNetworkRThreshold.sh
    ├── runPyfasta.sh
    ├── runSeidrBackbone.sh
    ├── runBedToolsSubtract.sh
    ├── runITSx.sh
    ├── runTaxonomicClassification.sh
    ├── runKmergenie.sh
    ├── runKallistoStats.sh
    ├── runSamtoolsIndex.sh
    ├── runRePair.sh
    ├── runJellyfishHisto.sh
    ├── runBgzipTabix.sh
    ├── runPLEK.sh
    ├── runGENIE3.sh
    ├── runTaxonomyUpdate.sh
    ├── runNutil.sh
    ├── runJellyfishBc.sh
    ├── runUsearch.sh
    ├── runCuffmerge.sh
    ├── runGATK_SplitNCigarReads.sh
    ├── runGROM.sh
    ├── runCleanTrinity.sh
    ├── runGatkRealignerTargetCreator.sh
    ├── runKallistoIndex.sh
    ├── runPicardAddOrReplaceReadGroups.sh
    ├── runTIGLM.sh
    ├── runAssemblathonStat.sh
    ├── runBlastFormatDb.sh
    ├── runSeidrAggregate.sh
    ├── runGATK_GenotypeGVCFs.sh
    ├── runNarromi.sh
    ├── runGATK_CombineVariants.sh
    ├── runPlaac.sh
    ├── runSnpEff.sh
    ├── runTrimmomaticSeStats.sh
    ├── runBedToolsCoverage.sh
    ├── runGeneNetworkRRun.sh
    ├── runGATK_VariantFiltration.sh
    ├── run_psf.sh
    ├── runTrimmomaticStats.sh
    ├── runGatkFastaAlternateReferenceMaker.sh
    ├── runJBrowse2.sh
    ├── runBamtoFastQ.sh
    ├── runSTARStats.sh
    ├── runDemultiplex.sh
    ├── runUpdateNCBI.sh
    ├── runSeidrRoc.sh
    ├── runSamtoolsSort.sh
    ├── runShortstack.sh
    ├── runPicardMarkDuplicatesWithMateCigar.sh
    ├── runSamtools_split_primary.sh
    ├── runGATK_IndelRealigner.sh
    └── runSwestoreSync.sh
├── VERSION.info
├── src
    ├── R
    │   ├── rmd.R
    │   ├── percentile.R
    │   ├── GeneNet.R
    │   ├── try opt.R
    │   ├── getCoverage.R
    │   ├── ARACNE.R
    │   ├── parseUniRef90IDs.R
    │   ├── reverseFastq.R
    │   ├── misoPePlot.R
    │   ├── convertTemplates.R
    │   ├── mailR.R
    │   ├── WgcnaClusterPlot.R
    │   ├── plotSft.R
    │   ├── GC_percent_from_fasta.R
    │   ├── extractGff3Subset.R
    │   ├── rfam5SKrakenPrep.R
    │   ├── gopher2-example.R
    │   ├── deviseSequenceFromGFF.R
    │   ├── createGeneAnnotation.R
    │   ├── enaCsvEdit.R
    │   ├── updateAspenVcfv1.0.R
    │   └── plotVCFQual.R
    ├── bash
    │   ├── try-catch.sh
    │   ├── updateTaxonomySqlite.sh
    │   ├── functions.sh
    │   └── seidr-aggregate-kebnekaise.sh
    └── python
    │   └── fastQCmultiviewer.py
├── templates
    ├── R
    │   ├── bulogo2.png
    │   ├── style.css
    │   ├── empty.R
    │   ├── seidrPageRank.R
    │   ├── footer.html
    │   └── header.html
    └── bash
    │   ├── submitSeidrBackbone.sh
    │   ├── submitSeidrAggregate.sh
    │   ├── submitSeidrRoc.sh
    │   ├── runTemplate.sh
    │   └── template.sh
├── container
    ├── apptainer
    │   ├── macs3.def
    │   ├── tagdust.def
    │   ├── plink20.def
    │   ├── kallisto.def
    │   ├── plink19.def
    │   ├── swarm.def
    │   ├── MCScanX.def
    │   ├── casoffinder.def
    │   ├── seidr.def
    │   ├── angsd.def
    │   └── velocyto.def
    └── docker
    │   └── Dockerfile_diamond
├── nextflow
    ├── config
    │   └── upscb.config
    └── template
    │   ├── rnaseq_spruce_v2.json
    │   ├── rnaseq_tomato_v4.json
    │   ├── rnaseq_arabidopsis_araport11.json
    │   ├── rnaseq_microtom_xue.json
    │   ├── rnaseq_T89_v1.json
    │   ├── rnaseq_microtom_shirasawa.json
    │   ├── rnaseq_lupin_v2.json
    │   └── rnaseq_lupin_v1.json
└── .gitignore


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pipeline/runBismark.sh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/VERSION.info:
--------------------------------------------------------------------------------
1 | UPSCb-common: 1.2.1
2 | 


--------------------------------------------------------------------------------
/src/R/rmd.R:
--------------------------------------------------------------------------------
1 | rmd <- function(x){
2 |   mean(abs(x-mean(x)))/mean(x)
3 | }
4 | 


--------------------------------------------------------------------------------
/templates/R/bulogo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UPSCb/UPSCb-common/HEAD/templates/R/bulogo2.png


--------------------------------------------------------------------------------
/src/R/percentile.R:
--------------------------------------------------------------------------------
1 | "percentile" <- function(x,probs=seq(0,1,.01),...){
2 |   quantile(x,probs=probs)
3 | }


--------------------------------------------------------------------------------
/pipeline/runProjectPermission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo find . -type d -exec chmod 771 "{}" \;
4 | sudo find . -type d -exec chmod g+s "{}" \;
5 | sudo find . -type f -exec chmod 664 "{}" \;
6 | 
7 | 


--------------------------------------------------------------------------------
/pipeline/runHTSeqStats.sh:
--------------------------------------------------------------------------------
1 | find . -name "*.txt" | xargs -I {} bash -c 'echo $0 $(grep __no_feature $0) $(grep __alignment_not_unique $0)' {}
2 | find . -name "*.txt" -exec awk 'BEGIN{sum=0}{sum+=$2}END{print sum}' "{}" \;
3 | 
4 | 


--------------------------------------------------------------------------------
/pipeline/runBuscoPlant.sh:
--------------------------------------------------------------------------------
1 | module load bioinfo-tools
2 | module load busco
3 | 
4 | # $0 [option] <name> <fasta> <out>
5 | # option should be -m OGS or -m trans
6 | # only runs on plants
7 | MODE="trans"
8 | cd $out
9 | python3 $BUSCO_PATH/BUSCO_plants.py -o $1 -in $2 -l $BUSCO_DATA/plantae -m $MODE


--------------------------------------------------------------------------------
/templates/R/style.css:
--------------------------------------------------------------------------------
 1 | body .main-container {
 2 |   max-width: 98% !important; 
 3 |   margin-left: 0px;
 4 |   margin-right: 20px;
 5 | }
 6 | 
 7 | img {
 8 |     display:block;
 9 |     float:none;
10 |     margin-left:auto;
11 |     margin-right:auto;
12 | }
13 | 
14 | .btn-group { display: none; }


--------------------------------------------------------------------------------
/pipeline/runFastQCCounts.sh:
--------------------------------------------------------------------------------
1 | find $1 -name "fastqc_data.txt" | xargs -I {} bash -c 'echo $0 $(grep "Total Sequences" $0)' {}
2 | 
3 | #find . -name "fastqc_data.txt" | xargs -I {} bash -c 'echo $0 $(grep "Total Sequences" $0)' {} | awk '{id=$1;gsub("\\./|_[1,2]_fastqc.*","",id);print id, $4}' | sort | uniq


--------------------------------------------------------------------------------
/pipeline/runMeme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p all
 4 | #SBATCH -n 8
 5 | #SBATCH -t 2-00:00:00
 6 | 
 7 | # module load bioinfo-tools Reaper 
 8 | set -ex
 9 | 
10 | meme $1.$SLURM_ARRAY_TASK_ID -dna -oc $2$SLURM_ARRAY_TASK_ID -mod anr -evt 0.05 -maxsize 3500000 -maxw 30 -nmotifs 100 -bfile $3 -p 8
11 | 


--------------------------------------------------------------------------------
/pipeline/runRmarkdown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=all
 3 | 
 4 | usage () {
 5 | echo "Usage:"
 6 | echo "runRmarkdown.sh <my_R_script.R>"
 7 | echo
 8 | }
 9 | 
10 | if [ ! $# == 1 -o ! -f $1 ]; then
11 | 	usage
12 | 	exit 1
13 | fi
14 | 
15 | module load R
16 | Rscript -e "library(rmarkdown); render(commandArgs(TRUE))" $1
17 | 


--------------------------------------------------------------------------------
/src/R/GeneNet.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(trailingOnly = TRUE)
 2 | setwd(args[1])
 3 | library(GeneNet)
 4 | library(data.table)
 5 | dat <- fread(args[2])
 6 | out <- args[3]
 7 | 
 8 | pcor.dyn = ggm.estimate.pcor(as.matrix(dat), method = "dynamic")
 9 | write.table(abs(pcor.dyn), out, quote = FALSE,
10 |             col.names = FALSE, row.names = FALSE, sep = "\t")
11 | 


--------------------------------------------------------------------------------
/pipeline/sortMeRnaV2Stats.sh:
--------------------------------------------------------------------------------
1 | grep "%" *.log | grep passing | awk -F_ '{print $4"_"$5,$8}' | awk '{print $1,$10}' | sed 's:[(,)]::g'
2 | grep "%" *.log | grep -v passing | grep -v failing | awk -F_ '{print $4"_"$5,$9}' | awk '{print $1,$3}' > file.txt
3 | R dat <- read.delim(sep=" ",file="file.txt",stringsAsFactors = FALSE,header=FALSE)
4 | do.call(rbind,split(dat[,2],dat[,1]))
5 | 


--------------------------------------------------------------------------------
/pipeline/runGzip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 06:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ## stop on error
 7 | set -e
 8 | 
 9 | if [ $# == 0 ]; then
10 |     echo "This function takes one file as argument"
11 |     exit 1
12 | fi
13 | 
14 | if [ ! -f $1 ]; then
15 |     echo "The provided file: $1 does not exist"
16 |     exit 1
17 | fi
18 | 
19 | gzip -f $1
20 | 
21 | 


--------------------------------------------------------------------------------
/pipeline/runXxhashsum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 06:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ## stop on error
 7 | set -e
 8 | 
 9 | if [ $# == 0 ]; then
10 |     echo "This function takes one file as argument"
11 |     exit 1
12 | fi
13 | 
14 | if [ ! -f "$1" ]; then
15 |     echo "The provided file: $1 does not exist"
16 |     exit 1
17 | fi
18 | 
19 | ~/bin/xxhashsum -f "$1"
20 | 


--------------------------------------------------------------------------------
/pipeline/runGetPasaFasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 1:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ####
 8 | #	A runner part of the novel genen and long non coding RNA pipeline
 9 | #	python get_fasta_seq.py <gff_file> <fasta_file> <output>
10 | ####
11 | 
12 | ## stop on error but be verbose
13 | set -e
14 | set -x
15 | 
16 | python $UPSCb/src/python/novel_genes/get_fasta_seq.py $1 $2 $3


--------------------------------------------------------------------------------
/pipeline/runSamtools_miRNAclusters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | 
 6 | module load bioinfo-tools
 7 | module load samtools/1.3.1
 8 | 
 9 | file=$1
10 | bed=$2
11 | outdir=$3
12 | name=$4
13 | 
14 | # extract alignments in miRNA loci regions
15 | # extract columns with sequence names and genomic location
16 | samtools view -L $bed $file | cut -f 1,3,4 > $outdir/$name.miRNA.txt
17 | 


--------------------------------------------------------------------------------
/templates/R/empty.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "CHANGEME"
 3 | #' author: "CHANGEME"
 4 | #' date: "`r Sys.Date()`"
 5 | #' output:
 6 | #'  html_document:
 7 | #'    toc: true
 8 | #'    number_sections: true
 9 | #'    code_folding: hide
10 | #' ---
11 | #' # Setup
12 | #' * Libraries
13 | suppressPackageStartupMessages({
14 |   
15 | })
16 | 
17 | #' # Session Info
18 | #' ```{r session info, echo=FALSE}
19 | #' sessionInfo()
20 | #' ```
21 | 


--------------------------------------------------------------------------------
/src/R/try opt.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages(library(optparse))
 2 | 
 3 | Main <- function(){
 4 |   ### ================ main
 5 |   ## define the arguments
 6 |   option_list <- list(
 7 |     make_option(c("-op", "--output_prefix"),dest="op", type="character", default="",
 8 |                 help="The output prefix, if wanted"))
 9 |   opt <- parse_args(OptionParser(option_list=option_list))
10 |   
11 |   return(opt$op)
12 | }  
13 | Main()
14 | 


--------------------------------------------------------------------------------
/pipeline/submit_nextflow_RNAseq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l 
 2 | #SBATCH -p nextflow
 3 | #SBATCH -t 72:00:00
 4 | #SBATCH -A SLURM_project_Code
 5 | #SBATCH -o log_rnaseq.out
 6 | #SBATCH -e log_rnaseq.err
 7 | 
 8 | set -eu -o pipefail
 9 | 
10 | nextflow run nf-core/rnaseq -r 3.19.0 \
11 | -profile singularity,upscb -c "nextflow/upscb.config" \
12 | -params-file "nextflow/nf-params.json"  \
13 | -with-trace -with-report "report_rnaseq.html" \
14 | -resume
15 | 


--------------------------------------------------------------------------------
/container/apptainer/macs3.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 |     build-essential \
11 |     python3-dev \
12 |     python3-pip \
13 |     python3
14 | apt-get clean
15 | 
16 | cd ~
17 | pip install numpy scipy scikit-learn hmmlearn Cython cykhash
18 | pip install macs3
19 | pip cache purge
20 | 
21 | %runscript
22 | macs3 "$@"
23 | 


--------------------------------------------------------------------------------
/src/R/getCoverage.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages(require(GenomicRanges))
 2 | 
 3 | args <- commandArgs(trailingOnly=TRUE);
 4 | 
 5 | inf <- args[1]
 6 | chr <- args[2]
 7 | st <- as.integer(args[3])
 8 | en <- as.integer(args[4])
 9 | 
10 | load(inf)
11 | 
12 | gr <- grep(".GR$",ls(),value=TRUE)
13 | 
14 | target <- GRanges(chr,IRanges(st,en))
15 | res <- sort(subsetByOverlaps(get(gr), target))
16 | 
17 | write.table(as.data.frame(res), sep="\t", quote=FALSE)
18 | 


--------------------------------------------------------------------------------
/container/apptainer/tagdust.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:18.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 |     build-essential \
11 |     autoconf \
12 |     git
13 | apt-get clean
14 | 
15 | cd ~
16 | git clone https://github.com/TimoLassmann/tagdust.git
17 | cd tagdust
18 | ./autogen.sh
19 | ./configure
20 | make
21 | make check
22 | make install
23 | 
24 | %runscript
25 | tagdust "$@"
26 | 


--------------------------------------------------------------------------------
/container/apptainer/plink20.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 |     unzip \
11 |     wget
12 | apt-get clean
13 | 
14 | cd ~
15 | wget https://s3.amazonaws.com/plink2-assets/plink2_linux_x86_64_20231123.zip
16 | unzip plink2_linux_x86_64_20231123.zip
17 | rm plink2_linux_x86_64_20231123.zip
18 | cp plink2 /usr/local/bin/
19 | 
20 | %runscript
21 | plink2 "$@"
22 | 


--------------------------------------------------------------------------------
/container/apptainer/kallisto.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 | build-essential \
11 | cmake \
12 | zlib1g-dev \
13 | libhdf5-dev \
14 | git
15 | apt-get clean
16 | 
17 | cd ~
18 | git clone https://github.com/pachterlab/kallisto.git
19 | cd kallisto
20 | mkdir build
21 | cd build
22 | cmake ..
23 | make
24 | make install
25 | 
26 | %runscript
27 | kallisto "$@"
28 | 


--------------------------------------------------------------------------------
/pipeline/runCmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH --mail-type=ALL
 4 | 
 5 | set -ex
 6 | 
 7 | usage(){
 8 |   echo >&2 \
 9 | "
10 | Usage: $(basename $0) <script> <additional arguments>
11 | 
12 | This runs a script using the provided arguments. The script has to be executable.
13 | 
14 | "
15 |   exit 1
16 | }
17 | 
18 | # check that the script exists
19 | if [ ! -f $1 ]; then
20 |   echo 
21 |   usage
22 | fi
23 | script=$1
24 | shift
25 | 
26 | # run the job
27 | $script $@
28 | 


--------------------------------------------------------------------------------
/src/R/ARACNE.R:
--------------------------------------------------------------------------------
 1 | library(minet)
 2 | setwd(args[2])
 3 | args <- commandArgs(trailingOnly = TRUE)
 4 | exp.dat <- fread(args[1])
 5 | mim <- build.mim(as.matrix(exp.dat))
 6 | ar <- aracne(mim)
 7 | arl <- ar[lower.tri(ar)]
 8 | 
 9 | gn <- colnames(ar)
10 | fout <- "predictions.txt"
11 | arli <- 1
12 | 
13 | for(i in 2:nrow(ar)){
14 |   for(j in 1:(i-1)){
15 |     if(arl[arli] > 0){
16 |       cat(gn[i],gn[j],arl[arli],"\n",append=TRUE,file=fout,sep="\t")
17 |     }
18 |     arli <- arli + 1
19 |   }
20 | }


--------------------------------------------------------------------------------
/pipeline/runMinion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p main -n 1
 4 | #SBATCH -t 12:00:00
 5 | 
 6 | module load bioinfo-tools Reaper
 7 | set -ex
 8 | 
 9 | infile=$1
10 | dir=$2
11 | 
12 | if [ ! -f $infile ]; then
13 |     echo "invalid file"
14 |     exit 1
15 | fi
16 | 
17 | if [ ! -d $dir ]; then
18 |     echo "invalid directory"
19 |     exit 1
20 | fi
21 | 
22 | name=$(basename $infile)
23 | 
24 | minion search-adapter -i $infile > $dir/${name/.fastq.gz/.adapter_mn.txt}
25 | 


--------------------------------------------------------------------------------
/src/bash/try-catch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # from https://gist.github.com/e7d/e43e6586c1c2ecb67ae2
 4 | 
 5 | function try()
 6 | {
 7 |     [[ $- = *e* ]]; SAVED_OPT_E=$?
 8 |     set +e
 9 | }
10 | 
11 | function throw()
12 | {
13 |     exit $1
14 | }
15 | 
16 | function catch()
17 | {
18 |     export ex_code=$?
19 |     (( $SAVED_OPT_E )) && set +e
20 |     return $ex_code
21 | }
22 | 
23 | function throwErrors()
24 | {
25 |     set -e
26 | }
27 | 
28 | function ignoreErrors()
29 | {
30 |     set +e
31 | }
32 | 


--------------------------------------------------------------------------------
/container/apptainer/plink19.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 |     unzip \
11 |     wget
12 | apt-get clean
13 | 
14 | cd ~
15 | wget https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231018.zip
16 | unzip plink_linux_x86_64_20231018.zip
17 | rm plink_linux_x86_64_20231018.zip
18 | cp plink /usr/local/bin/
19 | cp prettify /usr/local/bin/
20 | 
21 | %runscript
22 | plink "$@"
23 | 


--------------------------------------------------------------------------------
/container/apptainer/swarm.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 |     wget
11 | apt-get clean
12 | 
13 | cd ~
14 | wget https://github.com/torognes/swarm/releases/download/v3.1.4/swarm-3.1.4-linux-x86_64.tar.gz
15 | tar -xzvf swarm-3.1.4-linux-x86_64.tar.gz
16 | rm swarm-3.1.4-linux-x86_64.tar.gz
17 | cp swarm-3.1.4-linux-x86_64/bin/swarm /usr/local/bin/
18 | 
19 | %runscript
20 | swarm "$@"
21 | 


--------------------------------------------------------------------------------
/pipeline/runQCprot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p all
 3 | #SBATCH -n 8 --mem=150G -t 2-00:00:00
 4 | ## no mail at the time
 5 | ###SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -e
 9 | 
10 | ## be verbose and extend the commands
11 | set -x
12 | 
13 | ## load the modules
14 | module load bioinfo-tools
15 | module load blast
16 | 
17 | blastx -query $1/Trinity.format.fasta \
18 |          -db $2 -out $3/blastxprot.outfmt6 \
19 |          -evalue 1e-20 -num_threads 8 -max_target_seqs 1 -outfmt 6
20 | 
21 | 


--------------------------------------------------------------------------------
/pipeline/run_ppfinder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools ppfinder
12 | 
13 | # usage function
14 | usage(){
15 | echo >&2 \
16 | "
17 | 	Usage: $0 
18 | 		Parameters can be changed in parameter.file
19 | "
20 | 	exit 1
21 | }
22 | 
23 | # run the command
24 | cd $2
25 | ppfinder
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/container/docker/Dockerfile_diamond:
--------------------------------------------------------------------------------
 1 | ROM ubuntu:24.04
 2 | 
 3 | MAINTAINER Nicolas Delhomme (nicolas.delhomme@umu.se)
 4 | 
 5 | ARG DMD_VERSION=2.1.16
 6 | 
 7 | RUN apt update && apt upgrade -y && apt install -y curl
 8 | 
 9 | # Download and extract Diamond
10 | RUN curl -L -O https://github.com/bbuchfink/diamond/releases/download/v2.1.16/diamond-linux64.tar.gz && \
11 |     tar -zxf diamond-linux64.tar.gz
12 | 
13 | # Copy binaries & libraries to system paths
14 | RUN mv diamond /usr/local/bin/
15 | 
16 | ENTRYPOINT ["diamond"]
17 | 


--------------------------------------------------------------------------------
/pipeline/run_makeblastdb_pglauca.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools
12 | module load blast/2.2.29+
13 | 
14 | # run the command
15 | makeblastdb -in /mnt/picea/projects/spruce/pipeline/psari_data/picea_glauca/PG29-scaffolds.fa -dbtype nucl -out /mnt/picea/projects/spruce/pipeline/psari_data/picea_glauca/db_pglauca_genome/PG29-scaffolds.fa
16 | 
17 | 


--------------------------------------------------------------------------------
/pipeline/runVCFPlot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p main -n 1
 4 | #SBATCH -t 2:00:00
 5 | #SBATCH --mail-type ALL
 6 | 
 7 | module load R/3.1.0
 8 | 
 9 | usage() {
10 |     if [[ ! -z $1 ]]; then
11 |         echo >&2 $1
12 |     fi
13 |     echo >&2 "usage: $0 input.vcf[.gz] outdir title"
14 |     exit 1
15 | }
16 | 
17 | [[ $# -lt 3 ]] && usage "ERROR: too few arguments"
18 | [[ ! -f $1 ]] && usage "ERROR: file not found: $1"
19 | [[ ! -d $2 ]] && usage "ERROR: output directory not found: $2"
20 | 
21 | Rscript ../../../src/R/plotVCFQual.R "$1" "$2" "$3"
22 | 


--------------------------------------------------------------------------------
/nextflow/config/upscb.config:
--------------------------------------------------------------------------------
 1 | // upsc general profile
 2 | profiles {
 3 |     upscb{
 4 |         process {
 5 |             executor       = 'slurm'
 6 |             clusterOptions = "-A <account>"
 7 |         }
 8 |         singularity {
 9 |             enabled = true
10 |             cacheDir = '/mnt/reference/nf-core_apptainer_containers/'
11 |         }
12 |         memory         = { 20.GB * task.attempt }
13 |         cpus           = { 2 * task.attempt }
14 |         time           = { 48.h * task.attempt }
15 |     }
16 | }
17 | 
18 | workDir = 'data/work'
19 | 


--------------------------------------------------------------------------------
/pipeline/run_makeblastdb_ptaeda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools
12 | module load blast/2.2.29+
13 | 
14 | # run the command
15 | makeblastdb -in /mnt/picea/storage/reference/Pinus-taeda/v1.01/fasta/ptaeda.v1.01-genome-collapsed-for-STAR.fa -dbtype nucl -out /mnt/picea/projects/spruce/pipeline/psari_data/pinus_taeda/db_blastplus/ptaeda.v1.01-genome.fa
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/pipeline/run_array_repeatmasker2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -J arrayrepeatmasker.job
 4 | #SBATCH -p main
 5 | #SBATCH -c 8
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | ## stop on error and be verbose in the output
 9 | set -e -x
10 | 
11 | # load the modules
12 | module load bioinfo-tools 
13 | module load RepeatMasker
14 | 
15 | # usage function
16 | 
17 | usage(){
18 | echo >&2 \
19 | "
20 | 	Usage: $0 <link to genome>
21 | 
22 | "
23 | 	exit 1
24 | }
25 | 
26 | # run the command # -dir $3 
27 | RepeatMasker $1.$SLURM_ARRAY_TASK_ID.masked -e $2 -pa $3 -qq -lib $4
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/container/apptainer/MCScanX.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:20.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install tzdata && \
10 | apt-get -y install \
11 |     build-essential \
12 |     default-jdk \
13 |     git
14 | apt-get clean
15 | 
16 | cd ~
17 | git clone https://github.com/wyp1125/MCScanX.git
18 | cd MCScanX
19 | make
20 | cp MCScanX /usr/local/bin/
21 | cp MCScanX_h /usr/local/bin/
22 | cp duplicate_gene_classifier /usr/local/bin/
23 | cp downstream_analyses/* /usr/local/bin/
24 | 
25 | %runscript
26 | MCScanX "$@"
27 | 


--------------------------------------------------------------------------------
/pipeline/run_array_repeatmasker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -J arrayrepeatmasker.job
 4 | #SBATCH -p main
 5 | #SBATCH -c 8
 6 | #SBATCH -t 7-00:00:00
 7 | #SBATCH --mail-type=ALL
 8 | 
 9 | ## stop on error and be verbose in the output
10 | set -e -x
11 | 
12 | # load the modules
13 | module load bioinfo-tools 
14 | module load RepeatMasker
15 | 
16 | # usage function
17 | 
18 | usage(){
19 | echo >&2 \
20 | "
21 | 	Usage: $0 <link to genome>
22 | 
23 | "
24 | 	exit 1
25 | }
26 | 
27 | # run the command # -dir $3 
28 | RepeatMasker $1.$SLURM_ARRAY_TASK_ID -e $2 -pa $3 -qq -lib $4
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/pipeline/runDexSeqCount.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 0-2:00:00
 5 | #SBATCH --mail-type=ALL
 6 | #SBATCH --mail-user=david.sundell@umu.se
 7 | 
 8 | module load bioinfo-tools
 9 | module load HTSeq/0.6.1
10 | 
11 | ## abort on error
12 | set -e
13 | 
14 | ## usage
15 | 
16 | echo "runDEXSeq_count.sh input_file input_gff"
17 | 
18 | #python ~/script/python/dexseq_count.py $1 $2 -p yes -f bam -s no
19 | 
20 | name1=${1##*/}
21 | name=${name1%.bam}
22 | 
23 | 
24 | #echo $3/$name.txt
25 | python ~/Git/UPSCb/src/python/dexseq_count.py $2 $1 $3/$name".txt" -p yes -f bam -s no
26 | 


--------------------------------------------------------------------------------
/container/apptainer/casoffinder.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | export DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | %post 
 9 | apt-get update && \
10 | apt-get -y install \
11 |     opencl-headers \
12 |     pocl-opencl-icd \
13 |     libgomp1 \
14 |     wget \
15 |     unzip
16 | apt-get clean
17 | 
18 | cd ~
19 | wget https://github.com/snugel/cas-offinder/releases/download/2.4.1/cas-offinder_linux_x86-64.zip
20 | unzip cas-offinder_linux_x86-64.zip
21 | chmod +x cas-offinder
22 | mv cas-offinder /usr/local/bin/
23 | 
24 | %runscript
25 | cas-offinder "$@"
26 | 


--------------------------------------------------------------------------------
/pipeline/run_HMMbuild.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 8
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools 
12 | module load hmmer
13 | 
14 | # usage function
15 | usage(){
16 | echo >&2 \
17 | "
18 | 	Usage: $0 <fasta file to align>
19 | 
20 | "
21 | 	exit 1
22 | }
23 | 
24 | # check values
25 | 
26 | if [ $# != 3 ]; then
27 |     echo "This function requires 3 arguments."
28 |     usage
29 | fi
30 | 
31 | # run the command
32 | cd $3
33 | hmmbuild $1 $2
34 | 
35 | 


--------------------------------------------------------------------------------
/pipeline/runKnitR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p all
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH -n 1
 5 | #SBATCH -t 2-00:00:00
 6 | 
 7 | # modules
 8 | module load R
 9 | 
10 | # helper
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # usage
14 | USAGETXT=\
15 | "
16 | Usage: $0 <R script to knit>
17 | "
18 | 
19 | # sanity: this script expects one argument, the file to knit
20 | if [ $# -ne 1 ]; then
21 |   abort "This script expects one argument, the file to knit"
22 | fi
23 | 
24 | # knit
25 | Rscript -e "require(methods);rmarkdown::render(commandArgs(trailingOnly=TRUE)[1])" $1
26 | 


--------------------------------------------------------------------------------
/pipeline/runParseMacs2.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 24:00:00
 4 | 
 5 | # be verbose and stop on error
 6 | set -eux
 7 | 
 8 | # test 
 9 | #isEnvVarSet $UPSCb
10 | 
11 | # load module
12 | module load R
13 | 
14 | if [ ! -d $1 ]; then
15 |     echo "The first argument needs to be a directory"
16 |     exit 1
17 | fi
18 | # exit 1 = exit with any error (normal case would be "exit 0" -> no error)
19 | 
20 | 
21 | # one parameter from the command line: the saturation analysis directory. $1=1st argument on the command line
22 | 
23 | cd $1
24 | Rscript --vanilla $UPSCb/projects/DAP-Seq/src/R/parseMacs2.R


--------------------------------------------------------------------------------
/pipeline/run_ClustalW.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 8
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools 
12 | module load ClustalO
13 | 
14 | # usage function
15 | usage(){
16 | echo >&2 \
17 | "
18 | 	Usage: $0 <fasta file to align>
19 | 
20 | "
21 | 	exit 1
22 | }
23 | 
24 | # check values
25 | 
26 | if [ $# != 2 ]; then
27 |     echo "This function requires only one argument."
28 |     usage
29 | fi
30 | 
31 | # run the command
32 | cd $2
33 | clustalw $1
34 | 
35 | 


--------------------------------------------------------------------------------
/src/R/parseUniRef90IDs.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | file="/mnt/picea/storage/reference/UniRef90/201908/annotation/uniref90.id"
 4 | 
 5 | f <- function(c,i){
 6 |   str_match(c,">([^ ]+) .*Tax=(.*) TaxID=(\\d+).*")[,2:4]
 7 | }
 8 | 
 9 | df <- read_lines_chunked(file,callback=DataFrameCallback$new(f),
10 |                          chunk_size=1e6,progress=TRUE)
11 | 
12 | saveRDS(df,file="/mnt/picea/storage/reference/UniRef90/201908/annotation/uniref90.id.rds")
13 | 
14 | write_delim(as.data.frame(df),col.names=FALSE,
15 |             path="/mnt/picea/storage/reference/UniRef90/201908/annotation/uniref90_id-table.txt")
16 | 


--------------------------------------------------------------------------------
/pipeline/run_check_offtarget_for_guide_RNAs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -n 1
 3 | 
 4 | 
 5 | set -eu
 6 | 
 7 | # You need a singularity container for blast, a genome fasta file,
 8 | # a fasta file with the sequence of the used gRNAs and the name of an output file
 9 | container=$(realpath SINGULARITY_CONTAINER)
10 | genome=$(realpath GENOME_FASTA_FILE)
11 | gRNA=$(realapth gRNA_FASTA_FILE)
12 | output="NAME_OUTPUT_FILE"
13 | 
14 | apptainer exec -B /mnt $container makeblastdb -in $genome -dbtype nucl
15 | 
16 | apptainer exec -B /mnt $container blastn -query  $gRNA \
17 | -db $genome -task blastn-short -outfmt 7 -out $output
18 | 


--------------------------------------------------------------------------------
/templates/R/seidrPageRank.R:
--------------------------------------------------------------------------------
 1 | library(here)
 2 | library(igraph)
 3 | library(readr)
 4 | 
 5 | sf <- read_tsv(here("data/seidr/backbone/backbone-2-percent-filtered.txt"),
 6 |                col_names=FALSE,col_types=cols(.default=col_character()),
 7 |                show_col_types=FALSE)
 8 | 
 9 | d.graf <- graph.edgelist(as.matrix(sf[sf$X3=="Directed",1:2]),directed=TRUE)
10 | u1.graf <- graph.edgelist(as.matrix(sf[sf$X3=="Undirected",1:2]),directed=TRUE)
11 | u2.graf <- graph.edgelist(as.matrix(sf[sf$X3=="Undirected",2:1]),directed=TRUE)
12 | graf <- union(d.graf,u1.graf,u2.graf)
13 | 
14 | pr <- page_rank(graf)$vector
15 | pr
16 | 


--------------------------------------------------------------------------------
/pipeline/run_array_repeatmasker2coredump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -J arrayrepeatmasker.job
 4 | #SBATCH -p main
 5 | #SBATCH -c 8
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | ## stop on error and be verbose in the output
 9 | set -e -x
10 | 
11 | # load the modules
12 | module load bioinfo-tools 
13 | module load RepeatMasker
14 | 
15 | # usage function
16 | 
17 | usage(){
18 | echo >&2 \
19 | "
20 | 	Usage: $0 <link to genome>
21 | 
22 | "
23 | 	exit 1
24 | }
25 | 
26 | # run the command # -dir $3 
27 | #RepeatMasker $1.$SLURM_ARRAY_TASK_ID.masked_core.fasta -e $2 -pa $3 -qq -lib $4
28 | RepeatMasker $1 -e $2 -pa $3 -qq -lib $4
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/pipeline/runCreateDEXSeqReference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 0-1:00:00
 5 | #SBATCH --mail-type=ALL
 6 | #SBATCH --mail-user=david.sundell@umu.se
 7 | 
 8 | module load bioinfo-tools
 9 | module load python/2.6.6
10 | 
11 | ## abort on error
12 | set -e
13 | 
14 | ## usage
15 | 
16 | echo "runDEXSeq_count.sh input_gff output_gff"
17 | 
18 | python /mnt/picea/home/ishutava/Git/UPSCb/src/python/dexseq_prepare_annotation.py /mnt/picea/storage/reference/Arabidopsis-thaliana/TAIR10/gff/TAIR10_GFF3_genes_transposons.gtf /mnt/picea/projects/docker/upsc2017/jBrowse/srobert/dr4-resistant-mutant/TAIR10_GFF3_MY.gff
19 | 
20 | 


--------------------------------------------------------------------------------
/container/apptainer/seidr.def:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: fedora:31
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post
 8 | dnf update -y && \
 9 | dnf -y install \
10 |   gcc \
11 |   gcc-c++ \
12 |   gcc-gfortran \
13 |   cmake \
14 |   git \
15 |   boost-devel \
16 |   coin-or-Clp-devel \
17 |   armadillo-devel \
18 |   zlib-devel
19 | dnf clean all
20 | 
21 | #  glpk-devel \
22 | 
23 | cd ~
24 | git clone --recursive https://github.com/bschiffthaler/seidr
25 | cd seidr
26 | mkdir build
27 | cd build
28 | cmake -DCMAKE_BUILD_TYPE=Release -DSEIDR_WITH_MPI=ON -DNARROMI_USE_CLP=ON ..
29 | make
30 | make install
31 | 
32 | %runscript
33 | seidr "$@"
34 | 


--------------------------------------------------------------------------------
/pipeline/runRemoveBlankLines.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 1:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | ## usage
12 | usage(){
13 | echo >&2 \
14 | "
15 | 	Usage: $0 <gz file>
16 | "
17 | 	exit 1
18 | }
19 | 
20 | ## we get file as input
21 | if [ $# != 1 ]; then
22 |     echo "This function takes one file as argument"
23 |     usage
24 | fi
25 | 
26 | if [ ! -f $1 ]; then
27 |     echo "The first argument needs to be an existing gzipped file"
28 |     usage
29 | fi
30 | 
31 | # procced
32 | zcat $1 | sed '/^$/d' | gzip > $1.tmp
33 | mv $1.tmp $1
34 | 


--------------------------------------------------------------------------------
/pipeline/runSTARGenomeLoad.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 02:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | usage(){
 8 | echo >&2 \
 9 | "
10 | 	Usage: $0 <genome>
11 | 	
12 | 	Arguments:
13 |                 genome: The genome STAR index directoy
14 | "
15 | 	exit 1
16 | }
17 | 
18 | ## check that the genome exists
19 | if [ $# != 1 ]; then
20 |     echo "This function takes one argument the STAR genome"
21 |     usage
22 | fi
23 | 
24 | if [ ! -d $1 ]; then
25 |     echo "The first argument needs to point to a valid STAR index directory"
26 |     usage
27 | fi
28 | 
29 | ## load the genome
30 | STAR --genomeDir $1 --genomeLoad LoadAndExit
31 | 
32 | 


--------------------------------------------------------------------------------
/pipeline/runSTARGenomeRemove.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 01:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | usage(){
 8 | echo >&2 \
 9 | "
10 | 	Usage: $0 <genome>
11 | 	
12 | 	Arguments:
13 |                 genome: The genome STAR index directoy
14 | "
15 | 	exit 1
16 | }
17 | 
18 | ## check that the genome exists
19 | if [ $# != 1 ]; then
20 |     echo "This function takes one argument the STAR genome"
21 |     usage
22 | fi
23 | 
24 | if [ ! -d $1 ]; then
25 |     echo "The first argument needs to point to a valid STAR index directory"
26 |     usage
27 | fi
28 | 
29 | ## load the genome
30 | STAR --genomeDir $1 --genomeLoad Remove
31 | 
32 | 


--------------------------------------------------------------------------------
/pipeline/runAbyssBloom.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -c 8
 3 | module load bioinfo-tools ABySS
 4 | 
 5 | usage () 
 6 | {
 7 |     echo "runAbyssBloom.sh <k> <out file> <read1> <read2> ..."
 8 |     echo
 9 |     exit 1
10 | }
11 | 
12 | K=$1
13 | OUT=$2
14 | 
15 | if [ ! $K =~ '^[0-9]+$' ]; then
16 |     echo "First argument has to be a number"
17 |     usage
18 | fi
19 | 
20 | shift 2
21 | 
22 | for f in $@; do
23 |     if [ ! -f $f ];then
24 | 	echo "$f is not a valid file"
25 | 	usage
26 |     fi
27 | done
28 | 
29 | if [ ! -d $(dirname $OUT) ];then
30 |     echo "Creating out directory"
31 |     mkdir -p $(dirname $OUT)
32 | fi
33 | 
34 | abyss-bloom build -k $K -t 8 $OUT $@
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/pipeline/runPasaSeqclean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 00:10:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -ex
 9 | 
10 | ## modules
11 | module load bioinfo-tools pasa
12 | 
13 | ## a usage function
14 | usage(){
15 |     echo >&2 \
16 | "
17 |     Usage: $0 <fasta file>
18 | " 
19 |     exit 1
20 | }
21 | 
22 | ## we get one file as input
23 | if [ $# != 1 ]; then
24 |     echo "This function takes one fasta file as argument"
25 |     usage
26 | fi
27 | 
28 | if [ ! -f $1 ]; then
29 |     echo "The first argument needs to be an existing fasta file"
30 |     usage
31 | fi
32 | 
33 | ## create the index
34 | seqclean $1
35 | 
36 | 


--------------------------------------------------------------------------------
/pipeline/runMetaxa2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -c 2
 4 | #SBATCH --mem=8G
 5 | #SBATCH -p main
 6 | 
 7 | usage(){
 8 |     echo "runMetaxa2.sh <forward_reads> <reverse_reads> <output_directory>"
 9 |     exit l
10 | }
11 | 
12 | frw_reads=$1
13 | rev_reads=$2
14 | out_dir=$3
15 | 
16 | if [ ! -e $frw_reads ]
17 |     then
18 |     usage
19 | fi
20 | 
21 | if [ ! -e $rev_reads ]
22 |     then
23 |     usage
24 | fi
25 | 
26 | if [ ! -d $out_dir ]
27 |     then
28 |     usage
29 | fi
30 | 
31 | cd $out_dir
32 | 
33 | module load bioinfo-tools metaxa2
34 | 
35 | DBDIR=$(dirname $(which metaxa2))/metaxa2_db
36 | 
37 | metaxa2 -g ssu -1 $frw_reads -2 $rev_reads -o $out_dir -d $DBDIR --cpu 2
38 | 


--------------------------------------------------------------------------------
/pipeline/runReaper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p main -n 1
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | # module load bioinfo-tools Reaper 
 7 | set -ex
 8 | 
 9 | infile=$1
10 | dir=$2
11 | shift 2
12 | 
13 | if [ ! -f $infile ]; then
14 |     echo "invalid file"
15 |     exit 1
16 | fi
17 | 
18 | if [ ! -d $dir ]; then
19 |     echo "invalid directory"
20 |     exit 1
21 | fi
22 | 
23 | name=$(basename $infile)
24 | 
25 | reaper -geom no-bc -i $infile -3pa TGGAATTCTCGGG -basename $dir/${name/.fastq.gz/} -nnn-check 1/1 -3pa ""
26 | #reaper -i $infile -3pa TGGAATTCTCGGGTGCCAAGG -geom no-bc -basename $dir/${name/.fastq.gz/}  -nnn-check 1/1 -3pa "" -tabu TGGAATTCTCGGG $@
27 | 


--------------------------------------------------------------------------------
/src/R/reverseFastq.R:
--------------------------------------------------------------------------------
 1 | ## libs
 2 | library("ShortRead")
 3 | 
 4 | ## a warning
 5 | message("That script is hardcoded as h..l, so if you need it, edit or enhance :-)")
 6 | 
 7 | ## setwd
 8 | setwd("/mnt/picea/storage/projects/07_Sd_ludwigii_Project/fastq/454/8k")
 9 | 
10 | ## read the fq file
11 | setMethod(f="reverse",
12 |           signature="ShortReadQ",
13 |           definition=function(x,...){
14 |   x@sread <- reverseComplement(sread(x))
15 |   x@quality@quality <- reverse(quality(quality(x)))
16 |   return(x)
17 | })
18 | 
19 | writeFastq(reverse(readFastq("trim_13_1.fastq.gz")),file="rev_trim_13_1.fastq")
20 | writeFastq(reverse(readFastq("trim_15_1.fastq.gz")),file="rev_trim_15_1.fastq")
21 | 


--------------------------------------------------------------------------------
/src/R/misoPePlot.R:
--------------------------------------------------------------------------------
 1 | ## set the wd
 2 | setwd("/gulo/proj_nobackup/b2012243/data/accdata/MISO/PE_distribution")
 3 | 
 4 | ## read the header
 5 | mat <- do.call(rbind,strsplit(sapply(dir(path=".",pattern="*.insert_len$",recursive=TRUE,full.names=TRUE),scan,what="character",n=1),"=|,"))
 6 | 
 7 | ## mean
 8 | png(file="PE-fragment-size-distribution-boxplots.png",width=600,height=600,pointsize=16)
 9 | par(mfrow=c(2,2))
10 | boxplot(as.numeric(mat[,2]),xlab="mean")
11 | 
12 | ## sd
13 | boxplot(as.numeric(mat[,4]),xlab="sd")
14 | 
15 | ## dispersion
16 | boxplot(as.numeric(mat[,6]),xlab="dispersion")
17 | 
18 | ## number of reads used
19 | boxplot(as.numeric(mat[,8]),xlab="number of reads")
20 | 
21 | dev.off()
22 | 
23 | 


--------------------------------------------------------------------------------
/src/R/convertTemplates.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Template conversion"
 3 | #' author: "Nicolas Delhomme"
 4 | #' date: "`r Sys.Date()`"
 5 | #' output:
 6 | #'  html_document:
 7 | #'    toc: true
 8 | #'    number_sections: true
 9 | #'    code_folding: hide
10 | #' ---
11 | #' # Setup
12 | #' * Libraries
13 | suppressPackageStartupMessages({
14 |   library(here)
15 |   library(knitr)
16 | })
17 | 
18 | #' # Rmd to R
19 | #' 
20 | #' ## Differential Expression
21 | purl(here("template/R/DifferentialExpression_WithGOenrichment.Rmd"),
22 |      output=here("template/R/DifferentialExpression_WithGOenrichment.R"),
23 |      documentation=2)
24 | 
25 | #' # Session Info
26 | #' ```{r session info, echo=FALSE}
27 | #' sessionInfo()
28 | #' ```
29 | 


--------------------------------------------------------------------------------
/pipeline/runMuscle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p main -n 1
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH --mem=100G
 5 | #SBATCH --mail-type=ALL
 6 | # stop on error
 7 | set -ex
 8 | 
 9 | # usage
10 | export USAGETXT=\
11 | "
12 | Usage: $0 <FASTA> <OUTFILE>
13 | "
14 | 
15 | # load functions
16 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
17 | 
18 | # checks
19 | if [ $@ -ne 2 ]; then
20 |   abort "This script expects 2 arguments."
21 | fi
22 | 
23 | if [ ! -f $1 ]; then
24 |   abort "The fasta file does not exist"
25 | fi
26 | 
27 | if [ ! -d $(dirname $outfile) ]; then
28 |   abort "The directory of the output file does not exist"
29 | fi
30 | 
31 | isExec muscle
32 | 
33 | # run
34 | muscle -in $1 -out $2 -diags -maxiters 1
35 | 


--------------------------------------------------------------------------------
/pipeline/runPASA_GFF3_validator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 00:10:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error, be verbose
 8 | set -ex
 9 | 
10 | ## load modules
11 | module load bioinfo-tools pasa
12 | 
13 | ## a usage function
14 | usage(){
15 |     echo >&2 "Usage: $0 <gff file>" 
16 |     exit 1
17 | }
18 | 
19 | ## check the number of arguments
20 | if [ $# != 1 ]; then
21 |     echo "This function take one argument, the gff file path"
22 |     usage
23 | fi
24 | 
25 | ## check that the file exists
26 | if [ ! -f $1 ]; then
27 |     echo "The argument shoud be a valid file path to a gff file"
28 |     usage
29 | fi
30 | 
31 | ## run it
32 | perl $PASAHOME/misc_utilities/pasa_gff3_validator.pl $1
33 | 


--------------------------------------------------------------------------------
/pipeline/runSamtoolsFlagstat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mem=4GB
 5 | #SBATCH -t 01:00:00
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | # source helpers
12 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
13 | 
14 | # usage 
15 | USAGETXT=\
16 | "
17 | 	Usage: $0 <in.bam>
18 | "
19 | 
20 | ## we get one file as input
21 | if [ $# != 1 ]; then
22 |     echo "This function takes one file as argument"
23 |     usage
24 | fi
25 | 
26 | if [ ! -f $1 ]; then
27 |     echo "The first argument needs to be an existing bam file"
28 | fi
29 | 
30 | ## define the output file
31 | new=${1//.bam/.stats}
32 | 
33 | ## get the coverage table
34 | samtools flagstat $1 > $new
35 | 


--------------------------------------------------------------------------------
/pipeline/runSampleN_SE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # usage 
 4 | USAGETXT=\
 5 | "
 6 | runSampleN_SE <out> <file> <subset size in M reads>
 7 | 
 8 | Note:
 9 |   Adapted to SE by mquevedo
10 | "
11 |  
12 | 
13 | # sanity check
14 | if [ $# -ne 3 ]; then
15 |   echo "This script expects 3 arguments"
16 |   usage
17 | fi
18 | 
19 | # the vars
20 | 
21 | out=$1
22 | nam=$2
23 | fnam=$(basename ${nam/_trim.fq.gz/}) 
24 | subset=$3
25 | 
26 | echo
27 | # sanity check
28 | if [ ! -d $out ]; then
29 |   echo "The first arg needs to be the output dir"
30 |   usage
31 | fi
32 | 
33 | if [ ! -f $nam ]; then
34 |   echo "The second arg needs to be the file"
35 |   usage
36 | fi
37 | 
38 | # run
39 | sampleN -n `expr $subset "*" 1000000` -o $out/$fnam"_"$subset"M" $nam 
40 | 
41 | 


--------------------------------------------------------------------------------
/templates/R/footer.html:
--------------------------------------------------------------------------------
 1 | &nbsp;
 2 | <hr />
 3 | <p align="center">
 4 | <img src="bulogo2.png" alt="drawing" style="width:200px;"/>
 5 | </p>
 6 | 
 7 | <p style="text-align: center;">Created by <a href="YOURLINK">YOURNAME</a></p>
 8 | <p style="text-align: center;"><a href="mailto:YOUREMAIL?">YOUREMAIL</a></p>
 9 | 
10 | <!-- Add icon library -->
11 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.2.0/css/all.min.css">
12 | 
13 | <!-- Add font awesome icons -->
14 | <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
15 | <p style="text-align: center;">
16 |     <a href="YOURLINKEDIN" class="fa fa-linkedin"></a>
17 |     <a href="YOURGITHUB" class="fa fa-github"></a>
18 | </p>
19 | 
20 | &nbsp;


--------------------------------------------------------------------------------
/container/apptainer/angsd.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | export DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | %post 
 9 | apt-get update && \
10 | apt-get -y install \
11 |     build-essential \
12 |     wget \
13 |     make \
14 |     zlib1g-dev \
15 |     libhdf5-dev \
16 |     curl \
17 |     bzip2 \
18 |     libncurses5-dev \
19 |     libncursesw5-dev \
20 |     libbz2-dev \
21 |     liblzma-dev
22 | apt-get clean
23 | 
24 | wget http://popgen.dk/software/download/angsd/angsd0.940.tar.gz
25 | tar xf angsd0.940.tar.gz
26 | rm angsd0.940.tar.gz
27 | 
28 | cd htslib
29 | make
30 | cd ..
31 | 
32 | cd angsd
33 | make HTSSRC=../htslib
34 | cd ..
35 | 
36 | cp angsd/angsd /usr/local/bin/
37 | 
38 | %runscript
39 | angsd "$@"
40 | 


--------------------------------------------------------------------------------
/pipeline/runPrepInfomap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | inf=$1
 4 | prep="/mnt/picea/home/bastian/tmp/prep"
 5 | back="/mnt/picea/home/bastian/tmp/back"
 6 | info="/mnt/picea/home/bastian/tmp/Infomap"
 7 | outdir=$2
 8 | bn=$(basename $inf)
 9 | 
10 | $prep $inf map > $outdir/$bn.map
11 | $prep $inf print > $outdir/$bn.infomap.txt
12 | $info -z -i link-list --markov-time 0.01 $outdir/$bn.infomap.txt $outdir
13 | $back $outdir/$bn.map $outdir/$bn.infomap.tree > $outdir/$bn.final.txt
14 | 
15 | nc=$(head -n 1 $outdir/$bn.final.txt | wc -w)
16 | nc=$(expr $nc - 3)
17 | 
18 | >$outdir/$bn.final.h.txt
19 | for f in $(seq 1 $nc); do echo -ne "C$f\t" >>  $outdir/$bn.final.h.txt; done
20 | echo -e "Score\tIID\tID" >>  $outdir/$bn.final.h.txt
21 | cat  $outdir/$bn.final.txt >>  $outdir/$bn.final.h.txt
22 | 


--------------------------------------------------------------------------------
/pipeline/runTrim.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -A b2010064
 3 | #SBATCH -n 8
 4 | #SBATCH -t 1-0:00:00
 5 | #SBATCH --mail-user david.sundell@plantphys.umu.se
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | prog="/home/davidsu/opt/Trimmomatic-0.22/trimmomatic_0.22.sh"
 9 | 
10 | ####
11 | ##		Run trimmomatic
12 | ###
13 | 
14 | ## Usage: runTrim.sh inptFolder forwardReads reverseReads
15 | 
16 | cd $1
17 | OUT="/proj/b2010064/nobackup/david_pipeline/$4/trimmed"
18 | n1=${2%.fq*}
19 | n2=${3%.fq*}
20 | in1=$2
21 | in2=$3
22 | if [ -h $f ]; then
23 | 	in1=$(readlink -f "$2")
24 | 	in2=$(readlink -f "$3")
25 | fi
26 | 
27 | #run
28 | sh $prog $in1 $in2 $OUT/$n1"_FP.fq.gz" $OUT/$n1"_FU.fq.gz" $OUT/$n2"_FP.fq.gz" $OUT/$n2"_FU.fq.gz" ILLUMINACLIP:"/proj/b2010064/analysis/illuminaClippingPoly.fa":2:40:14 MINLEN:50


--------------------------------------------------------------------------------
/pipeline/join_targets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=/mnt/picea/home/katja/aspen_sRNA/Potra/ShortStack/targets
 4 | 
 5 | cat $dir/psRNATarget_*_Potra_4.txt >> $dir/targets_all_Potra_4.txt
 6 | grep "miRNA_[0-9]" $dir/targets_all_Potra_4.txt > $dir/targets_all_Potra.tmp
 7 | grep "miRNA_Acc." $dir/psRNATarget_21nt_Potra_4.txt > $dir/targets_all_Potra_4.txt
 8 | cat $dir/targets_all_Potra.tmp >> $dir/targets_all_Potra_4.txt
 9 | rm $dir/targets_all_Potra.tmp
10 | 
11 | cat $dir/psRNATarget_*_Potri_4.txt >> $dir/targets_all_Potri_4.txt
12 | grep "miRNA_[0-9]" $dir/targets_all_Potri_4.txt > $dir/targets_all_Potri.tmp
13 | grep "miRNA_Acc." $dir/psRNATarget_21nt_Potri_4.txt > $dir/targets_all_Potri_4.txt
14 | cat $dir/targets_all_Potri.tmp >> $dir/targets_all_Potri_4.txt
15 | rm $dir/targets_all_Potri.tmp
16 | 


--------------------------------------------------------------------------------
/templates/bash/submitSeidrBackbone.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | account=CHANGEME
 4 | email=CHANGEME
 5 | 
 6 | # define the thresholds
 7 | thresholds=( 2.33 2.05 1.88 1.75 1.64 1.55 1.48 1.41 1.34 1.28 )
 8 | 
 9 | # Load the tools
10 | module load bioinfo-tools seidr-devel
11 | 
12 | # process the argument
13 | network=$(realpath ../data/seidr/aggregate/aggregated.sf)
14 | out=$(realpath ../data/seidr/backbone)
15 | 
16 | if [ ! -d $out ]; then
17 |   mkdir -p $out
18 | fi
19 | 
20 | # submit
21 | for i in {0..9}; do
22 |   j=$(expr $i + 1)
23 |   sbatch -A $account --mail-user=$email \
24 |   -o $out/backbone-${j}-percent.out \
25 |   -e $out/backbone-${j}-percent.err -J bb-${j}  \
26 |   ../UPSCb-common/pipeline/runSeidrBackbone.sh $network ${thresholds[$i]} \
27 |   $out/backbone-${j}-percent.sf
28 | done
29 | 


--------------------------------------------------------------------------------
/pipeline/runBedToolsGCov.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 2:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -e
10 | 
11 | ## we get one dir and one file as input
12 | if [ $# != 2 ]; then
13 |     echo "This function takes one directories and one bam file as arguments"
14 |     echo "Usage: sbatch runBedToolsGCov.sh <out dir> <in.bam>"
15 |     exit 1
16 | fi
17 | 
18 | if [ ! -d $1 ]; then
19 |     echo "The first argument needs to be an existing directory"
20 | fi
21 | 
22 | if [ ! -f $2 ]; then
23 |     echo "The second argument needs to be an existing bam file"
24 | fi
25 | nam=`basename ${2//.bam/}`
26 | 
27 | ## get the coverage table
28 | bedtools genomecov -ibam $2 -max 1 > $1/$nam.txt
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/pipeline/runTremulaTranslate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | set -eu
 4 | 
 5 | #SBATCH -t 40:00
 6 | #SBATCH -p main
 7 | #SBATCH -n 1
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | if [ $# -lt 3 ]; then
11 |     echo >&2 "usage: $0 <bam> <transtable> <out.bam>"
12 |     exit 1
13 | fi
14 | 
15 | inbam=$1
16 | transtable=$2
17 | outbam=$3
18 | 
19 | [[ ! -f $inbam ]] && echo "Could not find BAM file" && exit 1
20 | [[ ! -f $transtable ]] && echo "Could not find translation table" && exit 1
21 | [[ ! -d $(dirname $outbam) ]] && echo "Could not find directory for output" && exit 1
22 | [[ -z $UPSCb ]] && echo "The UPSCb environment variable needs to be set" && exit 1
23 | 
24 | module load bioinfo-tools samtools
25 | 
26 | perl $UPSCb/src/perl/tremula_scaffold_bamconvert.pl <(samtools view -h $inbam) $transtable | \
27 |     samtools view -hSb - > $outbam
28 | 


--------------------------------------------------------------------------------
/pipeline/runThresholdNetwork.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mem=100G
 3 | #SBATCH --mail-type=all
 4 | #SBATCH -n 1
 5 | #SBATCH -J scThresh
 6 | 
 7 | #Make sure we have igraph
 8 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 9 | 
10 | usage(){
11 |     echo "Usage: runThresholdNetwork.sh <input file> <output file> ['other options']"
12 |     echo "Important: Other options must be quoted"
13 | 
14 |     exit 1
15 | }
16 | 
17 | INF=$1
18 | OUTF=$2
19 | OPTS=""
20 | 
21 | shift 2
22 | if [ $# == 1 ]; then
23 |     OPTS=$1
24 | fi
25 | 
26 | if [ $# -gt 1 ];then
27 |     usage
28 | fi
29 | 
30 | if [ ! -f $INF ];then
31 |     usage
32 | fi
33 | 
34 | if [ -z $UPSCb ];then
35 |     echo "You must set the UPSCb environment variable for this script"
36 |     usage
37 | fi
38 | 
39 | $UPSCb/src/cpp/scgraph/threshold -i $INF $OPTS > $OUTF
40 | 


--------------------------------------------------------------------------------
/pipeline/runDiamondMakedb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mail-type=FAIL
 5 | #SBATCH -t 1:00:00
 6 | 
 7 | set -eux
 8 | 
 9 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
10 | 
11 | USAGETXT=\
12 | "
13 |   Usage: $0 <singularity diamond container><fasta> <indexName>
14 | "
15 | 
16 | [[ $# -ne 3 ]] && abort "This script expects 3 arguments"
17 | 
18 | [[ ! -f $1 ]] && abort "The singularity container file does not exist"
19 | 
20 | [[ ! -f $2 ]] && abort "The input fasta file does not exist"
21 | 
22 | [[ ! -d $(dirname $3) ]] && abort "The output directory for the index does not exist"
23 | 
24 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
25 | 
26 | singularity exec $1 diamond makedb --in $2 -d $3
27 | 


--------------------------------------------------------------------------------
/pipeline/runAsArray.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH --mail-type=ALL
 4 | 
 5 | set -ex
 6 | 
 7 | usage(){
 8 |   echo >&2 \
 9 | "
10 | Usage: $(basename $0) <script> <file list> <additional arguments>
11 | 
12 | This runs an array of the script using as arguments:
13 | (i) iteratively a line of file list (i.e. specific arguments, such as filename(s))
14 | (ii) common arguments provided as additional arguments on the command line
15 | 
16 | 
17 | "
18 | exit 1
19 | }
20 | 
21 | # check that the script exists
22 | if [ ! -f $1 ]; then
23 |   echo 
24 |   usage
25 | fi
26 | script=$1
27 | shift
28 | 
29 | # check that the file list exists
30 | if [ ! -f $1 ]; then
31 |   echo 
32 |   usage
33 | fi
34 | 
35 | # read the file list
36 | readarray -t array < $1
37 | shift
38 | 
39 | # run the jobs
40 | bash $script ${array[$SLURM_ARRAY_TASK_ID]} $@
41 | 


--------------------------------------------------------------------------------
/src/R/mailR.R:
--------------------------------------------------------------------------------
 1 | mailR <- function(to,subject,msg) {
 2 |   library(stringr)
 3 |   if(any(sapply(c("to","subject","msg"),function(f){class(get(f))}) != "character")) {
 4 |     stop("Please supply only character arguments to this function")
 5 |   }
 6 |   reg.test <- str_detect(to,"[A-Za-z0-9\\.]+\\@[A-Za-z0-9]+\\.[A-Za-z0-9\\.]+")
 7 |   if(!reg.test){
 8 |     stop("Your email doesn't appear to be valid")
 9 |   }
10 |   if(nchar(subject)>2000){
11 |     stop("Please restrict your subject to 2000 or less characters")
12 |   }
13 |   if(as.integer(object.size(msg))/1024^2>20){
14 |     stop("Please restrict your message to be 20 MB or less")
15 |   }
16 |   to <- gsub('"',"'",to)
17 |   subject <- gsub('"',"'",subject)
18 |   msg <- gsub('"',"'",msg)
19 |   cmd <- paste('echo','"',msg,'"','| mail -s "',subject,'" -a "From: RStudio"','--to "',to,'"')
20 |   system(cmd)
21 | }


--------------------------------------------------------------------------------
/pipeline/runSamtoolsFaidx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 00:10:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -eu
 9 | 
10 | # functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | ## a usage function
14 | USAGETXT=\
15 | "
16 | Usage: $0 <samtools singularity container> <fasta file>
17 | "
18 | 
19 | # safeguards
20 | [[ $# != 2 ]] && abort "This function takes two arguments"
21 | 
22 | [[ ! -f $1 ]] && abort "The first argument needs to be the singularity container file"
23 | 
24 | [[ ! -f $2 ]] && abort "The first argument needs to be an existing bam file"
25 | 
26 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
27 | 
28 | ## create the index
29 | singularity exec $1 samtools faidx $2
30 | 


--------------------------------------------------------------------------------
/pipeline/runDISCOVARdeNovo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=ALL
 3 | module load jemalloc gcc
 4 | usage () 
 5 | {
 6 | 	echo "runDISCOVARdeNovo.sh <fastq1,fastq2,fastq3,fastq4...> <out_dir> <Discovar options>"
 7 | 	echo
 8 | 	echo "Note: This script expects the input FASTQ (or SAM/BAM) reads in a"
 9 | 	echo "comma separated list just as DISCOVAR does"
10 | 	exit 1
11 | }
12 | 
13 | #Argn check
14 | 
15 | if [ $# -lt 2 ]; then
16 | 	usage
17 | fi
18 | 
19 | # File and dir errors
20 | for f in $(echo $1 | tr "," " "); do
21 | 	if [ ! -f $f ];
22 | 		then echo "File $f not found. Exiting"
23 | 		usage
24 | 	fi
25 | done
26 | 
27 | if [ ! -d $2 ]; then
28 | 	echo "Directory $2 note found. Exiting"
29 | 	usage
30 | fi
31 | 
32 | # Arg assignments
33 | INF=$1
34 | OUTD=$2
35 | 
36 | shift 2
37 | 
38 | EXTRA_ARGS=$@
39 | 
40 | 
41 | DiscovarDeNovo READS=$INF OUT_DIR=$OUTD $EXTRA_ARGS
42 | 


--------------------------------------------------------------------------------
/pipeline/runSampleN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 |   echo "$0 <in> <out> <file prefix> <subset size in M reads>"
 5 |   exit 1
 6 | } 
 7 | 
 8 | # sanity check
 9 | if [ $# -ne 4 ]; then
10 |   echo "This script expects 4 arguments"
11 |   usage
12 | fi
13 | 
14 | # the vars
15 | in=$1
16 | out=$2
17 | nam=$3
18 | subset=$4
19 | 
20 | # sanity check
21 | if [ ! -d $in ]; then
22 |   echo "The first arg needs to be the input dir"
23 |   usage
24 | fi
25 | 
26 | if [ ! -d $out ]; then
27 |   echo "The second arg needs to be the output dir"
28 |   usage
29 | fi
30 | 
31 | if [ ! -f $in/${nam}_1.fq.gz ]; then
32 |   echo "The third arg needs to be the prefix (without _1.fq.gz) of the input files"
33 |   usage
34 | fi
35 | 
36 | # run
37 | sampleN -l $out/${nam}_${subset}.log.txt.gz -n `expr $subset "*" 1000000` -o $out/${nam}_${subset}_million $in/${nam}_1.fq.gz $in/${nam}_2.fq.gz 
38 | 
39 | 


--------------------------------------------------------------------------------
/templates/bash/submitSeidrAggregate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | account=CHANGEME
 4 | mail=CHANGEME
 5 | 
 6 | # input
 7 | base=$(realpath ../data/seidr)
 8 | out=$(realpath ../data/seidr/aggregate)
 9 | 
10 | # helpers
11 | source ../UPSCb-common/src/bash/functions.sh
12 | 
13 | # modules
14 | module load bioinfo-tools seidr-devel
15 | 
16 | # directories
17 | if [ ! -d $base/results ]; then
18 |   abort "Your directory structure is unexpected"
19 | fi
20 | 
21 | if [ ! -d $base/sf ]; then
22 |   mkdir -p $base/sf
23 | fi
24 | 
25 | # create links
26 | find $base/results -name "*.sf" -exec ln -sf -t $base/sf "{}" \;
27 | 
28 | if [ ! -d $out ]; then
29 |   mkdir -p $out
30 | fi
31 | 
32 | # submit
33 | sbatch -A $account --mem=128GB --mail-user=$mail \
34 |   -e $out/aggregate.err -o $out/aggregate.err \
35 |   -J aggregate ../UPSCb-common/pipeline/runSeidrAggregate.sh $out $base/sf/*.sf 
36 | 


--------------------------------------------------------------------------------
/pipeline/runSamtoolsIdxstats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 00:10:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -e
10 | 
11 | ## modules
12 | module load bioinfo-tools
13 | module load samtools/0.1.19
14 | 
15 | ## we get one dir and one file as input
16 | if [ $# != 2 ]; then
17 |     echo "This function takes one directories and one bam file as arguments"
18 |     echo "Usage: sbatch runSamtoolsIdxstats.sh <out dir> <in.bam>"
19 |     exit 1
20 | fi
21 | 
22 | if [ ! -d $1 ]; then
23 |     echo "The first argument needs to be an existing directory"
24 | fi
25 | 
26 | if [ ! -f $2 ]; then
27 |     echo "The second argument needs to be an existing bam file"
28 | fi
29 | nam=`basename ${2//.bam/}`
30 | 
31 | ## get the coverage table
32 | samtools idxstats $2 | cut -f 1,3 > $1/$nam.txt
33 | 
34 | 


--------------------------------------------------------------------------------
/pipeline/runPicardSamToFastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -t 6:00:00
 3 | #SBATCH -p main -n 2
 4 | #SBATCH --mail-type=FAIL
 5 | 
 6 | set -eu
 7 | 
 8 | # setup
 9 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
10 | 
11 | # usage
12 | USAGETXT=\
13 | "
14 | Usage  $0 <bam> <out>
15 | "
16 | 
17 | # validity
18 | [[ $# -ne 2 ]] && abort "This script expects 2 arguments"
19 | [[ ! -f $1 ]] && abort "The first argument needs to be a bam file"
20 | [[ ! -d $2 ]] && abort "The second argument needs to be a directory"
21 | 
22 | # extract sample name
23 | fnam=$(basename ${1/.bam/})
24 | 
25 | # extract fastq
26 | java -jar $PICARD_ROOT/picard.jar SamToFastq \
27 | -I $1 -F $2/${fnam}_1.fq -F2 $2/${fnam}_2.fq -FU $2/${fnam}.fq \
28 | --VALIDATION_STRINGENCY LENIENT
29 | 
30 | # compress
31 | gzip -f $2/${fnam}_1.fq
32 | gzip -f $2/${fnam}_2.fq
33 | 
34 | # clean
35 | rm $2/${fnam}.fq
36 | 


--------------------------------------------------------------------------------
/pipeline/runFRC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | ## report error
 4 | set -e
 5 | 
 6 | ## be verbose 
 7 | set -x
 8 | 
 9 | ## usage
10 | usage(){
11 | echo >&2 \
12 | "
13 |      Usage: runFRC.sh <alignment bam> <out dir>
14 |      Note: At the moment it only accept --pe-sam
15 | "
16 | exit 1
17 | }
18 | 
19 | ## check params
20 | if [ $# != 2 ]; then
21 |     echo "This script needs two parameters"
22 |     usage
23 | fi
24 | 
25 | if [ ! -f $1 ]; then
26 |     echo "The first parameter should be a bam file"
27 |     usage
28 | fi
29 | 
30 | if [ ! -d $2 ]; then
31 |     echo "The second argument has to be an existing output directory"
32 |     usage
33 | fi
34 | 
35 | # load modules
36 | module load bioinfo-tools FRC
37 | 
38 | ## run
39 | cd $2
40 | FRC --pe-sam $1 \
41 | --pe-max-insert 425 --genome-size 11350000 --CEstats-PE-min -5 \
42 | --CEstats-PE-max 5.5 --output i425g1135Cmin5Cmx55
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/container/apptainer/velocyto.def:
--------------------------------------------------------------------------------
 1 | BootStrap: docker
 2 | From: ubuntu:22.04
 3 | 
 4 | %environment
 5 | export LC_ALL=C
 6 | 
 7 | %post 
 8 | apt-get update && \
 9 | apt-get -y install \
10 |     build-essential \
11 |     python3-dev \
12 |     python3-pip \
13 |     python3 \
14 |     zlib1g-dev \
15 |     libhdf5-dev \
16 |     curl \
17 |     bzip2 \
18 |     git \
19 |     libncurses5-dev \
20 |     libncursesw5-dev \
21 |     libbz2-dev \
22 |     liblzma-dev \
23 |     wget
24 | apt-get clean
25 | 
26 | cd ~
27 | wget https://github.com/samtools/samtools/releases/download/1.18/samtools-1.18.tar.bz2
28 | tar -xvf samtools-1.18.tar.bz2
29 | rm samtools-1.18.tar.bz2
30 | cd samtools-1.18
31 | ./configure
32 | make
33 | make install
34 | 
35 | cd ~
36 | pip install numpy scipy cython numba matplotlib scikit-learn h5py click
37 | pip install velocyto
38 | pip cache purge
39 | 
40 | %runscript
41 | velocyto "$@"
42 | 


--------------------------------------------------------------------------------
/pipeline/runBwaIndex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=END,FAIL
 3 | #SBATCH -p main -w picea
 4 | #SBATCH --mem=264GB
 5 | #SBATCH -t 12:00:00
 6 | 
 7 | # failsafe
 8 | set -eu
 9 | 
10 | # load helpers
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # usage
14 | USAGETXT=\
15 | "
16 |     runBwaIndex.sh <bwa singularity container> <Genome Fasta> <Output Dir>
17 | "
18 | 
19 | # sanity
20 | [[ $# -ne 3 ]] && abort "The script expects three arguments."
21 | [[ ! -f $1 ]] && abort "BWA singularity container not found"
22 | [[ ! -f $2 ]] && abort "FASTA input not found"
23 | [[ ! -d $3 ]] && abort "OUTPUT directory not found"
24 | 
25 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
26 | 
27 | # prep
28 | ln -sf $2 $3
29 | BNAM=$(basename $2)
30 | singularity exec $1 bwa index $3/$BNAM
31 | 


--------------------------------------------------------------------------------
/pipeline/runSortmernaDennis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -c 4
 4 | #SBATCH --mem=8G
 5 | #SBATCH -p main
 6 | 
 7 | usage(){
 8 |     echo "runSortmerna.sh <forward_reads> <reverse_reads> <output_directory>"
 9 | exit l
10 | }
11 | 
12 | frw_reads=$1
13 | rev_reads=$2
14 | out_dir=$3
15 | 
16 | if [ ! -e $frw_reads ]
17 | then
18 |     usage
19 | fi
20 | 
21 | if [ ! -e $rev_reads ]
22 | then
23 |     usage
24 | fi
25 | 
26 | if [ ! -d $out_dir ]
27 | then
28 |     usage
29 | fi
30 | 
31 | module load bioinfo-tools sortmerna
32 | 
33 | merged_reads=${out_dir}/merged.fastq
34 | 
35 | merge-paired-reads.sh $frw_reads $rev_reads $merged_reads
36 | 
37 | sortmerna --ref $SORTMERNADB -a 4 --log TRUE -paired_in TRUE --reads $merged_reads > ${out_dir}/summarysortmerna.log
38 | 
39 | unmerge-paired-reads.sh $merged_reads ${out_dir}/$(echo $frw_reads | cut -d '/' -f 10) ${out_dir}/$(echo $rev_reads | cut -d '/' -f 10)
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | *.Rproj
20 | 
21 | # produced vignettes
22 | vignettes/*.html
23 | vignettes/*.pdf
24 | 
25 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
26 | .httr-oauth
27 | 
28 | # knitr and R markdown default cache directories
29 | /*_cache/
30 | /cache/
31 | 
32 | # Temporary files created by R markdown
33 | *.utf8.md
34 | *.knit.md
35 | 
36 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
37 | rsconnect/
38 | 
39 | # data, etc.
40 | data
41 | data/
42 | analysis
43 | analysis/
44 | reference
45 | reference/
46 | singularity
47 | singularity/
48 | empty.html
49 | 


--------------------------------------------------------------------------------
/pipeline/runSeidrThreshold.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A facility
 3 | #SBATCH -p main -n 16
 4 | #SBATCH -t 12:00:00
 5 | #SBATCH --mail-type=ALL
 6 | #SBATCH --mem=96GB
 7 | 
 8 | # variables
 9 | CPU=16
10 | 
11 | # usage
12 | USAGETXT=\
13 | "
14 |   Usage: $0 <seidr file> <output filename>
15 |   
16 | "
17 | 
18 | # sanity
19 | if [ -z $UPSCb ]; then
20 |   echo "Set your UPSCb environment variable"
21 |   exit 1
22 | fi
23 | 
24 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
25 | 
26 | isExec seidr
27 | 
28 | if [ $# -ne 2 ]; then
29 |   abort "This script expects 2 arguments"
30 | fi
31 | 
32 | if [ ! -f $1 ]; then
33 |   abort "The first argument needs to be an existing file"
34 | fi
35 | 
36 | if [ ! -d $(dirname $2) ]; then
37 |   abort "The second argument directory needs to exist"
38 | fi
39 | 
40 | # run
41 | export OMP_NUM_THREADS=$CPU
42 | seidr threshold -n 10000 -m 0 -M 1 -O $CPU -o $2 $1
43 | 


--------------------------------------------------------------------------------
/pipeline/runGATK_CombineGVCFs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 10:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | set -eux
 8 | 
 9 | # helper
10 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
11 | 
12 | USAGETXT=\
13 | "
14 | Usage: $0 <ref.fa> <out.vcf> <gvcf> [<gvcf> ...]
15 | 
16 | Notes:This script is GATK v4 compatible and GATK V3 incompatible.
17 | "
18 | 
19 | # check
20 | isExec gatk
21 | 
22 | if [ $# -lt 3 ]; then
23 |     usage
24 | fi
25 | 
26 | if [ ! -f $1 ]; then
27 |     abort "ERROR: could not find reference: '$1'"
28 | fi
29 | 
30 | ref=$1
31 | shift
32 | 
33 | out=$1
34 | shift
35 | 
36 | variants=()
37 | for gvcf in $@; do
38 |     if [ ! -f "$gvcf" ]; then
39 |         abort "ERROR: file not found: '$gvcf'"
40 |     fi
41 |     variants+=("-V $gvcf")
42 | done
43 | 
44 | # checth GVCF options
45 | gatk CombineGVCFs -R "$ref" ${variants[@]} -O $out
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/runPearsonSpearmanCorrelation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | usage(){
 9 |   echo >&2 \
10 |   "
11 |   This script expects three arguments: the transposed matrix, the gene list and the output dir.
12 |   The transposed matrix should have no column nor row names.
13 |   "
14 |   exit 1
15 | }
16 | 
17 | if [ $# != 3 ]; then
18 |   echo "ERROR: This script expect three arguments"
19 |   usage
20 | fi
21 | 
22 | if [ ! -f $1 ]; then
23 |   echo "ERROR: The first argument should be an existing file"
24 |   usage
25 | fi
26 | 
27 | if [ ! -f $2 ]; then
28 |   echo "ERROR: The second argument should be an existing file"
29 |   usage
30 | fi
31 | 
32 | if [ ! -d $3 ]; then
33 |   echo "ERROR: The third argument should be an existing directory"
34 |   usage
35 | fi
36 | 
37 | # run
38 | cd $3
39 | ~bastian/Git/geneNetworkR/src/corutil/bin/getcor $1 $2
40 | 
41 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_spruce_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Picea-abies/v2.0/fasta/Picab02_chromosomes_and_unplaced.fa.gz",
 5 |     "gff": "<reference>/Picea-abies/v2.0/gff3/Picab02_230926_at01_all_sorted.gff.gz",
 6 |     "transcript_fasta": "<reference>/Picea-abies/v2.0/fasta/Picab02_230926_at01_all_mRNA.fa.gz",
 7 |     "salmon_index": "<reference>/Picea-abies/v2.0/indices/salmon/Picab02_230926_at01_all_mRNA_salmon-version-1-dot-10-dot-3",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": true,
14 |     "skip_alignment": true
15 | }
16 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_tomato_v4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Solanum-lycopersicum/v4.0/fasta/Slycopersicum_691_SL4.0.fa.gz",
 5 |     "gff": "<reference>/Solanum-lycopersicum/v4.0/gff3/Slycopersicum_691_ITAG4.0.gene_exons.gff.gz",
 6 |     "transcript_fasta": "<reference>/Solanum-lycopersicum/v4.0/fasta/Slycopersicum_691_ITAG4.0.transcript.fa.gz",
 7 |     "salmon_index": "<reference>/Solanum-lycopersicum/v4.0/indices/salmon_with_decoy_v_1_9_0",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": false,
14 |     "skip_alignment": true
15 | }


--------------------------------------------------------------------------------
/pipeline/runInfomap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools InfoMap
12 | 
13 | # usage function
14 | usage(){
15 | echo >&2 \
16 | "
17 | 	Usage: $0 <pajek (.net) graph file> <out dir>
18 | 
19 | 	Notes:
20 | 		The script only accept parjek formatted graph files
21 | "
22 | 	exit 1
23 | }
24 | 
25 | # check the arguments
26 | if [ ! -f $1 ]; then
27 | 	echo "The pajek graph file: $1 does not exist"
28 | 	usage
29 | fi
30 | 
31 | 
32 | if [ "${1##*.}" != ".net"]; then
33 |   echo "The graph file needs to be a pajek file, i.e. have a .net extension"
34 |   usage
35 | fi
36 | 
37 | 
38 | if [ ! -d $2 ]; then
39 |   echo "The output directory: $2 does not exist"
40 |   usage
41 | fi
42 | 
43 | # run the command
44 | cd $2
45 | Infomap -i pajek -u $1 .
46 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_arabidopsis_araport11.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Arabidopsis-thaliana/TAIR10/fasta/genome.fa.gz",
 5 |     "gtf": "<reference>/Arabidopsis-thaliana/ARAPORT11/2024_release/Araport11_October2024.gtf.gz",
 6 |     "transcript_fasta": "<reference>/Arabidopsis-thaliana/ARAPORT11/2024_release/Araport11_cdna_20241007_from_gtf.fasta.gz",
 7 |     "salmon_index": "<reference>/Arabidopsis-thaliana/ARAPORT11/2024_release/Salmon_Index",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": true,
14 |     "skip_alignment": true
15 | }
16 | 


--------------------------------------------------------------------------------
/pipeline/runCLR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | usage(){
 9 |   echo >&2 \
10 |   "
11 |   This script expects two arguments: the transposed matrix and the output dir.
12 |   The transposed matrix should have no column nor row names.
13 |   "
14 |   exit 1
15 | }
16 | 
17 | if [ $# != 2 ]; then
18 |   echo "ERROR: This script expects two arguments"
19 |   usage
20 | fi
21 | 
22 | if [ ! -f $1 ]; then
23 |   echo "ERROR: The first argument should be an existing file"
24 |   usage
25 | fi
26 | 
27 | if [ ! -d $2 ]; then
28 |   echo "ERROR: The second argument should be an existing directory"
29 |   usage
30 | fi
31 | 
32 | # global vars
33 | gp=$UPSCb/src/c/genepair
34 | rs=$UPSCb/src/NetworkCrowd/CLR/CLR.R
35 | 
36 | # load module
37 | module load R
38 | 
39 | # create the structure
40 | cd $2
41 | ln -sf $gp .
42 | ln -sf $rs .
43 | 
44 | # run
45 | Rscript --vanilla $rs --data $1
46 | 


--------------------------------------------------------------------------------
/pipeline/runSRnaWorkBenchFilter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | #SBATCH -t 1-00:00:00
 6 | 
 7 | set -ex
 8 | 
 9 | # module load bioinfo-tools java
10 | 
11 | usage(){
12 | echo >&2 \
13 | "
14 | 	Usage: $0 file config output
15 | "
16 | 	exit 1
17 | }
18 | 
19 | if [ $# -ne 3 ]; then
20 | 	echo "This file expects 3 arguments"
21 | 	usage
22 | fi 
23 | 
24 | if [ ! -f $1 ]; then
25 | 	echo "The first argument should be an existing file"
26 | 	usage
27 | fi
28 | f=$1
29 | 
30 | if [ ! -f $cfg ]; then
31 | 	echo "The second argument should be an existing file"
32 | 	usage
33 | fi
34 | cfg=$2
35 | 
36 | if [ ! -d $3 ]; then
37 | 	echo "The third argument should be an existing dir"
38 | 	usage
39 | fi
40 | out=$3
41 | 
42 | name=$(basename ${f//.lane.clean.fa/})
43 | java -Xmx8G -jar /mnt/picea/Modules/apps/bioinfo/srna-workbench/3.2/Workbench.jar -tool filter -srna_file $f -out_file $out/${name}_filtered.fa -params $cfg
44 | 


--------------------------------------------------------------------------------
/pipeline/runGenomeTools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p core
 3 | #SBATCH -n 2
 4 | #SBATCH --mem=16GB
 5 | #SBATCH --mail-type=END,FAIL
 6 | #SBATCH -t 12:00:00
 7 | 
 8 | # stop on error and undefined vars
 9 | set -eu
10 | 
11 | # source helpers
12 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
13 | 
14 | USAGETXT=\
15 | "
16 | Usage: <singularity genometools> [options] <output gff file> <input gff file>
17 | "
18 | ## arguments
19 | [[ $# -ne 3 ]] && abort "This script takes three arguments"
20 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing genome tools singularity container file."
21 | [[ ! -d $2 ]] && abort "The second argument needs to be the output gff filename."
22 | [[ ! -d $3 ]] && abort "The third argument needs to be an existing input gff file."
23 | 
24 | ## start
25 | singularity exec $1 gt gff3 -force -tidy yes -addintrons yes -addids yes \
26 | -fixregionboundaries yes -retainids yes -sort yes -checkids yes -o $2 $3
27 | 


--------------------------------------------------------------------------------
/src/R/WgcnaClusterPlot.R:
--------------------------------------------------------------------------------
 1 | "WgcnaClusterPlot" <- function(dat,xlabels=NULL,...){
 2 |   
 3 |   ## check
 4 |   stopifnot(require(LSD))
 5 |   
 6 |   ## matrix
 7 |   stopifnot(is.matrix(dat))
 8 |   
 9 |   ## labels
10 |   if(is.null(xlabels)){
11 |     message("No labels were provided, defining them")
12 |     #     if(ncol(dat)>nrow(dat)){
13 |     #       message("Setting the row names as labels")
14 |     #       xlabels <- rownames(dat)
15 |     #     } else {
16 |     xlabels <- colnames(dat)
17 |     message("Setting the col names as labels")
18 |     #     }
19 |   }
20 |   
21 |   ## plot
22 |   clusterplot(dat,
23 |               colpal=colorRampPalette(c("dodgerblue3",
24 |                                         "lightcyan3",
25 |                                         "lightgrey"))(9),
26 |               quartiles.col=c("black","darkgrey","darkgrey"),
27 |               xlabels=xlabels,ylab="vst expression",xlab="",...)
28 |   
29 |   ## return
30 |   invisible(TRUE)
31 | }


--------------------------------------------------------------------------------
/pipeline/runBedToolsBamToFastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 1-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | ## load the modules
12 | module load bioinfo-tools
13 | module load BEDTools
14 | module load samtools
15 | 
16 | ## we get one dir and one file as input
17 | usage(){
18 |     echo >&2 \
19 |     "Usage: $0 <bam file> <fastq fwd file> <fastq rev file>"
20 |     exit 1
21 | }
22 | 
23 | if [ $# != 3 ]; then
24 |   echo "This function requires 3 arguments"
25 |   usage;
26 | fi
27 | 
28 | if [ ! -f $1 ]; then
29 |     echo "The first argument needs to be an existing file"
30 |     usage;
31 | fi
32 | 
33 | ## samtools
34 | samtools sort -n $1 ${1//.bam/.nsorted}
35 | 
36 | ## extract the fastq files
37 | bedtools bamtofastq -i ${1//.bam/.nsorted}.bam -fq $2 -fq2 $3
38 | 
39 | ## cleanup
40 | rm ${1//.bam/.nsorted}.bam
41 | gzip -f $2
42 | gzip -f $3
43 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_microtom_xue.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Solanum-lycopersicum/Micro-Tom/Xue/fasta/microTom.genome.fa.gz",
 5 |     "gff": "<reference>/Solanum-lycopersicum/Micro-Tom/Xue/gff3/ordered_Xue_specified_strand_with_recovery.gff.gz",
 6 |     "transcript_fasta": "<reference>/Solanum-lycopersicum/Micro-Tom/Xue/fasta/transcripts.fa.gz",
 7 |     "salmon_index": "<reference>/Solanum-lycopersicum/Micro-Tom/Xue/indexes/salmon/genome.transcripts_with-decoy_salmon-version-1-dot-10-dot-3",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": false,
14 |     "skip_alignment": true
15 | }
16 | 


--------------------------------------------------------------------------------
/pipeline/runBESST.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | ## report error
 4 | set -e
 5 | 
 6 | ## be verbose 
 7 | set -x
 8 | 
 9 | ## usage
10 | usage(){
11 | echo >&2 \
12 | "
13 |      Usage: runBESST.sh <genome fasta> <alignment bam> <out dir>
14 | "
15 | exit 1
16 | }
17 | 
18 | ## check params
19 | if [ $# != 3 ]; then
20 |     echo "This script needs three parameters"
21 |     usage
22 | fi
23 | 
24 | if [ ! -f $1 ]; then
25 |     echo "The first parameter has to be the genome (scaffold) fasta file"
26 |     usage
27 | fi
28 | 
29 | if [ ! -f $2 ]; then
30 |     echo "The second parameter should be a bam file"
31 |     usage
32 | fi
33 | 
34 | if [ ! -f $2.bai ]; then
35 |     echo "The bam file should be indexed, i.e. a $1.bai file should exist."
36 |     usage
37 | fi
38 | 
39 | if [ ! -d $3 ]; then
40 |     echo "The third argument has to be an existing output directory"
41 |     usage
42 | fi
43 | 
44 | # load modules
45 | module load bioinfo-tools BESST
46 | 
47 | ## run
48 | runBESST -c $1 -f $2 -o $3
49 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_T89_v1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Populus-tremula_X_Populus-tremuloides/v2.0/fasta/all.genome.fa.gz",
 5 |     "gff": "<reference>/Populus-tremula_X_Populus-tremuloides/v2.0/gff3/all.genome.gff.gz",
 6 |     "transcript_fasta": "<reference>/Populus-tremula_X_Populus-tremuloides/v2.0/fasta/genome.cds.fa.gz",
 7 |     "salmon_index": "<reference>/Populus-tremula_X_Populus-tremuloides/v2.0/indices/salmon/genome.mRNA.w.putative.pseudogene_with-decoy_salmon-version-1-dot-10-dot-3",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": false,
14 |     "skip_alignment": true
15 | }
16 | 


--------------------------------------------------------------------------------
/pipeline/runMiRBase_SS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mail-type=all
 3 | 
 4 | module load bioinfo-tools bwa/0.7.10
 5 | 
 6 | dir_ref=/mnt/picea/storage/reference/miRBase/v21/indices/BWA
 7 | dir_seq=/mnt/picea/home/katja/aspen_sRNA/Potra/ShortStack/miRBase
 8 | outdir=/mnt/picea/home/katja/aspen_sRNA/Potra/ShortStack/miRBase
 9 | mkdir -p $outdir
10 | 
11 | bwa aln $dir_ref/hairpin_T.fa $dir_seq/*.precursor_T.fa > $outdir/Potra_SS_hairpin.sai
12 | bwa samse -n 200 $dir_ref/hairpin_T.fa $outdir/Potra_SS_hairpin.sai $dir_seq/*.precursor_T.fa > $outdir/Potra_SS_hairpin.sam
13 | 
14 | bwa aln $dir_ref/hairpin_T.fa $dir_seq/*.mature_T.fa > $outdir/Potra_SS_miRNA.sai
15 | bwa samse -n 200 $dir_ref/hairpin_T.fa $outdir/Potra_SS_miRNA.sai $dir_seq/*.mature_T.fa > $outdir/Potra_SS_miRNA.sam
16 | 
17 | grep -v "@" $outdir/Potra_SS_hairpin.sam | awk '{ if($3 != "\*") print $0 }' > $outdir/Potra_SS_hairpin.mapped
18 | grep -v "@" $outdir/Potra_SS_miRNA.sam | awk '{ if($3 != "\*") print $0 }' > $outdir/Potra_SS_miRNA.mapped
19 | 


--------------------------------------------------------------------------------
/pipeline/runGMAPIndex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -t 24:00:00
 4 | #SBATCH --mail-type=ALL
 5 | ## SBATCH --mem=24G
 6 | 
 7 | ## stop on error and be verbose
 8 | set -ex
 9 | 
10 | # ## load the modules
11 | #module load bioinfo-tools gmap-gsnap
12 | 
13 | ## usage
14 | usage(){
15 | echo >&2 \
16 | "
17 | 	Usage: runGMAPIndex.sh <index dir> <index name> <fasta file>
18 | "
19 | 	exit 1
20 | }
21 | 
22 | ## we get one dir, one token and one file as input
23 | if [ $# != 3 ]; then
24 |     echo "This function takes one directory, one token and one file as arguments"
25 |     usage
26 | fi
27 | 
28 | if [ ! -d $1 ]; then
29 |     echo "The first argument needs to be the GMAP index directory"
30 |     usage
31 | fi
32 | 
33 | if [ ! -f $3 ]; then
34 |     echo "The third argument needs to be a fasta file"
35 |     usage
36 | fi
37 | 
38 | ## run GMAP
39 | echo Indexing
40 | 
41 | ## run
42 | gmap_build -D $1 -d $2 $3
43 | 
44 | ## fix permission
45 | chmod -R g+w $1/$2
46 | 
47 | ##
48 | echo Done
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/R/plotSft.R:
--------------------------------------------------------------------------------
 1 | plotSFT <- function(sft,powers=1:35,ymin=NULL,ymax=NULL){
 2 |   
 3 |   ## check the library
 4 |   stopifnot(require(WGCNA))
 5 |   
 6 |   ## default
 7 |   ylim <- range(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
 8 |   
 9 |   ## update ylim
10 |   if(! is.null(ymin)){
11 |     stopifnot(ymin>=-1 & ymin <=1)
12 |     ylim[1] <- ymin
13 |   }
14 |   if(! is.null(ymax)){
15 |     stopifnot(ymax>=-1 & ymax <=1)
16 |     ylim[2] <- ymax
17 |   }
18 |   
19 |   ## do the plot
20 |   plot(0,0,
21 |        xlim=range(sft$fitIndices[,1]),
22 |        ylim=ylim,
23 |        xlab="Soft Threshold (power)",
24 |        ylab="Scale Free Topology Model Fit,signed R^2",
25 |        type="n",
26 |        main = paste("Scale independence"));
27 |   text(x=sft$fitIndices[,1],
28 |        y=-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
29 |        labels=powers,cex=1,col="red");
30 |   abline(h=0.8 * c(-1,1),col='skyblue');
31 |   abline(h=0.9 * c(-1,1),col='darkolivegreen2')
32 |   
33 |   ## return
34 |   invisible(TRUE)
35 | }
36 | 


--------------------------------------------------------------------------------
/pipeline/runFastQC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 3:00:00
 5 | #SBATCH --mail-type=END,FAIL
 6 | 
 7 | # fail on ERROR
 8 | set -eux
 9 | 
10 | # load helpers
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # vars
14 | OPTIONS="--noextract"
15 | CPU=1
16 | 
17 | # usage
18 | USAGETXT=\
19 | "
20 |  $0 <singularity image> <outputFolder> <fastq file>
21 | "
22 | 
23 | ## arguments
24 | [[ $# -ne 3 ]] && abort "This script takes two arguments"
25 | 
26 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing singularity fastqc container file"
27 | 
28 | ## enforce singularity
29 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
30 | 
31 | [[ ! -d $2 ]] && abort "The second argument needs to be an existing directory"
32 | 
33 | [[ ! -f $3 ]] && abort "The third argument needs to be an fastq file"
34 | 
35 | ## start
36 | singularity exec $1 fastqc --outdir $2 -t $CPU $OPTIONS $3
37 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_microtom_shirasawa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Solanum-lycopersicum/Micro-Tom/Shirasawa/SLM_r2.0/fasta/SLM_r2.0.pmol.fasta.gz",
 5 |     "gtf": "<reference>/Solanum-lycopersicum/Micro-Tom/Shirasawa/SLM_r2.0/gtf/SLM_r2.0.pmol.filtered.gtf.gz",
 6 |     "transcript_fasta": "<reference>/Solanum-lycopersicum/Micro-Tom/Shirasawa/SLM_r2.0/fasta/genome.transcripts.fa.gz",
 7 |     "salmon_index": "<reference>/Solanum-lycopersicum/Micro-Tom/Shirasawa/SLM_r2.0/indices/salmon/genome.transcripts_with-decoy_salmon-version-1-dot-10-dot-3",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": false,
14 |     "skip_alignment": true
15 | }


--------------------------------------------------------------------------------
/pipeline/runBamSubset.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | ## stop on error
 4 | set -e
 5 | 
 6 | ## be verbose and extend the commands
 7 | set -x
 8 | 
 9 | ## usage
10 | usage(){
11 | echo >&2 \
12 | "
13 | 	Usage: runBamSubset.sh <contig list file> <bam file>
14 | 
15 | 	The <contig list file> should contain one contig name per line
16 | "
17 | 	exit 1
18 | }
19 | 
20 | if [ $# != 2 ]; then
21 |     echo "This function takes two files as arguments"
22 |     usage
23 | fi
24 | 
25 | if [ ! -f $1 ]; then
26 |     echo "The first argument needs to be an existing file listing the contigs"
27 |     usage
28 | fi
29 | 
30 | if [ ! -f $2 ]; then
31 |     echo "The second argument needs to be an existing bam file"
32 |     usage
33 | fi
34 | 
35 | ## get the out name
36 | out=${2//.bam/_Subset}
37 | 
38 | ## the awk one liner
39 | samtools view -h $2 | awk '{if(NF==1){ctg[$1]++} else {if((x=index($1,"@")) > 0){if($1 == "@SQ"){sn=$2;sub("SN:","",sn);if (ctg[sn]==1){print $0}}else{print $0}} else {if (ctg[$3]==1){print $0}}}}' $1 - | samtools view -bS - | samtools sort - $out


--------------------------------------------------------------------------------
/pipeline/runPasaLoadAnnotation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 04:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -ex
 9 | 
10 | ## modules
11 | module load bioinfo-tools pasa
12 | 
13 | ## a usage function
14 | usage(){
15 |     echo >&2 \
16 | "
17 |     Usage: $0 <config file> <genome fasta file> <gff3 file>
18 | " 
19 |     exit 1
20 | }
21 | 
22 | ## we get three files as input
23 | if [ $# != 3 ]; then
24 |     echo "This function takes one config file, one fasta file and one gff3 file as argument"
25 |     usage
26 | fi
27 | 
28 | if [ ! -f $1 ]; then
29 |     echo "The first argument needs to be an existing config file"
30 |     usage
31 | fi
32 | 
33 | if [ ! -f $2 ]; then
34 |     echo "The second argument needs to be an existing fasta file"
35 |     usage
36 | fi
37 | 
38 | if [ ! -f $3 ]; then
39 |     echo "The third argument needs to be an existing gff3 file"
40 |     usage
41 | fi
42 | 
43 | ## execute
44 | $PASAHOME/scripts/Load_Current_Gene_Annotations.dbi -c $1 -g $2 -P $3
45 | 


--------------------------------------------------------------------------------
/pipeline/runGeneNetworkRPreparation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | # usage
 9 | usage(){
10 | echo >&2 \
11 | "
12 | 	Usage: $0 <expression matrix> <metadata table> <output dir>
13 | 	
14 | 	The expression matrix and metadata table have to be in tab delimited format
15 | 	and may be gzipped.
16 | "
17 | 	exit 1
18 | }
19 | 
20 | # check
21 | if [ $# != 3 ]; then
22 |   echo "This script expects 3 arguments"
23 |   usage
24 | fi
25 | 
26 | if [ ! -f $1 ]; then
27 |   echo "The first argument must be a file"
28 |   usage
29 | fi
30 | 
31 | if [ ! -f $2 ]; then
32 |   echo "The second argument must be a file"
33 |   usage
34 | fi
35 | 
36 | if [ ! -d $3 ]; then
37 |   echo "The third argument must be a directory"
38 |   usage
39 | fi
40 | 
41 | # get the exec
42 | module load R
43 | exeR=`Rscript -e 'cat(system.file("R","geneNetworkR-preparation.R",package="geneNetworkR"))'`
44 | 
45 | 
46 | # run with knitr
47 | Rscript -e "library(knitr); spin('$exeR')" -e $1 -m $2 -f $3
48 | 
49 | 


--------------------------------------------------------------------------------
/pipeline/runCPC2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | # stop on error, be verbose and expand the commands
 7 | set -e -x
 8 | 
 9 | # source helpers
10 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
11 | 
12 | ## usage
13 | USAGETXT=\
14 | "
15 | 	Usage: runCPC2.sh <Trinity.fasta> <out dir>
16 | 	
17 | 	Options:
18 | 	            -i    input file
19 | 	            -o    output file 
20 | 	            -r    also check the reverse strand [Default: FALSE]
21 | 	            
22 | "
23 | 
24 | # Check
25 | if [ $# -ne 2 ]; then
26 |     echo "This function needs 2 arguments"
27 |     usage
28 | fi
29 | 
30 | if [ ! -f $1 ]; then
31 |   abort "The first argument needs to be the trinity fasta filepath"
32 | fi
33 | 
34 | if [ ! -d $(dirname $2) ]; then
35 |     abort "The second argument (output dir) parent directory does not exist"
36 | fi
37 | 
38 | # run CPC2
39 | cd $(dirname $2)
40 | docker run --rm -v /mnt:/mnt \
41 | delhomme/upscb-lncrna CPC2.py \
42 | -r TRUE -i $1 -o $2
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_lupin_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Lupinus-albus/v2.0/fasta/Lupinus_Ribotaper_genome_reannot.fasta.gz",
 5 |     "gtf": "<reference>/Lupinus-albus/v2.0/gtf/Lalbus_Ribotaper_final_strand_changed.filtered.gtf.gz",
 6 |     "transcript_fasta": "<reference>/Lupinus-albus/v2.0/fasta/Lalbus_Ribotaper_final_strand_changed_mRNA.fa.gz",
 7 |     "salmon_index": "<reference>/Lupinus-albus/v2.0/indices/salmon/Lalbus_Ribotaper_final_strand_changed_mRNA_with_decoy_salmon-1-dot-10-dot-3",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": true,
14 |     "skip_alignment": true,
15 |     "skip_gtf_filter": true,
16 |     "skip_gtf_transcript_filter": true
17 | }
18 | 


--------------------------------------------------------------------------------
/pipeline/runRepeatMasker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -J repeatmasker
 3 | #SBATCH -p main
 4 | #SBATCH -c 8
 5 | #SBATCH -t 7-00:00:00
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | ## stop on error and be verbose in the output
 9 | set -e -x
10 | 
11 | # usage txt
12 | export USAGETXT=\
13 | "
14 | 	Usage: $0 <genome> <engine> <outdir> [options to RM]
15 |   Note: a default option is -qq, set another option to overwrite
16 | "
17 | 
18 | OPT="-qq"
19 | 
20 | # common function
21 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
22 | 
23 | if [ "$#" -lt 3 ]; then
24 |   abort "This function expects at least three arguments"
25 | fi
26 | 
27 | genome=$1
28 | shift
29 | if [ ! -f $genome ]; then
30 |   abort "This function expects a fasta file as first argument"
31 | fi
32 | 
33 | engine=$1
34 | shift
35 | 
36 | outdir=$1
37 | shift
38 | if [ ! -d $outdir ]; then
39 |   abort "The output directory needs to exist"
40 | fi
41 | 
42 | if [ "$#" -gt 1 ]; then
43 |   OPT=$@
44 | fi
45 | 
46 | # run
47 | RepeatMasker $genome -e $engine -pa 8 -dir $outdir $OPT
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/pipeline/runMultiQC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mail-type=ALL
 5 | #SBATCH -t 02:00:00
 6 | 
 7 | # stop on error but be verbose
 8 | set -eux
 9 | 
10 | # load helpers
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | #  Run MultiQC
14 | USAGETXT=\
15 | "Usage: $(basename $0) <singularity image> <analysis directory> <output directory>"
16 | 
17 | ## arguments
18 | [[ $# -lt 3 ]] &&  abort "This script takes three arguments"
19 | 
20 | ## input file
21 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing singularity multiqc container file"
22 | 
23 | ## enforce singularity
24 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
25 | 
26 | # directory
27 | [[ ! -d $2 ]] && abort "The second argument needs to be an existing analysis directory."
28 | 
29 | # output
30 | [[ ! -d $3 ]] && abort "The third argument needs to be an existing output directory."
31 | 
32 | ## start
33 | singularity exec $1 multiqc -o $3 $2
34 | 


--------------------------------------------------------------------------------
/src/R/GC_percent_from_fasta.R:
--------------------------------------------------------------------------------
 1 | stopifnot(
 2 |   suppressPackageStartupMessages({
 3 |     require(tidyverse)
 4 |     require(Biostrings)
 5 |   })
 6 |   )
 7 | 
 8 | ## GC percent per gene from a fasta input with gene wise sequences
 9 | ## Gene name to be truncated until first space
10 | 
11 | gc_from_fasta <- function(fasta_file) {
12 |   # check input
13 |   stopifnot(is.character(fasta_file), length(fasta_file) == 1)
14 |   
15 |   # read fasta
16 |   fasta <- Biostrings::readDNAStringSet(fasta_file)
17 |   
18 |   # base frequencies
19 |   freq <- Biostrings::alphabetFrequency(
20 |     fasta,
21 |     baseOnly = TRUE,
22 |     collapse = FALSE
23 |   )
24 |   
25 |   GC_percent <- freq %>% 
26 |     as.data.frame(row.names = sub(" .*", "", names(fasta))) %>% 
27 |     rownames_to_column(var = "Gene") %>% 
28 |     as_tibble() %>% 
29 |     rowwise() %>%
30 |     mutate(GC = sum(G, C),
31 |            AGCT = sum(A, G, C, T),
32 |            GC_percent = (GC/AGCT)*100) %>%
33 |     dplyr::select(Gene, GC_percent) %>% 
34 |     as_data_frame()
35 |   
36 |   return(GC_percent)
37 | }
38 | 


--------------------------------------------------------------------------------
/templates/bash/submitSeidrRoc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | account=CHANGEME
 4 | email=CHANGEME
 5 | 
 6 | # Load the tools
 7 | module load bioinfo-tools seidr-devel
 8 | 
 9 | # process the argument
10 | pgs=CHANGEME
11 | ngs=CHANGEME
12 | # example: 
13 | # pgs=$(realpath ../goldStandard/Picea-abies_KEGG-based-positive-gold-standard.tsv)
14 | # ngs=$(realpath ../goldStandard/Picea-abies_KEGG-based-negative-gold-standard.tsv)
15 | 
16 | bb=$(realpath ../data/seidr/aggregate/aggregated.sf)
17 | indir=$(realpath ../data/seidr/backbone)
18 | out=$(realpath ../data/seidr/roc)
19 | 
20 | if [ ! -d $out ]; then
21 |   mkdir -p $out
22 | fi
23 | 
24 | bbnam=$(basename $bb)
25 | if [ ! -h $indir/$bbnam ]; then
26 |   ln -sf -t $indir $bb 
27 | fi
28 | 
29 | # find the network files
30 | for f in $(find $indir -name "*.sf"); do
31 |   fnam=$(basename ${f/.sf/})
32 | 
33 |   # run the roc on all
34 |   sbatch -A $account --mail-user=$email \
35 |   -o $out/${fnam}_roc.out -e $out/${fnam}_roc.err \
36 |   ../UPSCb-common/pipeline/runSeidrRoc.sh $f $pgs $ngs \
37 |   $out/${fnam}_roc.tsv
38 | done
39 | 


--------------------------------------------------------------------------------
/pipeline/runAnova.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | ## TODO add an option to make the name optional
 9 | 
10 | usage(){
11 |   echo >&2 \
12 |   "
13 |   This script expects three arguments: the transposed matrix, the gene list and the output dir.
14 |   The transposed matrix should have no column nor row names.
15 |   "
16 |   exit 1
17 | }
18 | 
19 | if [ $# != 3 ]; then
20 |   echo "ERROR: This script expect three arguments"
21 |   usage
22 | fi
23 | 
24 | if [ ! -f $1 ]; then
25 |   echo "ERROR: The first argument should be an existing file"
26 |   usage
27 | fi
28 | 
29 | if [ ! -f $2 ]; then
30 |   echo "ERROR: The second argument should be an existing file"
31 |   usage
32 | fi
33 | 
34 | if [ ! -d $3 ]; then
35 |   echo "ERROR: The third argument should be an existing directory"
36 |   usage
37 | fi
38 | 
39 | # create the structure
40 | cd $3
41 | ln -sf $1 NetworkE_expression_data.tsv
42 | ln -sf $2 NetworkE_chip_features.tsv
43 | nrow=`head -1 $1 | wc -w`
44 | 
45 | ~bastian/Git/geneNetworkR/src/anova/anova NetworkE 1 $nrow
46 | 


--------------------------------------------------------------------------------
/pipeline/runSSPACE-LR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --mem=20G
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH -c 1
 5 | 
 6 | set -ex
 7 | 
 8 | module load perl bioinfo-tools sspace
 9 | 
10 | usage()
11 | {
12 | 	echo "Usage:$0 <contigs file> <out dir> <gzipped pacbio reads>"
13 | 	exit 1
14 | }
15 | 
16 | CONTIGS=$1
17 | OUT=$2
18 | shift 2
19 | PB=( "$@" )
20 | 
21 | for f in ${PB[@]}; do
22 | 	if [ ! -f $f ]; then
23 | 	echo "$f not a valid file"
24 | 	exit 1
25 | 	fi
26 | done
27 | 
28 | if [ ! -f $CONTIGS ]; then
29 | 	echo "Contigs file $CONTIGS does not exist"
30 | 	exit 1
31 | fi
32 | 
33 | if [ ! -d $OUT ]; then
34 | 	echo "Out dir $OUT does note exist"
35 | 	exit 1
36 | fi
37 | 
38 | F1=${PB[0]}
39 | EXT="${F1##*.}"
40 | 
41 | if [[ $EXT == "gz" ]]; then
42 | 	EXT=${F1/.gz/}
43 | 	EXT=${EXT##*.}
44 | 	GZIP=1
45 | else
46 | 	GZIP=0
47 | fi
48 | 
49 | touch $OUT/tmp.ct.$EXT
50 | if [ $GZIP -eq 1 ];then
51 | 	zcat ${PB[@]} > $OUT/tmp.ct.$EXT
52 | else
53 | 	cat ${PB[@]} > $OUT/tmp.ct.$EXT
54 | fi
55 | 
56 | perl $(which SSPACE-LongRead.pl) -c $CONTIGS -p $OUT/tmp.ct.$EXT -b $OUT
57 | 
58 | rm $OUT/tmp.ct.$EXT
59 | 


--------------------------------------------------------------------------------
/pipeline/runStarFusion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 8
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools star-fusion samtools
12 | 
13 | # OPTIONS
14 | CPU=8
15 | 
16 | # usage function
17 | usage(){
18 | echo >&2 \
19 | "
20 | 	Usage: $0 <star-fusion index> <star chimeric file> <out dir>
21 | "
22 | 	exit 1
23 | }
24 | 
25 | # check the arguments
26 | if [ ! -d $1 ]; then
27 | 	echo "The star-fusion index dir: $1 does not exist"
28 | 	usage
29 | fi
30 | 
31 | if [ ! -f $1/_ref_cdna.fasta.bowtie_idx.ok ]; then
32 | 	echo "The star-fusion index directory: $1 does not seem to be a valid index directory"
33 | 	usage
34 | fi
35 | 
36 | if [ ! -f $2 ]; then
37 | 	echo "The star chimeric file: $2 does not exist"
38 | 	usage
39 | fi
40 | 
41 | if [ ! -d $3 ]; then
42 |   echo "The output directory: $3 does not exist"
43 |   usage
44 | fi
45 | 
46 | # run the commands
47 | STAR-Fusion --genome_lib_dir $1 \
48 |              -J $2 \
49 |              --output_dir $3
50 | 


--------------------------------------------------------------------------------
/pipeline/runCuffcompare.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 0-00:10:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ##
 7 | set -e
 8 | 
 9 | ## 
10 | echo Loading
11 | export LD_LIBRARY_PATH=~delhomme/lib
12 | 
13 | ##
14 | echo Checking
15 | 
16 | ## we get two dir as input
17 | if [ $# != 4 ]; then
18 |     echo "This function takes two directories and two files as arguments."
19 |     echo "Usage: sbatch runCuffmerge.sh <in dir> <out dir> <gene gff3> <genome softmasked fasta>"
20 |     exit 1
21 | fi
22 | 
23 | if [ ! -d $1 ]; then
24 |     echo "The first argument needs to be an existing directory"
25 | fi
26 | 
27 | if [ ! -d $2 ]; then
28 |     echo "The second argument needs to be an existing directory"
29 | fi
30 | 
31 | if [ ! -f $3 ]; then
32 |     echo "The third argument needs to be an existing file"
33 | fi
34 | 
35 | if [ ! -f $4 ]; then
36 |     echo "The forth argument needs to be an existing file"
37 | fi
38 | 
39 | ##
40 | echo Starting
41 | 
42 | cuffcompare -r $3 -R -C -V -s $4 -o $2 $1/*/*_transcripts.gtf > $2/cuffcompare.txt 2> $2/cuffcompare.err
43 | 
44 | ##
45 | echo Done
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/runFusionInspector.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 8
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -e -x
 9 | 
10 | # load the modules
11 | module load bioinfo-tools star-fusion samtools
12 | 
13 | # OPTIONS
14 | CPU=8
15 | 
16 | # usage function
17 | usage(){
18 | echo >&2 \
19 | "
20 | 	Usage: $0 <star-fusion index> <star chimeric file> <out dir>
21 | "
22 | 	exit 1
23 | }
24 | 
25 | # check the arguments
26 | if [ ! -d $1 ]; then
27 | 	echo "The star-fusion index dir: $1 does not exist"
28 | 	usage
29 | fi
30 | 
31 | if [ ! -f $1/_ref_cdna.fasta.bowtie_idx.ok ]; then
32 | 	echo "The star-fusion index directory: $1 does not seem to be a valid index directory"
33 | 	usage
34 | fi
35 | 
36 | if [ ! -f $2 ]; then
37 | 	echo "The star chimeric file: $2 does not exist"
38 | 	usage
39 | fi
40 | 
41 | if [ ! -d $3 ]; then
42 |   echo "The output directory: $3 does not exist"
43 |   usage
44 | fi
45 | 
46 | # run the commands
47 | STAR-Fusion --genome_lib_dir $1 \
48 |              -J $2 \
49 |              --output_dir $3
50 | 


--------------------------------------------------------------------------------
/nextflow/template/rnaseq_lupin_v1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input": "<project>/doc/sample_sheet.csv",
 3 |     "outdir": "data",
 4 |     "fasta": "<reference>/Lupinus-albus/v1.0/fasta/Lalbus-20171117r1.genome_N-to-A.fasta.gz",
 5 |     "gtf": "<reference>/Lupinus-albus/v1.0/gtf/Lalbus-20171117r1-v1.annot-all-features_addedUTRs.filtered.gtf.gz",
 6 |     "transcript_fasta": "<reference>/Lupinus-albus/v1.0/fasta/Lalbus-20171117r1-v1.annot-all-features_addedUTRs.transcripts.fa.gz",
 7 |     "salmon_index": "<reference>/Lupinus-albus/v1.0/indices/salmon/Lalbus-20171117r1-v1.annot-all-features_addedUTRs.transcripts_with-decoy_salmon-1-dot-10-dot-3",
 8 |     "remove_ribo_rna": true,
 9 |     "ribo_database_manifest": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/sortmerna_manifest.txt",
10 |     "sortmerna_index": "<reference>/rRNA/sortmerna/v4.3.4/nextflow/idx",
11 |     "pseudo_aligner": "salmon",
12 |     "extra_salmon_quant_args": "--dumpEq --numGibbsSamples 30 --gcBias --seqBias --posBias",
13 |     "save_non_ribo_reads": true,
14 |     "skip_alignment": true,
15 |     "skip_gtf_filter": true,
16 |     "skip_gtf_transcript_filter": true
17 | }
18 | 


--------------------------------------------------------------------------------
/src/R/extractGff3Subset.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Extract Gff3 subtract"
 3 | #' author: "Nicolas Delhomme"
 4 | #' date: "`r Sys.Date()`"
 5 | #' output:
 6 | #'  html_document:
 7 | #'    toc: true
 8 | #'    number_sections: true
 9 | #' ---
10 | #' # Setup
11 | #' Set the working dir
12 | setwd("~/")
13 | #' ```{r set up, echo=FALSE}
14 | #' knitr::opts_knit$set(root.dir="~/")
15 | #' ```
16 | 
17 | #' Libraries
18 | suppressPackageStartupMessages(library(genomeIntervals))
19 | 
20 | #' Source the helper file
21 | source("~/Git/UPSCb/src/R/gff3Utilities.R")
22 | 
23 | #' Read the gff3 file
24 | gff3 <- readGff3("/mnt/picea/storage/reference/Picea-abies/v1.0/GBrowse/Pabies1.0/Gene_Prediction_Transcript_assemblies/Eugene.gff3",
25 |                  quiet=TRUE)
26 | 
27 | #' Define the gene list
28 | gene.list <- c("MA...","MA...")
29 | 
30 | #' # Extract
31 | #' The subset of interest
32 | subgff3 <- extractFromGff3UsingGeneIDs(gff3=gff3,IDs=gene.list)
33 | 
34 | #' And save it
35 | writeGff3(subgff3,file="A..FILE..NAME")
36 | 
37 | #' # Session Info
38 | #' ```{r session info, echo=FALSE}
39 | #' sessionInfo()
40 | #' ```
41 | #' 
42 | 


--------------------------------------------------------------------------------
/src/R/rfam5SKrakenPrep.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(Biostrings)
 3 |   library(tidyverse)
 4 | })
 5 | 
 6 | RFAM_VERSION <- "14.4"
 7 | HTTPS_SERVER <- file.path("ftp://ftp.ebi.ac.uk/pub/databases/Rfam",RFAM_VERSION,"fasta_files")
 8 | 
 9 | # Taxonomy <- "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
10 | # tar -zxf taxdump.tar.gz names.dmp
11 | # sed -i 's:\t::g' names.dmp
12 | # sed -i 's:"::g' names.dmp
13 | # sed -i 's:|$::g' names.dmp
14 | 
15 | Tax <- read_delim_chunked("/mnt/picea/storage/reference/Taxonomy/20210226/names.dmp",
16 |                           callback=DataFrameCallback$new(function(chunk,pos){chunk %>% filter(Type=="scientific name") %>% select(c("ID","Name"))}),
17 |                           delim="|",
18 |                   col_names=c("ID","Name","Description","Type"),
19 |                   col_types=cols(ID=col_double(),.default=col_character()))
20 | 
21 | 
22 | R5S <- readDNAStringSet(file.path(HTTPS_SERVER,"RF00001.fa.gz"))
23 |   
24 | names(R5S) %>% head %>% str_spli
25 | 
26 | R5.8S <- readDNAStringSet(file.path(HTTPS_SERVER,"RF00002.fa.gz"))
27 | names(R5.8S)
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/pipeline/runTrinityTransDecoder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | # stop on error, be verbose and expand the commands
 7 | set -e -x
 8 | 
 9 | # source helpers
10 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
11 | 
12 | ## usage
13 | USAGETXT=\
14 | "
15 | 	Usage: runTrinityTransDecoder.sh <Trinity.fasta> <out dir>
16 | "
17 | 
18 | # Check
19 | if [ $# -ne 2 ]; then
20 |     echo "This function needs 2 arguments"
21 |     usage
22 | fi
23 | 
24 | if [ ! -f $1 ]; then
25 |   abort "The first argument needs to be the trinity fasta filepath"
26 | fi
27 | 
28 | if [ ! -d $2 ]; then
29 |     abort "The second argument (output dir) needs to be an existing directory"
30 | fi
31 | 
32 | # run
33 | cd $2
34 | 
35 | singularity exec --bind /mnt:/mnt /mnt/picea/projects/singularity/trinity-trinotate-berlin2018.simg \
36 | /usr/local/src/TransDecoder/TransDecoder.LongOrfs -t $1
37 | 
38 | singularity exec --bind /mnt:/mnt /mnt/picea/projects/singularity/trinity-trinotate-berlin2018.simg \
39 | /usr/local/src/TransDecoder/TransDecoder.Predict -t $1
40 | 


--------------------------------------------------------------------------------
/src/bash/updateTaxonomySqlite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # set the dir
 5 | DIR=/mnt/picea/storage/reference/Taxonomy/`date "+%Y%m%d"`
 6 | mkdir $DIR
 7 | cd $DIR
 8 | 
 9 | # retrieve the data
10 | wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
11 | wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
12 | wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz
13 | 
14 | # extract what we need
15 | tar -zxf taxdump.tar.gz nodes.dmp names.dmp
16 | find . -name "gi*.dmp.gz" | xargs -P 2 -I{} gunzip {}
17 | 
18 | # update some table dumps
19 | sed -i 's:\t::g' nodes.dmp
20 | sed -i 's:"::g' nodes.dmp
21 | sed -i 's:|$::g' nodes.dmp
22 | sed -i 's:\t::g' names.dmp
23 | sed -i 's:"::g' names.dmp
24 | sed -i 's:|$::g' names.dmp
25 | 
26 | # create and populate the database
27 | sqlite3 taxonomy.sqlite < $UPSCb/src/sql/taxonomy-update.sql
28 | 
29 | # update the database dynamically (fix some awkwardities in the taxonomy tables)
30 | module load R
31 | Rscript ~/Git/UPSCb/src/R/updateTaxonomyDivisionTable.R
32 | 
33 | # clean up
34 | rm names.dmp nodes.dmp
35 | find . -name "*.dmp" | xargs -P 2 -I{} gzip {}
36 | 


--------------------------------------------------------------------------------
/pipeline/runBAMtoCRAM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 2-00:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | # usage txt
 9 | export USAGETXT=\
10 | "
11 | 	Usage: $0 [options] <genome fasta> <bam file>
12 | "
13 | 
14 | # common function
15 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
16 | 
17 | # options
18 | while getopts h option
19 | do
20 |   case "$option" in
21 |       h) usage;;
22 |       ?) usage;;
23 |   esac
24 | done
25 | shift `expr $OPTIND - 1`
26 | 
27 | # check the arguments
28 | if [ "$#" != 2 ]; then
29 |   abort "This function expects 2 arguments"
30 | fi
31 | 
32 | if [ ! -f $1 ]; then
33 |   abort "This function expects a fasta file as first argument"
34 | fi
35 | 
36 | if [ ! -f $2 ]; then
37 |   abort "This function expects a bam file"
38 | fi
39 | 
40 | # check tool
41 | isExec samtools
42 | 
43 | # run
44 | in=$2
45 | fasta=$1
46 | out=${in/.bam/.cram}
47 | 
48 | samtools view -C -T $fasta -o $out $in
49 | 
50 | samtools index $out
51 | 
52 | if [ -f $out ]; then
53 |     rm $in
54 | fi
55 | 
56 | if [ -f $in.bai ]; then
57 |     rm $in.bai
58 | fi
59 | 


--------------------------------------------------------------------------------
/pipeline/runBedToolsIntersect.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 2:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | ## we get one dir and one file as input
12 | export USAGETXT="Usage: $0 <a file> <b file> <out dir> [bed intersect option]"
13 | 
14 | # load functions
15 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
16 | 
17 | # test the param
18 | if [ "$#" -lt 3 ]; then
19 |   abort "This function requires 3 arguments"
20 | fi
21 | 
22 | if [ ! -f $1 ]; then
23 |     abort "The first argument needs to be an existing file"
24 | fi
25 | a=$1
26 | shift
27 | 
28 | if [ ! -f $1 ]; then
29 |     abort "The second argument needs to be an existing file"
30 | fi
31 | b=$1;
32 | shift;
33 | 
34 | if [ ! -d $1 ]; then
35 |     abort "The third argument needs to be an existing directory"
36 | fi
37 | dir=$1;
38 | shift;
39 | 
40 | # combine the filename for the output
41 | outfile=$dir/`basename ${a%.*}`-`basename ${b%.*}`.tsv
42 | 
43 | ## get the intersct results
44 | bedtools intersect $@ -a $a -b $b > $outfile
45 | 


--------------------------------------------------------------------------------
/pipeline/runSamtoolsMerge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 12:00:00
 5 | #SBATCH --mail-type=END,FAIL
 6 | 
 7 | # be verbose and print
 8 | set -ex
 9 | 
10 | # functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # test
14 | #isEnvVarSet $UPSCb
15 | 
16 | # usage
17 | USAGETXT=\
18 | "
19 | Usage: runSamtoolsMerge.sh <samtools singularity container> <output bam file> <input bamfile 1> <input bamfile 2> ... <input bamfile n>
20 | "
21 | 
22 | [[ $# -lt 4 ]] && abort "The script expects at least four arguments"
23 | 
24 | [[ ! -f $1 ]] && abort "The singularity container needs to be a file"
25 | singularity=$1
26 | shift
27 | 
28 | [[ ! -d $(dirname $1) ]] && abort "The output directory of the output file does not exist."
29 | out=$1
30 | shift
31 | 
32 | for f in $@; do
33 |   [[ ! -f $f ]] && abort "The input BAM $f does not exist"
34 | done
35 | 
36 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
37 | 
38 | #run samtools merge
39 | singularity exec $singularity samtools merge $out $@
40 | 


--------------------------------------------------------------------------------
/pipeline/runVsearchMergePairs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 1:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | # be safe (-e stop on error; -u stop if undefined variable, -x be verbose)
 8 | set -eux
 9 | 
10 | # load some functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | USAGETXT=\
14 | "
15 | Usage: runVsearchMergePair.sh <forward fastq file> <reverse fastq file> <output directory>
16 | "
17 | 
18 | # process the arguments
19 | if [ $# -ne 3 ]; then
20 |     abort "This script expects 3 arguments"
21 | fi
22 | 
23 | if [ ! -f $1 ]; then
24 |   abort "The first argument needs to be a file"
25 | fi
26 | 
27 | if [ ! -f $2 ]; then
28 |   abort "The second argument needs to be a file"
29 | fi
30 | 
31 | if [ ! -d $3 ]; then
32 |   abort "The third argument needs to be a directory"
33 | fi
34 | 
35 | # run
36 | fnam=$(basename ${1/_1.fastq.gz/})
37 | vsearch --fastq_mergepairs $1 --reverse $2 --fastq_allowmergestagger \
38 | --fastaout $3/$fnam.fa --fastaout_notmerged_fwd $3/${fnam}_1.fa --fastaout_notmerged_rev $3/${fnam}_2.fa
39 | 
40 | # think of compressing the output
41 | 


--------------------------------------------------------------------------------
/src/R/gopher2-example.R:
--------------------------------------------------------------------------------
 1 | # dat = your list of genes of interest (a subset of a population)
 2 | dat <- scan("~/Git/UPSCb/projects/facility/doc/tiggy-gene-example.txt",what="character")
 3 | 
 4 | # bg = your population (think what defines it.)
 5 | bg <- scan("~/Git/UPSCb/projects/facility/doc/Tiggy.spruce.background.txt",what="character",skip=1)
 6 | 
 7 | # we silence warnings (not good practice, but we know what we're doing) - think to adjust the path
 8 | suppressPackageStartupMessages(source("~delhomme/Git/UPSCb-common/src/R/gopher.R"))
 9 | 
10 | # we just quantify the run time
11 | # task has to be a list
12 | # task can take any value from: go, kegg, mapman and pfam
13 | system.time(enrichment <- gopher(dat,task = list("go","kegg","pfam"),background = bg,url="pabies"))
14 | 
15 | # if no background (i.e. the whole population is to be used)
16 | system.time(enrichment <- gopher(dat,task = list("go","kegg","pfam"),url="pabies"))
17 | 
18 | # enrichment will contain a list of tibbles (a "type of" data.frame), on per "task"
19 | 
20 | # for go, you can for example export the GO ID and the FDR to a file and then 
21 | # upload that to REVIGO (http://revigo.irb.hr)


--------------------------------------------------------------------------------
/pipeline/runFastQCMultiviewer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | usage(){
 6 |     echo >&2 \
 7 | "
 8 |     runFastQCMultiviewer.sh <fastqc dir>
 9 | 
10 |     Arguments:
11 |         fastqc dir - the directory containing the FastQC reports
12 | 
13 |     Note:
14 |         The UPSCb Environment Variable needs to be set to your
15 |         Git UPSCb checkout dir.
16 | 
17 |     Details:
18 |         It unzip the fastqc files into a directory called multiview
19 |         and create a multiview.html file.
20 | "
21 | exit 1
22 | }
23 | 
24 | if [ $# -ne 1 ]; then
25 |     echo "This function takes one argument"
26 |     usage
27 | fi
28 | 
29 | if [ ! -d $1 ]; then
30 |     echo "The first argument needs to be the directory containing the FastQC reports"
31 |     usage
32 | fi
33 | 
34 | py=${SLURM_SUBMIT_DIR:-$(dirname $0)}/../src/python/fastQCmultiviewer.py
35 | 
36 | if [ ! -f $py ]; then
37 |     echo "Fixme; the .. part of the path, also below..."
38 |     usage
39 | fi
40 | 
41 | mkdir -p $1/multiview
42 | 
43 | find $1 -name "*.zip" -type f -exec unzip -f -d $1/multiview "{}" \;
44 | 
45 | python $py -out_file $1/multiview.html -in_dir $1/multiview
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/runSraFastqDump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 02:00:00
 5 | #SBATCH --mem=6GB
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | ## stop on error and be verbose
 9 | set -ex
10 | 
11 | ## usage
12 | usage(){
13 |   echo >&2 \
14 |   "Usage: $(basename $0) <file to be converted> <output directory>"
15 |   exit 1
16 | }
17 | 
18 | ## check number of arguments
19 | if [ $# -lt 2 ]; then
20 |    echo "This script takes two arguments"
21 |    usage
22 | fi
23 | 
24 | ## check input file 
25 | if [ ! -f $1 ]; then
26 |     echo "The first argument needs to be an existing sra file, please verify file path."
27 |     usage
28 | fi
29 | 
30 | ## check output directory
31 | if [ ! -d $2 ]; then
32 | 	echo "The second argument needs to be an existing output directory."
33 | 	usage
34 | fi
35 | 
36 | ## start
37 | fastq-dump --gzip --split-3 -O $2 $1
38 | # --split-3: create 3 files, one for forward reads, one for reverse, and one for unpaired reads.
39 | # --I --split-files: produces two fastq files (--split-files) containing ".1" and ".2" read suffices (-I) for paired-end data
40 | # --gzip: compress output using gzip
41 | # - O: output directory
42 | 


--------------------------------------------------------------------------------
/pipeline/runMinimap2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p main -n 20
 3 | #SBATCH -t 2-00:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | # usage txt
 9 | export USAGETXT=\
10 | "
11 | 	Usage: $0 [options] <target fasta> <query fasta> <outdir>
12 | 	
13 | 	Options: -c the number of CPU to use
14 | "
15 | 
16 | # common function
17 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
18 | 
19 | # vars
20 | CPU=20
21 | 
22 | # options
23 | while getopts c:h option
24 | do
25 |   case "$option" in
26 |       c) CPU=$OPTARG;;
27 |       h) usage;;
28 |       ?) usage;;
29 |   esac
30 | done
31 | shift `expr $OPTIND - 1`
32 | 
33 | # check the arguments
34 | if [ "$#" != 3 ]; then
35 |   abort "This function expects three arguments"
36 | fi
37 | 
38 | if [ ! -f $1 ]; then
39 |   abort "This function expects a fasta file as first argument"
40 | fi
41 | 
42 | if [ ! -f $2 ]; then
43 |   abort "This function expects a fasta file as second argument"
44 | fi
45 | 
46 | if [ ! -d $3 ]; then
47 |   abort "The output directory needs to exist"
48 | fi
49 | 
50 | isExec minimap2
51 | 
52 | # run
53 | cd $outdir
54 | minimap2 -t $CPU -x map-pb -a $1 $2 > $3/$(basename ${1/.fa*/})
55 | 


--------------------------------------------------------------------------------
/pipeline/runFastQValidator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p main -n 1
 4 | #SBATCH -t 0-01:00:00
 5 | #SBATCH --mail-type=ALL
 6 | #SBATCH --mem=16GB
 7 | 
 8 | ## load the module if it exists
 9 | module load bioinfo-tools && module load fastQvalidator || {
10 |   if ! hash fastQValidator 2>/dev/null; then
11 |     echo "fastQValidator was not found in your path" 1>&2
12 |     exit 1
13 |   fi
14 | }
15 | 
16 | usage() {
17 |   echo "usage: `basename $0` <fastq>
18 | 
19 | Run fastQValidator on a FASTQ file. Prints output on stdout and
20 | exits with a non-zero exit status if the input file does not
21 | conform to the standard.
22 | 
23 | ARGUMENTS:
24 |     fastq   a FASTQ file, can be gzipped
25 | 
26 | NOTES:
27 |     fastQValidator must lie in your PATH" 1>&2
28 |   exit 1
29 | }
30 | 
31 | ## stop on error
32 | set -e
33 | 
34 | ## check
35 | if [ $# != 1 ]; then
36 |     echo "This function takes one argument: a fastq filename" 1>&2
37 |     usage
38 | fi
39 | 
40 | if [ ! -f $1 ]; then
41 |     echo "The fastq filename you provided does not exist" 1>&2
42 |     usage
43 | fi
44 | 
45 | ## we print 1000 errors, should be enough
46 | fastQValidator --noeof --file $1 --printableErrors 1000
47 | 


--------------------------------------------------------------------------------
/pipeline/runMmseq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 8:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error but be verbose
 8 | set -ex
 9 | 
10 | usage(){
11 | echo >&2 \
12 | "
13 | 	Usage: $0 [option] <out dir> <genome fasta> <in bam>
14 | "
15 | 	exit 1
16 | }
17 | 
18 | ## executable
19 | module load bioinfo-tools
20 | module load mmseq
21 | 
22 | ## arguments
23 | if [ $# != 3 ]; then
24 |    echo "This script takes three arguments: one out dir, one reference fasta file and one input bam file"
25 |    exit 1
26 | fi
27 | 
28 | ## input files
29 | if [ ! -d $1 ]; then
30 | 	echo "The first argument needs to be an existing directory"
31 | 	usage
32 | fi
33 | 
34 | if [ ! -f $2 ]; then
35 | 	echo "The second argument needs to be an existing fastq(.gz) file"
36 | 	usage
37 | fi
38 | 
39 | ## bowtie index
40 | if [ ! -f $3 ]; then
41 | 	echo "The third argument needs to be an existing bam file"
42 | 	usage
43 | fi
44 | 
45 | ## create the outfile name
46 | outfile=$1/`basename ${3//.bam/}`
47 | 
48 | ## start
49 | ## then run bam2hits
50 | bam2hits -m "(\S+)\s*.*" 1 1 $2 $3 > $outfile.hits
51 | 
52 | ## then run
53 | mmseq $outfile.hits $outfile
54 | 
55 | 


--------------------------------------------------------------------------------
/pipeline/runRepeatModeler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -J repeatModeler
 3 | #SBATCH -p main
 4 | #SBATCH -c 8
 5 | #SBATCH -t 7-00:00:00
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | ## stop on error and be verbose in the output
 9 | set -e -x
10 | CPU=8
11 | 
12 | # usage txt
13 | export USAGETXT=\
14 | "
15 | 	Usage: $0 [options] <repeatModelerDB> <outdir>
16 | 	
17 | 	Options: -c the number of CPU to use
18 | "
19 | 
20 | # common function
21 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
22 | 
23 | # options
24 | while getopts c:h option
25 | do
26 |   case "$option" in
27 |       c) CPU=$OPTARG;;
28 |       h) usage;;
29 |       ?) usage;;
30 |   esac
31 | done
32 | shift `expr $OPTIND - 1`
33 | 
34 | # check the arguments
35 | if [ "$#" != 2 ]; then
36 |   abort "This function expects at least two arguments"
37 | fi
38 | 
39 | genome=$1
40 | shift
41 | if [ ! -f $genome.nhr ]; then
42 |   abort "This function expects a database prefix as first argument"
43 | fi
44 | 
45 | outdir=$1
46 | shift
47 | if [ ! -d $outdir ]; then
48 |   abort "The output directory needs to exist"
49 | fi
50 | 
51 | # run
52 | cd $outdir
53 | RepeatModeler -pa $(expr $CPU - 1) -database $genome
54 | 


--------------------------------------------------------------------------------
/pipeline/runCNCI.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | # stop on error, be verbose and expand the commands
 7 | set -e -x
 8 | 
 9 | # source helpers
10 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
11 | 
12 | ## usage
13 | USAGETXT=\
14 | "
15 | 	Usage: runCNCI.sh <Trinity.fasta> <out dir>
16 | 	
17 | 	Options:
18 | 	            -f    input file
19 | 	            -o    output file 
20 | 	            -p    (parallel) assign the running CUP numbers
21 | 	            -m    (model) assign the classification model ("re" for vertebrates; "pl" for plants)
22 | 	            
23 | "
24 | 
25 | # Check
26 | if [ $# -ne 2 ]; then
27 |     echo "This function needs 2 arguments"
28 |     usage
29 | fi
30 | 
31 | if [ ! -f $1 ]; then
32 |   abort "The first argument needs to be the trinity fasta filepath"
33 | fi
34 | 
35 | if [ ! -d $2 ]; then
36 |     abort "The second argument (output dir) needs to be an existing directory"
37 | fi
38 | 
39 | # run CNCI
40 | cd $2
41 | 
42 | singularity exec --bind /mnt:/mnt \
43 | /mnt/picea/projects/singularity/delhomme-upscb-lncrna.simg CNCI.py \
44 | -p 12 -m pl -f $1 -o $2
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/runPicardCreateSequenceDictionary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 00:10:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -e
 9 | 
10 | ## modules
11 | module load bioinfo-tools Picard-tools
12 | 
13 | ## a usage function
14 | usage(){
15 |     echo >&2 \
16 | "
17 |     Usage: $0 [options] <fasta file>
18 | 
19 |     Options:
20 |             -s the species to be added to the SP tag
21 | " 
22 |     exit 1
23 | }
24 | 
25 | ## VARS
26 | OPTIONS=""
27 | 
28 | ## get the options
29 | while getopts s: option
30 | do
31 |     case "$option" in
32 | 	s) OPTIONS="SPECIES=$OPTARG";;
33 | 	\?) ## unknown flag
34 | 	    usage;;
35 |   esac
36 | done
37 | shift `expr $OPTIND - 1`
38 | 
39 | 
40 | ## we get one file as input
41 | if [ $# != 1 ]; then
42 |     echo "This function takes one fasta file as argument"
43 |     usage
44 | fi
45 | 
46 | if [ ! -f $1 ]; then
47 |     echo "The first argument needs to be an existing fasta file"
48 |     usage
49 | fi
50 | 
51 | ## create the output
52 | out=${1//.f*a/.dict}
53 | 
54 | ## create the index
55 | java -jar  $PICARD_TOOLS_DIR/picard.jar CreateSequenceDictionary REFERENCE=$1 OUTPUT=$out $OPTIONS
56 | 


--------------------------------------------------------------------------------
/pipeline/runMarkDuplicates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -t 1-00:00:00
 4 | #SBATCH -p main
 5 | #SBATCH -n 1
 6 | 
 7 | set -e
 8 | 
 9 | module load java
10 | module load bioinfo-tools
11 | module load samtools
12 | #module load picard
13 | module load Picard-tools
14 | 
15 | THREADS=1
16 | 
17 | if [ -z $PICARD_HOME ]; then
18 |     echo >&2 "Could not find picard tools"
19 |     exit 1
20 | fi
21 | 
22 | # default
23 | JavaMem=6G
24 | 
25 | if [ $# -ne 2 ]; then
26 |     echo "Usage: $0 <BAM file> <output directory>" 1>&2
27 |     exit 1
28 | fi
29 | 
30 | if [ ! -f $1 ]; then
31 |     echo "Could not find BAM file '$1'" 1>&2
32 |     exit 1
33 | fi
34 | inbam=$1
35 | 
36 | if [ ! -d $2 ]; then
37 |     echo "Could not find directory '$2'" 1>&2
38 |     exit 1
39 | fi
40 | outdir=$2
41 | 
42 | sname=`basename "${inbam/_[st]*[st]*_STAR.bam/}"`
43 | name_out=`basename "${inbam/.bam/}"`
44 | 
45 | # Run MarkDuplicates
46 | java -Xmx${JavaMem} -XX:ParallelGCThreads=$THREADS -jar $PICARD_TOOLS_DIR/picard.jar MarkDuplicates \
47 |     ASSUME_SORTED=true \
48 |     INPUT=$inbam \
49 |     OUTPUT=$outdir/${name_out}_mkdup.bam \
50 |     METRICS_FILE=$outdir/${sname}_mkdup.metrics \
51 |     CREATE_INDEX=true
52 | 
53 | 


--------------------------------------------------------------------------------
/pipeline/runSUPPA2PsiPerEvent.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH --time=1-00:00:00
 4 | #SBATCH --job-name="suppa2"
 5 | #SBATCH --mail-type=END,FAIL
 6 | 
 7 | # sanity
 8 | set -eu
 9 | 
10 | # functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # usage
14 | USAGETXT=\
15 | "
16 |   Synopsis [options] $0 <suppa2 singularity container> <ioe file> <count tsv> <out dir>
17 | "
18 | 
19 | # checks
20 | [[ $# -ne 4 ]] && abort "This script expects four arguments"
21 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing file path to a singularity container"
22 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
23 | [[ ! -f $2 ]] && abort "The second argument needs to be an existing file path to an ioe annotation file"
24 | [[ ! -f $3 ]] && abort "The third argument needs to be an existing file path to a count tsv file"
25 | [[ ! -d $4 ]] && abort "The fourth argument needs to be an existing directory"
26 | 
27 | # run
28 | singularity exec $1 \
29 | suppa.py psiPerEvent \
30 | -i $2 \
31 | -o $4/$(basename ${3/.tsv/})_$(basename ${2/.ioe/}) \
32 | -e $3
33 | 


--------------------------------------------------------------------------------
/pipeline/runSUPPA2PsiPerIsoform.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH --time=1-00:00:00
 4 | #SBATCH --job-name="suppa2"
 5 | #SBATCH --mail-type=END,FAIL
 6 | 
 7 | # sanity
 8 | set -eu
 9 | 
10 | # functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # usage
14 | USAGETXT=\
15 | "
16 |   Synopsis [options] $0 <suppa2 singularity container> <gtf file> <count tsv> <out dir>
17 | "
18 | 
19 | # checks
20 | [[ $# -ne 4 ]] && abort "This script expects four arguments"
21 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing file path to a singularity container"
22 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
23 | [[ ! -f $2 ]] && abort "The second argument needs to be an existing file path to a gtf annotation file"
24 | [[ ! -f $3 ]] && abort "The third argument needs to be an existing file path to a count tsv file"
25 | [[ ! -d $4 ]] && abort "The fourth argument needs to be an existing directory"
26 | 
27 | # run
28 | singularity exec $1 \
29 | suppa.py psiPerIsoform \
30 | -g $2 \
31 | -o $4/$(basename ${3/.tsv/})_$(basename ${2/.gtf/}) \
32 | -e $3
33 | 


--------------------------------------------------------------------------------
/pipeline/runSalmonStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 |     echo >&2 \
 5 | "
 6 |     runSalmonStats.sh <Salmon dir>
 7 | 
 8 |     Arguments:
 9 |         Salmon dir - the directory containing the Salmon stderr files
10 | 
11 |     Note:
12 |         The UPSCb Environment Variable needs to be set to your
13 |         Git UPSCb checkout dir.
14 | 
15 |     Details:
16 |         It reads the kallisto stderr files and create a text delimited file
17 |         containing the sample name and number of pseudoalignments
18 | "
19 | exit 1
20 | }
21 | 
22 | if [ $# -ne 1 ]; then
23 |     echo "This function takes one argument"
24 |     usage
25 | fi
26 | 
27 | if [ ! -d $1 ]; then
28 |     echo "The first argument needs to be the directory containing the Salmon reports"
29 |     usage
30 | fi
31 | 
32 | if [ -z $UPSCb ]; then
33 |     echo "The UPSCb environment variable needs to be set."
34 |     usage
35 | fi
36 | 
37 | if [ ! -f $UPSCb/pipeline/runSalmonStats.sh ]; then
38 |     echo "Either your UPSC env. var. is not set correctly or your checkout is too old."
39 |     usage
40 | fi
41 | 
42 | cd $1
43 | grep Counted *.err | awk '{smpl=$1;gsub(/_sortmerna.*$/,"",smpl);val=$6;print smpl,val}' > SalmonStats.txt
44 | 


--------------------------------------------------------------------------------
/pipeline/runGeneNetworkRAggregate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | # usage
 9 | usage(){
10 | echo >&2 \
11 | "
12 | 	Usage: $0 [options] <output dir>
13 | 
14 | 	Options:
15 |           -c the number of cores to parallelise over
16 | "
17 | 	exit 1
18 | }
19 | 
20 | # define global vars
21 | CPU=1
22 | 
23 | # manage options
24 | while getopts c: option
25 | do
26 |   case "$option" in
27 | 	    c) CPU=$OPTARG;;
28 | 	    \?) ## unknown flag
29 | 		usage;;
30 |   esac
31 | done
32 | shift `expr $OPTIND - 1`
33 | 
34 | # check
35 | if [ $# != 1 ]; then
36 |   echo "This script expects 1 argument"
37 |   usage
38 | fi
39 | 
40 | if [ ! -d $1 ]; then
41 |   echo "The first argument must be a directory"
42 |   usage
43 | fi
44 | 
45 | if [ ! -f $1/Data/sexp.rda ]; then
46 |   echo "The directory has to be a valid geneNetworkR directory."
47 |   echo "Run geneNetworkRPreparation.R first."
48 |   usage
49 | fi
50 | 
51 | # get the exec
52 | module load R
53 | exeR=`Rscript -e 'cat(system.file("R","geneNetworkR-aggregate.R",package="geneNetworkR"))'`
54 | 
55 | # run with knitr
56 | Rscript -e "library(knitr); spin('$exeR')" -c $CPU -f $1
57 | 
58 | 


--------------------------------------------------------------------------------
/pipeline/runGeneNetworkRThreshold.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | # usage
 9 | usage(){
10 | echo >&2 \
11 | "
12 | 	Usage: $0 [options] <output dir>
13 | 
14 | 	Options:
15 |           -c the number of cores to parallelise over
16 | "
17 | 	exit 1
18 | }
19 | 
20 | # define global vars
21 | CPU=1
22 | 
23 | # manage options
24 | while getopts c: option
25 | do
26 |   case "$option" in
27 | 	    c) CPU=$OPTARG;;
28 | 	    \?) ## unknown flag
29 | 		usage;;
30 |   esac
31 | done
32 | shift `expr $OPTIND - 1`
33 | 
34 | # check
35 | if [ $# != 1 ]; then
36 |   echo "This script expects 1 argument"
37 |   usage
38 | fi
39 | 
40 | if [ ! -d $1 ]; then
41 |   echo "The first argument must be a directory"
42 |   usage
43 | fi
44 | 
45 | if [ ! -f $1/Data/sexp.rda ]; then
46 |   echo "The directory has to be a valid geneNetworkR directory."
47 |   echo "Run geneNetworkRPreparation.R first."
48 |   usage
49 | fi
50 | 
51 | # get the exec
52 | module load R
53 | exeR=`Rscript -e 'cat(system.file("R","geneNetworkR-threshold.R",package="geneNetworkR"))'`
54 | 
55 | # run with knitr
56 | Rscript -e "library(knitr); spin('$exeR')" -c $CPU -f $1
57 | 
58 | 


--------------------------------------------------------------------------------
/src/bash/functions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file is meant to contain only functions to be sourced as
 4 | # source $UPSCb/src/bash/functions.sh
 5 | 
 6 | ### ---------------------------------------------------------------------------
 7 | ## logic functions
 8 | 
 9 | usage(){
10 |   echo >&2 "$USAGETXT"
11 |   exit 1;
12 | }
13 | 
14 | abort(){
15 |   echo >&2 "$1"
16 |   usage
17 | }
18 | 
19 | ### ---------------------------------------------------------------------------
20 | ## array functions
21 | 
22 | # from https://stackoverflow.com/questions/3685970/check-if-a-bash-array-contains-a-value
23 | containsElement () {
24 |   local e
25 |   for e in "${@:2}"; do
26 |     [[ "$e" == "$1" ]] && echo 0 && return 0; 
27 |   done
28 |   echo 1
29 | }
30 | 
31 | ### ---------------------------------------------------------------------------
32 | ## preflight functions
33 | isExec () {
34 |   tool=$(which "$1" 2>/dev/null)
35 |   if [ -n "$tool" ] && [ -f "$tool" ] && [ -x "$tool" ]; then
36 |     return 0
37 |   else
38 |     abort "The tool $tool is not available."
39 |   fi
40 | }
41 | 
42 | isEnvVarSet () {
43 |   if [ -n "$1" ]; then
44 |     abort "The environment variable $1 is not set"
45 |   fi
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/pipeline/runPyfasta.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -l
 2 | #SBATCH -p main -n 2
 3 | #SBATCH -t 1-00:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ## we need slightly more than 3GB of RAM :-\
 7 | 
 8 | ## stop on error
 9 | set -e
10 | 
11 | if [ $# != 3 ]; then
12 |     echo "The argument should be the fasta full filename, the output directory and the number of chunks"
13 |     exit 1
14 | fi
15 | 
16 | if [ ! -f $1 ]; then
17 |     echo "The  fasta filename you provided does not exist"
18 |     exit 1
19 | fi
20 | 
21 | if [ ! -d $2 ]; then
22 |     echo "The  directory name you provided does not exist"
23 |     exit 1
24 | fi
25 | 
26 | if [ -z $3 ]; then
27 |     echo "The second argument should be an integer value describing the final number of chunks expected"
28 |     exit 1
29 | fi
30 | 
31 | if [ "$3" -lt "1" ]; then
32 |     echo "The second argument should be an integer value larger than 1 describing the final number of chunks expected"
33 |     exit 1
34 | fi
35 | 
36 | ## cd in the out dir
37 | cd $2
38 | 
39 | ## copy if it does not exist
40 | if [ ! -f  $2/`basename $1` ]; then
41 | cp $1 $2
42 | fi
43 | 
44 | ## create the chunks
45 | pyfasta split -n $3 $2/`basename $1`
46 | 
47 | ## rm the file
48 | rm $2/`basename $1`
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/pipeline/runSeidrBackbone.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A facility
 3 | #SBATCH -t 4:00:00
 4 | #SBATCH -p main -n 2
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | set -ex
 8 | 
 9 | # variables
10 | # we use 2 because of hyperthreading
11 | # to force the use of a virtual core instead of a logical one, we could
12 | # sbatch -n 1 -c 1 OMP_NUM_THREADS=1 seidr backbone -O 1
13 | CPU=2
14 | 
15 | # helper functions
16 | source ../UPSCb-common/src/bash/functions.sh
17 | 
18 | # usage
19 | USAGETXT=\
20 | "
21 |   Usage: $0 <seidr file> <threshold> <output filename>
22 |   
23 |   Note: the threshold is the quantile value from a normal distribution,
24 |   so a backbone of 1% is qnorm(0.99) = 2.33. 10% is 1.28. etc.
25 | "
26 | 
27 | # sanity
28 | isExec seidr
29 | if [ $? -ne 0 ]; then
30 |   abort "seidr is not available. Install it, or load the module"
31 | fi
32 | 
33 | if [ $# -ne 3 ]; then
34 |   abort "This script expects 3 arguments"
35 | fi
36 | 
37 | if [ ! -f $1 ]; then
38 |   abort "The first argument needs to be an existing file"
39 | fi
40 | 
41 | if [ ! -d $(dirname $3) ]; then
42 |   abort "The third argument directory needs to exist"
43 | fi
44 | 
45 | # run
46 | export OMP_NUM_THREADS=$CPU
47 | seidr backbone -F $2 -o $3 $1
48 | 
49 | 


--------------------------------------------------------------------------------
/pipeline/runBedToolsSubtract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 2:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | ## load the modules
12 | module load bioinfo-tools
13 | module load BEDTools
14 | 
15 | ## we get one dir and one file as input
16 | usage(){
17 |     echo >&2 \
18 |     " Usage: $0 <a file> <b file> <out dir> [bed subtract option] 
19 |     "
20 |     exit 1
21 | }
22 | 
23 | if [ "$#" -lt 3 ]; then
24 |   echo "This function requires 3 arguments"
25 |   usage;
26 | fi
27 | 
28 | if [ ! -f $1 ]; then
29 |     echo "The first argument needs to be an existing file"    
30 |     usage;
31 | fi
32 | a=$1
33 | shift
34 | 
35 | if [ ! -f $1 ]; then
36 |     echo "The second argument needs to be an existing file"
37 |     usage;
38 | fi
39 | b=$1;
40 | shift;
41 | 
42 | if [ ! -d $1 ]; then
43 |     echo "The third argument needs to be an existing directory"
44 |     usage;
45 | fi
46 | dir=$1;
47 | shift;
48 | 
49 | # combine the filename for the output
50 | outfile=$dir/`basename ${a%.*}`-`basename ${b%.*}`."${a##*.}"
51 | 
52 | ## get the subtracted results
53 | bedtools subtract $@ -a $a -b $b > $outfile
54 | 


--------------------------------------------------------------------------------
/pipeline/runITSx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 12
 4 | #SBATCH -t 1-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | # be safe (-e stop on error; -u stop if undefined variable, -x be verbose)
 8 | set -eux
 9 | 
10 | # load some functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | CPU=12
14 | 
15 | USAGETXT=\
16 | "
17 | Usage: runITSx.sh <mergedpairs fasta file> <forward fasta file> <output directory> <ITSx HMMs directory)
18 | "
19 | 
20 | # process the arguments
21 | if [ $# -ne 4 ]; then
22 |         abort "This script expects 3 arguments"
23 | fi
24 |      
25 | if [ ! -f $1 ]; then
26 |      abort "The first argument needs to be a file"
27 | fi
28 |      
29 | if [ ! -f $2 ]; then
30 |      abort "The second argument needs to be a file"
31 | fi
32 |      
33 | if [ ! -d $3 ]; then
34 |      abort "The third argument needs to be a directory"
35 | fi
36 |      
37 | if [ ! -d $4 ]; then
38 |      abort "The forth argument needs to be a directory"
39 | fi
40 |      
41 | # merge the input
42 | input=$3/$(basename ${1/.fa/-tmp.fa})
43 | cat $1 $2 > $input
44 |      
45 | # run
46 | ITSx -i $input -o ${input/-tmp.fa/-ITSx.fa} -p $4 --cpu $CPU
47 |      
48 | # clean
49 | rm $input
50 | 


--------------------------------------------------------------------------------
/pipeline/runTaxonomicClassification.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 12
 4 | #SBATCH -t 1-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | # be safe (-e stop on error; -u stop if undefined variable, -x be verbose)
 8 | set -eux
 9 | 
10 | # modules
11 | module load bioinfo-tools vsearch/2.13.0
12 | 
13 | # load some functions
14 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
15 | 
16 | # defaults
17 | CUTOFF=0.9
18 | 
19 | # usage
20 | USAGETXT=\
21 | "
22 | Usage: runVsearchTaxonClassification.sh <ITS2.fa> 
23 | 
24 | TODO: think that we might want a single file resulting from the dereplication, clustering, etc.
25 | "
26 | 
27 | # process the arguments
28 | if [ $# -ne 4 ]; then
29 |          abort "This script expects 3 arguments"
30 | fi 
31 |      
32 | if [ ! -f $1 ]; then
33 |          abort "The first argument needs to be a file"
34 | fi
35 |      
36 | if [ ! -f $2 ]; then
37 |          abort "The second argument needs to be a file"
38 | fi
39 |      
40 | if [ ! -d $3 ]; then
41 |          abort "The third argument needs to be a directory"
42 | fi
43 | 
44 | # run
45 | vsearch --sintax $1 --db $2 --sintax_cutoff $CUTOFF --tabbedout $3/${1/.fa/.tsv} 
46 | 
47 | # think of compressing the output
48 | 


--------------------------------------------------------------------------------
/pipeline/runKmergenie.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -c 8
 3 | 
 4 | module load bioinfo-tools kmergenie
 5 | 
 6 | usage ()
 7 | {
 8 | 	echo "runKmergenie.sh <READ1> [READ2...] <OUT_DIR>" >&2
 9 | 	echo "" >&2
10 | 	exit 1
11 | }
12 | 
13 | NARGS=$#
14 | CALL=$@
15 | 
16 | if [ $NARGS -lt 2 ]; then
17 | 	echo -e "\e[91m[ERROR] The minimum number of arguments is 2\e[39m" >&2
18 | 	usage
19 | fi
20 | 
21 | i=0
22 | let NARGS-=1
23 | while [ $i -lt $NARGS ]; do
24 | 	readFilesIn[$i]=$1
25 | 	shift
26 | 	let i+=1
27 | done
28 | 
29 | OUT_DIR=$@
30 | 
31 | if [ ! -d $OUT_DIR ];then
32 | 	echo -e "\e[33m[WARN] $OUT_DIR not a directory... creating\e[39m" >&2
33 | 	mkdir -p $OUT_DIR
34 | fi
35 | 
36 | for f in ${readFilesIn[@]}; do
37 | 	if [ ! -f $f ];then
38 | 		echo -e "\e[91m[ERROR] \"$f\" is not a valid file\e[39m" >&2
39 | 		usage
40 | 	fi
41 | done
42 | 
43 | TMF=$(tempfile)
44 | echo $TMF
45 | for f in ${readFilesIn[@]}; do
46 | 	echo -e $(readlink -f $f) >> $TMF
47 | done
48 | BN=$(basename  ${readFilesIn[0]})
49 | 
50 | echo -e "[INFO] Shell:	$0 $CALL" >&2
51 | echo -e "[INFO] Exec: kmergenie --diploid -t 8 $TMF -o ${OUT_DIR}/${BN}_hist" >&2
52 | echo -e "[INFO] PWD:  	$PWD" >&2
53 | kmergenie --diploid -t 8 $TMF -o ${OUT_DIR}/${BN}_hist
54 | rm $TMF
55 | 


--------------------------------------------------------------------------------
/pipeline/runKallistoStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 |     echo >&2 \
 5 | "
 6 |     runkallistoStats.sh <kallisto dir>
 7 | 
 8 |     Arguments:
 9 |         kallisto dir - the directory containing the kallisto stderr files
10 | 
11 |     Note:
12 |         The UPSCb Environment Variable needs to be set to your
13 |         Git UPSCb checkout dir.
14 | 
15 |     Details:
16 |         It reads the kallisto stderr files and create a text delimited file 
17 |         containing the sample name and number of pseudoalignments
18 |         
19 | "
20 | exit 1
21 | }
22 | 
23 | if [ $# -ne 1 ]; then
24 |     echo "This function takes one argument"
25 |     usage
26 | fi
27 | 
28 | if [ ! -d $1 ]; then
29 |     echo "The first argument needs to be the directory containing the kallisto reports"
30 |     usage
31 | fi
32 | 
33 | if [ -z $UPSCb ]; then
34 |     echo "The UPSCb environment variable needs to be set."
35 |     usage
36 | fi
37 | 
38 | if [ ! -f $UPSCb/pipeline/runKallistoStats.sh ]; then
39 |     echo "Either your UPSC env. var. is not set correctly or your checkout is too old."
40 |     usage
41 | fi
42 | 
43 | cd $1
44 | grep pseudoaligned *.err | awk '{smpl=$1;gsub(/_kallisto.*$/,"",smpl);val=$5;gsub(/,/,"",val);print smpl,val}' > kallistoStats.txt
45 | 


--------------------------------------------------------------------------------
/pipeline/runSamtoolsIndex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 00:10:00
 5 | #SBATCH --mail-type=END,FAIL
 6 | 
 7 | # stop on error
 8 | set -eu
 9 | 
10 | # functions
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # Vars
14 | CSI=
15 | 
16 | # a usage function
17 | USAGETXT=\
18 | "
19 | Usage: $0 [options] <samtools singularity container> <bam file>
20 | 
21 | Options: -c generate CSI index (for large, e.g. the spruce, genomes)
22 | 
23 | " 
24 | 
25 | # get the options
26 | while getopts ch option
27 | do
28 |         case "$option" in
29 |         c) CSI="-c";;
30 | 	      h) usage;;
31 | 		    \?) ## unknown flag
32 | 		    abort "unknown option";;
33 |         esac
34 | done
35 | shift `expr $OPTIND - 1`
36 | 
37 | # safeguards
38 | [[ $# != 2 ]] && abort "This function takes two arguments"
39 | 
40 | [[ ! -f $1 ]] && abort "The first argument needs to be the singularity container file"
41 | 
42 | [[ ! -f $2 ]] && abort "The first argument needs to be an existing bam file"
43 | 
44 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
45 | 
46 | # create the index
47 | singularity exec $1 samtools index $CSI $2
48 | 


--------------------------------------------------------------------------------
/pipeline/runRePair.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 2
 4 | #SBATCH -t 12:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -e
 9 | 
10 | ##
11 | echo Checking
12 | 
13 | ## we get two dir and two files as input
14 | if [ $# != 2 ]; then
15 |     echo "This function takes two files as arguments"
16 |     echo "Usage: sbatch runRePair.sh <forward fastq> <reverse fastq>"
17 |     exit 1
18 | fi
19 | 
20 | if [ ! -f $1 ]; then
21 |     echo "The second argument needs to be an existing fastq file"
22 | fi
23 | 
24 | if [ ! -f $2 ]; then
25 |     echo "The third argument needs to be an existing fastq file"
26 | fi
27 | 
28 | # set the UPSCb env. var.
29 | UPSCb=${UPSCb:-$(pwd)}/../UPSCb-common
30 | 
31 | ##
32 | echo Pairing
33 | 
34 | ## check the file order
35 | ## SLURM_SUBMIT_DIR is the directory from which sbatch was invoked.
36 | ## in our case it should always be in the project pipeline dir, so 3 dirs
37 | ## down the hierarchy to a common parent directory
38 | Rscript $UPSCb/src/R/rePairFastq.R -f $1 -r $2 -v
39 | 
40 | ## 
41 | echo Gzipping
42 | 
43 | ## compress the output files
44 | printf "%s\0%s" ${1/1./_paired_1.} ${2/2./_paired_2.} ${1/1./_single_1.} ${2/2./_single_2.} | xargs -0 -I {} -P 4 gzip -f {}
45 | 
46 | ##
47 | echo Done
48 | 


--------------------------------------------------------------------------------
/pipeline/runJellyfishHisto.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 12
 3 | #SBACTH --mail-type=FAIL
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | set -eux
 7 | 
 8 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
 9 | 
10 | # vars
11 | OPTARG=
12 | OPTIND=
13 | OPTIONS="-f"
14 | CPUS=12
15 | 
16 | ## usage
17 | USAGETXT=\
18 | "
19 | 	Usage: runJellyfishHisto.sh [options] <in.jf> <out.dir>
20 | 	
21 | 	Options:
22 | 	  -C turn off canonical mode (on by default)
23 | 	  -m kmer size (default $KMERSIZE)
24 | 	  -s hash size (defaut $HASHSIZE)
25 | 	  -t threads (default $CPUS)
26 | "
27 | 
28 | while getopts Cm:s:t: option
29 | do
30 |         case "$option" in
31 |         C) CANON="";;
32 | 	      m) KMERSIZE=$OPTARG;;
33 |         s) HASHSIZE=$OPTARG;;
34 |         t) CPUS=$OPTARG;;
35 |         \?) ## unknown flag
36 | 		usage;;
37 |         esac
38 | done
39 | shift `expr $OPTIND - 1`
40 | 
41 | # sanity
42 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing file"
43 | 
44 | [[ ! -d $2 ]] && abort "The second argument needs to be an existing directory"
45 | 
46 | isExec jellyfish
47 | isExec samtools
48 | 
49 | jellyfish bc -m $KMERSIZE -s $HASHSIZE $CANON -o $2/$(basename ${1/.bam/.bc}) -t $CPUS <(samtools view $1 | awk '{print ">"$1"\n"$10}')
50 | 
51 | 


--------------------------------------------------------------------------------
/pipeline/runBgzipTabix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p main -n 1
 4 | #SBATCH -t 02:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -e
 9 | 
10 | usage() {
11 |     echo >&2 "usage: $0 [OPTIONS] input.gff
12 | 
13 | Compress a gff, vcf, bed, sam or psltab file with bgzip
14 | and index it with tabix.
15 | 
16 | OPTIONS:
17 | -h          Show this message and exit.
18 | -p FORMAT   Format of input. Valid arguments of FORMAT are
19 |             gff, vcf, bed, sam and psltab. Default is gff."
20 | }
21 | 
22 | if ! hash bgzip 2>/dev/null; then
23 |     echo >&2 "ERROR: bgzip not found in path"
24 |     exit 1
25 | fi
26 | 
27 | if ! hash tabix 2>/dev/null; then
28 |     echo >&2 "ERROR: tabix not found in path"
29 |     exit 1
30 | fi
31 | 
32 | OPTIND=1
33 | format="gff"
34 | while getopts "hp:" opt; do
35 |     case "$opt" in
36 |         h) usage; exit 1 ;;
37 |         p) format=$OPTARG ;;
38 |         ?) usage; exit 1 ;;
39 |     esac
40 | done
41 | 
42 | shift $((OPTIND - 1))
43 | 
44 | if [ $# -lt 1 ]; then
45 |     echo >&2 "ERROR: input file missing"
46 |     usage
47 |     exit 1
48 | fi
49 | 
50 | if [ ! -f $1 ]; then
51 |     echo >&2 "ERROR: could not find input file: '$1'"
52 |     usage
53 |     exit 1
54 | fi
55 | 
56 | bgzip $1
57 | tabix -p $format $1.gz
58 | 


--------------------------------------------------------------------------------
/src/R/deviseSequenceFromGFF.R:
--------------------------------------------------------------------------------
 1 | ## libs
 2 | library(genomeIntervals)
 3 | library(Biostrings)
 4 | 
 5 | ## working dir
 6 | setwd("/mnt/picea/storage/reference/Populus-trichocarpa/v3.0/")
 7 | 
 8 | ## read the gff
 9 | gff <- readGff3(file="gff/Ptrichocarpa_v3.0_210_synthetic-gene-models.gff3")
10 | gff <- gff[gff$type=="exon",]
11 | 
12 | ## read the genome
13 | fa <- readDNAStringSet("fasta/Ptrichocarpa_v3.0_210.fa")
14 | 
15 | ## create the gene-model sequences
16 | seqs <- subseq(fa[seq_name(gff)],gff[,1],gff[,2])
17 | names(seqs) <- getGffAttribute(gff,"Parent")
18 | 
19 | ## the Reduce takes for ever (the outter one), try to just use a List
20 | seqs <- Reduce(append,mclapply(unique(names(seqs)),function(nam,seqs){
21 |   DNAStringSet(Reduce("c",seqs[names(seqs) == nam]))
22 | },seqs,mc.cores=16))
23 | 
24 | ## i.e. this might be faster; try out
25 | seqs <- DNAStringSet(mclapply(unique(names(seqs)),function(nam,seqs){
26 |   Reduce("c",seqs[names(seqs) == nam])
27 | },seqs,mc.cores=16))
28 | 
29 | names(seqs) <- unique(getGffAttribute(gff,"Parent"))
30 | m.sel <- gff[match(unique(names(seqs)),getGffAttribute(gff,"Parent"))]$strand=="-"
31 | seqs[m.sel] <- reverseComplement(seqs[m.sel])
32 | 
33 | ## export them
34 | writeXStringSet(seqs,"fasta/Ptrichocarpa_v3.0_210_synthetic-gene-models.fa")
35 | 


--------------------------------------------------------------------------------
/pipeline/runPLEK.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH --mail-type=ALL
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | # stop on error, be verbose and expand the commands
 7 | set -e -x
 8 | 
 9 | # source helpers
10 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
11 | 
12 | ## usage
13 | USAGETXT=\
14 | "
15 | 	Usage: runPLEK.sh <Trinity.fasta> <out dir>
16 | 	
17 | 	Options:
18 | 	            -thread       number of threads for running the PLEK program
19 | 	            -minlength    the minimum length of sequences
20 | 	            -isoutmsg     output messages to screen or not
21 | 	            -isrtempfile  remove temporary files or not
22 | "
23 | 
24 | # Check
25 | if [ $# -ne 2 ]; then
26 |     echo "This function needs 2 arguments"
27 |     usage
28 | fi
29 | 
30 | if [ ! -f $1 ]; then
31 |   abort "The first argument needs to be the trinity fasta filepath"
32 | fi
33 | 
34 | if [ ! -d $2 ]; then
35 |     abort "The second argument (output dir) needs to be an existing directory"
36 | fi
37 | 
38 | # run PLEK
39 | cd $2
40 | 
41 | # output filename
42 | fnam=$(basename ${1%.*})
43 | singularity exec --bind /mnt:/mnt \
44 | /mnt/picea/projects/singularity/delhomme-upscb-lncrna.simg PLEK.py \
45 | -minlength 200 -thread 12 -p 12 -isoutmsg 1 -fasta $1 -out $2/$fnam.txt
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/pipeline/runGENIE3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 16
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ## stop on error
 7 | set -ex
 8 | 
 9 | ## we get input and output file as input
10 | usage(){
11 |   echo >&2 \
12 |   " Usage: $0 [options] <expression matrix file> <out file>
13 |     Options: -c set the number of CPUs
14 |   "
15 |   exit 1
16 | }
17 | 
18 | ## Set defaults
19 | CPU=16
20 | 
21 | ## get the options
22 | while getopts c: option
23 |   do
24 |     case "$option" in
25 | 	    c) CPU=$OPTARG;;
26 | 		  \?) usage;;
27 |     esac
28 | done
29 | shift `expr $OPTIND - 1`
30 | 
31 | ## Process the args
32 | if [ "$#" -lt 2 ]; then
33 |   echo "This function requires 3 arguments"
34 |   usage;
35 | fi
36 | 
37 | if [ ! -f $1 ]; then
38 |   echo "The first argument needs to be an existing file"    
39 |   usage;
40 | fi
41 | 
42 | ## The R script now checks output permission so no
43 | ## need to do it here
44 | 
45 | ## check the environment
46 | if [ -z $UPSCb ]; then
47 |   echo "You need to define the UPSCb env var to your local UPSCb git checkout dir"
48 |   usage
49 | fi
50 | 
51 | ## load the modules
52 | module load R
53 | 
54 | ## and run
55 | Rscript --vanilla $UPSCb/src/R/GENIE3_simple.R $1 $2 $CPU
56 | 
57 | ## cleanup
58 | cat tmp/file* > GENIE3-edgelist.tsv
59 | rm -rf tmp
60 | 
61 | 


--------------------------------------------------------------------------------
/pipeline/runTaxonomyUpdate.sh:
--------------------------------------------------------------------------------
 1 | # DO NOT RUN ME - RUN ~/Git/UPSCb/src/bash/updateTaxonomySqlite.sh instead
 2 | 
 3 | #!/bin/env bash
 4 | set -ex
 5 | 
 6 | # go to the dir
 7 | cd /mnt/picea/storage/reference/Taxonomy
 8 | 
 9 | # create the dir
10 | dat=`date "+%Y%m%d"`
11 | mkdir $dat && cd $dat
12 | 
13 | # get the data
14 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
15 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
16 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz
17 | 
18 | # extract what we need
19 | tar -zxf taxdump.tar.gz nodes.dmp names.dmp
20 | 
21 | # and reformart
22 | sed -i 's:\t::g' nodes.dmp
23 | sed -i 's:"::g' nodes.dmp
24 | sed -i 's:|$::g' nodes.dmp
25 | sed -i 's:\t::g' names.dmp
26 | sed -i 's:"::g' names.dmp
27 | sed -i 's:|$::g' names.dmp
28 | 
29 | # extract the rest
30 | gunzip gi_taxid_nucl.dmp.gz
31 | gunzip gi_taxid_prot.dmp.gz
32 | 
33 | # load the data in SQL
34 | sqlite3 taxonomy.sqlite < $UPSCb/src/sql/taxonomy-create-and-populate-table.sql
35 | 
36 | # run R script to recreate the division
37 | Rscript $UPSCb/src/R/updateTaxonomyDivisionTable.R 
38 | 
39 | # load the GI
40 | sqlite3 taxonomy.sqlite < $UPSCb/src/sql/taxonomy-add-gi-relationship.sql
41 | 
42 | # compress
43 | gzip gi_taxid_nucl.dmp gi_taxid_prot.dmp mains.dmp names.dmp
44 | 


--------------------------------------------------------------------------------
/pipeline/runNutil.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | task=$1
 9 | 
10 | shift;
11 | 
12 | case "$task" in
13 |   aggr2rmt)
14 |     ~bastian/Git/geneNetworkR/src/util/bin/nutil --task $task -i $1 -g $2
15 |   ;;
16 |   aggregate)
17 |     gen=$1
18 |     shift
19 |     ~bastian/Git/geneNetworkR/src/util/bin/nutil --task $task -g $gen -i $@
20 |     # rename $@ to create the bin file
21 |   ;;
22 |   anova2el)
23 |     ~bastian/Git/geneNetworkR/src/util/bin/nutil --task $task -i $1 -g $2
24 |   ;;
25 |   ccm)
26 |     nrow=`wc -l $1`
27 |     ncol=`head -1 $1 | wc -w`
28 |     ncol=`expr $ncol - 1`
29 |     ~bastian/Git/RMTGeneNet/ccm $1 $nrow $ncol
30 |   ;;
31 |   el2bin)
32 |     ~bastian/Git/geneNetworkR/src/util/bin/nutil --task $task -i $1 -g $2 $3
33 |   ;;
34 |   lm2el)
35 |     ~bastian/Git/geneNetworkR/src/util/bin/nutil --task $task -i $1 -g $2
36 |   ;;
37 |   rmm)
38 |     ~bastian/Git/RMTGeneNet/rmm -b 1 -i $1
39 |   ;;
40 |   threshold)
41 |     ~bastian/Git/geneNetworkR/src/util/bin/threshold -i $1 -s 0.01 -H 0.99 -L 0.65 --trace
42 |   ;;
43 |   view)
44 |     ~bastian/Git/geneNetworkR/src/util/bin/nutil --task $task -i $1 -g $2
45 |   ;;
46 |   ?)
47 |     echo "No such task $task. Exiting."
48 |     exit 1;
49 |   ;;
50 | esac
51 | 
52 | 


--------------------------------------------------------------------------------
/pipeline/runJellyfishBc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 12
 3 | #SBACTH --mail-type=FAIL
 4 | #SBATCH -t 2-00:00:00
 5 | 
 6 | set -eux
 7 | 
 8 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
 9 | 
10 | # vars
11 | OPTARG=
12 | OPTIND=
13 | CANON="-C"
14 | OPTIONS=
15 | KMERSIZE=25
16 | HASHSIZE=100G
17 | CPUS=12
18 | 
19 | ## usage
20 | USAGETXT=\
21 | "
22 | 	Usage: runJellyfishBc.sh [options] <in.bam> <out.dir>
23 | 	
24 | 	Options:
25 | 	  -C turn off canonical mode (on by default)
26 | 	  -m kmer size (default $KMERSIZE)
27 | 	  -s hash size (defaut $HASHSIZE)
28 | 	  -t threads (default $CPUS)
29 | "
30 | 
31 | while getopts Cm:s:t: option
32 | do
33 |         case "$option" in
34 |         C) CANON="";;
35 | 	      m) KMERSIZE=$OPTARG;;
36 |         s) HASHSIZE=$OPTARG;;
37 |         t) CPUS=$OPTARG;;
38 |         \?) ## unknown flag
39 | 		usage;;
40 |         esac
41 | done
42 | shift `expr $OPTIND - 1`
43 | 
44 | # sanity
45 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing file"
46 | 
47 | [[ ! -d $2 ]] && abort "The second argument needs to be an existing directory"
48 | 
49 | isExec jellyfish
50 | isExec samtools
51 | 
52 | jellyfish bc -m $KMERSIZE -s $HASHSIZE $CANON -o $2/$(basename ${1/.bam/.bc}) -t $CPUS <(samtools view $1 | awk '{print ">"$1"\n"$10}')
53 | 
54 | 


--------------------------------------------------------------------------------
/pipeline/runUsearch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | ## be verbose and stop on error
 4 | set -ex
 5 | 
 6 | ## usage
 7 | usage(){
 8 | echo >&2 \
 9 | "
10 | Usage: $0 [options] <input fasta> <output prefix>
11 | 
12 | Options:
13 |    -c sequence identity threshold; default to 0.99
14 |    -T number of threads; default to 16
15 |    -i the idprefix size; default to 0
16 | Note:
17 |    You need to set the UPSCb env. variable to your UPSCb git checkout directory
18 | "
19 | exit 1
20 | }
21 | 
22 | ## defaults
23 | IDENT=0.99
24 | THREADS=16
25 | IDP=0
26 | 
27 | ## options
28 | while getopts c:T:i: option
29 | do
30 |     case "$option" in
31 | 	c) IDENT=$OPTARG;; 
32 | 	T) THREADS=$OPTARG;;
33 | 	i) IDP=$OPTARG;;
34 | 	\?) usage;;
35 |     esac
36 | done
37 | shift `expr $OPTIND - 1`
38 | 
39 | ## arguments
40 | if [ $# != 2 ]; then
41 |     echo "This function takes 2 arguments: the input fasta file and the output filename"
42 |     usage
43 | fi
44 | 
45 | if [ ! -f $1 ]; then
46 |     echo "The first argument must be a valid fasta file"
47 |     usage
48 | fi
49 | 
50 | if [ ! -d `dirname $2` ]; then
51 |     echo "The second argument must be an output filename which parent directory must exist"
52 |     usage
53 | fi
54 | 
55 | ## command
56 | usearch -cluster_fast $1 -id $IDENT -uc $2.uc -idprefix $IDP --centroids $2.fasta -threads $THREADS
57 | 


--------------------------------------------------------------------------------
/templates/bash/runTemplate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -t 48:00:00
 3 | #SBATCH -n 1
 4 | #SBATCH -A facility
 5 | #SBATCH -J CHANGEME
 6 | #SBATCH --mail-type=END,FAIL
 7 | 
 8 | # sanity
 9 | set -eu -o pipefail
10 | 
11 | # functions
12 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
13 | 
14 | # usage
15 | USAGETXT=\
16 | "
17 |   Synopsis [options] $0 <ARGS>
18 |   
19 |   Options:
20 |     -c change me
21 | 
22 |   Note: some notes
23 | "
24 | 
25 | # vars
26 | key="value"
27 | 
28 | # options
29 | while getopts cC: option
30 | do
31 |     case "$option" in
32 |     c) key="changeme-the-key-too";;
33 |     C) key="changeme-the-key-too${OPTARG}";;
34 |     \?) usage;;
35 |   esac
36 | done
37 | shift `expr $OPTIND - 1`
38 | 
39 | # checks
40 | [[ $# -ne CHANGEME ]] && abort "This script expects CHANGEME arguments"
41 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing file path to a singularity container"
42 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
43 | [[ ! -f  ]] && abort "The CHANGEME argument needs to be an existing file path to CHANGEME"
44 | [[ ! -d  ]] && abort "The CHANGEME argument needs to be an existing directory"
45 | 
46 | # run
47 | singularity exec $1 \
48 | CHANGEMETOOL CHANGEMECMDLINE
49 | 


--------------------------------------------------------------------------------
/templates/bash/template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # failsafe
 4 | set -eu -o pipefail
 5 | #set -x
 6 | 
 7 | # variables
 8 | ARGs=0
 9 | DO=0
10 | 
11 | # usage
12 | export USAGETXT=\
13 | "
14 | Usage: $0 [options] <arguments>
15 | 
16 | Purpose: The script ...
17 | 
18 | Options:
19 |     -d do not just print, do
20 |     -h print this message
21 |     
22 | Note:
23 |     The script ...
24 | "
25 | 
26 | # helper function
27 | # shellcheck disable=SC1091
28 | source functions.sh
29 | 
30 | # handle the options
31 | while getopts dh option
32 | do
33 |         case "$option" in
34 |         d) DO=1;;
35 |         h) usage;;
36 | 		\?) ## unknown flag
37 | 		usage;;
38 |         esac
39 | done
40 | shift $((OPTIND - 1))
41 | 
42 | # setup
43 | 
44 | # sanity
45 | [[ ${ARGs} -gt 0 ]] && [[ $# -ne ${ARGs} ]] && abort "This script expects ${ARGs} arguments"
46 | 
47 | # cmds container
48 | cmds=()
49 | 
50 | # end of the boilerplate, logic goes below - instead of running cmds, add them to the cmds list
51 | # e.g. 
52 | cmds+=("echo Hello World
53 | ")
54 | 
55 | # end of logic, start of evalution. dry-run unless -d is provided on the cmdline
56 | 
57 | # shellcheck disable=SC2086
58 | if [ ${DO} -eq 1 ]; then
59 |     for j in $(seq 0 $((${#cmds[@]} - 1))); do
60 |         eval "${cmds[$j]}"
61 |     done
62 | else
63 |     echo "${cmds[@]}"
64 | fi
65 | 


--------------------------------------------------------------------------------
/pipeline/runCuffmerge.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -l
 2 | #SBATCH -p main -N 1
 3 | #SBATCH -t 0-02:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ##
 7 | set -e
 8 | 
 9 | ## 
10 | echo Loading
11 | export LD_LIBRARY_PATH=~delhomme/lib
12 | 
13 | ##
14 | echo Checking
15 | 
16 | ## we get two dir as input
17 | if [ $# -lt 3 ]; then
18 |     echo "This function takes two directories and two files as arguments, the second file being facultative"
19 |     echo "in which case cuffmerge is run without prior annotation knowledge."
20 |     echo "Usage: sbatch runCuffmerge.sh <in dir> <out dir> <genome fasta> [gene gff3]"
21 |     exit 1
22 | fi
23 | 
24 | if [ ! -d $1 ]; then
25 |     echo "The first argument needs to be an existing directory"
26 | fi
27 | 
28 | if [ ! -d $2 ]; then
29 |     echo "The second argument needs to be an existing directory"
30 | fi
31 | 
32 | if [ ! -f $3 ]; then
33 |     echo "The third argument needs to be an existing file"
34 | fi
35 | 
36 | ext=
37 | if [ ! -z $4 ]; then
38 |     if [ ! -f $4 ]; then
39 | 	echo "The forth argument needs to be an existing file"
40 |     fi
41 |     ext="--ref-gtf $4"    
42 | fi
43 | 
44 | ##
45 | echo Setting up
46 | 
47 | find $1 -name "transcripts.gtf" > $2/cuffmergeMANIFEST
48 | 
49 | ##
50 | echo Starting
51 | 
52 | echo cuffmerge -p 8 $ext -s $3 -o $2 $2/cuffmergeMANIFEST
53 | 
54 | ##
55 | echo Done
56 | 
57 | 


--------------------------------------------------------------------------------
/pipeline/runGATK_SplitNCigarReads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -t 1-00:00:00
 4 | #SBATCH -p main
 5 | #SBATCH -n 1
 6 | #SBATCH --mem=32G
 7 | 
 8 | set -e
 9 | 
10 | module load bioinfo-tools GATK
11 | module load java
12 | 
13 | if [ -z $GATK_HOME ]; then
14 |     echo >&2 "Could not find GATK"
15 |     exit 1
16 | fi
17 | 
18 | GATK=$GATK_HOME/GenomeAnalysisTK.jar
19 | if [ -d "$SNIC_TMP" ]; then
20 |     tmp=$SNIC_TMP
21 | else
22 |     tmp=/mnt/picea/tmp
23 | fi
24 | 
25 | if [ $# -ne 3 ]; then
26 |     echo "Usage: $0 <in.bam> <ref.fasta> <output directory>" 1>&2
27 |     exit 1
28 | fi
29 | 
30 | if [ ! -f $1 ]; then
31 |     echo "Could not find BAM file '$1'" 1>&2
32 |     exit 1
33 | fi
34 | inbam=$1
35 | 
36 | if [ ! -f $2 ]; then
37 |     echo "Could not find reference '$2'" 1>&2
38 |     exit 1
39 | fi
40 | ref=$2
41 | 
42 | if [ ! -d $3 ]; then
43 |     echo "No such output directory" 1>&2
44 |     exit 1
45 | fi
46 | outdir=$3
47 | 
48 | namein=${inbam/.bam/}
49 | bname=`basename $inbam`
50 | sname=${bname/_[st]*[st]*_STAR*.bam/}
51 | 
52 | # Run splitNCigarReads
53 | java -jar -Xmx5G -Djava.io.tmpdir=$tmp $GATK -T SplitNCigarReads \
54 |     -R "$ref" \
55 |     -I "$inbam" \
56 |     -o "$outdir/${bname/.bam/}_split.bam" \
57 |     -rf ReassignOneMappingQuality \
58 |     -RMQF 255 \
59 |     -RMQT 60 \
60 |     -U ALLOW_N_CIGAR_READS
61 | 


--------------------------------------------------------------------------------
/pipeline/runGROM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p main
 3 | #SBACTH -n 16
 4 | 
 5 | set -ex
 6 | 
 7 | # helper
 8 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
 9 | 
10 | # usage
11 | USAGETXT=\
12 | "
13 | Usage: $0 <in bam> <fasta reference> <out dir>
14 | 
15 | Note: the GROM, htslib and vcftools modules need to be loaded
16 | "
17 | 
18 | # Checks
19 | isEnvVarSet("UPSCb")
20 | 
21 | isExec("GROM")
22 | 
23 | isExec("vcf-sort")
24 | 
25 | isExec("bgzip")
26 | 
27 | if [ $# != 3 ]; then
28 |   abort "This script expects 2 arguments"
29 | fi
30 | 
31 | if [ ! -f $1 ]; then
32 |   abort "The first argument needs to be an existing bam file"
33 | fi
34 | 
35 | if [ ! -f $1 ]; then
36 |   abort "The first argument needs to be an existing fasta file"
37 | fi
38 | 
39 | if [ ! -d $3 ]; then
40 |   abort "The second argument needs to be an existing directory"
41 | fi
42 | 
43 | # Setup
44 | GROM=$(which GROM)
45 | cd $out
46 | ln -s $GROM.
47 | 
48 | # Run (P is half of the cores used)
49 | outfile=$3/$(basename ${1/.bam/.vcf})
50 | $GROM -M -P 8 -i $1 -r $2 -o $outfile
51 | 
52 | # Sort, compress and index
53 | sorted=${outfile/.vcf/_sorted.vcf}
54 | cat $outfile | vcf-sort -c > $sorted
55 | 
56 | bgzip -f $sorted
57 | 
58 | tabix ${sorted}.gz -p vcf
59 |  
60 | # Cleanup
61 | find $out -type l -name "GROM" -delete
62 | rm $outfile
63 | 


--------------------------------------------------------------------------------
/src/R/createGeneAnnotation.R:
--------------------------------------------------------------------------------
 1 | ## working dir
 2 | setwd("/mnt/picea/storage/reference/Populus-trichocarpa/v3.0/annotation")
 3 | 
 4 | ## read the annot
 5 | annotationtable<-read.delim(file="Ptrichocarpa_v3.0_210_annotation_info.txt"
 6 |                             header=FALSE,stringsAsFactors=FALSE,row.names=1)
 7 | 
 8 | ## check that the 3rd column is useless and remove it
 9 | all(annotationtable$V3 == annotationtable$V4)
10 | annotationtable$V3 <- NULL
11 | 
12 | ## check the number of gene and transcripts
13 | length(unique(annotationtable$V2))
14 | length(unique(annotationtable$V4))
15 | 
16 | ## set the colnames
17 | colnames(annotationtable) <- c("geneID","trxID","PFAM","PANTHER","KOG","EC","RefSeq","GO","At","Symbol","Description")
18 | 
19 | ## check that some isoforms have different annotations
20 | table(sapply(sapply(split(annotationtable$PFAM,annotationtable$geneID),unique),length))
21 | table(sapply(sapply(split(annotationtable$GO,annotationtable$geneID),unique),length))
22 | 
23 | ## combine these information into a single table
24 | annot<-cbind(geneID=unique(annotationtable$geneID),sapply(colnames(annotationtable)[-c(1:2)],function(co,tab){
25 |   sapply(sapply(split(tab[,co],tab$geneID),unique),paste,collapse="|")
26 |   },annotationtable))
27 | 
28 | ## write it out
29 | write.csv(annot,file="Ptrichocarpa_v3.0_210_gene-annotation.csv")
30 | 


--------------------------------------------------------------------------------
/pipeline/runCleanTrinity.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 08:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | usage(){
 8 |   echo >&2 \
 9 |   "
10 |   This script expects one argument: the directory to clean up.
11 |   
12 |   WARNING: this script REMOVES files considered unneeded from a trinity run.
13 |   DO NOT USE IT FOR ANYTHING ELSE!
14 |   "
15 |   exit 1
16 | }
17 | 
18 | if [ $# != 1 ]; then
19 |   echo "ERROR: This script expect one argument: the directory to clean"
20 |   usage
21 | fi
22 | 
23 | if [ ! -d $1 ]; then
24 |   echo "ERROR: The provided directory does not exist"
25 |   usage
26 | fi
27 | 
28 | cd $1
29 | 
30 | if [ ! -f Trinity.fasta ]; then
31 |   echo "This does not look like a completed trinity run or a trinity directory. Aborting"
32 |   exit 1
33 | else
34 |   ## remove
35 |   rm -rf \
36 |   chrysalis \
37 |   both.fa* \
38 |   bowtie.* \
39 |   FailedCommand \
40 |   inchworm.* \
41 |   iworm_scaffolds* \
42 |   jellyfish.* \
43 |   partitioned_reads* \
44 |   read_partitions \
45 |   recursive_trinity* \
46 |   scaffolding_entries.sam \
47 |   target* \
48 |   tmp.iworm.fa* \
49 |   norm_for_read_set* \
50 |   
51 |   ## compress diginorm
52 |   find insilico_read_normalization* -type f -name "*.fq" | xargs -P 2 -I {} gzip {}
53 |   
54 |   ## compress output
55 |   gzip Trinity.fasta
56 | fi
57 | 


--------------------------------------------------------------------------------
/pipeline/runGatkRealignerTargetCreator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -t 1-00:00:00
 4 | #SBATCH -p main
 5 | #SBATCH -n 6
 6 | #SBATCH --mem 36G
 7 | 
 8 | set -e
 9 | 
10 | #module load java
11 | #module load bioinfo-tools
12 | #module load GATK
13 | 
14 | # helper
15 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
16 | 
17 | USAGETXT=\
18 | "
19 | Usage: $0 <BAM file> <fasta ref> <output directory>
20 | 
21 | Note: This script is not GATK v4 compatible. Load a GATK V3 module. More at https://software.broadinstitute.org/gatk/blog?id=7847
22 | "
23 | 
24 | # default
25 | Threads=6
26 | JavaThreadMem=6G
27 | 
28 | # GATK 3
29 | if [ -z $GATK_HOME ]; then
30 |   usage
31 | fi
32 | GATK=$GATK_HOME/GenomeAnalysisTK.jar
33 | 
34 | if [ $# -lt 3 ]; then
35 |   usage
36 | fi
37 | 
38 | if [ ! -f $1 ]; then
39 |     abort "Could not find BAM file '$1'"
40 | fi
41 | inbam=$1
42 | 
43 | if [ ! -f $2 ]; then
44 |     abort "Could not find FASTA file '$2'"
45 | fi
46 | ref=$2
47 | 
48 | if [ ! -d $3 ]; then
49 |     abort "Could not find directory '$3'"
50 | fi
51 | outdir=$3
52 | 
53 | # drop the three args
54 | shift
55 | shift
56 | shift
57 | 
58 | name_out=`basename "${inbam/.bam/.intervals}"`
59 | 
60 | # Run
61 | java -Xmx${JavaThreadMem} -jar $GATK -nt $Threads -I $inbam -R $ref -T RealignerTargetCreator -o $outdir/$name_out $@
62 | 
63 | 


--------------------------------------------------------------------------------
/pipeline/runKallistoIndex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 1-00:00:00
 4 | #SBATCH --mail-type=ALL
 5 | #SBATCH --mem=8GB
 6 | 
 7 | set -eux
 8 | 
 9 | module load bioinfo-tools kallisto
10 | 
11 | ## a usage function
12 | usage(){
13 |     echo >&2 \
14 | "
15 |     Usage: $0 [options] <fasta file> <out dir>
16 |     Note: The database filename defaults to the input basename
17 | " 
18 |     exit 1
19 | }
20 | 
21 | ## VARS
22 | KMER=""
23 | 
24 | 
25 | ## get the options
26 | while getopts k: option
27 | do
28 |     case "$option" in
29 | 	k) KMER="-k $OPTARG";;
30 | 	\?) ## unknown flag
31 | 	    usage;;
32 |   esac
33 | done
34 | shift `expr $OPTIND - 1`
35 | 
36 | ## we get one file and one dir as input 
37 | if [ $# != 2 ]; then
38 |     echo "This function takes one fasta file and one output dir as argument"
39 |     usage
40 | fi
41 | 
42 | if [ ! -f $1 ]; then
43 |     echo "The first argument needs to be an existing fasta file"
44 |     usage
45 | fi
46 | 
47 | if [ ! -d $2 ]; then
48 |     echo "The second argument needs to be an existing directory"
49 |     usage
50 | fi
51 | 
52 | # get the extension
53 | ext="${1##*.}"
54 | 
55 | if [ "$ext" == "gz" ]; then
56 |   inxName=$2/$(basename ${1/.f*a.gz/}).inx
57 | else
58 |   inxName=$2/$(basename ${1/.f*a/}).inx
59 | fi
60 | 
61 | # running
62 | kallisto index $KMER -i $inxName $1
63 | 


--------------------------------------------------------------------------------
/pipeline/runPicardAddOrReplaceReadGroups.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH --mail-type=ALL
 5 | #SBATCH --mem=32GB
 6 | 
 7 | ## stop on error
 8 | set -e
 9 | 
10 | ## modules
11 | module load bioinfo-tools Picard-tools samtools
12 | 
13 | ## a usage function
14 | usage(){
15 |     echo >&2 \
16 | "
17 |     Usage: $0 <bam file> <out dir> <read groups>
18 | 
19 |     Note:
20 |          Read Groups should be provided as ID=value pairs space delimited
21 | " 
22 |     exit 1
23 | }
24 | 
25 | ## we get one file as input
26 | if [ $# -le 3 ]; then
27 |     echo "This function takes at least a bam file, an out dir and one read group specification as argument"
28 |     usage
29 | fi
30 | 
31 | if [ ! -f $1 ]; then
32 |     echo "The first argument needs to be an existing bam file"
33 |     usage
34 | fi
35 | in=$1
36 | shift
37 | 
38 | if [ ! -d $1 ]; then
39 |     echo "The second argument needs to be an existing directory"
40 |     usage
41 | fi
42 | out=$1
43 | shift
44 | 
45 | # create the outfile
46 | out=$out/`basename $in`
47 | 
48 | ## clean sam
49 | java -jar $PICARD_TOOLS_DIR/picard.jar CleanSam I=$in O=$out.clean
50 | 
51 | ## add the RG
52 | java -jar $PICARD_TOOLS_DIR/picard.jar AddOrReplaceReadGroups I=$out.clean O=$out $@
53 | 
54 | # index the bam file
55 | samtools index $out
56 | 
57 | ## clean up
58 | rm $out.clean
59 | 


--------------------------------------------------------------------------------
/pipeline/runTIGLM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 8
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ## stop on error
 7 | set -ex
 8 | 
 9 | ## we get one dir and one file as input
10 | usage(){
11 |   echo >&2 \
12 |   " Usage: $0 [options] <expression matrix file> <gene names file> <out dir>
13 |     Options: -c set the number of CPUs
14 |   "
15 |   exit 1
16 | }
17 | 
18 | ## Set defaults
19 | CPU=16
20 | 
21 | ## get the options
22 | while getopts c: option
23 |   do
24 |     case "$option" in
25 | 	    c) CPU=$OPTARG;;
26 | 		  \?) usage;;
27 |     esac
28 | done
29 | shift `expr $OPTIND - 1`
30 | 
31 | ## Process the args
32 | if [ "$#" -lt 3 ]; then
33 |   echo "This function requires 3 arguments"
34 |   usage;
35 | fi
36 | 
37 | if [ ! -f $1 ]; then
38 |   echo "The first argument needs to be an existing file"
39 |   usage;
40 | fi
41 | 
42 | if [ ! -f $2 ]; then
43 |   echo "The second argument needs to be an existing file"
44 |   usage;
45 | fi
46 | 
47 | if [ ! -d $3 ]; then
48 |   echo "The third argument needs to be an existing directory"
49 |   usage;
50 | fi
51 | 
52 | # set up
53 | cd $3
54 | mkdir tmp
55 | 
56 | # run
57 | /mnt/picea/home/bastian/Git/geneNetworkR/src/tiglm/bin/tiglm -i $1 -g $2 -p $CPU
58 | 
59 | # clean up
60 | mv MA* tmp
61 | 
62 | # and concatenate
63 | cat tmp/MA* > TIGLM-edgelist.tsv
64 | 
65 | # finally clean up
66 | rm -rf tmp/MA*
67 | rmdir tmp
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/pipeline/runAssemblathonStat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 1:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | ## load the modules
12 | module load perl
13 | 
14 | ## we get one dir and one file as input
15 | usage(){
16 |     echo >&2 \
17 |     " Usage: $0 [options] <genome fasta file>
18 |       
19 |       Options:
20 |         -n the maximum number of consecutive N characters allowed 
21 |            before scaffolds are split into contigs (default 25)
22 |         -g the estimated genome size
23 |         
24 |       Notes: The result will be written in the same dir as the input fasta file  
25 |     "
26 |     exit 1
27 | }
28 | 
29 | OPTIONS="-csv"
30 | # getting the options
31 | while getopts n:g: option
32 | do
33 |         case "$option" in
34 |       g) OPTIONS="$OPTIONS -genome_size $OPTARG";;
35 | 	    n) OPTIONS="$OPTIONS -n $OPTARG";;
36 | 		\?) ## unknown flag
37 | 		usage;;
38 |         esac
39 | done
40 | shift `expr $OPTIND - 1`
41 | 
42 | if [ "$#" -lt 1 ]; then
43 |   echo "This function requires 2 arguments"
44 |   usage;
45 | fi
46 | 
47 | if [ ! -f $1 ]; then
48 |     echo "The first argument needs to be an existing fasta file"    
49 |     usage;
50 | fi
51 | 
52 | ## get the subtracted results
53 | perl -I $UPSCb/src/perl $UPSCb/src/perl/assemblathon_stats.pl $OPTIONS $1
54 | 


--------------------------------------------------------------------------------
/pipeline/runBlastFormatDb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 1-00:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | module load bioinfo-tools blast/2.2.26
 9 | 
10 | ## a usage function
11 | usage(){
12 |     echo >&2 \
13 | "
14 |     Usage: $0 [options] <fasta file> <out dir>
15 |     Options:
16 |             -p the type of file T/F (default to F: nucleotide; use T for protein)
17 |             -t the db title
18 |     Note: The database filename defaults to the input basename
19 | " 
20 |     exit 1
21 | }
22 | 
23 | ## VARS
24 | OPTIONS=""
25 | TYPE="F"
26 | 
27 | ## get the options
28 | while getopts p:t: option
29 | do
30 |     case "$option" in
31 | 	t) OPTIONS="$OPTIONS -t $OPTARG";;
32 | 	p) TYPE="$OPTARG";; 
33 | 	\?) ## unknown flag
34 | 	    usage;;
35 |   esac
36 | done
37 | shift `expr $OPTIND - 1`
38 | 
39 | ## extend the OPTIONS
40 | OPTIONS="$OPTIONS -p $TYPE"
41 | 
42 | ## we get one file and one dir as input 
43 | if [ $# != 2 ]; then
44 |     echo "This function takes one fasta file and one output dir as argument"
45 |     usage
46 | fi
47 | 
48 | if [ ! -f $1 ]; then
49 |     echo "The first argument needs to be an existing fasta file"
50 |     usage
51 | fi
52 | 
53 | if [ ! -d $2 ]; then
54 |     echo "The second argument needs to be an existing directory"
55 |     usage
56 | fi
57 | 
58 | 
59 | # running
60 | formatdb -i $1 -o T -n $2/`basename $1` $OPTIONS
61 | 


--------------------------------------------------------------------------------
/pipeline/runSeidrAggregate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A facility
 3 | #SBATCH -t 12:00:00
 4 | #SBATCH -p main -n 32
 5 | #SBATCH --mem=16GB
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | set -ex
 9 | DIRECTIONALITY="-k"
10 | FORCE=
11 | METHOD="-m irp"
12 | 
13 | # usage
14 | # shellcheck disable=SC2034
15 | USAGETXT=\
16 | "
17 |   Usage: $0 [options] <out dir> <sf file> [sf file] ... [sf file]
18 |   
19 |   Options:
20 |           -f  force overwrite output
21 |           -k  keep the directionality, default: true
22 |           -m  method, default: irp
23 |           
24 | "
25 | CPU=32
26 | 
27 | source "${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh"
28 | 
29 | isExec seidr
30 | 
31 | # Get the options
32 | while getopts fkm: option
33 | do
34 |     case "$option" in
35 |         f) FORCE="-f";;
36 |         k) DIRECTIONALITY=;;
37 |         m) METHOD="-m $OPTARG";;
38 |         \?) ## unknown flag
39 | 		    abort;;
40 |     esac
41 | done
42 | shift $(("$OPTIND" - 1))
43 | 
44 | OPTIONS="$FORCE $DIRECTIONALITY $METHOD"
45 | 
46 | if [ $# -lt 2 ]; then
47 |   abort "This script expects at least 2 arguments"
48 | fi
49 | 
50 | if [ ! -d "$1" ]; then
51 |   abort "The first argument needs to be an existing directory"
52 | fi
53 | 
54 | # run
55 | cd "$1"
56 | shift
57 | export OMP_NUM_THREADS=$CPU
58 | #rm -f aggregated.sf
59 | 
60 | # shellcheck disable=SC2068
61 | seidr aggregate "$OPTIONS" -O $CPU $@
62 | 


--------------------------------------------------------------------------------
/pipeline/runGATK_GenotypeGVCFs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 10:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | # check -u
 8 | set -ex
 9 | 
10 | # module load bioinfo-tools GATK
11 | 
12 | # helper
13 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
14 | 
15 | PLOIDY=2
16 | 
17 | USAGETXT=\
18 | "
19 | Usage: $0 [options] <ref.fa> <out.vcf> <combined gvcf>
20 | 
21 | Notes:This script is GATK v4 compatible and GATK V3 incompatible. 
22 | This script was changed to use a CombinedGVCFs - check runGATK_CombineGVCFs.sh
23 | 
24 | Options: -p ploidy defaults to 1
25 | 
26 | "
27 | 
28 | ## get the options
29 | while getopts p: option
30 | do
31 |   case "$option" in
32 | 	    p) PLOIDY=$OPTARG;;
33 | 		\?) ## unknown flag
34 | 		usage;;
35 |   esac
36 | done
37 | shift `expr $OPTIND - 1`
38 | 
39 | # check
40 | isExec gatk
41 | 
42 | if [ "$#" -ne "3" ]; then
43 |     usage
44 | fi
45 | 
46 | if [ ! -f $1 ]; then
47 |     abort "ERROR: could not find reference: '$1'"
48 | fi
49 | 
50 | ref=$1
51 | shift
52 | 
53 | out=$1
54 | shift
55 | 
56 | in=$1
57 | shift
58 | 
59 | # variants=()
60 | # for gvcf in $@; do
61 | #     if [ ! -f "$gvcf" ]; then
62 | #         abort "ERROR: file not found: '$gvcf'"
63 | #     fi
64 | #     variants+=("--variant $gvcf")
65 | # done
66 | 
67 | # checth GVCF options
68 | gatk GenotypeGVCFs -R "$ref" -V $in -O $out --sample-ploidy $PLOIDY
69 | 
70 | 


--------------------------------------------------------------------------------
/pipeline/runNarromi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 16
 4 | #SBATCH --mail-type=ALL
 5 | #SBATCH --mem=128G
 6 | 
 7 | ## stop on error
 8 | set -ex
 9 | 
10 | ## we get one dir and one file as input
11 | usage(){
12 |   echo >&2 \
13 |   " Usage: $0 [options] <expression matrix file> <gene names file> <out dir>
14 |     Options: -c set the number of CPUs
15 |   "
16 |   exit 1
17 | }
18 | 
19 | ## Set defaults
20 | CPU=16
21 | 
22 | ## get the options
23 | while getopts c: option
24 |   do
25 |     case "$option" in
26 | 	    c) CPU=$OPTARG;;
27 | 		  \?) usage;;
28 |     esac
29 | done
30 | shift `expr $OPTIND - 1`
31 | 
32 | ## Process the args
33 | if [ "$#" -lt 3 ]; then
34 |   echo "This function requires 3 arguments"
35 |   usage;
36 | fi
37 | 
38 | if [ ! -f $1 ]; then
39 |   echo "The first argument needs to be an existing file"
40 |   usage;
41 | fi
42 | 
43 | if [ ! -f $2 ]; then
44 |   echo "The second argument needs to be an existing file"
45 |   usage;
46 | fi
47 | 
48 | if [ ! -d $3 ]; then
49 |   echo "The third argument needs to be an existing directory"
50 |   usage;
51 | fi
52 | 
53 | ## set up
54 | cd $3
55 | mkdir tmp
56 | 
57 | ## run 
58 | ~bastian/Git/geneNetworkR/src/narromi/bin/narromi -i $1 -g $2 -p $CPU
59 | 
60 | # clean up
61 | mv MA* tmp
62 | 
63 | # and concatenate
64 | cat tmp/MA* | cut -f1,2,4 > Narromi-edgelist.tsv
65 | 
66 | # finally clean up
67 | rm -rf tmp/MA*
68 | rmdir tmp
69 | 


--------------------------------------------------------------------------------
/pipeline/runGATK_CombineVariants.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | #SBATCH -t 1:00:00
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | if [ -z $SLURM_SUBMIT_DIR ]; then
 9 |     ## Not using SLURM
10 |     if [ $# -lt 5 ]; then
11 |         echo >&2 "Usage: $0 <gatk> <tmp> <ref.fasta> <out.vcf> <in.vcf> <in.vcf> [<in.vcf> ...]"
12 |         exit 1
13 |     fi
14 |     GATK=$1
15 |     tmp=$2
16 |     shift 2
17 | else
18 |     if [ $# -lt 3 ]; then
19 |         echo "Usage: $0 <ref.fasta> <out.vcf> <in.vcf> <in.vcf> [<in.vcf> ...]"
20 |         exit 1
21 |     fi
22 |     GATK="/sw/apps/bioinfo/GATK/3.2.2/GenomeAnalysisTK.jar"
23 |     tmp=$SNIC_TMP
24 | fi
25 | 
26 | if [ ! -f $GATK ]; then
27 |     echo >&2 "ERROR: could not find GATK"
28 |     exit 1
29 | fi
30 | 
31 | if [ ! -d $tmp ]; then
32 |     echo >&2 "ERROR: tmp is not a directory"
33 |     exit 1
34 | fi
35 | 
36 | if [ ! -f $1 ]; then
37 |     echo >&2 "ERROR: could not find reference: '$1'"
38 |     exit 1
39 | fi
40 | ref=$1
41 | 
42 | if [ ! -d `dirname $2` ]; then
43 |     echo >&2 "ERROR: no such directory: '$2'"
44 |     exit 1
45 | fi
46 | out=$2
47 | 
48 | shift 2
49 | 
50 | variants=()
51 | for f in $@; do
52 |     variants=("${variants[@]}" "-V $f")
53 | done
54 | 
55 | VAR=(${variants[@]})
56 | 
57 | java -Xmx4g -jar $GATK \
58 |     -T CombineVariants \
59 |     -R $ref \
60 |     --filteredAreUncalled \
61 |     "${VAR[@]}" \
62 |     -o $out
63 | 


--------------------------------------------------------------------------------
/src/R/enaCsvEdit.R:
--------------------------------------------------------------------------------
 1 | # Read a csv from the ENA submission and correct it
 2 | # a ; csv
 3 | dat <- read.csv2("~/Git/UPSCb/projects/spruce-needles-drought-stress/doc/ENA/ENA_Submission_PA_Drougth_JH_1.csv",
 4 |                  as.is=TRUE)
 5 | 
 6 | # order
 7 | dat <-dat[order(dat$SampleName),]
 8 | 
 9 | # look
10 | str(dat)
11 | length(unique(dat$SampleName))
12 | length(unique(dat$SequencingDate))
13 | 
14 | # date
15 | #dat$SequencingDate <- "2015-02-16T10:00:00"
16 | table(dat$SequencingDate)
17 | dat$SequencingDate <- ifelse(dat$SequencingDate=="2015_10_20","2015-10-20T10:00:00","2015-11-18T10:00:00")
18 | 
19 | # Sample Name
20 | table(dat$SampleDescription)
21 | table(dat$SampleName)
22 | dat$SampleName <- sub("_[1,2]$","",dat$SampleName)
23 | length(unique(dat$SampleName))
24 | dat$SampleName <- paste(dat$SampleName,"biological replicate",rep(rep(1:3,each=4),6),"technical replicate",rep(rep(1:2,each=2),24))
25 | 
26 | # Description
27 | length(unique(dat$SampleDescription))
28 | length(unique(paste(dat$SampleDescription,"biological replicate",rep(rep(1:3,each=4),6),"technical replicate",rep(rep(1:2,each=2),24))))
29 | dat$SampleDescription <- paste(dat$SampleDescription,"biological replicate",rep(rep(1:3,each=4),6),"technical replicate",rep(rep(1:2,each=2),24))
30 | 
31 | # save
32 | write.csv(dat,file="~/Git/UPSCb/projects/spruce-needles-drought-stress/doc/ENA/ENA_Submission_PA_Drougth_JH_1.csv",
33 | row.names=FALSE,quote=FALSE)
34 | 


--------------------------------------------------------------------------------
/pipeline/runPlaac.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -t 48:00:00
 3 | #SBATCH -n 1
 4 | #SBATCH -A facility
 5 | #SBATCH -J plaac
 6 | #SBATCH --mail-usage=END,FAIL
 7 | 
 8 | # sanity
 9 | set -eu
10 | 
11 | # functions
12 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
13 | 
14 | # usage
15 | USAGETXT=\
16 | "
17 |   Synopsis [options] $0 <plaac singularity image> <input protein fasta file> <output directory>
18 |   
19 |   Options:
20 |     -p plot (returns per residue values)
21 | 
22 |   Note: the default returns per sequence value. If you use -p, provide a small fasta file, NOT a while proteome!
23 | "
24 | 
25 | # vars
26 | option=""
27 | 
28 | # options
29 | while getopts p option
30 | do
31 |     case "$option" in
32 |     p) option="-p all";;
33 |     \?) usage;;
34 |   esac
35 | done
36 | shift `expr $OPTIND - 1`
37 | 
38 | # checks
39 | [[ $# -ne 3 ]] && abort "This script expects three arguments"
40 | [[ ! -f $1 ]] && abort "The first argument needs to be an existing file path to a singularity container"
41 | [[ -z ${SINGULARITY_BINDPATH:-} ]] && abort "This function relies on singularity, set the SINGULARITY_BINDPATH environment variable"
42 | [[ ! -f $2 ]] && abort "The second argument needs to be an existing file path to the input protein fasta file"
43 | [[ ! -d $3 ]] && abort "The third argument needs to be an existing directory"
44 | 
45 | # run
46 | singularity exec $1 \
47 | plaac.sh -i $2 $option > $3
48 | 


--------------------------------------------------------------------------------
/pipeline/runSnpEff.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH -p main -n1
 3 | #SBATCH -t 6:00:00
 4 | #SBATCH --mail-type=ALL
 5 | #SBATCH -J snpEff-build
 6 | #SBATCH --mem=16G
 7 | 
 8 | # stop on error
 9 | set -ex
10 | 
11 | # usage
12 | export USAGETXT=\
13 | "
14 | Usage: $0 <CONFIG-FILE> <GENOME-VERSION> <VCF>
15 |   CONFIG-FILE is the snpEff config file - the data.dir needs to be adapted and
16 |     an entry with the genome version needs to be present: e.g.
17 |     'sdl16.genome : Saccharomycodes_ludwigii V16'
18 |   GENOME-VERSION is the version referenced in the snpEff config file, e.g. sdl16
19 |   VCF is the vcf file to process
20 | 
21 |   The output will be written in the vcf input file directory. Best is to link the
22 |   input vcf file there
23 | "
24 | 
25 | # load functions
26 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
27 | 
28 | # checks
29 | if [ $@ -ne 3 ]; then
30 |   abort "This script expects 3 arguments."
31 | fi
32 | 
33 | if [ ! -f $1 ]; then
34 |   abort "The config file does not exist"
35 | fi
36 | 
37 | if [ ! -f $3 ]; then
38 |   abort "The vcf file does not exist"
39 | fi
40 | 
41 | # prep
42 | out=$(dirname $3)
43 | fnam=$(basename ${3/.vcf.*/})
44 | 
45 | # run
46 | java -Xms4g -Xmx16g -jar $CLASSPATH/snpEff.jar ann -c $1 \
47 | -s $out/${fnam}_snpEff_summary.html $2 $3 > $out/$fnam.ann.vcf
48 | 
49 | # compress and index
50 | bgzip -f $out/$fnam.ann.vcf
51 | tabix $out/$fnam.ann.vcf.gz -p vcf
52 | 


--------------------------------------------------------------------------------
/pipeline/runTrimmomaticSeStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 |     echo >&2 \
 5 | "
 6 |     runTrimmomaticStats.sh <trimmomatic dir>
 7 | 
 8 |     Arguments:
 9 |         trimmomatic dir - the directory containing the Trimmomatic error logs
10 | 
11 |     Note:
12 |         The UPSCb Environment Variable needs to be set to your
13 |         Git UPSCb checkout dir.
14 | 
15 |     Details:
16 |         It reads the Trimmomatic error log files and create a text file containing a 
17 |         table to be readily added to the wiki.
18 | "
19 | exit 1
20 | }
21 | 
22 | if [ $# -ne 1 ]; then
23 |     echo "This function takes one argument"
24 |     usage
25 | fi
26 | 
27 | if [ ! -d $1 ]; then
28 |     echo "The first argument needs to be the directory containing the Trimmomatic reports"
29 |     usage
30 | fi
31 | 
32 | if [ -z $UPSCb ]; then
33 |     echo "The UPSCb environment variable needs to be set."
34 |     usage
35 | fi
36 | 
37 | if [ ! -f $UPSCb/pipeline/runTrimmomaticStats.sh ]; then
38 |     echo "Either your UPSC env. var. is not set correctly or your checkout is too old."
39 |     usage
40 | fi
41 | 
42 | echo > $1/trimmomaticStats.txt \
43 | "|Sample|Surviving|S%|Dropped|D%|
44 | |----|----|----|----|----|----|----|----|----|"
45 | grep "Input" $1/*.err | awk -F" " '{gsub(/\(/,"");gsub(/\)/,"");smpl=$1;sub(/^.*\//, "", smpl);sub("_trimmomatic.err:Input","",smpl);print "|"smpl"|"$5"|"$6"|"$8"|"$9"|"}' >> $1/trimmomaticStats.txt
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/runBedToolsCoverage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 2:00:00
 5 | #SBATCH --mail-type=ALL
 6 | ## -A and --mail-user set in the submit job
 7 | 
 8 | ## stop on error
 9 | set -ex
10 | 
11 | ## load the modules
12 | module load bioinfo-tools
13 | module load BEDTools
14 | 
15 | ## we get one dir and one file as input
16 | usage(){
17 |   echo >&2 \
18 |   " Usage: $0 <a file> <b file> <out dir> [bed coverage option] 
19 |     Note: 'a' is the 'subject' file, the file that contains the intervals
20 |           of interest (e.g. the genes)
21 |           'b' is the 'query' file, the file that contains the intervals to
22 |           be counted/summarized (e.g. the reads)
23 |   "
24 |   exit 1
25 | }
26 | 
27 | if [ "$#" -lt 3 ]; then
28 | echo "This function requires 3 arguments"
29 | usage;
30 | fi
31 | 
32 | if [ ! -f $1 ]; then
33 | echo "The first argument needs to be an existing file"    
34 | usage;
35 | fi
36 | a=$1
37 | shift
38 | 
39 | if [ ! -f $1 ]; then
40 | echo "The second argument needs to be an existing file"
41 | usage;
42 | fi
43 | b=$1;
44 | shift;
45 | 
46 | if [ ! -d $1 ]; then
47 | echo "The third argument needs to be an existing directory"
48 | usage;
49 | fi
50 | dir=$1;
51 | shift;
52 | 
53 | # combine the filename for the output
54 | outfile=$dir/`basename ${a%.*}`-`basename ${b%.*}`."${a##*.}"
55 | 
56 | ## get the subtracted results
57 | bedtools coverage $@ -a $a -b $b > $outfile
58 | 


--------------------------------------------------------------------------------
/src/R/updateAspenVcfv1.0.R:
--------------------------------------------------------------------------------
 1 | library(stringr)
 2 | args <- commandArgs(trailingOnly = TRUE)
 3 | 
 4 | if(length(args) != 3){
 5 |   cat("Usage: Rscript updateVcf.R <conversionTable> <infile> <outfile>\n")
 6 |   stop("Usage error")
 7 | }
 8 | 
 9 | conv <- args[1]
10 | infile <- args[2]
11 | outfile <- args[3]
12 | cat("Reading vcf....\n")
13 | f <- readLines(infile)
14 | cat("Reading conversion table....\n")
15 | conv <- read.table(conv, header = FALSE, stringsAsFactors = FALSE)
16 | 
17 | cat("Changing vcf header....\n")
18 | 
19 | reheader <- function(f){
20 |   contigIndex <- grepl("^\\#\\#contig=<ID=.*,",f)
21 |   header <- gsub(",","",gsub("##contig=<ID=","",str_extract(f[contigIndex],"^\\#\\#contig=<ID=.*,")))
22 |   header <- conv[,2][match(header,conv[,1])]
23 |   newHeader <- paste('##contig=<ID=',header,',',sep = '')
24 |   repl <- str_replace(f[contigIndex],"^\\#\\#contig=<ID=.*,",newHeader)
25 |   f[contigIndex] <- repl
26 |   return(f)
27 | }
28 | 
29 | f <- reheader(f)
30 | 
31 | cat("Changing rest of vcf....\n")
32 | 
33 | retail <- function(f){
34 |   tailindex <- !grepl("\\#",f)
35 |   newtail <- conv[,2][match(gsub("\\t","",str_extract(f[tailindex],".*?\\t")),conv[,1])]
36 |   newtail <- paste(newtail,"\t",sep="")
37 |   repl <- str_replace(f[tailindex],".*?\\t",newtail)
38 |   f[tailindex] <- repl
39 |   return(f)
40 | }
41 | 
42 | f <- retail(f)
43 | 
44 | cat("writing to disk",outfile,"....\n")
45 | 
46 | write(x = f,file = outfile)
47 | 


--------------------------------------------------------------------------------
/pipeline/runGeneNetworkRRun.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -c 1
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | set -ex
 7 | 
 8 | # usage
 9 | usage(){
10 | echo >&2 \
11 | "
12 | 	Usage: $0 [options] <output dir> <tool>
13 | 
14 | 	Note: The tool is one of the following:
15 | 	 Anova
16 | 	 CLR
17 | 	 GeneNet
18 | 	 GENIE3
19 | 	 Narromi
20 | 	 Pearson
21 | 	 Spearman
22 | 	 TIGLM
23 | 	 TIGRESS
24 | 
25 | 	 Options:
26 |                 -c the number of cores to parallelise over
27 | "
28 | 	exit 1
29 | }
30 | 
31 | # define global vars
32 | CPU=1
33 | 
34 | # manage options
35 | while getopts c: option
36 | do
37 |   case "$option" in
38 | 	    c) CPU=$OPTARG;;
39 | 	    \?) ## unknown flag
40 | 		usage;;
41 |   esac
42 | done
43 | shift `expr $OPTIND - 1`
44 | 
45 | # check
46 | if [ $# != 2 ]; then
47 |   echo "This script expects 2 arguments"
48 |   usage
49 | fi
50 | 
51 | if [ ! -d $1 ]; then
52 |   echo "The first argument must be a directory"
53 |   usage
54 | fi
55 | 
56 | if [ ! -f $1/Data/sexp.rda ]; then
57 |   echo "The directory has to be a valid geneNetworkR directory."
58 |   echo "Run geneNetworkRPreparation.R first."
59 |   usage
60 | fi
61 | 
62 | # We do not check $2 (the tool) as this is done in the Rscript
63 | 
64 | # get the exec
65 | module load R
66 | exeR=`Rscript -e 'cat(system.file("R","geneNetworkR-run.R",package="geneNetworkR"))'`
67 | 
68 | # run with knitr
69 | Rscript -e "library(knitr); spin('$exeR')" -c $CPU -f $1 -t $2
70 | 
71 | 


--------------------------------------------------------------------------------
/pipeline/runGATK_VariantFiltration.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | #SBATCH -t 5:00:00
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | if [ -z $SLURM_SUBMIT_DIR ]; then
 9 |     if [ $# -lt 4 ]; then
10 |         echo "Usage $0 </path/to/gatk> <tmp dir> <in.vcf> <ref.fasta> <output directory> [vf_args ...]" 1>&2
11 |         exit 1
12 |     fi
13 | 
14 |     GATK=$1
15 |     tmp=$2
16 |     shift 2
17 | else
18 |     if [ $# -lt 3 ]; then
19 |         echo "Usage: $0 <in.vcf> <ref.fasta> <output directory> [vf_args ...]" 1>&2
20 |         exit 1
21 |     fi
22 |     GATK=/sw/apps/bioinfo/GATK/3.2.2/GenomeAnalysisTK.jar
23 |     tmp=$SNIC_TMP
24 | fi
25 | 
26 | if [ ! -d $tmp ]; then
27 |     echo "tmp is not a directory" 1>&2
28 |     exit 1
29 | fi
30 | 
31 | if [ ! -f $1 ]; then
32 |     echo "Could not find VCF file '$1'" 1>&2
33 |     exit 1
34 | fi
35 | invcf=$1
36 | 
37 | if [ ! -f $2 ]; then
38 |     echo "Could not find reference '$2'" 1>&2
39 |     exit 1
40 | fi
41 | ref=$2
42 | 
43 | if [ ! -d $3 ]; then
44 |     echo "No such output directory" 1>&2
45 |     exit 1
46 | fi
47 | outdir=$3
48 | 
49 | shift 3
50 | 
51 | #vf_args="$@"
52 | 
53 | bname=`basename $invcf`
54 | sname="${bname/.vcf*/}"
55 | outfile="$outdir/${sname}_filtered.vcf"
56 | 
57 | # Run VariantFiltration
58 | java -jar -Xmx2G -Djava.io.tmpdir=$tmp $GATK -T VariantFiltration \
59 |     -V "$invcf" \
60 |     -R "$ref" \
61 |     -o "$outfile" \
62 |     "$@" #${vf_args[@]}
63 | 
64 | 


--------------------------------------------------------------------------------
/pipeline/run_psf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 1
 4 | #SBATCH -t 7-00:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error and be verbose in the output
 8 | set -ex
 9 | 
10 | # load the modules
11 | module load bioinfo-tools psf
12 | 
13 | # usage function
14 | usage(){
15 | echo >&2 \
16 | "
17 | 	Usage: psf <list(.cfg)> <seq_prot(.fa)> <-psf_cfg:psf.cfg> <-o:pmap.cfg> <-O:prot_mapM.CFG> <-search_here:seq_nuc.fa> <-no_echo> 
18 | 
19 | 		name of the executable file ./psf + ...
20 | 
21 | 		list(.cfg)	             - path to the list file
22 | 		seq_prot.fa              - path to the multiFASTA-file with protein sequences, without gaps. Headers can include additional information in Softberry 						   AbInitio or FGENESH++ format. Here IPI or NR database could be given on input.
23 | 		-psf_cfg:psf.cfg	       - path to the psf_cfg file (psf configuration file)
24 | 		-o:pmap.cfg              - path to configuration file with parameters of the general alignment algorithm
25 | 		-O:prot_mapM.CFG         - path to configuration file with the options of general protein-on-DNA mapping algorithm
26 | 		-search_here:seq_nuc.fa  - path nucleotide FASTA-file with a single genomic sequence (without gaps).
27 | 		-no_echo
28 | "
29 | 	exit 1
30 | }
31 | 
32 | # check the arguments
33 | 
34 | if [ $# != 7 ]; then
35 |     echo "This function requires 7 arguments."
36 |     usage
37 | fi
38 | 
39 | 
40 | # run the command
41 | psf $1 $2 $3 $4 $5 $6 $7 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/pipeline/runTrimmomaticStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 |     echo >&2 \
 5 | "
 6 |     runTrimmomaticStats.sh <trimmomatic dir>
 7 | 
 8 |     Arguments:
 9 |         trimmomatic dir - the directory containing the Trimmomatic error logs
10 | 
11 |     Note:
12 |         The UPSCb Environment Variable needs to be set to your
13 |         Git UPSCb checkout dir.
14 | 
15 |     Details:
16 |         It reads the Trimmomatic error log files and create a text file containing a 
17 |         table to be readily added to the wiki.
18 | "
19 | exit 1
20 | }
21 | 
22 | if [ $# -ne 1 ]; then
23 |     echo "This function takes one argument"
24 |     usage
25 | fi
26 | 
27 | if [ ! -d $1 ]; then
28 |     echo "The first argument needs to be the directory containing the Trimmomatic reports"
29 |     usage
30 | fi
31 | 
32 | if [ -z $UPSCb ]; then
33 |     echo "The UPSCb environment variable needs to be set."
34 |     usage
35 | fi
36 | 
37 | if [ ! -f $UPSCb/pipeline/runTrimmomaticStats.sh ]; then
38 |     echo "Either your UPSC env. var. is not set correctly or your checkout is too old."
39 |     usage
40 | fi
41 | 
42 | echo > $1/trimmomaticStats.txt \
43 | "|Sample|Both|B%|Forward|F%|Reverse|R%|Dropped|D%|
44 | |----|----|----|----|----|----|----|----|----|"
45 | grep "Input" $1/*.err | awk -F" " '{gsub(/\(/,"");gsub(/\)/,"");smpl=$1;sub(/^.*\//, "", smpl);sub("_trimmomatic.err:Input","",smpl);print "|"smpl"|"$7"|"$8"|"$12"|"$13"|"$17"|"$18"|"$20"|"$21"|"}' >> $1/trimmomaticStats.txt
46 | 
47 | 


--------------------------------------------------------------------------------
/src/bash/seidr-aggregate-kebnekaise.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -A SNIC2021-5-200
 3 | #SBATCH -t 12:00:00
 4 | #SBATCH -n 28
 5 | #SBATCH --mail-type=ALL
 6 | #SBATCH --mail-user=nicolas.delhomme@slu.se
 7 | #SBATCH -o aggregate.out
 8 | #SBATCH -e aggregate.err
 9 | 
10 | set -ex
11 | DIRECTIONALITY="-k"
12 | FORCE=
13 | METHOD="-m irp"
14 | 
15 | # usage
16 | USAGETXT=\
17 | "
18 |   Usage: $0 [options] <out dir> <sf file> [sf file] ... [sf file]
19 | 
20 |   Options:
21 |           -f  force overwrite output
22 |           -k  keep the directionality, default: true
23 |           -m  method, default: irp
24 | "
25 | 
26 | CPU=28
27 | 
28 | # source
29 | source functions.sh
30 | 
31 | # modules
32 | EXEC=/pfs/proj/nobackup/fs/projnb10/snic2019-35-44/software/seidr/build
33 | source $EXEC/sourcefile
34 | 
35 | # Get the options
36 | while getopts fkm: option
37 | do
38 |     case "$option" in
39 |         f) FORCE="-f";;
40 |         k) DIRECTIONALITY=;;
41 |         k) METHOD="-m $OPTARG";;
42 |         \?) ## unknown flag
43 | 		    abort;;
44 |     esac
45 | done
46 | shift `expr $OPTIND - 1`
47 | 
48 | OPTIONS="$FORCE $DIRECTIONALITY $METHOD"
49 | 
50 | if [ $# -lt 2 ]; then
51 |   abort "This script expects at least 2 arguments"
52 | fi
53 | 
54 | if [ ! -d $1 ]; then
55 |   abort "The first argument needs to be an existing directory"
56 | fi
57 | 
58 | # run
59 | cd $1
60 | shift
61 | export OMP_NUM_THREADS=$CPU
62 | #rm -f aggregated.sf
63 | 
64 | seidr aggregate $OPTIONS -O $CPU $@
65 | 


--------------------------------------------------------------------------------
/pipeline/runGatkFastaAlternateReferenceMaker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -t UNLIMITED
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | #SBATCH --mem 16G
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | set -e
 9 | 
10 | # source helpers
11 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
12 | 
13 | # default
14 | JavaThreadMem=16G
15 | 
16 | # usage
17 | export USAGETXT=\
18 | "
19 | Usage: $0 <VCF file> <interval file> <fasta ref> <output directory> [GATK additional options]
20 | 
21 | Note: several interval files can be provided, comma separated
22 | "
23 | 
24 | if [ $# -lt 4 ]; then
25 |   abort "This script expects 4 arguments"
26 | fi
27 | 
28 | if [ ! -f $1 ]; then
29 |     abort "Could not find VCF file '$1'"
30 | fi
31 | vcf=$1
32 | 
33 | # Check the second argument
34 | echo $2 | xargs -d, -I {} bash -c 'if [ ! -f $0 ]; then abort "INTERVAL file $0 does not exist"; fi' {}
35 | ivl=$2
36 | 
37 | if [ ! -f $3 ]; then
38 |     abort "Could not find FASTA file '$3'"
39 | fi
40 | ref=$3
41 | 
42 | if [ ! -d $4 ]; then
43 |     abort "Could not find directory '$4'"
44 | fi
45 | out=$4
46 | 
47 | # drop the four args
48 | shift
49 | shift
50 | shift
51 | shift
52 | 
53 | # run once for every interval file
54 | for i in `echo $ivl | tr ',' ' '`; do
55 | 
56 |   nam=$out/$(basename $i)"-"$(basename "${vcf/.vcf/.fasta}")
57 | 
58 |   # Run
59 |   java -Xmx${JavaThreadMem} -jar $GATK_HOME/GenomeAnalysisTK.jar \
60 |   -T FastaAlternateReferenceMaker -R $ref -L $i -V $vcf  -o $nam $@
61 | done
62 | 


--------------------------------------------------------------------------------
/pipeline/runJBrowse2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p nolimit -n 1
 3 | 
 4 | set -eu
 5 | 
 6 | # source functions
 7 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
 8 | 
 9 | # usage
10 | USAGETXT=\
11 | "
12 | 	Usage: runJBrowse2.sh <dir> <port>
13 | "
14 | 
15 | # usage when we singularity it
16 | # "
17 | # 	Usage: runJBrowse2.sh <singularity container> <dir> <port>
18 | # "
19 | 
20 | # safety
21 | # [[ $# -ne 3 ]] && abort "This script expects 3 arguments"
22 | # [[ ! -f $1 ]] && abort "The first argument needs to be an existing singularity file"
23 | # [[ ! -d $2 ]] && abort "The second argument needs to be an existing directory"
24 | # [[ ! -f $2/config.json ]] && abort "The second argument needs to be an initialized jbrowse2 directory"
25 | # [[ $3 -le 20000 ]] && abort "The port should be above or equal 20000"
26 | # [[ $3 -gt 30000 ]] && abort "The port should be below 30000"
27 | 
28 | # safety
29 | [[ $# -ne 2 ]] && abort "This script expects 2 arguments"
30 | [[ ! -d $1 ]] && abort "The first argument needs to be an existing directory"
31 | [[ ! -f $1/config.json ]] && abort "The first argument needs to be an initialized jbrowse2 directory"
32 | [[ $2 -le 20000 ]] && abort "The port should be above or equal 20000"
33 | [[ $2 -gt 30000 ]] && abort "The port should be below 30000"
34 | 
35 | # run
36 | #docker run -d --rm -p $3:3000 -v $2:/var/www/html/jbrowse2 delhomme/upscb-jbrowse2
37 | docker run -d --rm -p $2:3000 -v $1:/var/www/html/jbrowse2 delhomme/upscb-jbrowse2
38 | 


--------------------------------------------------------------------------------
/pipeline/runBamtoFastQ.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | #SBATCH -t 0-05:00:00
 6 | #SBATCH --mail-type=ALL
 7 | 
 8 | set -e -x
 9 | 
10 | ##check parameters
11 | ##argument number
12 | if [ $# != 2 ]; then
13 |         echo -e "\e[1;31mPlease supply three arguments for this script!\e[0m"
14 |         echo -e "\e[1;31mArgument 1 should be the input SAM or BAM file.\e[0m"
15 |         echo -e "\e[1;31mArguments should be the output directory.\e[0m"
16 |         exit 1
17 | fi
18 | 
19 | ## is the UPSCb env var set
20 | if [ -z $UPSCb ]; then
21 |     echo "You need to set the UPSCb environment variable"
22 | fi
23 | 
24 | ## load picard tools
25 | module load Picard-tools
26 | 
27 | ##are the files correct?
28 | if [ ! -f $1 ]; then
29 |         echo -e "\e[1;31mThe input file does not exist!\e[0m"
30 |         exit 1
31 | fi
32 | if [ ${1: -3} != "bam" ] ; then
33 |         echo -e "\e[1;31mThe filetype of your input file is not supported.\e[0m"
34 |         echo -e "\e[1;31mOnly .bam is supported (case sensitive filename extension). Your file is $1\e[0m"
35 |         exit 1
36 | fi
37 | if [ ! -d $2 ]; then
38 |         echo -e "\e[1;31mThe output directory does not exist!\e[0m"
39 | 	exit 1
40 | fi
41 | 
42 | ## clean files
43 | ## run executable with suggested memory of 2GB
44 | ## the field is 1 (to report all paired reads) 
45 | fnam=`basename ${1//.bam/}`
46 | java -Xmx2g -jar $PICARD_HOME/SamToFastq.jar INPUT=$1 FASTQ=$2/${fnam}_1.fq SECOND_END_FASTQ=$2/${fnam}_2.fq UNPAIRED_FASTQ=$2/${fnam}_singleton.fq
47 | 
48 | 


--------------------------------------------------------------------------------
/src/python/fastQCmultiviewer.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | def keyCheck(key, dic):
 4 | 	try:
 5 | 		dic[key]
 6 | 		return True
 7 | 	except KeyError:
 8 | 		return False
 9 | 
10 | def createIframe(html):
11 | 	string = "<td><iframe src='"+html+"' width='1500px' height='10000px' scrolling='no'></iframe></td>"
12 | 	return string
13 | 
14 | def siteHtml(images):
15 | 	html = """
16 | 			<!DOCTYPE html>
17 | 				<head></head>
18 | 
19 | 				<body>
20 | 				<table>
21 | 				<tr>
22 | 				"""+"".join(images)+"""
23 | 				</tr>
24 | 				</table>
25 | 
26 | 				</body>
27 | 
28 | 				</html>
29 | 	"""
30 | 	return html
31 | 
32 | import sys
33 | import os
34 | 
35 | inputs = sys.argv
36 | 
37 | inputs.pop(0)
38 | inDic = {}
39 | for inp in range(len(inputs)-1):
40 | 	if inputs[inp][0] == "-":
41 | 		inDic[inputs[inp][1:]] = inputs[inp+1]
42 | 
43 | outFile = open(inDic['out_file'], "w")
44 | limit = [0,10]
45 | if keyCheck("range",inDic):
46 | 	limit = inDic["range"].strip().split(",")
47 | 
48 | 
49 | table = []
50 | if keyCheck("samples", inDic):
51 | 	samples = inDic["samples"].strip.split(",")
52 | if keyCheck("in_dir", inDic):
53 | 	rows = 0
54 | 	folders = os.walk(inDic["in_dir"])
55 | 	for root, subFolders, files in folders:
56 | 		#print root
57 | 		folders = root.split("/")
58 | 		relative_folder = folders[-2]+"/"+folders[-1]
59 | 		for f in files:
60 | 			if f[-4:] == "html":
61 | 				table.append(createIframe(relative_folder+"/"+f))
62 | 				if rows > int(limit[1]):
63 | 					break
64 | 				rows += 1
65 | 	table.sort()
66 | 	outFile.write(siteHtml(table))


--------------------------------------------------------------------------------
/pipeline/runSTARStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 |     echo >&2 \
 5 | "
 6 |     runSTARStats.sh <STAR dir>
 7 | 
 8 |     Arguments:
 9 |         STAR dir - the directory containing the STAR logs directory
10 | 
11 |     Note:
12 |         The UPSCb Environment Variable needs to be set to your
13 |         Git UPSCb checkout dir.
14 | 
15 |     Details:
16 |         It reads the STAR 'Log.Final.out' log files and create a text file containing a 
17 |         table to be readily added to the wiki.
18 | "
19 | exit 1
20 | }
21 | 
22 | if [ $# -ne 1 ]; then
23 |     echo "This function takes one argument"
24 |     usage
25 | fi
26 | 
27 | if [ ! -d $1 ]; then
28 |     echo "The first argument needs to be the directory containing the STAR reports"
29 |     usage
30 | fi
31 | 
32 | if [ -z $UPSCb ]; then
33 |     echo "The UPSCb environment variable needs to be set."
34 |     usage
35 | fi
36 | 
37 | if [ ! -f $UPSCb/pipeline/runSTARStats.sh ]; then
38 |     echo "Either your UPSC env. var. is not set correctly or your checkout is too old."
39 |     usage
40 | fi
41 | 
42 | echo > $1/STARStats.txt \
43 | "|Sample|uniquely mapped|mismatch rate|deletion rate|insert rate|multiple mapping|too many mapping|unmapped MM|unmapped short|unmapped other|
44 | |----------|---|---|---|---|---|---|---|---|"
45 | cd $1
46 | grep "%" */*Log.final.out | awk 'BEGIN{FS="|"}{pct=$2;gsub(" |\t","",pct);if (index($1,"Uniquely")>0){smpl=$1;sub("Log.final.out.*","",smpl);sub(".*/","",smpl);if (NR>1) {printf "\n"} printf"|"smpl"|"pct"%|"}else{printf pct"%|"}}' >> STARStats.txt
47 | 
48 | 


--------------------------------------------------------------------------------
/pipeline/runDemultiplex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p rbx -n 1
 3 | #SBATCH --mail-type=END,FAIL
 4 | #SBATCH -t 2:00:00
 5 | 
 6 | set -eu
 7 | 
 8 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
 9 | 
10 | USAGETXT=\
11 | "
12 | SYNOPSIS $0 [options] <out> <barcode> <fwd> [rev]
13 | 
14 | OPTIONS
15 |   -r barcode is in the read
16 |   -e end position of the barcode in the read
17 |   -s start position of the barcode in the read
18 | NOTE
19 |   -r requires -e or -s
20 | "
21 | 
22 | isExec demultiplex
23 | 
24 | START=
25 | READ=
26 | END=
27 | 
28 | while getopts e:rs: option
29 | do
30 |     case "$option" in
31 |     e) END="-e $OPTARG";;
32 |     r) READ="-r";;
33 |     s) START="-s $OPTARG";;
34 |     \?) usage;;
35 | 
36 |   esac
37 | done
38 | shift `expr $OPTIND - 1`
39 | 
40 | [[ ! -z $READ ]] && [[ -z $END ]] && [[ -z $START ]]  && abort "-r requires -e or -s"
41 | 
42 | [[ -z $READ ]] && [[ ! -z $END ]] && abort "-e requires -r"
43 | 
44 | [[ -z $READ ]] && [[ ! -z $START ]] && abort "-s requires -r"
45 | 
46 | [[ $# -lt 3 ]] || [[ $# -gt 4 ]] && abort "The script expects 3 or 4 arguments"
47 | 
48 | [[ ! -d $1 ]] && abort "The output directory does not exist"
49 | 
50 | [[ ! -f $2 ]] && abort "The barcode file does not exist"
51 | 
52 | [[ ! -f $3 ]] && abort "The forward file does not exist"
53 | 
54 | [[ $# -eq 4 ]] && [[ ! -f $4 ]] && abort "The reverse file does not exist"
55 | 
56 | [[ $# -eq 3 ]] && demultiplex demux $READ $END $START -p $1 $2 $3
57 | 
58 | [[ $# -eq 4 ]] && demultiplex demux $READ $END $START -p $1 $2 $3 $4
59 | 


--------------------------------------------------------------------------------
/pipeline/runUpdateNCBI.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -t 0-04:00:00
 4 | #SBATCH --mail-type=ALL
 5 | 
 6 | ## stop on error
 7 | set -e
 8 | 
 9 | module load bioinfo-tools
10 | module load blast/2.2.27+
11 | module load samtools/0.1.19
12 | 
13 | if [ $# != 2 ]; then
14 |     echo "The argument should be the database name to download and the output directory"
15 |     exit 1
16 | fi
17 | 
18 | db=
19 | case "$1" in
20 |     nt) db="nt";;
21 |     nr) db="nr";;
22 | esac
23 | 
24 | if [ -z $db ]; then
25 |     echo "The argument should be one of nt or nr"
26 |     exit 1
27 | fi  
28 | 
29 | if [ ! -d $2 ]; then
30 |     echo "The  directory name you provided does not exist"
31 |     exit 1
32 | fi
33 | 
34 | ##
35 | echo "Setting up"
36 | 
37 | ## copy the exec to proper dir
38 | cp $SLURM_SUBMIT_DIR/../../../src/perl/update_blastdb.pl $2
39 | 
40 | ## cd to that dir
41 | cd $2
42 | 
43 | ## download
44 | echo "Downloading"
45 | $2/update_blastdb.pl --passive --force $1
46 | 
47 | ## unpacking
48 | echo "Unpacking"
49 | find . -name "*.tar.gz" -exec tar -zxf "{}" \;
50 | ## TODO to go parallel
51 | ## find . -name "*.tar.gz" -print0 | xargs -P 16 -I {} -0  tar -zxf {}
52 | 
53 | ## cleaning
54 | echo "Cleaning"
55 | rm *.tar.gz
56 | 
57 | ## extracting the fasta seq
58 | echo "Extracting the fasta"
59 | blastdbcmd -db $1 -entry all -out $1.fa
60 | samtools faidx $1.fa
61 | awk 'BEGIN{FS=" "};{print $2,$1}' $1.fa.fai > $1.lengths.txt
62 | 
63 | ## add a flag
64 | echo "Flagging"
65 | touch ${1}-Updated-`date +%Y%m%d`.flag
66 | 
67 | echo "Done"
68 | 
69 | 


--------------------------------------------------------------------------------
/src/R/plotVCFQual.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(trailingOnly=T)
 2 | 
 3 | if (length(args) < 3) {
 4 |     cat('Usage: Rscript plotVCFQuals.R <input.vcf> <path_to_folder> <title>\n')
 5 |     stop('Too few arguments.')
 6 | }
 7 | 
 8 | vcf_file = args[1]
 9 | out_path = args[2]
10 | plot_title = args[3]
11 | 
12 | library(stringr)
13 | library(ggplot2)
14 | library(LSD)
15 | library(VariantAnnotation)
16 | 
17 | # Only load quality and depth
18 | vcfparam <- ScanVcfParam(fixed="QUAL", info="DP", geno=NA)
19 | vcf <- readVcf(vcf_file, 'Potri', vcfparam)
20 | 
21 | dfr <- data.frame(Quality=fixed(vcf)$QUAL, Depth=info(vcf)$DP)
22 | 
23 | summary(dfr$Quality)
24 | summary(dfr$Depth)
25 | 
26 | # Plot a histogram of the quality scores
27 | theme_set(theme_bw(base_size=12))
28 | cat('Plotting quality histogram\n')
29 | pdf(file.path(out_path, paste(plot_title, '.raw_snp_quals.pdf', sep='')),
30 |     height=4, width=6)
31 | ggplot(dfr, aes(x = Quality)) + geom_histogram(binwidth=0.01) +
32 |     scale_x_log10() + ggtitle(paste(plot_title, ': SNP quality score distribution',
33 |         sep=''))
34 | dev.off()
35 | 
36 | # Add pseudocounts
37 | dfr$Depth = dfr$Depth + 1
38 | dfr$Quality = dfr$Quality + 1
39 | 
40 | # Plot a comparison of quality scores vs sequencing depth
41 | cat('Plotting quality vs depth\n')
42 | pdf(file.path(out_path, paste(plot_title, '.raw_qual_vs_depth.pdf', sep='')),
43 |     height=4, width=6)
44 | ggplot(dfr, aes(x = Depth, y = Quality)) + stat_binhex() +
45 |     scale_x_log10() + scale_y_log10() +
46 |     xlab('Depth + 1') + ylab('Quality + 1')
47 | dev.off()
48 | 


--------------------------------------------------------------------------------
/pipeline/runSeidrRoc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A facility
 3 | #SBATCH -t 4:00:00
 4 | #SBATCH -p main -n 1
 5 | #SBATCH --mem=16GB
 6 | 
 7 | set -ex
 8 | 
 9 | # Options
10 | ALL="-a"
11 | INDEX=
12 | POINTS="-p 1000"
13 | 
14 | # usage
15 | USAGETXT=\
16 | "
17 |   Usage: $0 [options] <seidr file> <positive-gold-standard> <negative-gold-standard> <output filename>
18 |   
19 |   Options:
20 |         -a (default, set the flag to unset) process all algorithms
21 |         -i index of the algorith to proceed
22 |         -p the number of points to keep (default: $POINTS)
23 | "
24 | 
25 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
26 | 
27 | isExec seidr
28 | 
29 | # Get the options
30 | while getopts ai:p: option
31 | do
32 |     case "$option" in
33 |         a) ALL="";;
34 |         i) INDEX="-i $OPTARG";;
35 |         p) POINTS="-p $OPTARG";;
36 |         \?) ## unknown flag
37 | 		    abort;;
38 |     esac
39 | done
40 | shift `expr $OPTIND - 1`
41 | 
42 | OPTIONS="$ALL $INDEX $POINTS"
43 | 
44 | if [ $# -ne 4 ]; then
45 |   abort "This script expects 4 arguments"
46 | fi
47 | 
48 | if [ ! -f $1 ]; then
49 |   abort "The first argument needs to be an existing file"
50 | fi
51 | 
52 | if [ ! -f $2 ]; then
53 |   abort "The second argument needs to be an existing file"
54 | fi
55 | 
56 | if [ ! -f $3 ]; then
57 |   abort "The third argument needs to be an existing file"
58 | fi
59 | 
60 | if [ ! -d $(dirname $4) ]; then
61 |   abort "The fourth argument directory needs to exist"
62 | fi
63 | 
64 | # run
65 | seidr roc -n $1 -g $2 -x $3 $OPTIONS > $4
66 | 


--------------------------------------------------------------------------------
/templates/R/header.html:
--------------------------------------------------------------------------------
 1 | <a href="https://github.com/UPSCb/UPSCb-common" class="github-corner" aria-label="View source on GitHub">
 2 |   <svg width="80" height="80" viewBox="0 0 250 250" style="fill:#64CEAA; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true">
 3 |     <path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path>
 4 |     <path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path>
 5 |     <path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path>
 6 |   </svg>
 7 | </a>
 8 | <style>
 9 | .github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}
10 | </style>
11 | 


--------------------------------------------------------------------------------
/pipeline/runSamtoolsSort.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main
 3 | #SBATCH -n 16
 4 | #SBATCH -t 01:00:00
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | ## stop on error
 8 | set -ex
 9 | 
10 | ## modules
11 | module load bioinfo-tools
12 | module load samtools/0.1.19
13 | 
14 | # usage 
15 | usage(){
16 | echo >&2 \
17 | "
18 | 	Usage: $0 [option] <in.bam>
19 | 	Options:
20 |                 -p define the number of threads to use (16)
21 |                 -n sort by name instead of coordinates
22 |                 -i inplace sort - i.e. keep the file name unchanged
23 |         Note: If sorting by coordinates, the extension is _byCoord.bam
24 |               and for sorting by ID, the extenstion is _byReadID.bam
25 | "
26 | 	exit 1
27 | }
28 | 
29 | # define options
30 | SORT=
31 | EXT="_byCoord"
32 | INPLACE=0
33 | CPU=16
34 | ## get the options
35 | while getopts p:ni option
36 | do
37 |         case "$option" in
38 | 	    i) INPLACE=1;;
39 | 	    n) SORT=" -n "
40 | 		EXT="_byReadID"
41 | 		;;
42 | 	    p) CPU=$OPTARG;;
43 | 	    \?) ## unknown flag
44 | 		usage;;
45 |         esac
46 | done
47 | shift `expr $OPTIND - 1`
48 | 
49 | ## we get one file as input
50 | if [ $# != 1 ]; then
51 |     echo "This function takes one file as argument"
52 |     usage
53 | fi
54 | 
55 | if [ ! -f $1 ]; then
56 |     echo "The first argument needs to be an existing bam file"
57 | fi
58 | 
59 | ## define the output file
60 | new=`dirname $1`/`basename ${1//.bam/}`${EXT}
61 | 
62 | ## get the coverage table
63 | samtools sort -@ $CPU $SORT $1 $new
64 | 
65 | ## if inplace
66 | if [ $INPLACE == 1 ]; then
67 |     mv $new.bam $1
68 | fi
69 | 
70 | 


--------------------------------------------------------------------------------
/pipeline/runShortstack.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p main -n 16
 4 | #SBATCH --mem=128G
 5 | #SBATCH -t 4-00:00:00
 6 | 
 7 | # modules
 8 | #module load bioinfo-tools ShortStack
 9 | 
10 | # defaults
11 | CPU=16
12 | MEM=120G
13 | FORMAT="--readfile"
14 | # usage
15 | usage(){
16 |   echo >&2 \
17 |   "
18 |     Usage: $0 [options] outdir genome file(s)
19 |     
20 |     Note: multiple files can be provided space separated
21 |     
22 |     Options:
23 |       -b input is a bamfile
24 |       -c the number of CPUs default to 16
25 |       -m the memory for sorting default to 120G
26 |   "
27 |   exit 1;
28 | }
29 | 
30 | # process the options
31 | while getopts "bc:m:" opt; do
32 |     case $opt in
33 |       b) FORMAT="--bamfile";;
34 | 	    c) CPU=$OPTARG;;
35 | 	    m) MEM=$OPTARG;;
36 |       \?) usage;;
37 |     esac
38 | done
39 | shift `expr $OPTIND - 1`
40 | 
41 | # check arguments
42 | if [ $# -lt 3 ]; then
43 |   echo "This function expects at least 3 arguments"
44 |   usage
45 | fi
46 | 
47 | if [ ! -d $1 ]; then
48 |   echo "The first argument needs to be an existing directory"
49 |   usage
50 | fi
51 | out=$1/`date +%Y%m%d`
52 | 
53 | if [ ! -f $2 ]; then
54 |   echo "The second argument needs to be an existing bowtie index"
55 |   usage
56 | fi
57 | bwt=$2
58 | 
59 | shift 2
60 | 
61 | for file in "$@"; do
62 |   if [ ! -f $file ]; then
63 |     echo "The read file $file does not exists"
64 |     usage
65 |   fi
66 | done
67 | 
68 | # run
69 | ShortStack --bowtie_cores $CPU --sort_mem $MEM --dicermin 18 \
70 | --mismatches 0 --nostitch --outdir $out --genomefile $bwt $FORMAT $@ 
71 | 


--------------------------------------------------------------------------------
/pipeline/runPicardMarkDuplicatesWithMateCigar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -t 4:00:00
 4 | #SBATCH -p main
 5 | #SBATCH -n 1
 6 | 
 7 | set -e
 8 | 
 9 | module load java
10 | module load bioinfo-tools
11 | module load samtools
12 | #module load picard
13 | module load Picard-tools
14 | 
15 | THREADS=1
16 | 
17 | if [ -z $PICARD_HOME ]; then
18 |     echo >&2 "Could not find picard tools"
19 |     exit 1
20 | fi
21 | 
22 | # default
23 | JavaMem=6G
24 | MIN=-1
25 | 
26 | while getopts j:m: option
27 | do
28 |     case "$option" in
29 | 	    j) JavaMem=$OPTARG;;
30 |       m) MIN=$OPTARG;;
31 | 	    \?) ## unknown flag
32 | 		usage;;
33 |         esac
34 | done
35 | shift `expr $OPTIND - 1`
36 | 
37 | if [ $# -ne 2 ]; then
38 |     echo "Usage: $0 <BAM file> <output directory>" 1>&2
39 |     exit 1
40 | fi
41 | 
42 | if [ ! -f $1 ]; then
43 |     echo "Could not find BAM file '$1'" 1>&2
44 |     exit 1
45 | fi
46 | inbam=$1
47 | 
48 | if [ ! -d $2 ]; then
49 |     echo "Could not find directory '$2'" 1>&2
50 |     exit 1
51 | fi
52 | outdir=$2
53 | 
54 | sname=`basename "${inbam/_[st]*[st]*_STAR.bam/}"`
55 | name_out=`basename "${inbam/.bam/}"`
56 | 
57 | # Run MarkDuplicates
58 | java -Xmx${JavaMem} -XX:ParallelGCThreads=$THREADS -jar $PICARD_TOOLS_DIR/picard.jar MarkDuplicatesWithMateCigar \
59 |     ASSUME_SORTED=true \
60 |     INPUT=$inbam \
61 |     OUTPUT=$outdir/${name_out}_mkdup.bam \
62 |     METRICS_FILE=$outdir/${sname}_mkdup.metrics \
63 |     VALIDATION_STRINGENCY=LENIENT \
64 |     MINIMUM_DISTANCE=$MIN 
65 |     #\
66 |     #CREATE_INDEX=true
67 | 
68 | samtools index $outdir/${name_out}_mkdup.bam
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/pipeline/runSamtools_split_primary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --mail-type=all
 3 | #SBATCH -p main -n 4
 4 | #SBATCH -t 1-00:00:00
 5 | #SBATCH --mem=32GB
 6 | 
 7 | usage(){
 8 |   echo >&2 \
 9 |   "
10 |   Usage $0 <bam file> <out dir>
11 |   Note: samtools >= v1.3 is expected
12 |   "
13 |   exit 1
14 | }
15 | 
16 | ## check if a tool is  present and is executable
17 | toolCheck() {
18 |     tool=`which $1 2>/dev/null`
19 |     if [ ! -z $tool ] && [ -f $tool ] && [ -x $tool ]; then
20 | 	echo 0
21 |     else
22 | 	echo 1
23 |     fi
24 | }
25 | 
26 | ## version
27 | #MAIN=1
28 | #MAJOR=3
29 | #MINOR=1
30 | #versionCheck(){
31 | #	echo $1
32 | #  if [ $(echo $1 | awk -F. '{$1}') -ge $MAIN ] && [ $(echo $1 | awk -F. '{$2}') -ge $MAJOR ] && [ $(echo $1 | awk -F. '{$3}') -ge $MINOR ]; then
33 | #  	echo 0
34 | #  else
35 | #  	echo 1
36 | #  fi
37 | #}
38 | 
39 | if [ $(toolCheck samtools) -eq 1 ]; then
40 |   echo "samtools is not available"
41 |   usage
42 | fi
43 | 
44 | if [ ! -f $1 ]; then
45 |   echo "The first argument needs to be a file"
46 |   usage
47 | fi
48 | 
49 | if [ ! -d $2 ]; then
50 |   echo "The second argument needs to be a dir"
51 |   usage
52 | fi
53 | 
54 | #if [ $(versionCheck $(samtools 2>&1 > /dev/null | grep Version | awk '{print $2}')) -eq 1 ]; then
55 | #  echo "The expected version of samtools is at least: $MAIN.$MAJOR.$MINOR"
56 | #  usage
57 | #fi
58 | 
59 | # extract only alignments with flag 0 or 16 (primary alignments on both strands, because the data is not strand specific)
60 | # split alignments in the separate files by the sample names in RG tags
61 | cd $2
62 | samtools view -h -b -F 1797 $1 | samtools split -f "%!.%." -
63 | 


--------------------------------------------------------------------------------
/pipeline/runGATK_IndelRealigner.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p main
 4 | #SBATCH -n 1
 5 | #SBATCH -t 1-00:00:00
 6 | #SBATCH --mem 6G
 7 | 
 8 | set -e
 9 | 
10 | #module load bioinfo-tools GATK
11 | #module load java
12 | 
13 | # helper
14 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
15 | 
16 | # Defaults
17 | JavaMem=6G
18 | USAGETXT=\
19 | "
20 | Usage: $0 <BAM file> <reference fasta> <target interval> <output directory>
21 | 
22 | Note: This script is not GATK v4 compatible. Load a GATK V3 module. More at https://software.broadinstitute.org/gatk/blog?id=7847
23 | "
24 | 
25 | # GATK 3
26 | if [ -z $GATK_HOME ]; then
27 |   usage
28 | fi
29 | GATK=$GATK_HOME/GenomeAnalysisTK.jar
30 | 
31 | if [ -d "$SNIC_TMP" ]; then
32 |     tmp=$SNIC_TMP
33 | else
34 |     tmp=/mnt/picea/tmp
35 | fi
36 | 
37 | if [ $# -lt 4 ]; then
38 |   usage
39 | fi
40 | 
41 | if [ ! -f $1 ]; then
42 |     abort "Could not find BAM file '$1'"
43 | fi
44 | inbam=$1
45 | 
46 | if [ ! -f $2 ]; then
47 |     abort "Could not find reference '$2'"
48 | fi
49 | ref=$2
50 | 
51 | if [ ! -f $3 ]; then
52 |     abort "Could not find interval file '$3'"
53 | fi
54 | interval=$3
55 | 
56 | 
57 | if [ ! -d $4 ]; then
58 |     abort "Could not find directory '$4'"
59 | fi
60 | outdir=$4
61 | 
62 | # drop all four args 
63 | shift
64 | shift
65 | shift
66 | shift
67 | 
68 | inname=${inbam##*/}
69 | outname=${inname%.bam}
70 | 
71 | # Perform local realignment
72 | java -Xmx${JavaMem} -jar -Djava.io.tmpdir=$tmp $GATK \
73 |     -T IndelRealigner \
74 |     -I $inbam \
75 |     -R $ref \
76 |     --targetIntervals $interval \
77 |     -o $outdir/${outname}_realigned.bam $@
78 | 
79 | 


--------------------------------------------------------------------------------
/pipeline/runSwestoreSync.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p main -n 1
 3 | #SBATCH -A snic2018-13-9
 4 | #SBATCH --mail-type=ALL
 5 | #SBATCH --mail-user=nicolas.delhomme@umu.se
 6 | #SBATCH -t 3-00:00:00
 7 | 
 8 | # source helpers
 9 | source ${SLURM_SUBMIT_DIR:-$(pwd)}/../UPSCb-common/src/bash/functions.sh
10 | 
11 | # failsafe
12 | set -ex
13 | 
14 | USAGETXT=\
15 | "
16 | 	Usage: runSwestoreSync.sh <parent dir> <archive dir> <swestore project>
17 | 	
18 | 	Note: <archive dir> is relative to the parent dir!
19 | "
20 | 
21 | # check arguments
22 | if [ $# -ne 3 ]; then
23 |     echo "This function needs 3 argument"
24 |     usage
25 | fi
26 | 
27 | if [ ! -d $1 ]; then
28 |   abort "The first argument needs to be an existing directory"
29 | fi
30 | 
31 | if [ ! -d $1/$2 ]; then
32 |    abort "The second argument needs to be an existing directory within the first argument directory"
33 | fi
34 | 
35 | # create a proxy
36 | arcproxy -c validityPeriod=96H -p key=file:~delhomme/.globus/key.txt
37 | 
38 | # create the directory in swestore
39 | dir=$(echo $2 | sed "s:^./::")
40 | #arcmkdir $3/$dir
41 | 
42 | # go to the parent dir
43 | cd $1
44 | 
45 | # create an archive - TODO check for special chars - checked manually, there are none
46 | arx=$(echo $2 | sed 's:^./::' | sed 's:-:_:gm' | sed 's:/:-:').tar
47 | tar -cf $arx $1/$dir
48 | 
49 | # sync the archive
50 | arccp $arx $3/$dir/
51 | 
52 | # check
53 | lsize=$(ls -l $arx | awk '{print $5}')
54 | rsize=$(arcls -l $3/$dir/$arx | awk '{if(NR>1)print $3}')
55 | 
56 | if [ $lsize -ne $rsize ]; then
57 |   abort "The file do not have the same size!"
58 | else
59 |   # remove the local structure and the archive
60 |   rm $arx
61 |   rm -rf $dir
62 | fi
63 | 


--------------------------------------------------------------------------------