├── docs
├── Makefile
├── issues.adoc
├── description.adoc
├── index.adoc
├── install.adoc
├── results.adoc
├── options.adoc
├── extra.adoc
├── run.adoc
└── examples.adoc
├── bin
├── len.py
├── get_sizes.sh
├── get_readqual.sh
├── get_readstats.sh
├── custom_uniprot_hits.R
├── addAnnotation.py
├── get_busco_val.sh
├── heatmap_busco.R
├── GO_plots.R
├── SOS_busco.py
├── busco_comparison.R
└── TransPi_Report_Ind.Rmd
├── .gitignore
├── remove_failed.sh
├── Dockerfile
├── conf
├── test.config
└── busV4list.txt
├── LICENSE
├── README.md
├── template.nextflow.config
└── precheck_TransPi.sh
/docs/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | /opt/homebrew/bin/asciidoctor -D . index.adoc
3 |
--------------------------------------------------------------------------------
/bin/len.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from Bio import SeqIO
4 | import sys
5 |
6 | arg1=sys.argv[1]
7 |
8 | filename=arg1
9 | for record in SeqIO.parse(filename, "fasta"):
10 | print(record.id,"\t", len(record.seq)+1)
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | busco_db/
2 | diamonddb/
3 | hmmerdb/
4 | nextflow
5 | pipeline_info/
6 | reads/
7 | uniprot_db/
8 | work/
9 | cbs-dtu-tools/
10 | diamonddb_custom/
11 | diamonddb_swiss/
12 | nextflow.config
13 | results/
14 | sqlite_db/
15 | .varfile.sh
16 | .nextflow*
17 | .DS_Store
18 | cbs-dtu-tools.tar.gz
19 | Singularity
20 | Dockerfile
21 | evigene/
22 |
--------------------------------------------------------------------------------
/bin/get_sizes.sh:
--------------------------------------------------------------------------------
1 | filename="$1"
2 | cat $filename | cut -f 1 -d "," | awk '{print $3}' >nam
3 | cat $filename | cut -f 2 -d "," | awk '{print $2}' >len
4 | paste nam len | awk '$2<2500 {print $0}' >lt_2500
5 | he=$( paste nam len | awk '$2>=2500 {print $0}' | wc -l )
6 | for x in `seq 1 $he`;do
7 | echo data >>temp_1
8 | echo $x >>temp_2
9 | done
10 | paste temp_1 temp_2 >he_2500
11 | cat lt_2500 he_2500 >final_sizes.txt
12 | rm nam len temp_1 temp_2 lt_2500 he_2500 $filename
13 |
--------------------------------------------------------------------------------
/remove_failed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #Script to remove directories of FAILED and ABORTED processes in a nextflow pipeline
3 | #INPUT = filename_trace.txt
4 | file=$1
5 | if [ "$file" == "" ];then
6 | echo -e "\n\t Provide a trace file as input (e.g. filename_trace.txt)"
7 | echo -e "\n\t Usage: bash remove_failed.sh filename_trace.txt\n"
8 | exit 0
9 | else
10 | cat $file | grep "ABORTED" >.erase.txt
11 | cat $file | grep "FAILED" >>.erase.txt
12 | while read line;do
13 | a=$( echo $line | awk '{print $2}' )
14 | echo $a
15 | if [ -d work*/${a}* ];then
16 | rm -rf work*/${a}*
17 | fi
18 | done <.erase.txt
19 | rm .erase.txt
20 | fi
21 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3
2 |
3 | LABEL authors="Ramon Rivera-Vicens" \
4 | description="Docker image containing all requirements for TransPi pipeline" \
5 | version="1.0dev"
6 |
7 | RUN apt update; apt install -y gcc bc procps
8 |
9 | COPY transpi_env.yml /
10 | RUN conda env create -f /transpi_env.yml && conda clean -a
11 |
12 | ENV PATH /opt/conda/envs/TransPi/bin:$PATH
13 |
14 | RUN sed -i 's/base/TransPi/g' ~/.bashrc
15 |
16 | RUN wget http://arthropods.eugenes.org/EvidentialGene/other/evigene_older/evigene19may14.tar
17 | RUN tar -xf evigene19may14.tar && rm evigene19may14.tar
18 | ENV PATH /evigene/scripts/prot/:$PATH
19 |
20 | RUN mkdir -p /opt/conda/envs/TransPi/lib/python3.6/site-packages/bin && cp /opt/conda/envs/TransPi/bin/skip*.awk /opt/conda/envs/TransPi/lib/python3.6/site-packages/bin/
21 |
--------------------------------------------------------------------------------
/conf/test.config:
--------------------------------------------------------------------------------
1 | /*
2 | ========================================================================================
3 | Test Config File TransPi
4 | ========================================================================================
5 | Transcriptome Analysis Pipeline
6 | Author: Ramón E. Rivera-Vicéns
7 | ----------------------------------------------------------------------------------------
8 | */
9 |
10 | params {
11 | readsTest = [
12 | ['Sponge_sample', ['https://github.com/rivera10/test_dataset/raw/master/RNA_data/Tethya_wilhelma_R1.fastq.gz'], ['https://github.com/rivera10/test_dataset/raw/master/RNA_data/Tethya_wilhelma_R2.fastq.gz']]
13 | ]
14 | k="25,53"
15 | maxReadLen=100
16 | shortTransdecoder = true
17 | }
18 |
--------------------------------------------------------------------------------
/docs/issues.adoc:
--------------------------------------------------------------------------------
1 | We tested TransPi using the followings deployment methods:
2 |
3 | - conda = individual conda environments per process
4 |
5 | - docker = using TransPi container (i.e. -profile docker,TransPiContainer)
6 |
7 | - singularity = using TransPi container (i.e. -profile singularity,TransPiContainer)
8 |
9 |
10 | [NOTE]
11 | Using individual container per process is working for the majority of processes. However, we found a couple os issues with some containers (e.g. transabyss). We are working to find a solution for these issues.
12 |
13 |
14 | = Reporting an issue
15 |
16 | If you find a problem or get an error please let us know by opening an issue in the repository.
17 |
18 |
19 | = Test dataset
20 |
21 | We include a `test` profile to try TransPi using a small dataset. However, this can create issues in some of the process (e.g. contamination removal by psytrans).
22 |
--------------------------------------------------------------------------------
/bin/get_readqual.sh:
--------------------------------------------------------------------------------
1 | jfile="$1"
2 | sampleid=$( echo $jfile | cut -f 1 -d "." )
3 | r1bn=$( jq '.read1_before_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] )
4 | r1bq=$( jq '.read1_before_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " )
5 | r2bn=$( jq '.read2_before_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] )
6 | r2bq=$( jq '.read2_before_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " )
7 | r1an=$( jq '.read1_after_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] )
8 | r1aq=$( jq '.read1_after_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " )
9 | r2an=$( jq '.read2_after_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] )
10 | r2aq=$( jq '.read2_after_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " )
11 | echo -e "${r1bn}\n${r1bq}\n${r2bn}\n${r2bq}\n${r1an}\n${r1aq}\n${r2an}\n${r2aq}" >${sampleid}_reads_qual.csv
12 |
--------------------------------------------------------------------------------
/docs/description.adoc:
--------------------------------------------------------------------------------
1 | *TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly*
2 |
3 | TransPi provides a useful resource for the generation of de novo transcriptome assemblies,
4 | with minimum user input but without losing the ability of a thorough analysis.
5 |
6 | * For more info see the https://doi.org/10.1101/2021.02.18.431773[Preprint]
7 |
8 | * Code available at https://www.github.com/palmuc/TransPi[GitHub]
9 |
10 | * Author: Ramón Rivera-Vicéns
11 | ** https://twitter.com/rerv787[Twitter]
12 |
13 |
14 | = Programs used
15 |
16 | * List of programs use by TransPi:
17 | ** FastQC
18 | ** fastp
19 | ** sortmerna
20 | ** rnaSPADES
21 | ** SOAP
22 | ** Trinity
23 | ** Velvet
24 | ** Oases
25 | ** TransABySS
26 | ** rnaQUAST
27 | ** EvidentialGene
28 | ** CD-Hit
29 | ** Exonerate
30 | ** Blast
31 | ** BUSCO
32 | ** Psytrans
33 | ** Trandecoder
34 | ** Trinotate
35 | ** Diamond
36 | ** Hmmer
37 | ** Bowtie2
38 | ** rnammer
39 | ** tmhmm
40 | ** signalP
41 | ** iPath
42 | ** SQLite
43 | ** R
44 | ** Python
45 |
46 | * Databases used by TransPi:
47 | ** Swissprot
48 | ** Uniprot custom database (e.g. all metazoan proteins)
49 | ** Pfam
50 |
--------------------------------------------------------------------------------
/bin/get_readstats.sh:
--------------------------------------------------------------------------------
1 | jfile="$1"
2 | sampleid=$( echo $jfile | cut -f 1 -d "." )
3 | tb=$( jq '.summary.before_filtering.total_reads' $jfile )
4 | r1b=$( jq '.read1_before_filtering.total_reads' $jfile )
5 | r1bl=$( jq '.summary.before_filtering.read1_mean_length' $jfile )
6 | r2b=$( jq '.read2_before_filtering.total_reads' $jfile )
7 | r2bl=$( jq '.summary.before_filtering.read2_mean_length' $jfile )
8 | ta=$( jq '.summary.after_filtering.total_reads' $jfile )
9 | r1a=$( jq '.read1_after_filtering.total_reads' $jfile )
10 | r1al=$( jq '.summary.after_filtering.read1_mean_length' $jfile )
11 | r2a=$( jq '.read2_after_filtering.total_reads' $jfile )
12 | r2al=$( jq '.summary.after_filtering.read2_mean_length' $jfile )
13 | loss=$( echo "${tb}-${ta}" | bc )
14 | gcb=$( jq '.summary.before_filtering.gc_content' $jfile )
15 | gca=$( jq '.summary.after_filtering.gc_content' $jfile )
16 | echo "Sample_name,Total_before,R1_before,R1_before_length,R2_before,R2_before_length,GC_before,Total_after,R1_after,R1_after_length,R2_after,R2_after_length,GC_after,Reads_discarded" >${sampleid}_reads_stats.csv
17 | echo "${sampleid},${tb},${r1b},${r1bl},${r2b},${r2bl},${gcb},${ta},${r1a},${r1al},${r2a},${r2al},${gca},${loss}" >>${sampleid}_reads_stats.csv
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Molecular Geobiology and Paleobiology Lab
4 | Department of Earth and Environmental Sciences, Palaeontology & Geobiology, Ludwig-Maximilians-Universität München (LMU)
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/bin/custom_uniprot_hits.R:
--------------------------------------------------------------------------------
1 | args = commandArgs(trailingOnly=TRUE)
2 | sample_name=args[1]
3 |
4 | library(ggthemes)
5 | library(ggplot2)
6 |
7 | data=read.csv(paste(sample_name,"_custom_uniprot_hits.txt",sep=""),header=F)
8 |
9 | nlim=round((head(data$V1,n = 1)+450),digits = -2)
10 | p1<-ggplot(data=data, aes(x=reorder(data$V2,data$V1), y=data$V1))+
11 | geom_bar(stat="identity", fill="dark blue", width=.5)+
12 | coord_flip()+labs(x="UniProt Species",y="Number of Hits")+
13 | geom_text(aes(label=data$V1), position=position_dodge(width=0.3), vjust=0.25, hjust=-.10)+
14 | theme(axis.text=element_text(size=12))+ylim(0,nlim)+theme(axis.text.x=element_text(size=12,angle=0))+
15 | theme(axis.title=element_text(size=15,face="bold"))+ggtitle(paste(sample_name,"UniProt hits",sep=" "))+
16 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.5)*1, face = "bold"))
17 |
18 |
19 | # not working in docker
20 | #ggsave(filename = paste(sample_name,"_custom_uniprot_hits.svg",sep=""),width = 15 ,height = 7)
21 | #ggsave(filename = paste(sample_name,"_custom_uniprot_hits.pdf",sep=""),width = 15 ,height = 7)
22 | pdf(paste(sample_name,"_custom_uniprot_hits.pdf",sep=""),width = 15 ,height = 7)
23 | print(p1)
24 | dev.off()
25 | svg(paste(sample_name,"_custom_uniprot_hits.svg",sep=""),width = 15 ,height = 7)
26 | print(p1)
27 | dev.off()
28 |
--------------------------------------------------------------------------------
/docs/index.adoc:
--------------------------------------------------------------------------------
1 | = TransPi Manual
2 | Ramón Rivera-Vicéns
3 | v1.1.0-rc, 2021-05-25
4 | :docinfo:
5 | :keywords: TransPi, transcriptome, assembly, annotation, Nextflow, pipeline
6 | :description: TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly
7 | :icons: font
8 | :toclevels: 3
9 | :imagesdir: img
10 | :toc: left
11 | :toc-title: TransPi Manual
12 | :source-highlighter: coderay
13 | :coderay-linenums-mode: table
14 | :sectnums:
15 | :sectlinks:
16 |
17 | How to use TransPi
18 |
19 | == Description
20 | :leveloffset: +2
21 |
22 | include::description.adoc[]
23 |
24 | :leveloffset: -2
25 |
26 | == Installing TransPi
27 | :leveloffset: +2
28 |
29 | include::install.adoc[]
30 |
31 | :leveloffset: -2
32 |
33 | == Running TransPi
34 | :leveloffset: +2
35 |
36 | include::run.adoc[]
37 |
38 | :leveloffset: -2
39 |
40 | == Results
41 | :leveloffset: +2
42 |
43 | include::results.adoc[]
44 |
45 | :leveloffset: -2
46 |
47 | == Additional options
48 | :leveloffset: +2
49 |
50 | include::options.adoc[]
51 |
52 | :leveloffset: -2
53 |
54 | == Examples
55 |
56 | :leveloffset: +2
57 |
58 | include::examples.adoc[]
59 |
60 | :leveloffset: -2
61 |
62 | == Extra information
63 | :leveloffset: +2
64 |
65 | include::extra.adoc[]
66 |
67 | :leveloffset: -2
68 |
69 | == Issues
70 | :leveloffset: +2
71 |
72 | include::issues.adoc[]
73 |
74 | :leveloffset: -2
75 |
--------------------------------------------------------------------------------
/docs/install.adoc:
--------------------------------------------------------------------------------
1 | = Requirements
2 |
3 | - System: Linux OS
4 |
5 | - Data type: Paired-end reads
6 |
7 | Example:
8 | IndA_R1.fastq.gz, IndA_R2.fastq.gz
9 |
10 | [NOTE]
11 | Make sure reads end with `_R1.fastq.gz` and `_R2.fastq.gz`.
12 | Multiple individuals can be run at the same time.
13 |
14 |
15 | = Downloading TransPi
16 |
17 | 1- Clone the repository
18 |
19 | [source,bash]
20 | ----
21 |
22 | git clone https://github.com/palmuc/TransPi.git
23 |
24 | ----
25 |
26 | 2- Move to the TransPi directory
27 |
28 | [source,bash]
29 | ----
30 |
31 | cd TransPi
32 |
33 | ----
34 |
35 | = Configuration
36 |
37 | TransPi requires various databases to run. The precheck script will installed the databases and software, if necessary, to run the tool.
38 | The precheck run needs a `PATH` as an argument for installing (locally) all the databases the pipeline needs.
39 |
40 | ```
41 |
42 | bash precheck_TransPi.sh /YOUR/PATH/HERE/
43 |
44 | ```
45 |
46 | [NOTE]
47 | This process may take a while depending on the options you select. Step that takes longer is downloading, if desired, the entire metazoan proteins from UniProt (6Gb).
48 | Other processes and databases are relatively fast depending on internet connection.
49 |
50 | Once the precheck run is done it will create a file named `nextflow.config` that contains the various `PATH` for the databases.
51 | If selected, it will also have the local conda environment `PATH`.
52 |
53 | The `nextflow.config` file has also other important parameters for pipeline execution that will be discussed further
54 | in the following sections.
55 |
--------------------------------------------------------------------------------
/bin/addAnnotation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import glob
5 | import sys
6 | import argparse
7 |
8 | parser = argparse.ArgumentParser(usage='xls.py -trinotate_file FILENAME', description='')
9 | parser.add_argument('-trinotateFile', dest='trinotate_file', required=True)
10 | parser.add_argument('-db', dest='db_name', required=False, default="swissprot", help="DB to use for header: uniprot or swissprot")
11 | parser.add_argument('-type', dest='db_type', required=False, default="prot", help="Type of DB to use: nucl or prot")
12 | parser.add_argument('-combine', dest='db_combine', required=False, default="false", help='Use two DBs in headers')
13 | args = parser.parse_args()
14 |
15 | swissProtCount=0
16 | uniProtCount=0
17 | for line in open(args.trinotate_file, 'r'):
18 | line = line.strip()
19 | lineSplit = line.split("\t")
20 | if args.db_name == "swissprot" and args.db_type == "nucl":
21 | if lineSplit[2] != ".":
22 | print(">" + lineSplit[0] + " SwissProt_Blastx:" + lineSplit[2].split("^")[0])
23 | uniProtCount += 1
24 | else:
25 | print(">" + lineSplit[0] + " SwissProt_Blastx:" + "noHit")
26 | uniProtCount += 1
27 | elif args.db_name == "swissprot" and args.db_type == "prot":
28 | if lineSplit[6] != ".":
29 | print(">" + lineSplit[0] + " SwissProt_Blastp:" + lineSplit[6].split("^")[0])
30 | swissProtCount += 1
31 | else:
32 | print(">" + lineSplit[0] + " SwissProt_Blastp:" + "noHit")
33 | swissProtCount += 1
34 | elif args.db_name == "uniprot" and args.db_type == "nucl":
35 | if lineSplit[7] != ".":
36 | print(">" + lineSplit[0] + " UniProt_Blastx:" + lineSplit[7].split("^")[0])
37 | uniProtCount += 1
38 | else:
39 | print(">" + lineSplit[0] + " UniProt_Blastx:" + "noHit")
40 | uniProtCount += 1
41 | elif args.db_name == "uniprot" and args.db_type == "prot":
42 | if lineSplit[8] != ".":
43 | print(">" + lineSplit[0] + " UniProt_Blastp:" + lineSplit[8].split("^")[0])
44 | uniProtCount += 1
45 | else:
46 | print(">" + lineSplit[0] + " UniProt_Blastp:" + "noHit")
47 | uniProtCount += 1
48 |
--------------------------------------------------------------------------------
/bin/get_busco_val.sh:
--------------------------------------------------------------------------------
1 | name_tri=$1
2 | name_transpi=$2
3 | version=$3
4 | a=$4
5 | if [ "$version" == "v3" ];then
6 | #trinity
7 | for x in $name_tri;do
8 | echo "'${a}','${a}','${a}','${a}'," >>tspec.txt
9 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," )
10 | echo "${b}," >>tnum.txt
11 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" )
12 | echo "${c}," >>tperc.txt
13 | done
14 | #transpi
15 | for x in $name_transpi;do
16 | echo "'${a}_TP','${a}_TP','${a}_TP','${a}_TP'" >>pspec.txt
17 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," )
18 | echo "${b}" >>pnum.txt
19 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" )
20 | echo "${c}" >>pperc.txt
21 | done
22 | cat tspec.txt pspec.txt | tr "\t" "\n" | tr -d "\n" >final_spec
23 | cat tnum.txt pnum.txt | tr "\t" "\n" | tr -d "\n" >final_num
24 | cat tperc.txt pperc.txt | tr "\t" "\n" | tr -d "\n" >final_perc
25 | rm tnum.txt tperc.txt tspec.txt
26 | rm pnum.txt pperc.txt pspec.txt
27 | elif [ "$version" == "v4" ];then
28 | #trinity
29 | for x in $name_tri;do
30 | echo "'${a}','${a}','${a}','${a}'," >>tspec.txt
31 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," )
32 | echo "${b}," >>tnum.txt
33 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" )
34 | echo "${c}," >>tperc.txt
35 | done
36 | #transpi
37 | for x in $name_transpi;do
38 | echo "'${a}_TP','${a}_TP','${a}_TP','${a}_TP'" >>pspec.txt
39 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," )
40 | echo "${b}" >>pnum.txt
41 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" )
42 | echo "${c}" >>pperc.txt
43 | done
44 | cat tspec.txt pspec.txt | tr "\t" "\n" | tr -d "\n" >final_spec
45 | cat tnum.txt pnum.txt | tr "\t" "\n" | tr -d "\n" >final_num
46 | cat tperc.txt pperc.txt | tr "\t" "\n" | tr -d "\n" >final_perc
47 | rm tnum.txt tperc.txt tspec.txt
48 | rm pnum.txt pperc.txt pspec.txt
49 | fi
50 |
--------------------------------------------------------------------------------
/docs/results.adoc:
--------------------------------------------------------------------------------
1 | = Directories
2 |
3 | == `results`
4 | After a successful run of TransPi the results are saved in a directory called `results`. This directory is divided into multiple directories for each major step of the pipeline.
5 |
6 | [NOTE]
7 | Directories will be created based on the options selected in the pipeline execution
8 |
9 | [horizontal]
10 | fastqc:: Fastqc html files
11 | filter:: Filter step html files
12 | rRNA_reads:: Info and reads of rRNA removal process
13 | normalization:: Normalized reads files
14 | saveReads:: Folder with reads saved from the filter and normalization processes
15 | assemblies:: All individual assemblies
16 | evigene:: Non-redundant final transcriptome (ends with name `.combined.okay.fa`)
17 | rnaQuast:: rnaQUAST output
18 | mapping:: Mapping results
19 | busco4:: BUSCO V4 results
20 | transdecoder:: Transdecoder results
21 | trinotate:: Annotation results
22 | report:: Interactive report of TransPi
23 | figures:: Figures created by TransPi (BUSCO comparison, Annotation, GO, etc)
24 | stats:: Basic stats from all steps of TransPi
25 | pipeline_info:: Nextflow report, trace file and others
26 | RUN_INFO.txt:: File with all versions of the tools used by TransPi. Also info from the run like command and PATH
27 |
28 | .NOTES
29 |
30 | ****
31 |
32 | - Name of output directory can be changed by using the `--outdir` parameter when executing the pipeline. Example `--outdir Results_SampleA`.
33 | - If multiple samples are run, each directory will have all files together but each one with a unique sample name.
34 |
35 |
36 | ****
37 |
38 | == `work`
39 |
40 | A directory called `work` is also created when running TransPi. It contains all the Nextflow working files, TransPi results and intermediate files.
41 |
42 | [NOTE]
43 | Directory `work` can be removed after the pipeline is done since all important files are stored in the `results` directory.
44 |
45 |
46 | = Figures
47 |
48 | TransPi produces multiple figures that are stored in the results directory.
49 |
50 | Example:
51 |
52 | image:https://sync.palmuc.org/index.php/s/kxetdGiNiSyHzrg/preview[UniProt,800,400,float="center", role="Uniprot"]
53 |
54 |
55 | = Report
56 |
57 | TransPi creates an interactive custom HTML report for ease data exploration.
58 |
59 | Report https://sync.palmuc.org/index.php/s/XCxeCNwAfParBHX[Sponge transcriptome]
60 |
61 | .NOTE
62 | ****
63 | - Example report here is a PDF file and not a HTML file. However the original HTML file with interactive visualization (i.e. as generated in TransPi) can be downloaded https://sync.palmuc.org/index.php/s/nP3TKPawmoX4xqL[here]
64 | ****
65 |
--------------------------------------------------------------------------------
/bin/heatmap_busco.R:
--------------------------------------------------------------------------------
1 | args = commandArgs(trailingOnly=TRUE)
2 | sample_name=args[1]
3 | comp_table=args[2]
4 | transpi_table=args[3]
5 |
6 | library(plotly)
7 | library(reshape2)
8 |
9 | # comparison table
10 | csv=read.csv(comp_table, header=TRUE, sep="\t")
11 |
12 | csv <- data.frame(lapply(csv, function(x) {gsub("Complete", "3", x)}))
13 | csv <- data.frame(lapply(csv, function(x) {gsub("Duplicated", "2", x)}))
14 | csv <- data.frame(lapply(csv, function(x) {gsub("Fragmented", "1", x)}))
15 | csv <- data.frame(lapply(csv, function(x) {gsub("Missing", "0", x)}))
16 | csv
17 | c=melt(csv,id.vars = 'Busco.ID')
18 | dec=c(0,.25,.25,.50,.50,.75,.75,1)
19 | my_colors <- c("#081D58","#081D58", "#2280B8","#2280B8", "#99D6B9", "#99D6B9","#f8f9fc","#f8f9fc")
20 | colz <- setNames(data.frame(dec, my_colors), NULL)
21 | fig <- plot_ly(c,x=~variable, y=~Busco.ID, z=~value, colorscale=colz, reversescale=T, type = "heatmap",
22 | colorbar=list(tickmode='array', tickvals=c(.35,1.1,1.87,2.60), thickness=30,
23 | ticktext= c("Missing","Fragmented","Duplicated","Complete"), len=0.4))
24 | fig <- fig %>% layout(xaxis=list(title="", showline = TRUE, mirror = TRUE),
25 | yaxis=list(title="BUSCO ID", tickmode="auto", nticks=length(csv$Busco.ID),
26 | tickfont=list(size=8), showline = TRUE, mirror = TRUE))
27 |
28 | orca(fig, paste(sample_name,"_all_missing_BUSCO.png",sep=""))
29 | orca(fig, paste(sample_name,"_all_missing_BUSCO.pdf",sep=""))
30 |
31 | # TransPi table
32 | csv=read.csv(transpi_table, header=TRUE, sep="\t")
33 |
34 | csv <- data.frame(lapply(csv, function(x) {gsub("Complete", "3", x)}))
35 | csv <- data.frame(lapply(csv, function(x) {gsub("Duplicated", "2", x)}))
36 | csv <- data.frame(lapply(csv, function(x) {gsub("Fragmented", "1", x)}))
37 | csv <- data.frame(lapply(csv, function(x) {gsub("Missing", "0", x)}))
38 | csv
39 | c=melt(csv,id.vars = 'Busco.ID')
40 | dec=c(0,.25,.25,.50,.50,.75,.75,1)
41 | my_colors <- c("#081D58","#081D58", "#2280B8","#2280B8", "#99D6B9", "#99D6B9","#f8f9fc","#f8f9fc")
42 | colz <- setNames(data.frame(dec, my_colors), NULL)
43 | fig <- plot_ly(c,x=~variable, y=~Busco.ID, z=~value, colorscale=colz, reversescale=T, type = "heatmap",
44 | colorbar=list(tickmode='array', tickvals=c(.35,1.1,1.87,2.60), thickness=30,
45 | ticktext= c("Missing","Fragmented","Duplicated","Complete"), len=0.4))
46 | fig <- fig %>% layout(xaxis=list(title="", showline = TRUE, mirror = TRUE),
47 | yaxis=list(title="BUSCO ID", tickmode="auto", nticks=length(csv$Busco.ID),
48 | tickfont=list(size=8), showline = TRUE, mirror = TRUE))
49 |
50 | orca(fig, paste(sample_name,"_TransPi_missing_BUSCO.png",sep=""))
51 | orca(fig, paste(sample_name,"_TransPi_missing_BUSCO.pdf",sep=""))
52 |
--------------------------------------------------------------------------------
/docs/options.adoc:
--------------------------------------------------------------------------------
1 | There are other parameters that can be changed when executing TransPi.
2 |
3 | = Output options
4 |
5 | [horizontal]
6 | `--outdir`::
7 | name of output directory. Example: `--outdir Sponges_150`.
8 | Default "results"
9 |
10 | `-w, -work`::
11 | name of working directory. Example: `-work Sponges_work`. Only one dash is needed for `-work` since it is a nextflow function.
12 |
13 | `--tracedir`::
14 | Name for directory to save pipeline trace files.
15 | Default "pipeline_info"
16 |
17 | = Additional analyses
18 |
19 | [horizontal]
20 | `--rRNAfilter`:: Remove rRNA from sequences. Requires option --rRNAdb
21 |
22 | `--rRNAdb`:: PATH to database of rRNA sequences to use for filtering of rRNA. Default ""
23 |
24 | `--filterSpecies`::
25 | Perform psytrans filtering of transcriptome. Requires options `--host` and `--symbiont`
26 |
27 | `--host`:: Host (or similar) protein file.
28 |
29 | `--symbiont`:: Symbionts (or similar) protein files
30 |
31 | `--psyval`:: Psytrans value to train model. Default "160"
32 |
33 | `--allBuscos`:: Run BUSCO analysis in all assemblies
34 |
35 | `--rescueBusco`:: Generate BUSCO distribution analysis
36 |
37 | `--minPerc`::
38 | Mininmum percentage of assemblers require for the BUSCO distribution.
39 | Default ".70"
40 |
41 | `--shortTransdecoder`:: Run Transdecoder without the homology searches
42 |
43 | `--withSignalP`::
44 | Include SignalP for the annotation. Needs manual installation of CBS-DTU tools.
45 | Default "false"
46 |
47 | `--rnam`:: PATH to Rnammer software. Default ""
48 |
49 | `--withTMHMM`::
50 | Include TMHMM for the annotation. Needs manual installation of CBS-DTU tools.
51 | Default "false"
52 |
53 | `--tmhmm`:: PATH to TMHMM software. Default ""
54 |
55 | `--withRnammer`::
56 | Include Rnammer for the annotation. Needs manual installation of CBS-DTU tools.
57 | Default "false"
58 |
59 | `--rnam`:: PATH to Rnammer software. Default ""
60 |
61 | = Skip options
62 |
63 | [horizontal]
64 | `--skipEvi`:: Skip EvidentialGene run in --onlyAsm option. Default "false"
65 |
66 | `--skipQC`:: Skip FastQC step. Default "false"
67 |
68 | `--skipFilter`:: Skip fastp filtering step. Default "false"
69 |
70 | `--skipKegg`:: Skip kegg analysis. Default "false"
71 |
72 | `--skipReport`:: Skip generation of final TransPi report. Default "false"
73 |
74 | = Other parameters
75 |
76 | [horizontal]
77 | `--minQual`:: Minimum quality score for fastp filtering. Default "25"
78 |
79 | `--pipeInstall`:: PATH to TransPi directory. Default "". If precheck is used this will be added to the nextflow.config automatically.
80 |
81 | `--envCacheDir`:: PATH for environment cache directory (either conda or containers). Default "Launch directory of pipeline"
82 |
--------------------------------------------------------------------------------
/docs/extra.adoc:
--------------------------------------------------------------------------------
1 | Here are some notes that can help in the execution of TransPi. Also some important considerations based Nextflow settings.
2 | For more in detail information visit the https://www.nextflow.io/docs/latest/index.html[Nextflow documentation]
3 |
4 | = `-resume`
5 | If an error occurs and you need to resume the pipeline just include the `-resume` option when calling the pipeline.
6 |
7 | [source,bash]
8 | ----
9 | ./nextflow run TransPi.nf --onlyAnn -profile conda -resume
10 | ----
11 |
12 | = `template.nextflow.config`
13 |
14 | == Resources
15 | The `template.nextflow.config` file has different configurations for the each program of the pipeline
16 | (e.g. some with a lot of CPUs, others with a small amount of CPUs). You can modify this depending on the resources you have in your system.
17 |
18 | Example:
19 | [source,bash]
20 | ****
21 | process {
22 | withLabel: big_cpus {
23 | cpus='30'
24 | memory='15 GB'
25 | }
26 | ****
27 |
28 | In this case, all the processes using the label `big_cpus` will use 30 CPUs. If your system only has 20 please modify these values accordingly to avoid errors.
29 |
30 | [NOTE]
31 | Setting the correct CPUs and RAM of your system is important because `nextflow` will start as many jobs as possible if the resources are available.
32 | If you are in a VM with 120 CPUs, `nextfow` will be able to start four processes with the label `big_cpus`.
33 |
34 | == Data
35 |
36 | The precheck is designed to create a new `nextflow.config` every time is run with with the `PATH` to the databases.
37 | You can modify the values that do not need editing for your analysis on the `template.nextflow.config`. This way you avoid doing the same changes to the `nextflow.config` after the precheck run.
38 |
39 | Example: Modify the `template.nextflow.config` with your cluster info to avoid repeating these in the future.
40 |
41 | = Custom profiles
42 |
43 | We are using https://slurm.schedmd.com/documentation.html[SLURM] as our workload manager in our server.
44 | Thus we have custom profiles for the submission of jobs. For example our `nextflow.config` has the following lines in the `profiles` section.
45 |
46 |
47 | [source,text]
48 | profiles {
49 | palmuc {
50 | process {
51 | executor='slurm'
52 | clusterOptions='--clusters=inter --partition=bigNode --qos=low'
53 | }
54 | }
55 | }
56 |
57 |
58 | You can add your custom profiles depending on the settings of your system and the workload manager you use (e.g. SGE, PBS, etc).
59 |
60 | The line `clusterOptions` can be used to add any other option that you will usually use for your job submission.
61 |
62 | = Local nextflow
63 |
64 | To avoid calling the pipeline using `./nextflow ...` you can modify the nextflow command like this `chmod 777 nextflow`. For running the pipeline you just need to use:
65 |
66 | [source,bash]
67 | ****
68 |
69 | nextflow run TransPi.nf ...
70 |
71 | ****
72 |
73 | = Real Time Monitoring
74 | To monitor your pipeline remotely without connecting to the server via ssh use https://tower.nf/login[Nextflow Tower].
75 | Make an account with your email and follow their instructions. After this, you can now run the pipeline adding the `-with-tower` option and follow live the execution
76 | of the processes.
77 |
78 | [source,bash]
79 | ****
80 |
81 | nextflow run TransPi.nf --all -with-tower -profile conda
82 |
83 | ****
84 |
--------------------------------------------------------------------------------
/docs/run.adoc:
--------------------------------------------------------------------------------
1 | = Full analysis (`--all`)
2 |
3 | After the successful run of the precheck script you are set to run TransPi.
4 |
5 | We recommend to run TransPi with the option `--all` where it will do the complete analysis, from raw reads filtering to annotation.
6 | Other options described below.
7 |
8 | To run the complete pipeline.
9 | [source,bash]
10 | ----
11 | nextflow run TransPi.nf --all --reads '/YOUR/READS/PATH/HERE/*_R[1,2].fastq.gz' \
12 | --k 25,41,53 --maxReadLen 75 -profile conda
13 |
14 | ----
15 |
16 | Arguments explanations:
17 | [source,text]
18 | ----
19 | --all Run full TransPi analysis
20 | --reads PATH to the paired-end reads
21 | --k kmers list to use for the assemblies
22 | --maxReadLen Max read length in the library
23 | -profile Use conda to run the analyses
24 | ----
25 |
26 | [CAUTION]
27 | --
28 | If you combined multiple libraries of the same individual to create a reference transcriptome, which will be later use in downstream analyses (e.g. Differential Expression),
29 | make sure the kmer list is based on the length for the shortest read library and the `maxReadLen` based on the longest read length.
30 |
31 | Example: Combining reads of 100bp with 125bp
32 | [source,text]
33 | ****
34 | --k 25,41,53,61 --maxReadLen 125
35 | ****
36 | --
37 |
38 | [NOTE]
39 | --
40 | You can run multiple samples at the same time
41 | --
42 |
43 | = Other options
44 |
45 | == `--onlyAsm`
46 |
47 | Run only the Assemblies and EvidentialGene analysis.
48 |
49 | Example for `--onlyAsm`:
50 | [source,bash]
51 | ----
52 | nextflow run TransPi.nf --onlyAsm --reads '/home/rrivera/TransPi/reads/*_R[1,2].fastq.gz' \
53 | --k 25,41,53 --maxReadLen 75 -profile conda
54 |
55 | ----
56 |
57 | [NOTE]
58 | You can run multiple samples at the same time
59 |
60 | == `--onlyEvi`
61 |
62 | Run only the Evidential Gene analysis
63 |
64 | Example for `--onlyEvi`:
65 | [source,bash]
66 | ----
67 | nextflow run TransPi.nf --onlyEvi -profile conda
68 | ----
69 |
70 |
71 | [IMPORTANT]
72 | TransPi looks for a directory named `onlyEvi`. It expects one file per sample to perform the reduction. The file should have all the assemblies concatenated into one.
73 | [NOTE]
74 | You can run multiple samples at the same time
75 |
76 | == `--onlyAnn`
77 |
78 | Run only the Annotation analysis (starting from a final assembly)
79 |
80 | Example for `--onlyAnn`:
81 | [source,bash]
82 | ----
83 | nextflow run TransPi.nf --onlyAnn -profile conda
84 | ----
85 |
86 | [IMPORTANT]
87 | TransPi looks for a directory named `onlyAnn`. It expects one file per sample to perform the annotation.
88 | [NOTE]
89 | You can run multiple samples at the same time
90 |
91 | = Using `-profiles`
92 |
93 | TransPi can also use docker, singularity, and individual conda installations (i.e. per process) to deploy the pipeline.
94 |
95 | [source,text]
96 | ----
97 | test Run TransPi with a test dataset
98 | conda Run TransPi with conda.
99 | docker Run TransPi with docker container
100 | singularity Run TransPi with singularity container with all the necessary tools
101 | TransPiContainer Run TransPi with a single container with all tools
102 | ----
103 |
104 | [NOTE]
105 | --
106 | Multiple profiles can be specified (comma separated)
107 |
108 | [source,text]
109 | ****
110 | Example: `-profile test,singularity`
111 | ****
112 | --
113 |
114 | Refer to *Section 6* of this manual for further details on deployment of TransPi using other profiles.
115 |
--------------------------------------------------------------------------------
/bin/GO_plots.R:
--------------------------------------------------------------------------------
1 | args = commandArgs(trailingOnly=TRUE)
2 | sample_name=args[1]
3 |
4 | library(ggthemes)
5 | library(ggplot2)
6 |
7 | dataCC=read.delim("GO_cellular.txt", header = F, sep = "\t")
8 | dataMF=read.delim("GO_molecular.txt", header = F, sep = "\t")
9 | dataBP=read.delim("GO_biological.txt", header = F, sep = "\t")
10 |
11 | #CC
12 | nlim=round((head(dataCC$V1,n = 1)+150),digits = -2)
13 | p1<-ggplot(data=dataCC, aes(x=reorder(dataCC$V2,dataCC$V1), y=dataCC$V1))+
14 | geom_bar(stat="identity", fill="green", width=.5)+
15 | coord_flip()+labs(x="Classification",y="Number of Sequences")+
16 | geom_text(aes(label=dataCC$V1), position=position_dodge(width=0.7), vjust=-0.0005, hjust=-.15)+
17 | theme(axis.text=element_text(size=10))+ylim(0,nlim)+theme(text = element_text(size = 15))+
18 | theme(axis.text.x=element_text(size=12,angle=0))+theme(axis.title=element_text(size=15,face="bold"))+
19 | ggtitle(paste(sample_name,"Cellular Componenet GOs",sep=" "))+
20 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.1)*1, face = "bold"))
21 |
22 | #ggsave(filename = paste(sample_name,"_Cellular_Component.svg",sep=""),width = 15 ,height = 7)
23 | #ggsave(filename = paste(sample_name,"_Cellular_Component.pdf",sep=""),width = 15 ,height = 7)
24 | pdf(paste(sample_name,"_Cellular_Component.pdf",sep=""),width = 15 ,height = 7)
25 | print(p1)
26 | dev.off()
27 | svg(paste(sample_name,"_Cellular_Component.svg",sep=""),width = 15 ,height = 7)
28 | print(p1)
29 | dev.off()
30 |
31 | #MF
32 | nlim=round((head(dataMF$V1,n = 1)+150),digits = -2)
33 | p2 <-ggplot(data=dataMF, aes(x=reorder(dataMF$V2,dataMF$V1), y=dataMF$V1))+
34 | geom_bar(stat="identity", fill="blue", width=.5)+
35 | coord_flip()+labs(x="Classification",y="Number of Sequences")+
36 | geom_text(aes(label=dataMF$V1), position=position_dodge(width=0.7), vjust=-0.0005, hjust=-.15)+
37 | theme(axis.text=element_text(size=10))+ylim(0,nlim)+theme(text = element_text(size = 15))+
38 | theme(axis.text.x=element_text(size=12,angle=0))+theme(axis.title=element_text(size=15,face="bold"))+
39 | ggtitle(paste(sample_name,"Molecular Function GOs",sep=" "))+
40 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.1)*1, face = "bold"))
41 |
42 | #ggsave(filename = paste(sample_name,"_Molecular_Function.svg",sep=""),width = 15 ,height = 7)
43 | #ggsave(filename = paste(sample_name,"_Molecular_Function.pdf",sep=""),width = 15 ,height = 7)
44 | pdf(paste(sample_name,"_Molecular_Function.pdf",sep=""),width = 15 ,height = 7)
45 | print(p2)
46 | dev.off()
47 | svg(paste(sample_name,"_Molecular_Function.svg",sep=""),width = 15 ,height = 7)
48 | print(p2)
49 | dev.off()
50 |
51 | #BP
52 | nlim=round((head(dataBP$V1,n = 1)+150),digits = -2)
53 | p3<-ggplot(data=dataBP, aes(x=reorder(dataBP$V2,dataBP$V1), y=dataBP$V1))+
54 | geom_bar(stat="identity", fill="red", width=.5)+
55 | coord_flip()+labs(x="Classification",y="Number of Sequences")+
56 | geom_text(aes(label=dataBP$V1), position=position_dodge(width=0.7), vjust=-0.0005, hjust=-.15)+
57 | theme(axis.text=element_text(size=10))+ylim(0,nlim)+theme(text = element_text(size = 15))+
58 | theme(axis.text.x=element_text(size=12,angle=0))+theme(axis.title=element_text(size=15,face="bold"))+
59 | ggtitle(paste(sample_name,"Biological Processes GOs",sep=" "))+
60 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.1)*1, face = "bold"))
61 |
62 | #ggsave(filename = paste(sample_name,"_Biological_Processes.svg",sep=""),width = 15 ,height = 7)
63 | #ggsave(filename = paste(sample_name,"_Biological_Processes.pdf",sep=""),width = 15 ,height = 7)
64 | pdf(paste(sample_name,"_Biological_Processes.pdf",sep=""),width = 15 ,height = 7)
65 | print(p3)
66 | dev.off()
67 | svg(paste(sample_name,"_Biological_Processes.svg",sep=""),width = 15 ,height = 7)
68 | print(p3)
69 | dev.off()
70 |
--------------------------------------------------------------------------------
/bin/SOS_busco.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import pandas as pd
4 | import Bio
5 | from Bio import SeqIO
6 | import argparse
7 | from functools import reduce
8 | import numpy as np
9 | import math
10 | import sys
11 | from collections import Counter
12 |
13 | parser = argparse.ArgumentParser(usage='', description='')
14 | parser.add_argument('-input_file_busco', dest='input_file_busco',required=True)
15 | parser.add_argument('-input_file_fasta', dest='input_file_fasta',required=True)
16 | parser.add_argument('-min', dest='min_num_assembler', type=float, required=True)
17 | parser.add_argument('-kmers',dest='kmers',required=True)
18 |
19 | args = parser.parse_args()
20 |
21 | assemblers_names = ['SOAP','SPADES','TransABySS','Velvet']
22 |
23 | all_missing_list = []
24 | list_of_databases = []
25 | final_list = []
26 |
27 | Busco_to_save = []
28 |
29 | with open(args.input_file_busco) as input_busco_file:
30 |
31 | kmers_list = args.kmers.strip().split(',')
32 | nr_of_kmers = (len(kmers_list)*4+2)
33 | column_names = [(assembler + '_' + kmer) for assembler in assemblers_names for kmer in kmers_list]
34 | column_names.insert(3*len(kmers_list) ,'Trinity')
35 | column_names.insert(len(column_names),'TransPi')
36 | column_names.insert(0,'Busco ID')
37 |
38 | busco_df = pd.read_csv(input_busco_file, sep=',',header=0,names=['Busco_id','Status','Sequence','Score','Length'])
39 | busco_unique = busco_df.groupby((busco_df['Busco_id'] !=busco_df['Busco_id'].shift()).cumsum().values).first()
40 |
41 | busco_tables = np.array_split(busco_unique, nr_of_kmers)
42 | transpi_table = busco_tables[nr_of_kmers-1]
43 |
44 | for table in busco_tables:
45 | busco_missing = table[table.Status.eq('Missing')].iloc[:,0].tolist()
46 | all_missing_list.extend(busco_missing)
47 | missing_Busco = list(dict.fromkeys(all_missing_list))
48 |
49 | for table in busco_tables:
50 | final_df = table[table['Busco_id'].isin(missing_Busco)].iloc[:, 0:2]
51 | final_list.append(final_df)
52 |
53 | comparison_table = reduce(lambda left,right: pd.merge(left,right,on='Busco_id'), final_list)
54 | comparison_table.columns = column_names
55 | transpi_table = comparison_table[(comparison_table['TransPi'] == 'Missing')]
56 |
57 | comparison_table.to_csv('Complete_comparison_table',sep='\t',index=False)
58 | transpi_table.to_csv('TransPi_comparison_table',sep='\t',index=False)
59 |
60 | BUSCO_to_rescue = transpi_table[(transpi_table == 'Complete').any(axis=1)].iloc[:,0].tolist()
61 |
62 | if len(BUSCO_to_rescue) == 0:
63 | sys.exit(0)
64 | elif len(BUSCO_to_rescue) != 0:
65 | for table in busco_tables[:-1]:
66 | for i in BUSCO_to_rescue:
67 | seqs = (i,table['Sequence'].loc[table['Busco_id'] == i].values[0],table['Score'].loc[table['Busco_id'] == i].values[0])
68 | Busco_to_save.append(seqs)
69 |
70 | potential_seqs = [t for t in Busco_to_save if not any(isinstance(n, float) and math.isnan(n) for n in t)]
71 | flat_list = [i[0] for i in potential_seqs]
72 | busco_count = Counter(flat_list)
73 |
74 | min_number = nr_of_kmers * args.min_num_assembler
75 | busco_to_save = [k for k, v in busco_count.items() if v >= min_number]
76 |
77 | seqs_to_save = [item for item in potential_seqs if item[0] in busco_to_save]
78 |
79 | seqs_to_save.sort(key= lambda x: x[2], reverse=True)
80 |
81 | checked = set()
82 | unique_seqs_list = []
83 |
84 | for busco_id, sequence, score in seqs_to_save:
85 | if not busco_id in checked:
86 | checked.add(busco_id)
87 | unique_seqs_list.append((busco_id,sequence))
88 |
89 | #The fasta file is parsed with Biopython SeqIO.parse. And target sequences are extracted.
90 | sequences_IDs_to_rescue = [ x[1] for x in unique_seqs_list]
91 | fasta_to_extract = []
92 |
93 | for seqrecord in SeqIO.parse(args.input_file_fasta, 'fasta'):
94 | if seqrecord.id in sequences_IDs_to_rescue:
95 | fasta_to_extract.append(seqrecord)
96 |
97 | #Output files are written.
98 | with open('sequences_to_add.fasta','w') as outputh:
99 | SeqIO.write(fasta_to_extract,outputh,'fasta')
100 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TransPi - TRanscriptome ANalysiS PIpeline
2 |
3 | ```text
4 | _______ _____ _
5 | |__ __| | __ \ (_)
6 | | | _ __ __ _ _ __ ___ | |__) | _
7 | | | | __| / _ | | _ \ / __| | ___/ | |
8 | | | | | | (_| | | | | | \__ \ | | | |
9 | |_| |_| \__,_| |_| |_| |___/ |_| |_|
10 | ```
11 |
12 | [](https://doi.org/10.1101/2021.02.18.431773)[**Preprint**](https://doi.org/10.1101/2021.02.18.431773) [](https://gitter.im/PalMuc/TransPi) [](https://docs.conda.io/en/latest/) [](https://www.docker.com/) [](https://sylabs.io/docs/)
13 | [](https://github.com/PalMuc/TransPi/releases/latest)
14 |
15 | # Table of contents
16 |
17 | * [General info](#General-info)
18 | * [Pipeline processes](#Pipelie-processes)
19 | * [Manual](#Manual)
20 | * [Publication](#Publication)
21 | * [Citation](#Citation)
22 | * [Funding](#Funding)
23 | * [Future work](#Future-work)
24 | * [Issues](#Issues)
25 | * [Chat](#Chat)
26 |
27 | # General info
28 |
29 | TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly
30 |
31 | TransPi is based on the scientific workflow manager [Nextflow](https://www.nextflow.io). It is designed to help researchers get the best reference transcriptome assembly for their organisms of interest. It performs multiple assemblies with different parameters to then get a non-redundant consensus assembly. It also performs other valuable analyses such as quality assessment of the assembly, BUSCO scores, Transdecoder (ORFs), and gene ontologies (Trinotate), etc. All these with minimum input from the user but without losing the potential of a comprehensive analysis.
32 |
33 | ## Pipeline processes
34 |
35 | 
36 |
37 | **Figure 1.** TransPi v1.0.0 flowchart showing the various steps and analyses it can performed. For simplicity, this diagram does not show all the connections between the processes. Also, it omits other additional options like the BUSCO distribution and transcriptome filtering with psytrans (see Section 2.6). ORFs=Open reading Frames; HTML=Hypertext Markup Language.
38 |
39 | ## Manual
40 |
41 | TransPi documentation and examples can be found [here](https://palmuc.github.io/TransPi/)
42 |
43 | # Publication
44 |
45 | Preprint of TransPi including kmer, reads length, and reads quantities tests can be found [here](https://doi.org/10.1101/2021.02.18.431773). Also we tested the pipeline with over 45 samples from different phyla.
46 |
47 | TransPi has been peer-reviewed and recommended by Peer Community In Genomics
48 | (https://doi.org/10.24072/pci.genomics.100009)
49 |
50 | ## Citation
51 |
52 | If you use TransPi please cite the peer-reviewed publication:
53 |
54 | Rivera-Vicéns, R.E., García-Escudero, CA., Conci, N., Eitel, M., and Wörheide, G. (2021). TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly. bioRxiv 2021.02.18.431773; doi: https://doi.org/10.1101/2021.02.18.431773
55 |
56 | # Funding
57 |
58 | - European Union’s Horizon 2020 research and innovation programme under the Marie Skłodowska-Curie grant agreement No 764840 (ITN IGNITE).
59 |
60 | - Advanced Human Capital Program of the National Commission for Scientific and Technological Research (CONICYT)
61 |
62 | - Lehre@LMU (project number: W19 F1; Studi forscht@GEO)
63 |
64 | - LMU Munich’s Institutional Strategy LMUexcellent within the framework of the German Excellence Initiative
65 |
66 | # Future work
67 |
68 | - Cloud deployment of the tool
69 |
70 | # Issues
71 |
72 | We tested TransPi using conda, singularity and docker. However, if you find a problem or get an error please let us know by opening an issue.
73 |
74 | ## Chat
75 |
76 | If you have further questions and need help with TransPi you can chat with us in the [TransPi Gitter chat](https://gitter.im/PalMuc/TransPi)
77 |
--------------------------------------------------------------------------------
/bin/busco_comparison.R:
--------------------------------------------------------------------------------
1 | ######################################
2 |
3 | # Edit from the orginal BUSCO plot script
4 |
5 | ######################################
6 | #
7 | # BUSCO summary figure
8 | # @version 3.0.0
9 | # @since BUSCO 2.0.0
10 | #
11 | # Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org)
12 | # Licensed under the MIT license. See LICENSE.md file.
13 | #
14 | ######################################
15 | args = commandArgs(trailingOnly=TRUE)
16 | sample_name=args[1]
17 | options(warn=-1)
18 | # Load the required libraries
19 | library(ggplot2)
20 | library(grid)
21 | # !!! CONFIGURE YOUR PLOT HERE !!!
22 | # Output
23 | #my_output <- paste("./","combined_busco_figure.png",sep="/")
24 | #my_width <- 20
25 | #my_height <- 15
26 | #my_unit <- "cm"
27 | # Colors
28 | #my_colors <- c("#56B4E9", "#3492C7", "#F0E442", "#F04442")
29 | #cata
30 | my_colors <- c("#0e9aa7", "#96ceba", "#ffcc5c", "#ff6f69")
31 | # Bar height ratio
32 | my_bar_height <- 0.55
33 | # Legend
34 | my_title <- "BUSCO Assessment Results - Trinity vs TransPi"
35 | # Font
36 | my_family <- "sans"
37 | my_size_ratio <- 1
38 | # !!! SEE YOUR DATA HERE !!!
39 | # Your data as generated by python, remove or add more
40 | my_species <- c(MYSPEC)
41 | my_species <- factor(my_species)
42 | my_species <- factor(my_species,levels(my_species)[c(length(levels(my_species)):1)]) # reorder your species here just by changing the values in the vector :
43 | my_percentage <- c(MYPERC)
44 | my_values <- c(MYVAL)
45 |
46 | ######################################
47 | ######################################
48 | # Code to produce the graph
49 | labsize = 1
50 | if (length(levels(my_species)) > 10){
51 | labsize = 0.66
52 | }
53 | print("Plotting the figure ...")
54 | category <- c(rep(c("S","D","F","M"),c(1)))
55 | category <-factor(category)
56 | #category = factor(category,levels(category)[c(4,1,2,3)])
57 | category = factor(category,levels(category)[(c(4,1,2,3))])
58 | df = data.frame(my_species,my_percentage,my_values,category)
59 | figure <- ggplot() +
60 | geom_bar(aes(y = my_percentage, x = my_species, fill = category), data = df, stat="identity", width=my_bar_height,position = position_stack(reverse=TRUE)) +
61 | coord_flip() +
62 | theme_gray(base_size = 8) +
63 | scale_y_continuous(labels = c("0","20","40","60","80","100"), breaks = c(0,20,40,60,80,100)) +
64 | scale_fill_manual(values = my_colors,labels =c(" Complete (C) and single-copy (S) ",
65 | " Complete (C) and duplicated (D)",
66 | " Fragmented (F) ",
67 | " Missing (M)")) +
68 | ggtitle(my_title) +
69 | xlab("") +
70 | ylab("\n%BUSCOs") +
71 | theme(plot.title = element_text(family=my_family, colour = "black", size = rel(2.2)*my_size_ratio, face = "bold")) +
72 | theme(legend.position="top",legend.title = element_blank()) +
73 | theme(legend.text = element_text(family=my_family, size = rel(1.2)*my_size_ratio)) +
74 | theme(panel.background = element_rect(color="#FFFFFF", fill="white")) +
75 | theme(panel.grid.minor = element_blank()) +
76 | theme(panel.grid.major = element_blank()) +
77 | theme(axis.text.y = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio,face="italic")) +
78 | theme(axis.text.x = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) +
79 | theme(axis.line = element_line(size=1*my_size_ratio, colour = "black")) +
80 | theme(axis.ticks.length = unit(.85, "cm")) +
81 | theme(axis.ticks.y = element_line(colour="white", size = 0)) +
82 | theme(axis.ticks.x = element_line(colour="#222222")) +
83 | theme(axis.ticks.length = unit(0.4, "cm")) +
84 | theme(axis.title.x = element_text(family=my_family, size=rel(1.2)*my_size_ratio)) +
85 | guides(fill = guide_legend(override.aes = list(colour = NULL))) +
86 | guides(fill=guide_legend(nrow=2,byrow=TRUE))
87 | for(i in rev(c(1:length(levels(my_species))))){
88 | detailed_values <- my_values[my_species==my_species[my_species==levels(my_species)[i]]]
89 | total_buscos <- sum(detailed_values)
90 | figure <- figure +
91 | annotate("text", label=paste("C:", detailed_values[1] + detailed_values[2], " [S:", detailed_values[1], ", D:", detailed_values[2], "], F:", detailed_values[3], ", M:", detailed_values[4], ", n:", total_buscos, sep="\t"),
92 | y=3, x = i, size = labsize*4*my_size_ratio, colour = "black", hjust=0, family=my_family)
93 | }
94 |
95 | #ggsave not working in docker
96 | #ggsave(filename = paste(sample_name,"_BUSCO_comparison.svg",sep=""),width = 15 ,height = 7)
97 | #ggsave(filename = paste(sample_name,"_BUSCO_comparison.pdf",sep=""),width = 15 ,height = 7)
98 | pdf(paste(sample_name,"_BUSCO_comparison.pdf",sep=""),width = 15 ,height = 7)
99 | print(figure)
100 | dev.off()
101 | svg(paste(sample_name,"_BUSCO_comparison.svg",sep=""),width = 15 ,height = 7)
102 | print(figure)
103 | dev.off()
104 | print("Done")
105 |
--------------------------------------------------------------------------------
/docs/examples.adoc:
--------------------------------------------------------------------------------
1 | Here are some examples on how to deploy TransPi depending on the method to use (e.g. conda) and the analyses to be performed.
2 |
3 | = Profiles
4 | You can use TransPi either with:
5 | - a local conda environment (from precheck);
6 | - individual conda environment per process;
7 | - docker or singularity
8 |
9 | == Conda
10 | This way of executing TransPi assumes that you installed conda locally.
11 | All these is done automatically for you, if desired, with the precheck script.
12 |
13 | *Example:*
14 | [source,bash]
15 | ----
16 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
17 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
18 | -profile conda
19 | ----
20 |
21 | [NOTE]
22 | `-profile conda` tells TransPi to use conda. An individual environment is used per process.
23 |
24 | == Containers
25 | Docker or singularity can also be use for deploying TransPi. You can either use individual containers for each process or a TransPi container with all the tools.
26 |
27 | === Individual
28 | To use individual containers:
29 |
30 | *Example for docker:*
31 | [source,bash]
32 | ----
33 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
34 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
35 | -profile docker
36 | ----
37 |
38 | *Example for singularity:*
39 | [source,bash]
40 | ----
41 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
42 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
43 | -profile singularity
44 | ----
45 |
46 | [NOTE]
47 | Some individual containers can create problems. We are working on solving these issues. In the meantime you can use the TransPi container (see below).
48 |
49 | === TransPi container
50 | To use the TransPi container with all the tools you need to use the profile `TransPiContainer`.
51 |
52 | *Example for docker:*
53 | [source,bash]
54 | ----
55 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
56 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
57 | -profile docker,TransPiContainer
58 | ----
59 |
60 | *Example for singularity:*
61 | [source,bash]
62 | ----
63 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
64 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
65 | -profile singularity,TransPiContainer
66 | ----
67 |
68 |
69 | = Other examples
70 |
71 | [NOTE]
72 | Order of commands is not important.
73 |
74 | == Filtering
75 |
76 | *Scenario:*
77 | [horizontal]
78 | Sample:: Coral sample
79 | Read length:: 150bp
80 | TransPi mode:: --all
81 | Kmers:: 25,35,55,75,85
82 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz
83 | Output directory:: Results_Acropora
84 | Work directory:: work_acropora
85 | Engine:: conda
86 | Filter species:: on
87 | host:: scleractinian proteins
88 | symbiont:: symbiodinium proteins
89 |
90 | *Command:*
91 | [source,bash]
92 | ----
93 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
94 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
95 | -w work_acropora -profile conda --filterSpecies \
96 | --host /YOUR/PATH/HERE/uniprot-Scleractinia.fasta \
97 | --symbiont /YOUR/PATH/HERE/uniprot-symbiodinium.fasta
98 | ----
99 |
100 |
101 | == BUSCO distribution
102 |
103 | *Scenario:*
104 | [horizontal]
105 | Sample:: SampleA
106 | Read length:: 100bp
107 | TransPi mode:: --all
108 | Kmers:: 25,41,57,67
109 | Reads PATH:: /YOUR/PATH/HERE/SampleA/*_R[1,2].fastq.gz
110 | Output directory:: Results_SampleA
111 | Engine:: conda
112 | All BUSCOs:: on
113 | BUSCO distribution:: on
114 |
115 | *Command:*
116 | [source,bash]
117 | ----
118 | nextflow run TransPi.nf --all --maxReadLen 100 --k 25,35,55,75,85 \
119 | --outdir Results_SampleA --reads '/YOUR/PATH/HERE/SampleA/*_R[1,2].fastq.gz' \
120 | -profile conda --allBuscos --buscoDist
121 | ----
122 |
123 | == `--onlyEvi`
124 |
125 | *Scenario:*
126 | [horizontal]
127 | Sample:: Assemblies from multiple assemblers and kmers
128 | Read length:: 50bp
129 | TransPi mode:: --onlyEvi
130 | Kmers:: 25,33,37
131 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz
132 | Output directory:: Reduction_results
133 | Engine:: conda
134 |
135 | *Command:*
136 | [source,bash]
137 | ----
138 | nextflow run TransPi.nf --onlyEvi --outdir Reduction_results \
139 | -profile conda
140 | ----
141 |
142 | .NOTES
143 | ****
144 | - A directory named `onlyEvi` is needed for this option with the transcriptome to perform the reduction.
145 |
146 | TIP: You can do multiple transcriptomes at the same time. Each file should have a unique name.
147 |
148 | - No need to specify reads PATH, length, cutoff, and kmers when using the `--onlyEvi`.
149 |
150 | ****
151 |
152 | == `--onlyAnn`
153 |
154 | *Scenario:*
155 | [horizontal]
156 | Sample:: Transcriptome missing annotation
157 | Read length:: 100bp
158 | TransPi mode:: --onlyEvi
159 | Kmers:: 25,41,57,67
160 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz
161 | Output directory:: Annotation_results
162 | Engine:: singularity
163 | Container:: TransPi container
164 |
165 | *Command:*
166 | [source,bash]
167 | ----
168 | nextflow run TransPi.nf --onlyAnn --outdir Annotation_results \
169 | -profile singularity,TransPiContainer
170 | ----
171 |
172 | .NOTES
173 | ****
174 | - A directory named `onlyAnn` is needed for this option with the transcriptome to annotate.
175 |
176 | TIP: You can do multiple transcriptomes (i.e. samples) at the same time. Each file should have a unique name.
177 |
178 | - No need to specify reads PATH, length, cutoff, and kmers when using the `--onlyAnn`.
179 |
180 | ****
181 |
182 | == Skip options
183 |
184 | *Scenario:*
185 | [horizontal]
186 | Sample:: Coral sample
187 | Read length:: 150bp
188 | TransPi mode:: --all
189 | Kmers:: 25,35,55,75,85
190 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz
191 | Output directory:: Results_Acropora
192 | Work directory:: work_acropora
193 | Engine:: docker
194 | Container:: Individual containers
195 | Skip QC:: on
196 | Skip Filter:: on
197 |
198 | *Command:*
199 | [source,bash]
200 | ----
201 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
202 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \
203 | -w work_acropora -profile docker \
204 | --skipQC --skipFilter
205 | ----
206 |
207 | == Extra annotation steps
208 |
209 | *Scenario:*
210 | [horizontal]
211 | Sample:: Mollusc sample
212 | Read length:: 150bp
213 | TransPi mode:: --all
214 | Kmers:: 25,35,55,75,85
215 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz
216 | Output directory:: Results
217 | Engine:: conda
218 | Skip QC:: on
219 | SignalP:: on
220 | TMHMM:: on
221 | RNAmmer:: on
222 |
223 |
224 | *Command:*
225 | [source,bash]
226 | ----
227 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
228 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results \
229 | -profile conda --skipQC --withSignalP --withTMHMM --withRnammer
230 | ----
231 |
232 | .NOTE
233 | ****
234 | - This option requires manual installation of the CBS-DTU tools: signalP, tmhmm, and rnammer.
235 |
236 | - For more info visit https://services.healthtech.dtu.dk/software.php[CBS-DTU tools]
237 |
238 | - It also assumes that the `PATH` for all the tools are in the `nextflow.config` file.
239 |
240 | ****
241 |
242 |
243 | == Full run and extra annotation
244 |
245 | *Scenario:*
246 | [horizontal]
247 | Sample:: Coral sample
248 | Read length:: 150bp
249 | TransPi mode:: --all
250 | Kmers:: 25,35,55,75,85
251 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz
252 | Output directory:: Results
253 | Engine:: conda
254 | Skip QC:: on
255 | SignalP:: on
256 | TMHMM:: on
257 | RNAmmer:: on
258 | Filter species:: on
259 | host:: scleractinian proteins
260 | symbiont:: symbiodinium proteins
261 | All BUSCOs:: on
262 | BUSCO distribution:: on
263 | Remove rRNA:: on
264 | rRNA database:: /YOUR/PATH/HERE/silva_rRNA_file.fasta
265 |
266 | *Command:*
267 | [source,bash]
268 | ----
269 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \
270 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results \
271 | -profile conda --skipQC --withSignalP --withTMHMM --withRnammer \
272 | --host /YOUR/PATH/HERE/uniprot-Scleractinia.fasta \
273 | --symbiont /YOUR/PATH/HERE/uniprot-symbiodinium.fasta
274 | --allBuscos --buscoDist --rRNAfilter \
275 | --rRNAdb "/YOUR/PATH/HERE/silva_rRNA_file.fasta"
276 | ----
277 |
--------------------------------------------------------------------------------
/template.nextflow.config:
--------------------------------------------------------------------------------
1 | /*
2 | ================================================================================================
3 | Config File TransPi
4 | ================================================================================================
5 | Transcriptome Analysis Pipeline
6 | Author: Ramón E. Rivera-Vicéns
7 | GitHub: rivera10
8 | ----------------------------------------------------------------------------------------
9 | */
10 |
11 | params {
12 |
13 | // ------------------------- EDIT below variables (mandatory) ------------------------- //
14 | // --------------------- Can also be specified in the command line ---------------------- //
15 |
16 | // Modify this accordingly (if needed)
17 | // kmers list (depends on read length!)
18 | k=""
19 |
20 | // SOAP config file generator
21 | //#maximal read length
22 | maxReadLen=""
23 | //[LIB]
24 | //#maximal read length in this lib
25 | rd_len_cutof="${params.maxReadLen}"
26 |
27 | // Other options if needed. Leave defaults if unsure.
28 | //#average insert size
29 | //avg_ins="200"
30 | //#if sequence needs to be reversed
31 | reverse_seq="0"
32 | //#in which part(s) the reads are used
33 | asm_flags="3"
34 | //#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
35 | map_len="32"
36 |
37 | // -------------------------- EDIT below variables if needed -------------------------- //
38 |
39 | // Directory for results
40 | outdir="results"
41 |
42 | // Directory for trace files
43 | tracedir="pipeline_info"
44 |
45 | // PATH for rnammer, tmhmm, signalp programs. Requires licenses. See CBS-DTU tools for information.
46 | // RNAmmer
47 | rnam = ""
48 | // Tmhmm
49 | tmhmm = ""
50 | // SignalP
51 | signalp = ""
52 |
53 | /*
54 | // ------------------------------------------------ STOP ------------------------------------------------ //
55 |
56 | Most of these values below are filled by the precheck script (e.g. PATH to databases or conda installation).
57 | However, if you run the precheck for a container you will not have all these PATHs assigned (e.g. conda PATH).
58 | Run the precehck again but selecting conda instead of containers if that is the case.
59 |
60 |
61 | For other options (e.g. filtering, buscoDist, etc.) is recommended to call them from the command line.
62 |
63 |
64 | Proceed to the end of this config file to modify the processes CPUs and RAM with the specs of your system.
65 | Also to modify the profiles if you use a scheduler manager like SLURM or PBS.
66 |
67 |
68 | More info at the TransPi repository (https://github.com/PalMuc/TransPi) and
69 | manual (https://palmuc.github.io/TransPi/).
70 |
71 |
72 | // -------------------------------------------------------------------------------------------------------------- //
73 | */
74 |
75 | // PATH to TransPi DBs installation
76 | pipeInstall
77 |
78 | // Uniprot database PATH
79 | uniprot
80 | uniname
81 |
82 | //BUSCO database
83 | busco4db
84 |
85 | //PFAM file location
86 | pfloc
87 |
88 | //name of pfam file
89 | pfname
90 |
91 | //Trinotate sqlite created when installing Trinotate
92 | Tsql
93 |
94 | // Directory for reads
95 | reads=""
96 |
97 | // Pipeline options
98 | help = false
99 | fullHelp = false
100 |
101 | // Full analysis
102 | all = false
103 |
104 | // Only Evidential Gene run (one sample per run)
105 | onlyEvi = false
106 |
107 | // Only annotation analysis
108 | onlyAnn = false
109 |
110 | // Only Assemblies and Evidential Gene
111 | onlyAsm = false
112 |
113 | // Skip quality control
114 | skipQC = false
115 |
116 | // Skip fastp quality filter step
117 | skipFilter = false
118 | // Minimum reads quality for filtering in fastp
119 | minQual="5"
120 |
121 | // Filter rRNA
122 | rRNAfilter = false
123 | // rRNA database
124 | rRNAdb = ""
125 |
126 | // Skip normalization of reads
127 | skipNormalization = false
128 | // Normalization parameters
129 | normMaxCov=100
130 | normMinCov=1
131 |
132 | // Save reads from filtering and normalization
133 | saveReads = false
134 |
135 | // Save bam file from mapping step
136 | saveBam = false
137 |
138 | // Filter Species using psytrans
139 | filterSpecies = false
140 | // Psytrans value to train model
141 | psyval=160
142 | // Host Sequence
143 | host=""
144 | // Symbiont Sequence
145 | symbiont=""
146 |
147 | // Run BUSCO in all assemblies
148 | allBuscos = false
149 |
150 | // BUSCO distribution analysis (this option needs to be run together with the allBuscos option)
151 | // Generate the analysis
152 | buscoDist = false
153 | // Mininmum percentage of assemblers require to rescue a BUSCO sequence
154 | minPerc="0.7"
155 |
156 | //short Transdecoder, no homlogy search (PFAM and UniProt)
157 | shortTransdecoder = false
158 | //Transdecoder genetic code
159 | genCode="Universal"
160 |
161 | // Annotation options
162 | // SignalP
163 | withSignalP = false
164 | // tmHMM
165 | withTMHMM = false
166 | // rnammer
167 | withRnammer = false
168 | // Add annotation to file
169 | addAnnotation = false
170 |
171 | //Test data
172 | readsTest = false
173 |
174 | // Skip Evidential Gene for onlyAsm option
175 | skipEvi = false
176 |
177 | // Kegg pathway search
178 | withKegg = false
179 |
180 | // Skip Report
181 | skipReport = false
182 |
183 | // These options will change how the profiles work.
184 | // Run with conda installed by the precheck
185 | //next 2 parameters are outdated
186 | myConda = false
187 | myCondaInstall=""
188 |
189 | condaActivate = false
190 |
191 | // TransPi container with all programs
192 | oneContainer = false
193 |
194 | // Cache directory for conda and singularity files. Leave in blank if not sure
195 | envCacheDir = ""
196 |
197 | // Singularity
198 | // Use singularity image created after pulling from docker and not from Galaxy depot (singularity image ready to use).
199 | singularity_pull_docker_container = false
200 |
201 | // Get software versions - only works with local conda installation and TransPi container.
202 | skipGetRunInfo = false
203 | }
204 |
205 | /*
206 | // ------------------------------------------------ NOTE ------------------------------------------------ //
207 |
208 |
209 | Proceed to modify the processes CPUs and RAM with the specs of your system.
210 | Also to modify the profiles if you use a scheduler manager like SLURM or PBS.
211 |
212 |
213 | More info at the TransPi repository (https://github.com/PalMuc/TransPi) and
214 | manual (https://palmuc.github.io/TransPi/).
215 |
216 | Also see Nextflow documentation (https://www.nextflow.io/docs/latest/index.html).
217 |
218 |
219 | // -------------------------------------------------------------------------------------------------------------- //
220 | */
221 |
222 | process {
223 | cpus='1'
224 | memory='5 GB'
225 | withLabel: big_cpus {
226 | cpus='20'
227 | memory='15 GB'
228 | }
229 | withLabel: med_cpus {
230 | cpus='8'
231 | memory='15 GB'
232 | }
233 | withLabel: low_cpus {
234 | cpus='4'
235 | memory='15 GB'
236 | }
237 | withLabel: exlow_cpus {
238 | cpus='1'
239 | memory='2 GB'
240 | }
241 | withLabel: big_mem {
242 | cpus='20'
243 | memory='350 GB'
244 | }
245 | withLabel: med_mem {
246 | cpus='15'
247 | memory={ 100.Gb + (task.attempt * 50.Gb)}
248 | errorStrategy={ task.exitStatus in 137..140 ? 'retry' : 'finish' }
249 | maxRetries = 2
250 | }
251 | withLabel: low_mem {
252 | cpus='20'
253 | memory='80 GB'
254 | }
255 | errorStrategy='finish'
256 | }
257 |
258 | // env Evidential Gene variable (only for nextflow)
259 | env.evi="${projectDir}/scripts/evigene"
260 |
261 | // Get PATH for cache environments
262 | params.localCacheDir = (params.envCacheDir ? "${params.envCacheDir}" : "${launchDir}")
263 |
264 | profiles {
265 | conda {
266 | params.condaActivate = true
267 | params.localConda="${params.myCondaInstall}"
268 | // cache for condaEnv created individually
269 | conda.cacheDir = "${params.localCacheDir}/condaEnv/"
270 | }
271 | docker {
272 | docker.enabled = true
273 | docker.runOptions = "-u \$(id -u):\$(id -g) -v ${params.pipeInstall}:${params.pipeInstall}"
274 | // --mount type=bind,src=${params.pipeInstall},dst=/dockerDB"
275 | }
276 | singularity {
277 | singularity.enabled = true
278 | singularity.autoMounts = true
279 | // cache for images from docker pull
280 | singularity.cacheDir="${params.localCacheDir}/singularityCache/"
281 | }
282 | test {
283 | includeConfig 'conf/test.config'
284 | }
285 | TransPiContainer {
286 | process {
287 | params.oneContainer = true
288 | params.TPcontainer="rerv/transpi:v1.0.0"
289 | }
290 | }
291 | palmuc {
292 | process {
293 | executor='slurm'
294 | clusterOptions='-p lemmium --qos=low'
295 | }
296 | }
297 | }
298 |
299 | executor {
300 | $slurm {
301 | queueSize=100
302 | }
303 | }
304 |
305 | timeline {
306 | enabled = true
307 | file = "${params.outdir}/${params.tracedir}/transpi_timeline.html"
308 | }
309 | report {
310 | enabled = true
311 | file = "${params.outdir}/${params.tracedir}/transpi_report.html"
312 | }
313 | trace {
314 | enabled = true
315 | file = "${params.outdir}/${params.tracedir}/transpi_trace.txt"
316 | }
317 | dag {
318 | enabled = true
319 | file = "${params.outdir}/${params.tracedir}/transpi_dag.html"
320 | }
321 |
322 | manifest {
323 | name = 'TransPi'
324 | author = 'Ramón E. Rivera-Vicéns'
325 | description = 'Transcriptome Analysis Pipeline'
326 | mainScript = 'TransPi.nf'
327 | nextflowVersion = '>=21.04.1'
328 | version = '1.3.0-rc'
329 | }
330 |
--------------------------------------------------------------------------------
/bin/TransPi_Report_Ind.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "TransPi Report: `r commandArgs(trailingOnly=T)[1]`"
3 | output:
4 | html_document: default
5 | pdf_document: default
6 | date: "Generated on: `r Sys.time()`"
7 | params:
8 | interactive: yes
9 | sample_id: !r commandArgs(trailingOnly=T)[1]
10 | skipFilter: !r commandArgs(trailingOnly=T)[2]
11 | skipNormalization: !r commandArgs(trailingOnly=T)[3]
12 | rRNAfilter: !r commandArgs(trailingOnly=T)[4]
13 | buscoDist: !r commandArgs(trailingOnly=T)[5]
14 | allBuscos: !r commandArgs(trailingOnly=T)[6]
15 | withKegg: !r commandArgs(trailingOnly=T)[7]
16 | ---
17 |
18 |
25 |
26 | ```{r setup, include=FALSE}
27 | knitr::opts_chunk$set(echo = TRUE,
28 | message = FALSE,
29 | warning = FALSE,
30 | out.width="105%"
31 | )
32 | ```
33 |
34 | ```{r load_libraries, include=FALSE}
35 | library(ggthemes)
36 | library(ggplot2)
37 | library(reshape2)
38 | library(grid)
39 | library(plotly)
40 | library(knitr)
41 | library(kableExtra) #install.packages("kableExtra")
42 | library(rmarkdown)
43 | mycol=c('#088da5','#73cdc8','#ff6f61','#7cb8df','#88b04b','#00a199','#6B5B95','#92A8D1','#b0e0e6','#ff7f50','#088d9b','#E15D44','#e19336')
44 | ```
45 |
46 |
47 |
48 |