├── docs ├── Makefile ├── issues.adoc ├── description.adoc ├── index.adoc ├── install.adoc ├── results.adoc ├── options.adoc ├── extra.adoc ├── run.adoc └── examples.adoc ├── bin ├── len.py ├── get_sizes.sh ├── get_readqual.sh ├── get_readstats.sh ├── custom_uniprot_hits.R ├── addAnnotation.py ├── get_busco_val.sh ├── heatmap_busco.R ├── GO_plots.R ├── SOS_busco.py ├── busco_comparison.R └── TransPi_Report_Ind.Rmd ├── .gitignore ├── remove_failed.sh ├── Dockerfile ├── conf ├── test.config └── busV4list.txt ├── LICENSE ├── README.md ├── template.nextflow.config └── precheck_TransPi.sh /docs/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | /opt/homebrew/bin/asciidoctor -D . index.adoc 3 | -------------------------------------------------------------------------------- /bin/len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from Bio import SeqIO 4 | import sys 5 | 6 | arg1=sys.argv[1] 7 | 8 | filename=arg1 9 | for record in SeqIO.parse(filename, "fasta"): 10 | print(record.id,"\t", len(record.seq)+1) 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | busco_db/ 2 | diamonddb/ 3 | hmmerdb/ 4 | nextflow 5 | pipeline_info/ 6 | reads/ 7 | uniprot_db/ 8 | work/ 9 | cbs-dtu-tools/ 10 | diamonddb_custom/ 11 | diamonddb_swiss/ 12 | nextflow.config 13 | results/ 14 | sqlite_db/ 15 | .varfile.sh 16 | .nextflow* 17 | .DS_Store 18 | cbs-dtu-tools.tar.gz 19 | Singularity 20 | Dockerfile 21 | evigene/ 22 | -------------------------------------------------------------------------------- /bin/get_sizes.sh: -------------------------------------------------------------------------------- 1 | filename="$1" 2 | cat $filename | cut -f 1 -d "," | awk '{print $3}' >nam 3 | cat $filename | cut -f 2 -d "," | awk '{print $2}' >len 4 | paste nam len | awk '$2<2500 {print $0}' >lt_2500 5 | he=$( paste nam len | awk '$2>=2500 {print $0}' | wc -l ) 6 | for x in `seq 1 $he`;do 7 | echo data >>temp_1 8 | echo $x >>temp_2 9 | done 10 | paste temp_1 temp_2 >he_2500 11 | cat lt_2500 he_2500 >final_sizes.txt 12 | rm nam len temp_1 temp_2 lt_2500 he_2500 $filename 13 | -------------------------------------------------------------------------------- /remove_failed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Script to remove directories of FAILED and ABORTED processes in a nextflow pipeline 3 | #INPUT = filename_trace.txt 4 | file=$1 5 | if [ "$file" == "" ];then 6 | echo -e "\n\t Provide a trace file as input (e.g. filename_trace.txt)" 7 | echo -e "\n\t Usage: bash remove_failed.sh filename_trace.txt\n" 8 | exit 0 9 | else 10 | cat $file | grep "ABORTED" >.erase.txt 11 | cat $file | grep "FAILED" >>.erase.txt 12 | while read line;do 13 | a=$( echo $line | awk '{print $2}' ) 14 | echo $a 15 | if [ -d work*/${a}* ];then 16 | rm -rf work*/${a}* 17 | fi 18 | done <.erase.txt 19 | rm .erase.txt 20 | fi 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | LABEL authors="Ramon Rivera-Vicens" \ 4 | description="Docker image containing all requirements for TransPi pipeline" \ 5 | version="1.0dev" 6 | 7 | RUN apt update; apt install -y gcc bc procps 8 | 9 | COPY transpi_env.yml / 10 | RUN conda env create -f /transpi_env.yml && conda clean -a 11 | 12 | ENV PATH /opt/conda/envs/TransPi/bin:$PATH 13 | 14 | RUN sed -i 's/base/TransPi/g' ~/.bashrc 15 | 16 | RUN wget http://arthropods.eugenes.org/EvidentialGene/other/evigene_older/evigene19may14.tar 17 | RUN tar -xf evigene19may14.tar && rm evigene19may14.tar 18 | ENV PATH /evigene/scripts/prot/:$PATH 19 | 20 | RUN mkdir -p /opt/conda/envs/TransPi/lib/python3.6/site-packages/bin && cp /opt/conda/envs/TransPi/bin/skip*.awk /opt/conda/envs/TransPi/lib/python3.6/site-packages/bin/ 21 | -------------------------------------------------------------------------------- /conf/test.config: -------------------------------------------------------------------------------- 1 | /* 2 | ======================================================================================== 3 | Test Config File TransPi 4 | ======================================================================================== 5 | Transcriptome Analysis Pipeline 6 | Author: Ramón E. Rivera-Vicéns 7 | ---------------------------------------------------------------------------------------- 8 | */ 9 | 10 | params { 11 | readsTest = [ 12 | ['Sponge_sample', ['https://github.com/rivera10/test_dataset/raw/master/RNA_data/Tethya_wilhelma_R1.fastq.gz'], ['https://github.com/rivera10/test_dataset/raw/master/RNA_data/Tethya_wilhelma_R2.fastq.gz']] 13 | ] 14 | k="25,53" 15 | maxReadLen=100 16 | shortTransdecoder = true 17 | } 18 | -------------------------------------------------------------------------------- /docs/issues.adoc: -------------------------------------------------------------------------------- 1 | We tested TransPi using the followings deployment methods: 2 | 3 | - conda = individual conda environments per process 4 | 5 | - docker = using TransPi container (i.e. -profile docker,TransPiContainer) 6 | 7 | - singularity = using TransPi container (i.e. -profile singularity,TransPiContainer) 8 | 9 | 10 | [NOTE] 11 | Using individual container per process is working for the majority of processes. However, we found a couple os issues with some containers (e.g. transabyss). We are working to find a solution for these issues. 12 | 13 | 14 | = Reporting an issue 15 | 16 | If you find a problem or get an error please let us know by opening an issue in the repository. 17 | 18 | 19 | = Test dataset 20 | 21 | We include a `test` profile to try TransPi using a small dataset. However, this can create issues in some of the process (e.g. contamination removal by psytrans). 22 | -------------------------------------------------------------------------------- /bin/get_readqual.sh: -------------------------------------------------------------------------------- 1 | jfile="$1" 2 | sampleid=$( echo $jfile | cut -f 1 -d "." ) 3 | r1bn=$( jq '.read1_before_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] ) 4 | r1bq=$( jq '.read1_before_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " ) 5 | r2bn=$( jq '.read2_before_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] ) 6 | r2bq=$( jq '.read2_before_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " ) 7 | r1an=$( jq '.read1_after_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] ) 8 | r1aq=$( jq '.read1_after_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " ) 9 | r2an=$( jq '.read2_after_filtering.quality_curves.mean' ${jfile} | grep -c [0-9] ) 10 | r2aq=$( jq '.read2_after_filtering.quality_curves.mean' ${jfile} | grep [0-9] | tr -d "\n" | tr -d " " ) 11 | echo -e "${r1bn}\n${r1bq}\n${r2bn}\n${r2bq}\n${r1an}\n${r1aq}\n${r2an}\n${r2aq}" >${sampleid}_reads_qual.csv 12 | -------------------------------------------------------------------------------- /docs/description.adoc: -------------------------------------------------------------------------------- 1 | *TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly* 2 | 3 | TransPi provides a useful resource for the generation of de novo transcriptome assemblies, 4 | with minimum user input but without losing the ability of a thorough analysis. 5 | 6 | * For more info see the https://doi.org/10.1101/2021.02.18.431773[Preprint] 7 | 8 | * Code available at https://www.github.com/palmuc/TransPi[GitHub] 9 | 10 | * Author: Ramón Rivera-Vicéns 11 | ** https://twitter.com/rerv787[Twitter] 12 | 13 | 14 | = Programs used 15 | 16 | * List of programs use by TransPi: 17 | ** FastQC 18 | ** fastp 19 | ** sortmerna 20 | ** rnaSPADES 21 | ** SOAP 22 | ** Trinity 23 | ** Velvet 24 | ** Oases 25 | ** TransABySS 26 | ** rnaQUAST 27 | ** EvidentialGene 28 | ** CD-Hit 29 | ** Exonerate 30 | ** Blast 31 | ** BUSCO 32 | ** Psytrans 33 | ** Trandecoder 34 | ** Trinotate 35 | ** Diamond 36 | ** Hmmer 37 | ** Bowtie2 38 | ** rnammer 39 | ** tmhmm 40 | ** signalP 41 | ** iPath 42 | ** SQLite 43 | ** R 44 | ** Python 45 | 46 | * Databases used by TransPi: 47 | ** Swissprot 48 | ** Uniprot custom database (e.g. all metazoan proteins) 49 | ** Pfam 50 | -------------------------------------------------------------------------------- /bin/get_readstats.sh: -------------------------------------------------------------------------------- 1 | jfile="$1" 2 | sampleid=$( echo $jfile | cut -f 1 -d "." ) 3 | tb=$( jq '.summary.before_filtering.total_reads' $jfile ) 4 | r1b=$( jq '.read1_before_filtering.total_reads' $jfile ) 5 | r1bl=$( jq '.summary.before_filtering.read1_mean_length' $jfile ) 6 | r2b=$( jq '.read2_before_filtering.total_reads' $jfile ) 7 | r2bl=$( jq '.summary.before_filtering.read2_mean_length' $jfile ) 8 | ta=$( jq '.summary.after_filtering.total_reads' $jfile ) 9 | r1a=$( jq '.read1_after_filtering.total_reads' $jfile ) 10 | r1al=$( jq '.summary.after_filtering.read1_mean_length' $jfile ) 11 | r2a=$( jq '.read2_after_filtering.total_reads' $jfile ) 12 | r2al=$( jq '.summary.after_filtering.read2_mean_length' $jfile ) 13 | loss=$( echo "${tb}-${ta}" | bc ) 14 | gcb=$( jq '.summary.before_filtering.gc_content' $jfile ) 15 | gca=$( jq '.summary.after_filtering.gc_content' $jfile ) 16 | echo "Sample_name,Total_before,R1_before,R1_before_length,R2_before,R2_before_length,GC_before,Total_after,R1_after,R1_after_length,R2_after,R2_after_length,GC_after,Reads_discarded" >${sampleid}_reads_stats.csv 17 | echo "${sampleid},${tb},${r1b},${r1bl},${r2b},${r2bl},${gcb},${ta},${r1a},${r1al},${r2a},${r2al},${gca},${loss}" >>${sampleid}_reads_stats.csv 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Molecular Geobiology and Paleobiology Lab 4 | Department of Earth and Environmental Sciences, Palaeontology & Geobiology, Ludwig-Maximilians-Universität München (LMU) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /bin/custom_uniprot_hits.R: -------------------------------------------------------------------------------- 1 | args = commandArgs(trailingOnly=TRUE) 2 | sample_name=args[1] 3 | 4 | library(ggthemes) 5 | library(ggplot2) 6 | 7 | data=read.csv(paste(sample_name,"_custom_uniprot_hits.txt",sep=""),header=F) 8 | 9 | nlim=round((head(data$V1,n = 1)+450),digits = -2) 10 | p1<-ggplot(data=data, aes(x=reorder(data$V2,data$V1), y=data$V1))+ 11 | geom_bar(stat="identity", fill="dark blue", width=.5)+ 12 | coord_flip()+labs(x="UniProt Species",y="Number of Hits")+ 13 | geom_text(aes(label=data$V1), position=position_dodge(width=0.3), vjust=0.25, hjust=-.10)+ 14 | theme(axis.text=element_text(size=12))+ylim(0,nlim)+theme(axis.text.x=element_text(size=12,angle=0))+ 15 | theme(axis.title=element_text(size=15,face="bold"))+ggtitle(paste(sample_name,"UniProt hits",sep=" "))+ 16 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.5)*1, face = "bold")) 17 | 18 | 19 | # not working in docker 20 | #ggsave(filename = paste(sample_name,"_custom_uniprot_hits.svg",sep=""),width = 15 ,height = 7) 21 | #ggsave(filename = paste(sample_name,"_custom_uniprot_hits.pdf",sep=""),width = 15 ,height = 7) 22 | pdf(paste(sample_name,"_custom_uniprot_hits.pdf",sep=""),width = 15 ,height = 7) 23 | print(p1) 24 | dev.off() 25 | svg(paste(sample_name,"_custom_uniprot_hits.svg",sep=""),width = 15 ,height = 7) 26 | print(p1) 27 | dev.off() 28 | -------------------------------------------------------------------------------- /docs/index.adoc: -------------------------------------------------------------------------------- 1 | = TransPi Manual 2 | Ramón Rivera-Vicéns 3 | v1.1.0-rc, 2021-05-25 4 | :docinfo: 5 | :keywords: TransPi, transcriptome, assembly, annotation, Nextflow, pipeline 6 | :description: TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly 7 | :icons: font 8 | :toclevels: 3 9 | :imagesdir: img 10 | :toc: left 11 | :toc-title: TransPi Manual 12 | :source-highlighter: coderay 13 | :coderay-linenums-mode: table 14 | :sectnums: 15 | :sectlinks: 16 | 17 | How to use TransPi 18 | 19 | == Description 20 | :leveloffset: +2 21 | 22 | include::description.adoc[] 23 | 24 | :leveloffset: -2 25 | 26 | == Installing TransPi 27 | :leveloffset: +2 28 | 29 | include::install.adoc[] 30 | 31 | :leveloffset: -2 32 | 33 | == Running TransPi 34 | :leveloffset: +2 35 | 36 | include::run.adoc[] 37 | 38 | :leveloffset: -2 39 | 40 | == Results 41 | :leveloffset: +2 42 | 43 | include::results.adoc[] 44 | 45 | :leveloffset: -2 46 | 47 | == Additional options 48 | :leveloffset: +2 49 | 50 | include::options.adoc[] 51 | 52 | :leveloffset: -2 53 | 54 | == Examples 55 | 56 | :leveloffset: +2 57 | 58 | include::examples.adoc[] 59 | 60 | :leveloffset: -2 61 | 62 | == Extra information 63 | :leveloffset: +2 64 | 65 | include::extra.adoc[] 66 | 67 | :leveloffset: -2 68 | 69 | == Issues 70 | :leveloffset: +2 71 | 72 | include::issues.adoc[] 73 | 74 | :leveloffset: -2 75 | -------------------------------------------------------------------------------- /docs/install.adoc: -------------------------------------------------------------------------------- 1 | = Requirements 2 | 3 | - System: Linux OS 4 | 5 | - Data type: Paired-end reads 6 | 7 | Example: 8 | IndA_R1.fastq.gz, IndA_R2.fastq.gz 9 | 10 | [NOTE] 11 | Make sure reads end with `_R1.fastq.gz` and `_R2.fastq.gz`. 12 | Multiple individuals can be run at the same time. 13 | 14 | 15 | = Downloading TransPi 16 | 17 | 1- Clone the repository 18 | 19 | [source,bash] 20 | ---- 21 | 22 | git clone https://github.com/palmuc/TransPi.git 23 | 24 | ---- 25 | 26 | 2- Move to the TransPi directory 27 | 28 | [source,bash] 29 | ---- 30 | 31 | cd TransPi 32 | 33 | ---- 34 | 35 | = Configuration 36 | 37 | TransPi requires various databases to run. The precheck script will installed the databases and software, if necessary, to run the tool. 38 | The precheck run needs a `PATH` as an argument for installing (locally) all the databases the pipeline needs. 39 | 40 | ``` 41 | 42 | bash precheck_TransPi.sh /YOUR/PATH/HERE/ 43 | 44 | ``` 45 | 46 | [NOTE] 47 | This process may take a while depending on the options you select. Step that takes longer is downloading, if desired, the entire metazoan proteins from UniProt (6Gb). 48 | Other processes and databases are relatively fast depending on internet connection. 49 | 50 | Once the precheck run is done it will create a file named `nextflow.config` that contains the various `PATH` for the databases. 51 | If selected, it will also have the local conda environment `PATH`. 52 | 53 | The `nextflow.config` file has also other important parameters for pipeline execution that will be discussed further 54 | in the following sections. 55 | -------------------------------------------------------------------------------- /bin/addAnnotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import glob 5 | import sys 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser(usage='xls.py -trinotate_file FILENAME', description='') 9 | parser.add_argument('-trinotateFile', dest='trinotate_file', required=True) 10 | parser.add_argument('-db', dest='db_name', required=False, default="swissprot", help="DB to use for header: uniprot or swissprot") 11 | parser.add_argument('-type', dest='db_type', required=False, default="prot", help="Type of DB to use: nucl or prot") 12 | parser.add_argument('-combine', dest='db_combine', required=False, default="false", help='Use two DBs in headers') 13 | args = parser.parse_args() 14 | 15 | swissProtCount=0 16 | uniProtCount=0 17 | for line in open(args.trinotate_file, 'r'): 18 | line = line.strip() 19 | lineSplit = line.split("\t") 20 | if args.db_name == "swissprot" and args.db_type == "nucl": 21 | if lineSplit[2] != ".": 22 | print(">" + lineSplit[0] + " SwissProt_Blastx:" + lineSplit[2].split("^")[0]) 23 | uniProtCount += 1 24 | else: 25 | print(">" + lineSplit[0] + " SwissProt_Blastx:" + "noHit") 26 | uniProtCount += 1 27 | elif args.db_name == "swissprot" and args.db_type == "prot": 28 | if lineSplit[6] != ".": 29 | print(">" + lineSplit[0] + " SwissProt_Blastp:" + lineSplit[6].split("^")[0]) 30 | swissProtCount += 1 31 | else: 32 | print(">" + lineSplit[0] + " SwissProt_Blastp:" + "noHit") 33 | swissProtCount += 1 34 | elif args.db_name == "uniprot" and args.db_type == "nucl": 35 | if lineSplit[7] != ".": 36 | print(">" + lineSplit[0] + " UniProt_Blastx:" + lineSplit[7].split("^")[0]) 37 | uniProtCount += 1 38 | else: 39 | print(">" + lineSplit[0] + " UniProt_Blastx:" + "noHit") 40 | uniProtCount += 1 41 | elif args.db_name == "uniprot" and args.db_type == "prot": 42 | if lineSplit[8] != ".": 43 | print(">" + lineSplit[0] + " UniProt_Blastp:" + lineSplit[8].split("^")[0]) 44 | uniProtCount += 1 45 | else: 46 | print(">" + lineSplit[0] + " UniProt_Blastp:" + "noHit") 47 | uniProtCount += 1 48 | -------------------------------------------------------------------------------- /bin/get_busco_val.sh: -------------------------------------------------------------------------------- 1 | name_tri=$1 2 | name_transpi=$2 3 | version=$3 4 | a=$4 5 | if [ "$version" == "v3" ];then 6 | #trinity 7 | for x in $name_tri;do 8 | echo "'${a}','${a}','${a}','${a}'," >>tspec.txt 9 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," ) 10 | echo "${b}," >>tnum.txt 11 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" ) 12 | echo "${c}," >>tperc.txt 13 | done 14 | #transpi 15 | for x in $name_transpi;do 16 | echo "'${a}_TP','${a}_TP','${a}_TP','${a}_TP'" >>pspec.txt 17 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," ) 18 | echo "${b}" >>pnum.txt 19 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" ) 20 | echo "${c}" >>pperc.txt 21 | done 22 | cat tspec.txt pspec.txt | tr "\t" "\n" | tr -d "\n" >final_spec 23 | cat tnum.txt pnum.txt | tr "\t" "\n" | tr -d "\n" >final_num 24 | cat tperc.txt pperc.txt | tr "\t" "\n" | tr -d "\n" >final_perc 25 | rm tnum.txt tperc.txt tspec.txt 26 | rm pnum.txt pperc.txt pspec.txt 27 | elif [ "$version" == "v4" ];then 28 | #trinity 29 | for x in $name_tri;do 30 | echo "'${a}','${a}','${a}','${a}'," >>tspec.txt 31 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," ) 32 | echo "${b}," >>tnum.txt 33 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" ) 34 | echo "${c}," >>tperc.txt 35 | done 36 | #transpi 37 | for x in $name_transpi;do 38 | echo "'${a}_TP','${a}_TP','${a}_TP','${a}_TP'" >>pspec.txt 39 | b=$( cat $x | grep "(C)" -A5 | awk '{print $1}' | awk -v RS= -v OFS=, '{$1 = $1} 1' | cut -f 2,3,4,5 -d "," ) 40 | echo "${b}" >>pnum.txt 41 | c=$( cat $x | grep "C:" | cut -f 2 -d "[" | cut -f 1,2,3,4 -d "," | tr -d "%" | tr -d "]" | tr -d [A-Z] | tr -d ":" ) 42 | echo "${c}" >>pperc.txt 43 | done 44 | cat tspec.txt pspec.txt | tr "\t" "\n" | tr -d "\n" >final_spec 45 | cat tnum.txt pnum.txt | tr "\t" "\n" | tr -d "\n" >final_num 46 | cat tperc.txt pperc.txt | tr "\t" "\n" | tr -d "\n" >final_perc 47 | rm tnum.txt tperc.txt tspec.txt 48 | rm pnum.txt pperc.txt pspec.txt 49 | fi 50 | -------------------------------------------------------------------------------- /docs/results.adoc: -------------------------------------------------------------------------------- 1 | = Directories 2 | 3 | == `results` 4 | After a successful run of TransPi the results are saved in a directory called `results`. This directory is divided into multiple directories for each major step of the pipeline. 5 | 6 | [NOTE] 7 | Directories will be created based on the options selected in the pipeline execution 8 | 9 | [horizontal] 10 | fastqc:: Fastqc html files 11 | filter:: Filter step html files 12 | rRNA_reads:: Info and reads of rRNA removal process 13 | normalization:: Normalized reads files 14 | saveReads:: Folder with reads saved from the filter and normalization processes 15 | assemblies:: All individual assemblies 16 | evigene:: Non-redundant final transcriptome (ends with name `.combined.okay.fa`) 17 | rnaQuast:: rnaQUAST output 18 | mapping:: Mapping results 19 | busco4:: BUSCO V4 results 20 | transdecoder:: Transdecoder results 21 | trinotate:: Annotation results 22 | report:: Interactive report of TransPi 23 | figures:: Figures created by TransPi (BUSCO comparison, Annotation, GO, etc) 24 | stats:: Basic stats from all steps of TransPi 25 | pipeline_info:: Nextflow report, trace file and others 26 | RUN_INFO.txt:: File with all versions of the tools used by TransPi. Also info from the run like command and PATH 27 | 28 | .NOTES 29 | 30 | **** 31 | 32 | - Name of output directory can be changed by using the `--outdir` parameter when executing the pipeline. Example `--outdir Results_SampleA`. 33 | - If multiple samples are run, each directory will have all files together but each one with a unique sample name. 34 | 35 | 36 | **** 37 | 38 | == `work` 39 | 40 | A directory called `work` is also created when running TransPi. It contains all the Nextflow working files, TransPi results and intermediate files. 41 | 42 | [NOTE] 43 | Directory `work` can be removed after the pipeline is done since all important files are stored in the `results` directory. 44 | 45 | 46 | = Figures 47 | 48 | TransPi produces multiple figures that are stored in the results directory. 49 | 50 | Example: 51 | 52 | image:https://sync.palmuc.org/index.php/s/kxetdGiNiSyHzrg/preview[UniProt,800,400,float="center", role="Uniprot"] 53 | 54 | 55 | = Report 56 | 57 | TransPi creates an interactive custom HTML report for ease data exploration. 58 | 59 | Report https://sync.palmuc.org/index.php/s/XCxeCNwAfParBHX[Sponge transcriptome] 60 | 61 | .NOTE 62 | **** 63 | - Example report here is a PDF file and not a HTML file. However the original HTML file with interactive visualization (i.e. as generated in TransPi) can be downloaded https://sync.palmuc.org/index.php/s/nP3TKPawmoX4xqL[here] 64 | **** 65 | -------------------------------------------------------------------------------- /bin/heatmap_busco.R: -------------------------------------------------------------------------------- 1 | args = commandArgs(trailingOnly=TRUE) 2 | sample_name=args[1] 3 | comp_table=args[2] 4 | transpi_table=args[3] 5 | 6 | library(plotly) 7 | library(reshape2) 8 | 9 | # comparison table 10 | csv=read.csv(comp_table, header=TRUE, sep="\t") 11 | 12 | csv <- data.frame(lapply(csv, function(x) {gsub("Complete", "3", x)})) 13 | csv <- data.frame(lapply(csv, function(x) {gsub("Duplicated", "2", x)})) 14 | csv <- data.frame(lapply(csv, function(x) {gsub("Fragmented", "1", x)})) 15 | csv <- data.frame(lapply(csv, function(x) {gsub("Missing", "0", x)})) 16 | csv 17 | c=melt(csv,id.vars = 'Busco.ID') 18 | dec=c(0,.25,.25,.50,.50,.75,.75,1) 19 | my_colors <- c("#081D58","#081D58", "#2280B8","#2280B8", "#99D6B9", "#99D6B9","#f8f9fc","#f8f9fc") 20 | colz <- setNames(data.frame(dec, my_colors), NULL) 21 | fig <- plot_ly(c,x=~variable, y=~Busco.ID, z=~value, colorscale=colz, reversescale=T, type = "heatmap", 22 | colorbar=list(tickmode='array', tickvals=c(.35,1.1,1.87,2.60), thickness=30, 23 | ticktext= c("Missing","Fragmented","Duplicated","Complete"), len=0.4)) 24 | fig <- fig %>% layout(xaxis=list(title="", showline = TRUE, mirror = TRUE), 25 | yaxis=list(title="BUSCO ID", tickmode="auto", nticks=length(csv$Busco.ID), 26 | tickfont=list(size=8), showline = TRUE, mirror = TRUE)) 27 | 28 | orca(fig, paste(sample_name,"_all_missing_BUSCO.png",sep="")) 29 | orca(fig, paste(sample_name,"_all_missing_BUSCO.pdf",sep="")) 30 | 31 | # TransPi table 32 | csv=read.csv(transpi_table, header=TRUE, sep="\t") 33 | 34 | csv <- data.frame(lapply(csv, function(x) {gsub("Complete", "3", x)})) 35 | csv <- data.frame(lapply(csv, function(x) {gsub("Duplicated", "2", x)})) 36 | csv <- data.frame(lapply(csv, function(x) {gsub("Fragmented", "1", x)})) 37 | csv <- data.frame(lapply(csv, function(x) {gsub("Missing", "0", x)})) 38 | csv 39 | c=melt(csv,id.vars = 'Busco.ID') 40 | dec=c(0,.25,.25,.50,.50,.75,.75,1) 41 | my_colors <- c("#081D58","#081D58", "#2280B8","#2280B8", "#99D6B9", "#99D6B9","#f8f9fc","#f8f9fc") 42 | colz <- setNames(data.frame(dec, my_colors), NULL) 43 | fig <- plot_ly(c,x=~variable, y=~Busco.ID, z=~value, colorscale=colz, reversescale=T, type = "heatmap", 44 | colorbar=list(tickmode='array', tickvals=c(.35,1.1,1.87,2.60), thickness=30, 45 | ticktext= c("Missing","Fragmented","Duplicated","Complete"), len=0.4)) 46 | fig <- fig %>% layout(xaxis=list(title="", showline = TRUE, mirror = TRUE), 47 | yaxis=list(title="BUSCO ID", tickmode="auto", nticks=length(csv$Busco.ID), 48 | tickfont=list(size=8), showline = TRUE, mirror = TRUE)) 49 | 50 | orca(fig, paste(sample_name,"_TransPi_missing_BUSCO.png",sep="")) 51 | orca(fig, paste(sample_name,"_TransPi_missing_BUSCO.pdf",sep="")) 52 | -------------------------------------------------------------------------------- /docs/options.adoc: -------------------------------------------------------------------------------- 1 | There are other parameters that can be changed when executing TransPi. 2 | 3 | = Output options 4 | 5 | [horizontal] 6 | `--outdir`:: 7 | name of output directory. Example: `--outdir Sponges_150`. 8 | Default "results" 9 | 10 | `-w, -work`:: 11 | name of working directory. Example: `-work Sponges_work`. Only one dash is needed for `-work` since it is a nextflow function. 12 | 13 | `--tracedir`:: 14 | Name for directory to save pipeline trace files. 15 | Default "pipeline_info" 16 | 17 | = Additional analyses 18 | 19 | [horizontal] 20 | `--rRNAfilter`:: Remove rRNA from sequences. Requires option --rRNAdb 21 | 22 | `--rRNAdb`:: PATH to database of rRNA sequences to use for filtering of rRNA. Default "" 23 | 24 | `--filterSpecies`:: 25 | Perform psytrans filtering of transcriptome. Requires options `--host` and `--symbiont` 26 | 27 | `--host`:: Host (or similar) protein file. 28 | 29 | `--symbiont`:: Symbionts (or similar) protein files 30 | 31 | `--psyval`:: Psytrans value to train model. Default "160" 32 | 33 | `--allBuscos`:: Run BUSCO analysis in all assemblies 34 | 35 | `--rescueBusco`:: Generate BUSCO distribution analysis 36 | 37 | `--minPerc`:: 38 | Mininmum percentage of assemblers require for the BUSCO distribution. 39 | Default ".70" 40 | 41 | `--shortTransdecoder`:: Run Transdecoder without the homology searches 42 | 43 | `--withSignalP`:: 44 | Include SignalP for the annotation. Needs manual installation of CBS-DTU tools. 45 | Default "false" 46 | 47 | `--rnam`:: PATH to Rnammer software. Default "" 48 | 49 | `--withTMHMM`:: 50 | Include TMHMM for the annotation. Needs manual installation of CBS-DTU tools. 51 | Default "false" 52 | 53 | `--tmhmm`:: PATH to TMHMM software. Default "" 54 | 55 | `--withRnammer`:: 56 | Include Rnammer for the annotation. Needs manual installation of CBS-DTU tools. 57 | Default "false" 58 | 59 | `--rnam`:: PATH to Rnammer software. Default "" 60 | 61 | = Skip options 62 | 63 | [horizontal] 64 | `--skipEvi`:: Skip EvidentialGene run in --onlyAsm option. Default "false" 65 | 66 | `--skipQC`:: Skip FastQC step. Default "false" 67 | 68 | `--skipFilter`:: Skip fastp filtering step. Default "false" 69 | 70 | `--skipKegg`:: Skip kegg analysis. Default "false" 71 | 72 | `--skipReport`:: Skip generation of final TransPi report. Default "false" 73 | 74 | = Other parameters 75 | 76 | [horizontal] 77 | `--minQual`:: Minimum quality score for fastp filtering. Default "25" 78 | 79 | `--pipeInstall`:: PATH to TransPi directory. Default "". If precheck is used this will be added to the nextflow.config automatically. 80 | 81 | `--envCacheDir`:: PATH for environment cache directory (either conda or containers). Default "Launch directory of pipeline" 82 | -------------------------------------------------------------------------------- /docs/extra.adoc: -------------------------------------------------------------------------------- 1 | Here are some notes that can help in the execution of TransPi. Also some important considerations based Nextflow settings. 2 | For more in detail information visit the https://www.nextflow.io/docs/latest/index.html[Nextflow documentation] 3 | 4 | = `-resume` 5 | If an error occurs and you need to resume the pipeline just include the `-resume` option when calling the pipeline. 6 | 7 | [source,bash] 8 | ---- 9 | ./nextflow run TransPi.nf --onlyAnn -profile conda -resume 10 | ---- 11 | 12 | = `template.nextflow.config` 13 | 14 | == Resources 15 | The `template.nextflow.config` file has different configurations for the each program of the pipeline 16 | (e.g. some with a lot of CPUs, others with a small amount of CPUs). You can modify this depending on the resources you have in your system. 17 | 18 | Example: 19 | [source,bash] 20 | **** 21 | process { 22 | withLabel: big_cpus { 23 | cpus='30' 24 | memory='15 GB' 25 | } 26 | **** 27 | 28 | In this case, all the processes using the label `big_cpus` will use 30 CPUs. If your system only has 20 please modify these values accordingly to avoid errors. 29 | 30 | [NOTE] 31 | Setting the correct CPUs and RAM of your system is important because `nextflow` will start as many jobs as possible if the resources are available. 32 | If you are in a VM with 120 CPUs, `nextfow` will be able to start four processes with the label `big_cpus`. 33 | 34 | == Data 35 | 36 | The precheck is designed to create a new `nextflow.config` every time is run with with the `PATH` to the databases. 37 | You can modify the values that do not need editing for your analysis on the `template.nextflow.config`. This way you avoid doing the same changes to the `nextflow.config` after the precheck run. 38 | 39 | Example: Modify the `template.nextflow.config` with your cluster info to avoid repeating these in the future. 40 | 41 | = Custom profiles 42 | 43 | We are using https://slurm.schedmd.com/documentation.html[SLURM] as our workload manager in our server. 44 | Thus we have custom profiles for the submission of jobs. For example our `nextflow.config` has the following lines in the `profiles` section. 45 | 46 | 47 | [source,text] 48 | profiles { 49 | palmuc { 50 | process { 51 | executor='slurm' 52 | clusterOptions='--clusters=inter --partition=bigNode --qos=low' 53 | } 54 | } 55 | } 56 | 57 | 58 | You can add your custom profiles depending on the settings of your system and the workload manager you use (e.g. SGE, PBS, etc). 59 | 60 | The line `clusterOptions` can be used to add any other option that you will usually use for your job submission. 61 | 62 | = Local nextflow 63 | 64 | To avoid calling the pipeline using `./nextflow ...` you can modify the nextflow command like this `chmod 777 nextflow`. For running the pipeline you just need to use: 65 | 66 | [source,bash] 67 | **** 68 | 69 | nextflow run TransPi.nf ... 70 | 71 | **** 72 | 73 | = Real Time Monitoring 74 | To monitor your pipeline remotely without connecting to the server via ssh use https://tower.nf/login[Nextflow Tower]. 75 | Make an account with your email and follow their instructions. After this, you can now run the pipeline adding the `-with-tower` option and follow live the execution 76 | of the processes. 77 | 78 | [source,bash] 79 | **** 80 | 81 | nextflow run TransPi.nf --all -with-tower -profile conda 82 | 83 | **** 84 | -------------------------------------------------------------------------------- /docs/run.adoc: -------------------------------------------------------------------------------- 1 | = Full analysis (`--all`) 2 | 3 | After the successful run of the precheck script you are set to run TransPi. 4 | 5 | We recommend to run TransPi with the option `--all` where it will do the complete analysis, from raw reads filtering to annotation. 6 | Other options described below. 7 | 8 | To run the complete pipeline. 9 | [source,bash] 10 | ---- 11 | nextflow run TransPi.nf --all --reads '/YOUR/READS/PATH/HERE/*_R[1,2].fastq.gz' \ 12 | --k 25,41,53 --maxReadLen 75 -profile conda 13 | 14 | ---- 15 | 16 | Arguments explanations: 17 | [source,text] 18 | ---- 19 | --all Run full TransPi analysis 20 | --reads PATH to the paired-end reads 21 | --k kmers list to use for the assemblies 22 | --maxReadLen Max read length in the library 23 | -profile Use conda to run the analyses 24 | ---- 25 | 26 | [CAUTION] 27 | -- 28 | If you combined multiple libraries of the same individual to create a reference transcriptome, which will be later use in downstream analyses (e.g. Differential Expression), 29 | make sure the kmer list is based on the length for the shortest read library and the `maxReadLen` based on the longest read length. 30 | 31 | Example: Combining reads of 100bp with 125bp 32 | [source,text] 33 | **** 34 | --k 25,41,53,61 --maxReadLen 125 35 | **** 36 | -- 37 | 38 | [NOTE] 39 | -- 40 | You can run multiple samples at the same time 41 | -- 42 | 43 | = Other options 44 | 45 | == `--onlyAsm` 46 | 47 | Run only the Assemblies and EvidentialGene analysis. 48 | 49 | Example for `--onlyAsm`: 50 | [source,bash] 51 | ---- 52 | nextflow run TransPi.nf --onlyAsm --reads '/home/rrivera/TransPi/reads/*_R[1,2].fastq.gz' \ 53 | --k 25,41,53 --maxReadLen 75 -profile conda 54 | 55 | ---- 56 | 57 | [NOTE] 58 | You can run multiple samples at the same time 59 | 60 | == `--onlyEvi` 61 | 62 | Run only the Evidential Gene analysis 63 | 64 | Example for `--onlyEvi`: 65 | [source,bash] 66 | ---- 67 | nextflow run TransPi.nf --onlyEvi -profile conda 68 | ---- 69 | 70 | 71 | [IMPORTANT] 72 | TransPi looks for a directory named `onlyEvi`. It expects one file per sample to perform the reduction. The file should have all the assemblies concatenated into one. 73 | [NOTE] 74 | You can run multiple samples at the same time 75 | 76 | == `--onlyAnn` 77 | 78 | Run only the Annotation analysis (starting from a final assembly) 79 | 80 | Example for `--onlyAnn`: 81 | [source,bash] 82 | ---- 83 | nextflow run TransPi.nf --onlyAnn -profile conda 84 | ---- 85 | 86 | [IMPORTANT] 87 | TransPi looks for a directory named `onlyAnn`. It expects one file per sample to perform the annotation. 88 | [NOTE] 89 | You can run multiple samples at the same time 90 | 91 | = Using `-profiles` 92 | 93 | TransPi can also use docker, singularity, and individual conda installations (i.e. per process) to deploy the pipeline. 94 | 95 | [source,text] 96 | ---- 97 | test Run TransPi with a test dataset 98 | conda Run TransPi with conda. 99 | docker Run TransPi with docker container 100 | singularity Run TransPi with singularity container with all the necessary tools 101 | TransPiContainer Run TransPi with a single container with all tools 102 | ---- 103 | 104 | [NOTE] 105 | -- 106 | Multiple profiles can be specified (comma separated) 107 | 108 | [source,text] 109 | **** 110 | Example: `-profile test,singularity` 111 | **** 112 | -- 113 | 114 | Refer to *Section 6* of this manual for further details on deployment of TransPi using other profiles. 115 | -------------------------------------------------------------------------------- /bin/GO_plots.R: -------------------------------------------------------------------------------- 1 | args = commandArgs(trailingOnly=TRUE) 2 | sample_name=args[1] 3 | 4 | library(ggthemes) 5 | library(ggplot2) 6 | 7 | dataCC=read.delim("GO_cellular.txt", header = F, sep = "\t") 8 | dataMF=read.delim("GO_molecular.txt", header = F, sep = "\t") 9 | dataBP=read.delim("GO_biological.txt", header = F, sep = "\t") 10 | 11 | #CC 12 | nlim=round((head(dataCC$V1,n = 1)+150),digits = -2) 13 | p1<-ggplot(data=dataCC, aes(x=reorder(dataCC$V2,dataCC$V1), y=dataCC$V1))+ 14 | geom_bar(stat="identity", fill="green", width=.5)+ 15 | coord_flip()+labs(x="Classification",y="Number of Sequences")+ 16 | geom_text(aes(label=dataCC$V1), position=position_dodge(width=0.7), vjust=-0.0005, hjust=-.15)+ 17 | theme(axis.text=element_text(size=10))+ylim(0,nlim)+theme(text = element_text(size = 15))+ 18 | theme(axis.text.x=element_text(size=12,angle=0))+theme(axis.title=element_text(size=15,face="bold"))+ 19 | ggtitle(paste(sample_name,"Cellular Componenet GOs",sep=" "))+ 20 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.1)*1, face = "bold")) 21 | 22 | #ggsave(filename = paste(sample_name,"_Cellular_Component.svg",sep=""),width = 15 ,height = 7) 23 | #ggsave(filename = paste(sample_name,"_Cellular_Component.pdf",sep=""),width = 15 ,height = 7) 24 | pdf(paste(sample_name,"_Cellular_Component.pdf",sep=""),width = 15 ,height = 7) 25 | print(p1) 26 | dev.off() 27 | svg(paste(sample_name,"_Cellular_Component.svg",sep=""),width = 15 ,height = 7) 28 | print(p1) 29 | dev.off() 30 | 31 | #MF 32 | nlim=round((head(dataMF$V1,n = 1)+150),digits = -2) 33 | p2 <-ggplot(data=dataMF, aes(x=reorder(dataMF$V2,dataMF$V1), y=dataMF$V1))+ 34 | geom_bar(stat="identity", fill="blue", width=.5)+ 35 | coord_flip()+labs(x="Classification",y="Number of Sequences")+ 36 | geom_text(aes(label=dataMF$V1), position=position_dodge(width=0.7), vjust=-0.0005, hjust=-.15)+ 37 | theme(axis.text=element_text(size=10))+ylim(0,nlim)+theme(text = element_text(size = 15))+ 38 | theme(axis.text.x=element_text(size=12,angle=0))+theme(axis.title=element_text(size=15,face="bold"))+ 39 | ggtitle(paste(sample_name,"Molecular Function GOs",sep=" "))+ 40 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.1)*1, face = "bold")) 41 | 42 | #ggsave(filename = paste(sample_name,"_Molecular_Function.svg",sep=""),width = 15 ,height = 7) 43 | #ggsave(filename = paste(sample_name,"_Molecular_Function.pdf",sep=""),width = 15 ,height = 7) 44 | pdf(paste(sample_name,"_Molecular_Function.pdf",sep=""),width = 15 ,height = 7) 45 | print(p2) 46 | dev.off() 47 | svg(paste(sample_name,"_Molecular_Function.svg",sep=""),width = 15 ,height = 7) 48 | print(p2) 49 | dev.off() 50 | 51 | #BP 52 | nlim=round((head(dataBP$V1,n = 1)+150),digits = -2) 53 | p3<-ggplot(data=dataBP, aes(x=reorder(dataBP$V2,dataBP$V1), y=dataBP$V1))+ 54 | geom_bar(stat="identity", fill="red", width=.5)+ 55 | coord_flip()+labs(x="Classification",y="Number of Sequences")+ 56 | geom_text(aes(label=dataBP$V1), position=position_dodge(width=0.7), vjust=-0.0005, hjust=-.15)+ 57 | theme(axis.text=element_text(size=10))+ylim(0,nlim)+theme(text = element_text(size = 15))+ 58 | theme(axis.text.x=element_text(size=12,angle=0))+theme(axis.title=element_text(size=15,face="bold"))+ 59 | ggtitle(paste(sample_name,"Biological Processes GOs",sep=" "))+ 60 | theme(plot.title = element_text(family="sans", colour = "black", size = rel(1.1)*1, face = "bold")) 61 | 62 | #ggsave(filename = paste(sample_name,"_Biological_Processes.svg",sep=""),width = 15 ,height = 7) 63 | #ggsave(filename = paste(sample_name,"_Biological_Processes.pdf",sep=""),width = 15 ,height = 7) 64 | pdf(paste(sample_name,"_Biological_Processes.pdf",sep=""),width = 15 ,height = 7) 65 | print(p3) 66 | dev.off() 67 | svg(paste(sample_name,"_Biological_Processes.svg",sep=""),width = 15 ,height = 7) 68 | print(p3) 69 | dev.off() 70 | -------------------------------------------------------------------------------- /bin/SOS_busco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import Bio 5 | from Bio import SeqIO 6 | import argparse 7 | from functools import reduce 8 | import numpy as np 9 | import math 10 | import sys 11 | from collections import Counter 12 | 13 | parser = argparse.ArgumentParser(usage='', description='') 14 | parser.add_argument('-input_file_busco', dest='input_file_busco',required=True) 15 | parser.add_argument('-input_file_fasta', dest='input_file_fasta',required=True) 16 | parser.add_argument('-min', dest='min_num_assembler', type=float, required=True) 17 | parser.add_argument('-kmers',dest='kmers',required=True) 18 | 19 | args = parser.parse_args() 20 | 21 | assemblers_names = ['SOAP','SPADES','TransABySS','Velvet'] 22 | 23 | all_missing_list = [] 24 | list_of_databases = [] 25 | final_list = [] 26 | 27 | Busco_to_save = [] 28 | 29 | with open(args.input_file_busco) as input_busco_file: 30 | 31 | kmers_list = args.kmers.strip().split(',') 32 | nr_of_kmers = (len(kmers_list)*4+2) 33 | column_names = [(assembler + '_' + kmer) for assembler in assemblers_names for kmer in kmers_list] 34 | column_names.insert(3*len(kmers_list) ,'Trinity') 35 | column_names.insert(len(column_names),'TransPi') 36 | column_names.insert(0,'Busco ID') 37 | 38 | busco_df = pd.read_csv(input_busco_file, sep=',',header=0,names=['Busco_id','Status','Sequence','Score','Length']) 39 | busco_unique = busco_df.groupby((busco_df['Busco_id'] !=busco_df['Busco_id'].shift()).cumsum().values).first() 40 | 41 | busco_tables = np.array_split(busco_unique, nr_of_kmers) 42 | transpi_table = busco_tables[nr_of_kmers-1] 43 | 44 | for table in busco_tables: 45 | busco_missing = table[table.Status.eq('Missing')].iloc[:,0].tolist() 46 | all_missing_list.extend(busco_missing) 47 | missing_Busco = list(dict.fromkeys(all_missing_list)) 48 | 49 | for table in busco_tables: 50 | final_df = table[table['Busco_id'].isin(missing_Busco)].iloc[:, 0:2] 51 | final_list.append(final_df) 52 | 53 | comparison_table = reduce(lambda left,right: pd.merge(left,right,on='Busco_id'), final_list) 54 | comparison_table.columns = column_names 55 | transpi_table = comparison_table[(comparison_table['TransPi'] == 'Missing')] 56 | 57 | comparison_table.to_csv('Complete_comparison_table',sep='\t',index=False) 58 | transpi_table.to_csv('TransPi_comparison_table',sep='\t',index=False) 59 | 60 | BUSCO_to_rescue = transpi_table[(transpi_table == 'Complete').any(axis=1)].iloc[:,0].tolist() 61 | 62 | if len(BUSCO_to_rescue) == 0: 63 | sys.exit(0) 64 | elif len(BUSCO_to_rescue) != 0: 65 | for table in busco_tables[:-1]: 66 | for i in BUSCO_to_rescue: 67 | seqs = (i,table['Sequence'].loc[table['Busco_id'] == i].values[0],table['Score'].loc[table['Busco_id'] == i].values[0]) 68 | Busco_to_save.append(seqs) 69 | 70 | potential_seqs = [t for t in Busco_to_save if not any(isinstance(n, float) and math.isnan(n) for n in t)] 71 | flat_list = [i[0] for i in potential_seqs] 72 | busco_count = Counter(flat_list) 73 | 74 | min_number = nr_of_kmers * args.min_num_assembler 75 | busco_to_save = [k for k, v in busco_count.items() if v >= min_number] 76 | 77 | seqs_to_save = [item for item in potential_seqs if item[0] in busco_to_save] 78 | 79 | seqs_to_save.sort(key= lambda x: x[2], reverse=True) 80 | 81 | checked = set() 82 | unique_seqs_list = [] 83 | 84 | for busco_id, sequence, score in seqs_to_save: 85 | if not busco_id in checked: 86 | checked.add(busco_id) 87 | unique_seqs_list.append((busco_id,sequence)) 88 | 89 | #The fasta file is parsed with Biopython SeqIO.parse. And target sequences are extracted. 90 | sequences_IDs_to_rescue = [ x[1] for x in unique_seqs_list] 91 | fasta_to_extract = [] 92 | 93 | for seqrecord in SeqIO.parse(args.input_file_fasta, 'fasta'): 94 | if seqrecord.id in sequences_IDs_to_rescue: 95 | fasta_to_extract.append(seqrecord) 96 | 97 | #Output files are written. 98 | with open('sequences_to_add.fasta','w') as outputh: 99 | SeqIO.write(fasta_to_extract,outputh,'fasta') 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TransPi - TRanscriptome ANalysiS PIpeline 2 | 3 | ```text 4 | _______ _____ _ 5 | |__ __| | __ \ (_) 6 | | | _ __ __ _ _ __ ___ | |__) | _ 7 | | | | __| / _ | | _ \ / __| | ___/ | | 8 | | | | | | (_| | | | | | \__ \ | | | | 9 | |_| |_| \__,_| |_| |_| |___/ |_| |_| 10 | ``` 11 | 12 | [![Prepint](http://d2538ggaoe6cji.cloudfront.net/sites/default/files/images/favicon.ico)](https://doi.org/10.1101/2021.02.18.431773)[**Preprint**](https://doi.org/10.1101/2021.02.18.431773)  [![Chat on Gitter](https://img.shields.io/gitter/room/PalMuc/TransPi.svg?colorB=26af64&style=popout)](https://gitter.im/PalMuc/TransPi)  [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) 13 | [![release](https://img.shields.io/github/v/release/PalMuc/TransPi?label=release&logo=github)](https://github.com/PalMuc/TransPi/releases/latest) 14 | 15 | # Table of contents 16 | 17 | * [General info](#General-info) 18 | * [Pipeline processes](#Pipelie-processes) 19 | * [Manual](#Manual) 20 | * [Publication](#Publication) 21 | * [Citation](#Citation) 22 | * [Funding](#Funding) 23 | * [Future work](#Future-work) 24 | * [Issues](#Issues) 25 | * [Chat](#Chat) 26 | 27 | # General info 28 | 29 | TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly 30 | 31 | TransPi is based on the scientific workflow manager [Nextflow](https://www.nextflow.io). It is designed to help researchers get the best reference transcriptome assembly for their organisms of interest. It performs multiple assemblies with different parameters to then get a non-redundant consensus assembly. It also performs other valuable analyses such as quality assessment of the assembly, BUSCO scores, Transdecoder (ORFs), and gene ontologies (Trinotate), etc. All these with minimum input from the user but without losing the potential of a comprehensive analysis. 32 | 33 | ## Pipeline processes 34 | 35 | ![TransPi flowchart](https://sync.palmuc.org/index.php/s/nrd3KPnfnz7AipF/preview) 36 | 37 | **Figure 1.** TransPi v1.0.0 flowchart showing the various steps and analyses it can performed. For simplicity, this diagram does not show all the connections between the processes. Also, it omits other additional options like the BUSCO distribution and transcriptome filtering with psytrans (see Section 2.6). ORFs=Open reading Frames; HTML=Hypertext Markup Language. 38 | 39 | ## Manual 40 | 41 | TransPi documentation and examples can be found [here](https://palmuc.github.io/TransPi/) 42 | 43 | # Publication 44 | 45 | Preprint of TransPi including kmer, reads length, and reads quantities tests can be found [here](https://doi.org/10.1101/2021.02.18.431773). Also we tested the pipeline with over 45 samples from different phyla. 46 | 47 | TransPi has been peer-reviewed and recommended by Peer Community In Genomics 48 | (https://doi.org/10.24072/pci.genomics.100009) 49 | 50 | ## Citation 51 | 52 | If you use TransPi please cite the peer-reviewed publication: 53 | 54 | Rivera-Vicéns, R.E., García-Escudero, CA., Conci, N., Eitel, M., and Wörheide, G. (2021). TransPi – a comprehensive TRanscriptome ANalysiS PIpeline for de novo transcriptome assembly. bioRxiv 2021.02.18.431773; doi: https://doi.org/10.1101/2021.02.18.431773 55 | 56 | # Funding 57 | 58 | - European Union’s Horizon 2020 research and innovation programme under the Marie Skłodowska-Curie grant agreement No 764840 (ITN IGNITE). 59 | 60 | - Advanced Human Capital Program of the National Commission for Scientific and Technological Research (CONICYT) 61 | 62 | - Lehre@LMU (project number: W19 F1; Studi forscht@GEO) 63 | 64 | - LMU Munich’s Institutional Strategy LMUexcellent within the framework of the German Excellence Initiative 65 | 66 | # Future work 67 | 68 | - Cloud deployment of the tool 69 | 70 | # Issues 71 | 72 | We tested TransPi using conda, singularity and docker. However, if you find a problem or get an error please let us know by opening an issue. 73 | 74 | ## Chat 75 | 76 | If you have further questions and need help with TransPi you can chat with us in the [TransPi Gitter chat](https://gitter.im/PalMuc/TransPi) 77 | -------------------------------------------------------------------------------- /bin/busco_comparison.R: -------------------------------------------------------------------------------- 1 | ###################################### 2 | 3 | # Edit from the orginal BUSCO plot script 4 | 5 | ###################################### 6 | # 7 | # BUSCO summary figure 8 | # @version 3.0.0 9 | # @since BUSCO 2.0.0 10 | # 11 | # Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org) 12 | # Licensed under the MIT license. See LICENSE.md file. 13 | # 14 | ###################################### 15 | args = commandArgs(trailingOnly=TRUE) 16 | sample_name=args[1] 17 | options(warn=-1) 18 | # Load the required libraries 19 | library(ggplot2) 20 | library(grid) 21 | # !!! CONFIGURE YOUR PLOT HERE !!! 22 | # Output 23 | #my_output <- paste("./","combined_busco_figure.png",sep="/") 24 | #my_width <- 20 25 | #my_height <- 15 26 | #my_unit <- "cm" 27 | # Colors 28 | #my_colors <- c("#56B4E9", "#3492C7", "#F0E442", "#F04442") 29 | #cata 30 | my_colors <- c("#0e9aa7", "#96ceba", "#ffcc5c", "#ff6f69") 31 | # Bar height ratio 32 | my_bar_height <- 0.55 33 | # Legend 34 | my_title <- "BUSCO Assessment Results - Trinity vs TransPi" 35 | # Font 36 | my_family <- "sans" 37 | my_size_ratio <- 1 38 | # !!! SEE YOUR DATA HERE !!! 39 | # Your data as generated by python, remove or add more 40 | my_species <- c(MYSPEC) 41 | my_species <- factor(my_species) 42 | my_species <- factor(my_species,levels(my_species)[c(length(levels(my_species)):1)]) # reorder your species here just by changing the values in the vector : 43 | my_percentage <- c(MYPERC) 44 | my_values <- c(MYVAL) 45 | 46 | ###################################### 47 | ###################################### 48 | # Code to produce the graph 49 | labsize = 1 50 | if (length(levels(my_species)) > 10){ 51 | labsize = 0.66 52 | } 53 | print("Plotting the figure ...") 54 | category <- c(rep(c("S","D","F","M"),c(1))) 55 | category <-factor(category) 56 | #category = factor(category,levels(category)[c(4,1,2,3)]) 57 | category = factor(category,levels(category)[(c(4,1,2,3))]) 58 | df = data.frame(my_species,my_percentage,my_values,category) 59 | figure <- ggplot() + 60 | geom_bar(aes(y = my_percentage, x = my_species, fill = category), data = df, stat="identity", width=my_bar_height,position = position_stack(reverse=TRUE)) + 61 | coord_flip() + 62 | theme_gray(base_size = 8) + 63 | scale_y_continuous(labels = c("0","20","40","60","80","100"), breaks = c(0,20,40,60,80,100)) + 64 | scale_fill_manual(values = my_colors,labels =c(" Complete (C) and single-copy (S) ", 65 | " Complete (C) and duplicated (D)", 66 | " Fragmented (F) ", 67 | " Missing (M)")) + 68 | ggtitle(my_title) + 69 | xlab("") + 70 | ylab("\n%BUSCOs") + 71 | theme(plot.title = element_text(family=my_family, colour = "black", size = rel(2.2)*my_size_ratio, face = "bold")) + 72 | theme(legend.position="top",legend.title = element_blank()) + 73 | theme(legend.text = element_text(family=my_family, size = rel(1.2)*my_size_ratio)) + 74 | theme(panel.background = element_rect(color="#FFFFFF", fill="white")) + 75 | theme(panel.grid.minor = element_blank()) + 76 | theme(panel.grid.major = element_blank()) + 77 | theme(axis.text.y = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio,face="italic")) + 78 | theme(axis.text.x = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) + 79 | theme(axis.line = element_line(size=1*my_size_ratio, colour = "black")) + 80 | theme(axis.ticks.length = unit(.85, "cm")) + 81 | theme(axis.ticks.y = element_line(colour="white", size = 0)) + 82 | theme(axis.ticks.x = element_line(colour="#222222")) + 83 | theme(axis.ticks.length = unit(0.4, "cm")) + 84 | theme(axis.title.x = element_text(family=my_family, size=rel(1.2)*my_size_ratio)) + 85 | guides(fill = guide_legend(override.aes = list(colour = NULL))) + 86 | guides(fill=guide_legend(nrow=2,byrow=TRUE)) 87 | for(i in rev(c(1:length(levels(my_species))))){ 88 | detailed_values <- my_values[my_species==my_species[my_species==levels(my_species)[i]]] 89 | total_buscos <- sum(detailed_values) 90 | figure <- figure + 91 | annotate("text", label=paste("C:", detailed_values[1] + detailed_values[2], " [S:", detailed_values[1], ", D:", detailed_values[2], "], F:", detailed_values[3], ", M:", detailed_values[4], ", n:", total_buscos, sep="\t"), 92 | y=3, x = i, size = labsize*4*my_size_ratio, colour = "black", hjust=0, family=my_family) 93 | } 94 | 95 | #ggsave not working in docker 96 | #ggsave(filename = paste(sample_name,"_BUSCO_comparison.svg",sep=""),width = 15 ,height = 7) 97 | #ggsave(filename = paste(sample_name,"_BUSCO_comparison.pdf",sep=""),width = 15 ,height = 7) 98 | pdf(paste(sample_name,"_BUSCO_comparison.pdf",sep=""),width = 15 ,height = 7) 99 | print(figure) 100 | dev.off() 101 | svg(paste(sample_name,"_BUSCO_comparison.svg",sep=""),width = 15 ,height = 7) 102 | print(figure) 103 | dev.off() 104 | print("Done") 105 | -------------------------------------------------------------------------------- /docs/examples.adoc: -------------------------------------------------------------------------------- 1 | Here are some examples on how to deploy TransPi depending on the method to use (e.g. conda) and the analyses to be performed. 2 | 3 | = Profiles 4 | You can use TransPi either with: 5 | - a local conda environment (from precheck); 6 | - individual conda environment per process; 7 | - docker or singularity 8 | 9 | == Conda 10 | This way of executing TransPi assumes that you installed conda locally. 11 | All these is done automatically for you, if desired, with the precheck script. 12 | 13 | *Example:* 14 | [source,bash] 15 | ---- 16 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 17 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 18 | -profile conda 19 | ---- 20 | 21 | [NOTE] 22 | `-profile conda` tells TransPi to use conda. An individual environment is used per process. 23 | 24 | == Containers 25 | Docker or singularity can also be use for deploying TransPi. You can either use individual containers for each process or a TransPi container with all the tools. 26 | 27 | === Individual 28 | To use individual containers: 29 | 30 | *Example for docker:* 31 | [source,bash] 32 | ---- 33 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 34 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 35 | -profile docker 36 | ---- 37 | 38 | *Example for singularity:* 39 | [source,bash] 40 | ---- 41 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 42 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 43 | -profile singularity 44 | ---- 45 | 46 | [NOTE] 47 | Some individual containers can create problems. We are working on solving these issues. In the meantime you can use the TransPi container (see below). 48 | 49 | === TransPi container 50 | To use the TransPi container with all the tools you need to use the profile `TransPiContainer`. 51 | 52 | *Example for docker:* 53 | [source,bash] 54 | ---- 55 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 56 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 57 | -profile docker,TransPiContainer 58 | ---- 59 | 60 | *Example for singularity:* 61 | [source,bash] 62 | ---- 63 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 64 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 65 | -profile singularity,TransPiContainer 66 | ---- 67 | 68 | 69 | = Other examples 70 | 71 | [NOTE] 72 | Order of commands is not important. 73 | 74 | == Filtering 75 | 76 | *Scenario:* 77 | [horizontal] 78 | Sample:: Coral sample 79 | Read length:: 150bp 80 | TransPi mode:: --all 81 | Kmers:: 25,35,55,75,85 82 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz 83 | Output directory:: Results_Acropora 84 | Work directory:: work_acropora 85 | Engine:: conda 86 | Filter species:: on 87 | host:: scleractinian proteins 88 | symbiont:: symbiodinium proteins 89 | 90 | *Command:* 91 | [source,bash] 92 | ---- 93 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 94 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 95 | -w work_acropora -profile conda --filterSpecies \ 96 | --host /YOUR/PATH/HERE/uniprot-Scleractinia.fasta \ 97 | --symbiont /YOUR/PATH/HERE/uniprot-symbiodinium.fasta 98 | ---- 99 | 100 | 101 | == BUSCO distribution 102 | 103 | *Scenario:* 104 | [horizontal] 105 | Sample:: SampleA 106 | Read length:: 100bp 107 | TransPi mode:: --all 108 | Kmers:: 25,41,57,67 109 | Reads PATH:: /YOUR/PATH/HERE/SampleA/*_R[1,2].fastq.gz 110 | Output directory:: Results_SampleA 111 | Engine:: conda 112 | All BUSCOs:: on 113 | BUSCO distribution:: on 114 | 115 | *Command:* 116 | [source,bash] 117 | ---- 118 | nextflow run TransPi.nf --all --maxReadLen 100 --k 25,35,55,75,85 \ 119 | --outdir Results_SampleA --reads '/YOUR/PATH/HERE/SampleA/*_R[1,2].fastq.gz' \ 120 | -profile conda --allBuscos --buscoDist 121 | ---- 122 | 123 | == `--onlyEvi` 124 | 125 | *Scenario:* 126 | [horizontal] 127 | Sample:: Assemblies from multiple assemblers and kmers 128 | Read length:: 50bp 129 | TransPi mode:: --onlyEvi 130 | Kmers:: 25,33,37 131 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz 132 | Output directory:: Reduction_results 133 | Engine:: conda 134 | 135 | *Command:* 136 | [source,bash] 137 | ---- 138 | nextflow run TransPi.nf --onlyEvi --outdir Reduction_results \ 139 | -profile conda 140 | ---- 141 | 142 | .NOTES 143 | **** 144 | - A directory named `onlyEvi` is needed for this option with the transcriptome to perform the reduction. 145 | 146 | TIP: You can do multiple transcriptomes at the same time. Each file should have a unique name. 147 | 148 | - No need to specify reads PATH, length, cutoff, and kmers when using the `--onlyEvi`. 149 | 150 | **** 151 | 152 | == `--onlyAnn` 153 | 154 | *Scenario:* 155 | [horizontal] 156 | Sample:: Transcriptome missing annotation 157 | Read length:: 100bp 158 | TransPi mode:: --onlyEvi 159 | Kmers:: 25,41,57,67 160 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz 161 | Output directory:: Annotation_results 162 | Engine:: singularity 163 | Container:: TransPi container 164 | 165 | *Command:* 166 | [source,bash] 167 | ---- 168 | nextflow run TransPi.nf --onlyAnn --outdir Annotation_results \ 169 | -profile singularity,TransPiContainer 170 | ---- 171 | 172 | .NOTES 173 | **** 174 | - A directory named `onlyAnn` is needed for this option with the transcriptome to annotate. 175 | 176 | TIP: You can do multiple transcriptomes (i.e. samples) at the same time. Each file should have a unique name. 177 | 178 | - No need to specify reads PATH, length, cutoff, and kmers when using the `--onlyAnn`. 179 | 180 | **** 181 | 182 | == Skip options 183 | 184 | *Scenario:* 185 | [horizontal] 186 | Sample:: Coral sample 187 | Read length:: 150bp 188 | TransPi mode:: --all 189 | Kmers:: 25,35,55,75,85 190 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz 191 | Output directory:: Results_Acropora 192 | Work directory:: work_acropora 193 | Engine:: docker 194 | Container:: Individual containers 195 | Skip QC:: on 196 | Skip Filter:: on 197 | 198 | *Command:* 199 | [source,bash] 200 | ---- 201 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 202 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results_Acropora \ 203 | -w work_acropora -profile docker \ 204 | --skipQC --skipFilter 205 | ---- 206 | 207 | == Extra annotation steps 208 | 209 | *Scenario:* 210 | [horizontal] 211 | Sample:: Mollusc sample 212 | Read length:: 150bp 213 | TransPi mode:: --all 214 | Kmers:: 25,35,55,75,85 215 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz 216 | Output directory:: Results 217 | Engine:: conda 218 | Skip QC:: on 219 | SignalP:: on 220 | TMHMM:: on 221 | RNAmmer:: on 222 | 223 | 224 | *Command:* 225 | [source,bash] 226 | ---- 227 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 228 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results \ 229 | -profile conda --skipQC --withSignalP --withTMHMM --withRnammer 230 | ---- 231 | 232 | .NOTE 233 | **** 234 | - This option requires manual installation of the CBS-DTU tools: signalP, tmhmm, and rnammer. 235 | 236 | - For more info visit https://services.healthtech.dtu.dk/software.php[CBS-DTU tools] 237 | 238 | - It also assumes that the `PATH` for all the tools are in the `nextflow.config` file. 239 | 240 | **** 241 | 242 | 243 | == Full run and extra annotation 244 | 245 | *Scenario:* 246 | [horizontal] 247 | Sample:: Coral sample 248 | Read length:: 150bp 249 | TransPi mode:: --all 250 | Kmers:: 25,35,55,75,85 251 | Reads PATH:: /YOUR/PATH/HERE/*_R[1,2].fastq.gz 252 | Output directory:: Results 253 | Engine:: conda 254 | Skip QC:: on 255 | SignalP:: on 256 | TMHMM:: on 257 | RNAmmer:: on 258 | Filter species:: on 259 | host:: scleractinian proteins 260 | symbiont:: symbiodinium proteins 261 | All BUSCOs:: on 262 | BUSCO distribution:: on 263 | Remove rRNA:: on 264 | rRNA database:: /YOUR/PATH/HERE/silva_rRNA_file.fasta 265 | 266 | *Command:* 267 | [source,bash] 268 | ---- 269 | nextflow run TransPi.nf --all --maxReadLen 150 --k 25,35,55,75,85 \ 270 | --reads '/YOUR/PATH/HERE/*_R[1,2].fastq.gz' --outdir Results \ 271 | -profile conda --skipQC --withSignalP --withTMHMM --withRnammer \ 272 | --host /YOUR/PATH/HERE/uniprot-Scleractinia.fasta \ 273 | --symbiont /YOUR/PATH/HERE/uniprot-symbiodinium.fasta 274 | --allBuscos --buscoDist --rRNAfilter \ 275 | --rRNAdb "/YOUR/PATH/HERE/silva_rRNA_file.fasta" 276 | ---- 277 | -------------------------------------------------------------------------------- /template.nextflow.config: -------------------------------------------------------------------------------- 1 | /* 2 | ================================================================================================ 3 | Config File TransPi 4 | ================================================================================================ 5 | Transcriptome Analysis Pipeline 6 | Author: Ramón E. Rivera-Vicéns 7 | GitHub: rivera10 8 | ---------------------------------------------------------------------------------------- 9 | */ 10 | 11 | params { 12 | 13 | // ------------------------- EDIT below variables (mandatory) ------------------------- // 14 | // --------------------- Can also be specified in the command line ---------------------- // 15 | 16 | // Modify this accordingly (if needed) 17 | // kmers list (depends on read length!) 18 | k="" 19 | 20 | // SOAP config file generator 21 | //#maximal read length 22 | maxReadLen="" 23 | //[LIB] 24 | //#maximal read length in this lib 25 | rd_len_cutof="${params.maxReadLen}" 26 | 27 | // Other options if needed. Leave defaults if unsure. 28 | //#average insert size 29 | //avg_ins="200" 30 | //#if sequence needs to be reversed 31 | reverse_seq="0" 32 | //#in which part(s) the reads are used 33 | asm_flags="3" 34 | //#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) 35 | map_len="32" 36 | 37 | // -------------------------- EDIT below variables if needed -------------------------- // 38 | 39 | // Directory for results 40 | outdir="results" 41 | 42 | // Directory for trace files 43 | tracedir="pipeline_info" 44 | 45 | // PATH for rnammer, tmhmm, signalp programs. Requires licenses. See CBS-DTU tools for information. 46 | // RNAmmer 47 | rnam = "" 48 | // Tmhmm 49 | tmhmm = "" 50 | // SignalP 51 | signalp = "" 52 | 53 | /* 54 | // ------------------------------------------------ STOP ------------------------------------------------ // 55 | 56 | Most of these values below are filled by the precheck script (e.g. PATH to databases or conda installation). 57 | However, if you run the precheck for a container you will not have all these PATHs assigned (e.g. conda PATH). 58 | Run the precehck again but selecting conda instead of containers if that is the case. 59 | 60 | 61 | For other options (e.g. filtering, buscoDist, etc.) is recommended to call them from the command line. 62 | 63 | 64 | Proceed to the end of this config file to modify the processes CPUs and RAM with the specs of your system. 65 | Also to modify the profiles if you use a scheduler manager like SLURM or PBS. 66 | 67 | 68 | More info at the TransPi repository (https://github.com/PalMuc/TransPi) and 69 | manual (https://palmuc.github.io/TransPi/). 70 | 71 | 72 | // -------------------------------------------------------------------------------------------------------------- // 73 | */ 74 | 75 | // PATH to TransPi DBs installation 76 | pipeInstall 77 | 78 | // Uniprot database PATH 79 | uniprot 80 | uniname 81 | 82 | //BUSCO database 83 | busco4db 84 | 85 | //PFAM file location 86 | pfloc 87 | 88 | //name of pfam file 89 | pfname 90 | 91 | //Trinotate sqlite created when installing Trinotate 92 | Tsql 93 | 94 | // Directory for reads 95 | reads="" 96 | 97 | // Pipeline options 98 | help = false 99 | fullHelp = false 100 | 101 | // Full analysis 102 | all = false 103 | 104 | // Only Evidential Gene run (one sample per run) 105 | onlyEvi = false 106 | 107 | // Only annotation analysis 108 | onlyAnn = false 109 | 110 | // Only Assemblies and Evidential Gene 111 | onlyAsm = false 112 | 113 | // Skip quality control 114 | skipQC = false 115 | 116 | // Skip fastp quality filter step 117 | skipFilter = false 118 | // Minimum reads quality for filtering in fastp 119 | minQual="5" 120 | 121 | // Filter rRNA 122 | rRNAfilter = false 123 | // rRNA database 124 | rRNAdb = "" 125 | 126 | // Skip normalization of reads 127 | skipNormalization = false 128 | // Normalization parameters 129 | normMaxCov=100 130 | normMinCov=1 131 | 132 | // Save reads from filtering and normalization 133 | saveReads = false 134 | 135 | // Save bam file from mapping step 136 | saveBam = false 137 | 138 | // Filter Species using psytrans 139 | filterSpecies = false 140 | // Psytrans value to train model 141 | psyval=160 142 | // Host Sequence 143 | host="" 144 | // Symbiont Sequence 145 | symbiont="" 146 | 147 | // Run BUSCO in all assemblies 148 | allBuscos = false 149 | 150 | // BUSCO distribution analysis (this option needs to be run together with the allBuscos option) 151 | // Generate the analysis 152 | buscoDist = false 153 | // Mininmum percentage of assemblers require to rescue a BUSCO sequence 154 | minPerc="0.7" 155 | 156 | //short Transdecoder, no homlogy search (PFAM and UniProt) 157 | shortTransdecoder = false 158 | //Transdecoder genetic code 159 | genCode="Universal" 160 | 161 | // Annotation options 162 | // SignalP 163 | withSignalP = false 164 | // tmHMM 165 | withTMHMM = false 166 | // rnammer 167 | withRnammer = false 168 | // Add annotation to file 169 | addAnnotation = false 170 | 171 | //Test data 172 | readsTest = false 173 | 174 | // Skip Evidential Gene for onlyAsm option 175 | skipEvi = false 176 | 177 | // Kegg pathway search 178 | withKegg = false 179 | 180 | // Skip Report 181 | skipReport = false 182 | 183 | // These options will change how the profiles work. 184 | // Run with conda installed by the precheck 185 | //next 2 parameters are outdated 186 | myConda = false 187 | myCondaInstall="" 188 | 189 | condaActivate = false 190 | 191 | // TransPi container with all programs 192 | oneContainer = false 193 | 194 | // Cache directory for conda and singularity files. Leave in blank if not sure 195 | envCacheDir = "" 196 | 197 | // Singularity 198 | // Use singularity image created after pulling from docker and not from Galaxy depot (singularity image ready to use). 199 | singularity_pull_docker_container = false 200 | 201 | // Get software versions - only works with local conda installation and TransPi container. 202 | skipGetRunInfo = false 203 | } 204 | 205 | /* 206 | // ------------------------------------------------ NOTE ------------------------------------------------ // 207 | 208 | 209 | Proceed to modify the processes CPUs and RAM with the specs of your system. 210 | Also to modify the profiles if you use a scheduler manager like SLURM or PBS. 211 | 212 | 213 | More info at the TransPi repository (https://github.com/PalMuc/TransPi) and 214 | manual (https://palmuc.github.io/TransPi/). 215 | 216 | Also see Nextflow documentation (https://www.nextflow.io/docs/latest/index.html). 217 | 218 | 219 | // -------------------------------------------------------------------------------------------------------------- // 220 | */ 221 | 222 | process { 223 | cpus='1' 224 | memory='5 GB' 225 | withLabel: big_cpus { 226 | cpus='20' 227 | memory='15 GB' 228 | } 229 | withLabel: med_cpus { 230 | cpus='8' 231 | memory='15 GB' 232 | } 233 | withLabel: low_cpus { 234 | cpus='4' 235 | memory='15 GB' 236 | } 237 | withLabel: exlow_cpus { 238 | cpus='1' 239 | memory='2 GB' 240 | } 241 | withLabel: big_mem { 242 | cpus='20' 243 | memory='350 GB' 244 | } 245 | withLabel: med_mem { 246 | cpus='15' 247 | memory={ 100.Gb + (task.attempt * 50.Gb)} 248 | errorStrategy={ task.exitStatus in 137..140 ? 'retry' : 'finish' } 249 | maxRetries = 2 250 | } 251 | withLabel: low_mem { 252 | cpus='20' 253 | memory='80 GB' 254 | } 255 | errorStrategy='finish' 256 | } 257 | 258 | // env Evidential Gene variable (only for nextflow) 259 | env.evi="${projectDir}/scripts/evigene" 260 | 261 | // Get PATH for cache environments 262 | params.localCacheDir = (params.envCacheDir ? "${params.envCacheDir}" : "${launchDir}") 263 | 264 | profiles { 265 | conda { 266 | params.condaActivate = true 267 | params.localConda="${params.myCondaInstall}" 268 | // cache for condaEnv created individually 269 | conda.cacheDir = "${params.localCacheDir}/condaEnv/" 270 | } 271 | docker { 272 | docker.enabled = true 273 | docker.runOptions = "-u \$(id -u):\$(id -g) -v ${params.pipeInstall}:${params.pipeInstall}" 274 | // --mount type=bind,src=${params.pipeInstall},dst=/dockerDB" 275 | } 276 | singularity { 277 | singularity.enabled = true 278 | singularity.autoMounts = true 279 | // cache for images from docker pull 280 | singularity.cacheDir="${params.localCacheDir}/singularityCache/" 281 | } 282 | test { 283 | includeConfig 'conf/test.config' 284 | } 285 | TransPiContainer { 286 | process { 287 | params.oneContainer = true 288 | params.TPcontainer="rerv/transpi:v1.0.0" 289 | } 290 | } 291 | palmuc { 292 | process { 293 | executor='slurm' 294 | clusterOptions='-p lemmium --qos=low' 295 | } 296 | } 297 | } 298 | 299 | executor { 300 | $slurm { 301 | queueSize=100 302 | } 303 | } 304 | 305 | timeline { 306 | enabled = true 307 | file = "${params.outdir}/${params.tracedir}/transpi_timeline.html" 308 | } 309 | report { 310 | enabled = true 311 | file = "${params.outdir}/${params.tracedir}/transpi_report.html" 312 | } 313 | trace { 314 | enabled = true 315 | file = "${params.outdir}/${params.tracedir}/transpi_trace.txt" 316 | } 317 | dag { 318 | enabled = true 319 | file = "${params.outdir}/${params.tracedir}/transpi_dag.html" 320 | } 321 | 322 | manifest { 323 | name = 'TransPi' 324 | author = 'Ramón E. Rivera-Vicéns' 325 | description = 'Transcriptome Analysis Pipeline' 326 | mainScript = 'TransPi.nf' 327 | nextflowVersion = '>=21.04.1' 328 | version = '1.3.0-rc' 329 | } 330 | -------------------------------------------------------------------------------- /bin/TransPi_Report_Ind.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "TransPi Report: `r commandArgs(trailingOnly=T)[1]`" 3 | output: 4 | html_document: default 5 | pdf_document: default 6 | date: "Generated on: `r Sys.time()`" 7 | params: 8 | interactive: yes 9 | sample_id: !r commandArgs(trailingOnly=T)[1] 10 | skipFilter: !r commandArgs(trailingOnly=T)[2] 11 | skipNormalization: !r commandArgs(trailingOnly=T)[3] 12 | rRNAfilter: !r commandArgs(trailingOnly=T)[4] 13 | buscoDist: !r commandArgs(trailingOnly=T)[5] 14 | allBuscos: !r commandArgs(trailingOnly=T)[6] 15 | withKegg: !r commandArgs(trailingOnly=T)[7] 16 | --- 17 | 18 | 25 | 26 | ```{r setup, include=FALSE} 27 | knitr::opts_chunk$set(echo = TRUE, 28 | message = FALSE, 29 | warning = FALSE, 30 | out.width="105%" 31 | ) 32 | ``` 33 | 34 | ```{r load_libraries, include=FALSE} 35 | library(ggthemes) 36 | library(ggplot2) 37 | library(reshape2) 38 | library(grid) 39 | library(plotly) 40 | library(knitr) 41 | library(kableExtra) #install.packages("kableExtra") 42 | library(rmarkdown) 43 | mycol=c('#088da5','#73cdc8','#ff6f61','#7cb8df','#88b04b','#00a199','#6B5B95','#92A8D1','#b0e0e6','#ff7f50','#088d9b','#E15D44','#e19336') 44 | ``` 45 | 46 |
47 |
48 |

  Reads Stats

49 |
50 | **Input reads and filtering** 51 | ```{r readstats_table, echo=FALSE} 52 | if (params$skipFilter == "true") { 53 | writeLines("\n--------------------------------------------------------------\n") 54 | cat(readLines(list.files(pattern="filter_reads.txt")) , sep = '\n') 55 | writeLines("\n--------------------------------------------------------------\n") 56 | } else { 57 | reads_stats=read.csv(paste(params$sample_id,sep="","_reads_stats.csv")) 58 | paged_table(reads_stats,options = list(rows.print = 10)) 59 | } 60 | ``` 61 |
62 |
63 |
64 | **Reads mean quality before filtering** 65 | ```{r qual_plot, echo=FALSE} 66 | if (params$skipFilter == "true") { 67 | writeLines("\n--------------------------------------------------------------\n") 68 | cat(readLines(list.files(pattern="filter_reads.txt")) , sep = '\n') 69 | writeLines("\n--------------------------------------------------------------\n") 70 | } else { 71 | rqual=read.csv(paste(params$sample_id,sep="","_reads_qual.csv"),header=FALSE) 72 | qp <- plot_ly(x=c(1:rqual[1,1]),y=as.numeric(rqual[2,]),type="scatter", mode = "lines", name="Read1", opacity = .8, line=list(color='#088da5')) 73 | qp <- qp %>% add_trace(x=c(1:rqual[3,1]), y=as.numeric(rqual[4,]), name="Read2", opacity = .8, line=list(color='#e19336')) 74 | qp <- qp %>% layout(xaxis=list(title="Base position")) 75 | qp <- qp %>% layout(yaxis=list(title="Mean quality", range = c(0,41))) 76 | qp <- qp %>% layout(legend = list(x=10,y=.5), hovermode = "x unified") 77 | qp <- qp %>% config(toImageButtonOptions=list(format='svg',filename='readsQC_before', height= 500, width= 800, scale= 1), displaylogo = FALSE) 78 | qp 79 | } 80 | ``` 81 |
82 |
83 |
84 |
85 | **Reads mean quality after filtering** 86 | ```{r qual_plot2, echo=FALSE} 87 | if (params$skipFilter == "true") { 88 | writeLines("\n--------------------------------------------------------------\n") 89 | cat(readLines(list.files(pattern="filter_reads.txt")) , sep = '\n') 90 | writeLines("\n--------------------------------------------------------------\n") 91 | } else { 92 | qp2 <- plot_ly(x=c(1:rqual[5,1]),y=as.numeric(rqual[6,]),type="scatter", mode = "lines", name="Read1", opacity = .8, line=list(color='#088da5')) 93 | qp2 <- qp2 %>% add_trace(x=c(1:rqual[7,1]),y=as.numeric(rqual[8,]), name="Read2", opacity = .8, line=list(color='#e19336')) 94 | qp2 <- qp2 %>% layout(xaxis=list(title="Base position")) 95 | qp2 <- qp2 %>% layout(yaxis=list(title="Mean quality", range = c(0,41))) 96 | qp2 <- qp2 %>% layout(legend = list(x=10,y=.5), hovermode = "x unified") 97 | qp2 <- qp2 %>% config(toImageButtonOptions=list(format='svg',filename='readsQC_after', height= 500, width= 800, scale= 1), displaylogo = FALSE) 98 | qp2 99 | } 100 | ``` 101 |
102 |
103 |
104 |
105 | **rRNA removal** 106 | ```{r rrna_remove, echo=FALSE} 107 | if (params$rRNAfilter == "true") { 108 | cat(readLines(list.files(pattern="*_remove_rRNA.log")) , sep = '\n') 109 | } else { 110 | writeLines("\n--------------------------------------------------------------\n") 111 | cat(readLines(list.files(pattern="rrna_removal.txt")) , sep = '\n') 112 | writeLines("\n--------------------------------------------------------------\n") 113 | } 114 | ``` 115 |
116 |
117 |
118 |
119 | **Normalization** 120 | ```{r norm_plot, echo=FALSE} 121 | if (params$skipNormalization == "true") { 122 | writeLines("\n--------------------------------------------------------------\n") 123 | cat(readLines(list.files(pattern="norm_reads.txt")) , sep = '\n') 124 | writeLines("\n--------------------------------------------------------------\n") 125 | } else { 126 | cat(readLines(list.files(pattern="*_normStats.txt")) , sep = '\n') 127 | } 128 | ``` 129 | 130 |
131 |
132 |
133 | 134 |

  Assemblies Stats

135 | 136 |
137 | 138 | **Number of transcripts before Evidential Genes** 139 | ```{r pre_EG_table, echo=FALSE} 140 | trans_preEG=read.csv(paste(params$sample_id,sep="","_sum_preEG.csv")) 141 | paged_table(trans_preEG,options = list(rows.print = 10)) 142 | ``` 143 | 144 |
145 |
146 |
147 | 148 | **Number of transcripts after Evidential Genes** 149 | ```{r EG_table, echo=FALSE, results='asis'} 150 | trans_EG=read.csv(paste(params$sample_id,sep="","_sum_EG.csv")) 151 | paged_table(trans_EG,options = list(rows.print = 10)) 152 | ``` 153 | 154 |
155 |
156 |
157 | 158 | **Plot before and after Evidential Gene** 159 | ```{bash EG_final, include=FALSE} 160 | cat *_preEG.csv >preEG.csv 161 | cat *_EG.csv >EG.csv 162 | head -n1 EG.csv | tr "," " " >headers.tmp 163 | tail -n1 preEG.csv | tr "," " " >preEG.tmp 164 | tail -n1 EG.csv | tr "," " " >EG.tmp 165 | cat headers.tmp preEG.tmp EG.tmp >EG_plot 166 | awk '{ 167 | for (f = 1; f <= NF; f++) { a[NR, f] = $f } 168 | } 169 | NF > nf { nf = NF } 170 | END { 171 | for (f = 1; f <= nf; f++) { 172 | for (r = 1; r <= NR; r++) { 173 | printf a[r, f] (r==NR ? RS : FS) 174 | } 175 | } 176 | }' EG_plot | tr " " "," >EG_plot.csv 177 | rm *.tmp 178 | ``` 179 | ```{r EG_plot, echo=FALSE, results='asis'} 180 | eg=read.csv("EG_plot.csv",header=FALSE) 181 | peg <- plot_ly(eg, x = ~V1, y = ~V2, type = 'bar', name = 'Before', marker=list(color='#088da5', line=list(color = 'black', width = 1)), opacity=0.8, 182 | hovertemplate = paste('Program: %{x}','
Number of transcripts: %{y}','')) 183 | peg <- peg %>% add_trace(y = ~V3, name = 'After', marker=list(color='#e19336'), opacity = 0.8) 184 | peg <- peg %>% layout(yaxis = list(title = 'Number of transcripts'), barmode = 'group') 185 | peg <- peg %>% layout(xaxis = list(title = '',categoryorder = "array",categoryarray = ~V1)) 186 | peg <- peg %>% layout(legend = list(x=10,y=.5)) 187 | peg <- peg %>% config(toImageButtonOptions=list(format='svg',filename='evigene_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 188 | peg 189 | ``` 190 | 191 |
192 |
193 |
194 |
195 | 196 | **Transcript length distribution** 197 | ```{r trans_len_plot, echo=FALSE, results='asis'} 198 | tlen=read.delim(paste(params$sample_id,sep="","_sizes.txt"), sep = "\t", header = FALSE) 199 | 200 | gg=ggplot(tlen,aes(x=tlen$V2))+ 201 | coord_flip()+theme_bw()+ 202 | geom_histogram(aes(y = ..count..), binwidth = 100, colour = "#1F3552", fill = "#96ceba")+ 203 | #stat_bin(geom = "text", aes(label=..count..),binwidth = 100, hjust=-.8)+ 204 | scale_x_continuous(name = "Transcripts sizes (100bp incrememnt)",breaks = seq(0, 2500, 100),limits=c(150, 2550))+ 205 | scale_y_continuous(name = "Number of transcripts") 206 | 207 | tlp <- plot_ly(tlen, x=~V2, type="histogram", xbins=list(start='200',end='2500', size= '100'), marker=list(color='#088da5', line=list(color = 'black', 208 | width = 1)), hovertemplate = paste('Size range: %{x}','
Number of transcripts: %{y}',''), opacity = 0.7) 209 | tlp <- tlp %>% layout(yaxis = list(title = "Number of transcripts")) 210 | tlp <- tlp %>% layout(xaxis = list(title = "Transcripts sizes (100bp incrememnt)")) 211 | tlp <- tlp %>% config(toImageButtonOptions=list(format='svg',filename='transcript_distribution', height= 500, width= 800, scale= 1), displaylogo = FALSE) 212 | tlp 213 | ``` 214 |
215 |
216 |
217 | 218 | **rnaQUAST** 219 | ```{r rna_quast, echo=FALSE, results='asis'} 220 | paged_table(read.csv(paste(params$sample_id,sep="","_rnaQUAST.csv")),options = list(rows.print = 10)) 221 | ``` 222 | 223 |
224 |
225 |
226 | 227 | **Mapping reads to EviGene results** 228 | 229 | ```{bash mapping_stats_evi, echo=FALSE, results='asis'} 230 | cat *.combined.okay.fa.txt | grep "overall alignment rate" 231 | ``` 232 | 233 |
234 |
235 |
236 | 237 | **Mapping reads to Trinity results** 238 | 239 | ```{bash mapping_stats_tri, echo=FALSE, results='asis'} 240 | cat *.Trinity.fa.txt | grep "overall alignment rate" 241 | ``` 242 | 243 |
244 |
245 |
246 |
247 |
248 | 249 |

  BUSCO

250 | 251 |
252 | 253 | ### - Using BUSCO V4 254 | 255 |
256 |
257 | 258 | ```{r busco4_plot, echo=FALSE} 259 | bus=read.csv(paste(params$sample_id,sep="","_busco4.csv"),header=FALSE) 260 | MYSPEC=as.character(t(bus[1,])) 261 | MYPERC=as.numeric(t(bus[2,])) 262 | MYVAL=as.numeric(t(bus[3,])) 263 | ###################################### 264 | 265 | # Edit from the orginal BUSCO plot script 266 | 267 | ###################################### 268 | # 269 | # BUSCO summary figure 270 | # @version 3.0.0 271 | # @since BUSCO 2.0.0 272 | # 273 | # Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org) 274 | # Licensed under the MIT license. See LICENSE.md file. 275 | # 276 | ###################################### 277 | my_colors <- c("#0e9aa7", "#96ceba", "#ffcc5c", "#ff6f69") 278 | # Bar height ratio 279 | my_bar_height <- 0.55 280 | # Legend 281 | my_title <- "BUSCO Assessment Results - TransPi vs Trinity" 282 | # Font 283 | my_family <- "sans" 284 | my_size_ratio <- 1 285 | species <- c(MYSPEC) 286 | species <- factor(species) 287 | species <- factor(species,levels(species)[c(length(levels(species)):1)]) # reorder your species here just by changing the values in the vector : 288 | percentage <- c(MYPERC) 289 | values <- c(MYVAL) 290 | ###################################### 291 | # Code to produce the graph 292 | labsize = 1 293 | if (length(levels(species)) > 10){ 294 | labsize = 0.66 295 | } 296 | category <- c(rep(c("Single","Duplicated","Fragmented","Missing"),c(1))) 297 | category <-factor(category) 298 | #category = factor(category,levels(category)[c(4,1,2,3)]) 299 | category = factor(category,levels(category)[(c(4,1,2,3))]) 300 | df = data.frame(species,percentage,values,category) 301 | figure <- ggplot()+ 302 | geom_bar(aes(y = percentage, x = species, fill = category), data = df, stat="identity", width=my_bar_height,position = position_stack(reverse=TRUE)) + 303 | coord_flip() + theme_gray(base_size = 8) + scale_y_continuous(labels = c("0","20","40","60","80","100"), breaks = c(0,20,40,60,80,100))+ 304 | theme_gray(base_size = 8) + scale_y_continuous(labels = c("0","20","40","60","80","100"), breaks = c(0,20,40,60,80,100)) + 305 | scale_fill_manual(values = my_colors,labels =c(" Complete (C) and single-copy (S) ", " Complete (C) and duplicated (D)", 306 | " Fragmented (F) ", " Missing (M)")) + 307 | xlab("") + ylab("\n%BUSCOs") + 308 | theme(plot.title = element_text(family=my_family, colour = "black", size = rel(2.2)*my_size_ratio, face = "bold")) + 309 | theme(legend.position="top",legend.title = element_blank()) + 310 | theme(legend.text = element_text(family=my_family, size = rel(1.2)*my_size_ratio)) + 311 | theme(panel.background = element_rect(color="#FFFFFF", fill="white")) + 312 | theme(panel.grid.minor = element_blank()) + 313 | theme(panel.grid.major = element_blank()) + 314 | theme(axis.text.y = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio,face="italic")) + 315 | theme(axis.text.x = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) + 316 | theme(axis.line = element_line(size=1*my_size_ratio, colour = "black")) + 317 | theme(axis.ticks.length = unit(.85, "cm")) + 318 | theme(axis.ticks.y = element_line(colour="white", size = 0)) + 319 | theme(axis.ticks.x = element_line(colour="#222222")) + 320 | theme(axis.ticks.length = unit(0.4, "cm")) + 321 | theme(axis.title.x = element_text(family=my_family, size=rel(1.2)*my_size_ratio)) + 322 | guides(fill = guide_legend(override.aes = list(colour = NULL))) + 323 | guides(fill=guide_legend(nrow=2,byrow=TRUE)) 324 | for(i in rev(c(1:length(levels(species))))){ 325 | detailed_values <- values[species==species[species==levels(species)[i]]] 326 | total_buscos <- sum(detailed_values) 327 | figure <- figure + 328 | annotate("text", label=paste("C:", detailed_values[1] + detailed_values[2], " [S:", detailed_values[1], ", D:", detailed_values[2], "], F:", detailed_values[3], ", M:", detailed_values[4], ", n:", total_buscos, sep=" "), 329 | y=30, x = i, size = labsize*3*my_size_ratio, colour = "black", hjust=0, family=my_family) 330 | } 331 | bp <-ggplotly(figure) 332 | bp <- bp %>% layout(title = "BUSCO Results - TransPi vs Trinity") 333 | bp <- bp %>% layout(legend = list(x=10,y=.5)) 334 | bp <- bp %>% config(toImageButtonOptions=list(format='svg',filename='busco4_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 335 | bp 336 | ``` 337 | 338 |
339 |
340 |
341 |
342 | 343 | ### - BUSCO Distribution 344 | 345 |
346 | 347 | #### BUSCO V4 348 | 349 |
350 | 351 | **Missing BUSCO distribution** 352 | ```{r busco4_dist_mis, echo=FALSE} 353 | if (params$buscoDist == "true" && params$allBuscos == "true") { 354 | system("cat *_missing_BUSCO4_table.tsv | sed -e 's/Missing/0/g' -e 's/Fragmented/1/g' -e 's/Duplicated/2/g' -e 's/Complete/3/g' >BUSCO4_missing_table.tsv") 355 | writeLines("\t\t\t\tHEATMAP") 356 | csv=read.csv("BUSCO4_missing_table.tsv", header=TRUE, sep="\t") 357 | c=melt(csv,id.vars = 'Busco.ID') 358 | dec=c(0,.25,.25,.50,.50,.75,.75,1) 359 | my_colors <- c("#081D58","#081D58", "#2280B8","#2280B8", "#99D6B9", "#99D6B9","#f8f9fc","#f8f9fc") 360 | colz <- setNames(data.frame(dec, my_colors), NULL) 361 | fig <- plot_ly(c,x=~variable, y=~Busco.ID, z=~value, colorscale=colz, reversescale=T, type = "heatmap", 362 | colorbar=list(tickmode='array', tickvals=c(.35,1.1,1.87,2.60), thickness=30, 363 | ticktext= c("Missing","Fragmented","Duplicated","Complete"), len=0.4)) 364 | fig <- fig %>% layout(xaxis=list(title="", showline = TRUE, mirror = TRUE, tickfont=list(size=8)), 365 | yaxis=list(title="BUSCO ID", tickmode="auto", nticks=length(csv$Busco.ID), 366 | tickfont=list(size=6), showline = TRUE, mirror = TRUE)) 367 | fig <- fig %>% config(toImageButtonOptions=list(format='svg',filename='busco4_TransPi_missing_distribution_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 368 | fig 369 | } else { 370 | writeLines("\n--------------------------------------------------------------\n") 371 | cat(readLines(list.files(pattern="busco4_dist.txt")) , sep = '\n') 372 | writeLines("\n--------------------------------------------------------------\n") 373 | } 374 | ``` 375 | 376 |
377 |
378 |
379 |
380 |
381 | 382 |

  ORFs

383 | 384 |
385 | 386 | **Summary of the Transdecoder run** 387 | 388 | ```{r trandecoder_table, echo=FALSE, results='asis'} 389 | paged_table(read.csv(paste(params$sample_id,sep="","_transdecoder.csv")),options = list(rows.print = 10)) 390 | ``` 391 | 392 |
393 |
394 |
395 |
396 |
397 |
398 | 399 |

  Gene Ontologies

400 | 401 |
402 |
403 | 404 | ```{r go_plots1, echo=FALSE} 405 | #go_sample_name=params$data 406 | dataCC=read.csv(paste(params$sample_id,sep="","_GO_cellular.csv"), header = F) 407 | pcel <- plot_ly(dataCC,x=~V1,y=~V2, text = ~V1, textposition = 'outside', marker = list(color = '#ff8f66',line = list(color = '#08306B', width = 1.5)), opacity = 0.8, 408 | hovertemplate = paste('Category: %{y}','
Number of GOs: %{x}','')) 409 | pcel <- pcel %>% layout(yaxis=list(showticklabels=TRUE)) 410 | pcel <- pcel %>% layout(yaxis=list(autorange="reversed")) 411 | pcel <- pcel %>% layout(yaxis = list(title = '',categoryorder = "array",categoryarray = ~V2)) 412 | pcel <- pcel %>% layout(xaxis = list(title = 'Number of sequences')) 413 | pcel <- pcel %>% layout(font = list(size=10)) 414 | pcel <- pcel %>% layout(title = list(text='Cellular Component GOs',x=.7)) 415 | pcel <- pcel %>% config(toImageButtonOptions=list(format='svg',filename='GO_cellular_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 416 | pcel 417 | ``` 418 |
419 |
420 |
421 |
422 | 423 | ```{r go_plots2, echo=FALSE} 424 | dataMF=read.csv(paste(params$sample_id,sep="","_GO_molecular.csv"), header = F) 425 | pmol <- plot_ly(dataMF,x=~V1,y=~V2, text = ~V1, textposition = 'outside', marker = list(color = '#b0e0e6',line = list(color = '#08306B', width = 1.5)), opacity = 0.8, 426 | hovertemplate = paste('Category: %{y}','
Number of GOs: %{x}','')) 427 | pmol <- pmol %>% layout(yaxis=list(showticklabels=TRUE)) 428 | pmol <- pmol %>% layout(yaxis=list(autorange="reversed")) 429 | pmol <- pmol %>% layout(yaxis = list(title = '',categoryorder = "array",categoryarray = ~V2)) 430 | pmol <- pmol %>% layout(xaxis = list(title = 'Number of sequences')) 431 | pmol <- pmol %>% layout(font = list(size=10)) 432 | pmol <- pmol %>% layout(title = list(text='Molecular Function GOs',x=.7)) 433 | pmol <- pmol %>% config(toImageButtonOptions=list(format='svg',filename='GO_molecular_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 434 | pmol 435 | ``` 436 |
437 |
438 |
439 |
440 | 441 | ```{r go_plots3, echo=FALSE} 442 | dataBP=read.csv(paste(params$sample_id,sep="","_GO_biological.csv"), header = F) 443 | pbio <- plot_ly(dataBP,x=~V1,y=~V2, text = ~V1, textposition = 'outside', marker = list(color = '#88B04B',line = list(color = '#08306B', width = 1.5)), opacity = 0.8, 444 | hovertemplate = paste('Category: %{y}','
Number of GOs: %{x}','')) 445 | pbio <- pbio %>% layout(yaxis=list(showticklabels=TRUE)) 446 | pbio <- pbio %>% layout(yaxis=list(autorange="reversed")) 447 | pbio <- pbio %>% layout(yaxis = list(title = '',categoryorder = "array",categoryarray = ~V2)) 448 | pbio <- pbio %>% layout(xaxis = list(title = 'Number of sequences')) 449 | pbio <- pbio %>% layout(font = list(size=10)) 450 | pbio <- pbio %>% layout(title = list(text='Biological Processes GOs',x=.7)) 451 | pbio <- pbio %>% config(toImageButtonOptions=list(format='svg',filename='GO_biological_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 452 | pbio 453 | ``` 454 |
455 |
456 |
457 |
458 | 459 |

  UniProt

460 | 461 |
462 |
463 | 464 | ```{r custom_uniprot_plot, echo=FALSE} 465 | dataUni=read.csv(paste(params$sample_id,sep="","_custom_uniprot_hits.csv"), header=F) 466 | p3 <- plot_ly(dataUni,x=~V1, y=~V2, text = ~V1, textposition = 'outside', marker = list(color = '#0e9aa7',line = list(color = '#08306B', width = 1.5)), opacity = 0.8, 467 | hovertemplate = paste('Species: %{y}','
Number of hits: %{x}','')) 468 | p3 <- p3 %>% layout(yaxis=list(showticklabels=TRUE)) 469 | p3 <- p3 %>% layout(yaxis=list(autorange="reversed")) 470 | p3 <- p3 %>% layout(yaxis = list(title = '',categoryorder = "array",categoryarray = ~V2)) 471 | p3 <- p3 %>% layout(xaxis = list(title = 'Number of sequences')) 472 | p3 <- p3 %>% layout(font = list(size=10)) 473 | p3 <- p3 %>% layout(title = list(text='UniProt Species Hits',x=.7)) 474 | p3 <- p3 %>% config(toImageButtonOptions=list(format='svg',filename='custom_uniprot_plot', height= 500, width= 800, scale= 1), displaylogo = FALSE) 475 | p3 476 | ``` 477 | 478 |
479 |
480 |
481 |
482 | 483 |

  KEGG Pathways

484 | 485 |
486 |
487 | 488 | ```{r pathways_plot, echo=FALSE} 489 | if (params$withKegg == "false") { 490 | writeLines("\n--------------------------------------------------------------\n") 491 | writeLines("\t\t\t\tKEGG analysis was skipped") 492 | writeLines("\n--------------------------------------------------------------\n") 493 | } else { 494 | writeLines("\t\t\t\tPathways from annotation and iPATH") 495 | knitr::include_graphics(paste(params$sample_id,sep="","_kegg.svg")) 496 | } 497 | ``` 498 | 499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 | -------------------------------------------------------------------------------- /conf/busV4list.txt: -------------------------------------------------------------------------------- 1 | ###BACTERIA 2 | ## 3 | Bacteria_(Kingdom) 4 | ## 5 | Acidobacteria_(Phylum) 6 | Actinobacteria_(Phylum) 7 | Bacteroidetes_(Phylum) 8 | Chlamydiae_(Phylum) 9 | Chlorobi_(Phylum) 10 | Chloroflexi_(Phylum) 11 | Cyanobacteria_(Phylum) 12 | Firmicutes_(Phylum) 13 | Fusobacteria_(Phylum) 14 | Planctomycetes_(Phylum) 15 | Proteobacteria_(Phylum) 16 | Spirochaetes_(Phylum) 17 | Synergistetes_(Phylum) 18 | Tenericutes_(Phylum) 19 | Thermotogae_(Phylum) 20 | Verrucomicrobia_(Phylum) 21 | ## 22 | Actinobacteria_(Class) 23 | Alphaproteobacteria_(Class) 24 | Aquificae_(Class) 25 | Bacilli_(Class) 26 | Bacteroidia_(Class) 27 | Betaproteobacteria_(Class) 28 | Clostridia_(Class) 29 | Coriobacteriia_(Class) 30 | Cytophagia_(Class) 31 | Deltaproteobacteria_(Class) 32 | Epsilonproteobacteria_(Class) 33 | Flavobacteriia_(Class) 34 | Gammaproteobacteria_(Class) 35 | Mollicutes_(Class) 36 | Sphingobacteriia_(Class) 37 | Spirochaetia_(Class) 38 | Tissierellia_(Class) 39 | ## 40 | Alteromonadales_(Order) 41 | Bacillales_(Order) 42 | Bacteroidales_(Order) 43 | Burkholderiales_(Order) 44 | Campylobacterales_(Order) 45 | Cellvibrionales_(Order) 46 | Chromatiales_(Order) 47 | Chroococcales_(Order) 48 | Clostridiales_(Order) 49 | Coriobacteriales_(Order) 50 | Corynebacteriales_(Order) 51 | Cytophagales_(Order) 52 | Desulfobacterales_(Order) 53 | Desulfovibrionales_(Order) 54 | Desulfuromonadales_(Order) 55 | Enterobacterales_(Order) 56 | Entomoplasmatales_(Order) 57 | Flavobacteriales_(Order) 58 | Fusobacteriales_(Order) 59 | Lactobacillales_(Order) 60 | Legionellales_(Order) 61 | Micrococcales_(Order) 62 | Mycoplasmatales_(Order) 63 | Neisseriales_(Order) 64 | Nitrosomonadales_(Order) 65 | Nostocales_(Order) 66 | Oceanospirillales_(Order) 67 | Oscillatoriales_(Order) 68 | Pasteurellales_(Order) 69 | Propionibacteriales_(Order) 70 | Pseudomonadales_(Order) 71 | Rhizobiales_(Order) 72 | Rhodobacterales_(Order) 73 | Rhodospirillales_(Order) 74 | Rickettsiales_(Order) 75 | Selenomonadales_(Order) 76 | Sphingomonadales_(Order) 77 | Spirochaetales_(Order) 78 | Streptomycetales_(Order) 79 | Streptosporangiales_(Order) 80 | Synechococcales_(Order) 81 | Thermoanaerobacterales_(Order) 82 | Thiotrichales_(Order) 83 | Tissierellales_(Order) 84 | Vibrionales_(Order) 85 | Xanthomonadales_(Order) 86 | ## 87 | Bacteroidetes-Chlorobi_group_(Other) 88 | Rhizobium-Agrobacterium_group_(Other) 89 | delta-epsilon-subdivisions_(Other) 90 | #MAIN_MENU 91 | ###EUKARYOTA 92 | Eukaryota_(Superkingdom) 93 | ## 94 | Arthropoda_(Phylum) 95 | Fungi_(Kingdom) 96 | Plants_(Kingdom) 97 | Protists_(Clade) 98 | Vertebrata_(Sub_phylum) 99 | ## 100 | Metazoa_(Other) 101 | Mollusca_(Other) 102 | Nematoda_(Other) 103 | #MAIN_MENU 104 | ###ARCHAEA 105 | Archaea_(Kingdom) 106 | ## 107 | Euryarchaeota_(Phylum) 108 | Thaumarchaeota_(Phylum) 109 | ## 110 | Halobacteria_(Class) 111 | Methanobacteria_(Class) 112 | Methanomicrobia_(Class) 113 | Thermoplasmata_(Class) 114 | Thermoprotei_(Class) 115 | ## 116 | Desulfurococcales_(Order) 117 | Halobacteriales_(Order) 118 | Haloferacales_(Order) 119 | Methanococcales_(Order) 120 | Methanomicrobiales_(Order) 121 | Natrialbales_(Order) 122 | Sulfolobales_(Order) 123 | Thermoproteales_(Order) 124 | #MAIN_MENU 125 | ##ARTHROPODA 126 | Arthropoda_(Phylum) 127 | ## 128 | Arachnida_(Class) 129 | Insecta_(Class) 130 | ## 131 | Diptera_(Order) 132 | Hemiptera_(Order) 133 | Hymenoptera_(Order) 134 | Lepidoptera_(Order) 135 | ## 136 | Endopterygota_(Other) 137 | #MAIN_MENU 138 | ##FUNGI 139 | Fungi_(Kingdom) 140 | ## 141 | Ascomycota_(Phylum) 142 | Basidiomycota_(Phylum) 143 | Microsporidia_(Phylum) 144 | Mucoromycota_(Phylum) 145 | ## 146 | Agaricomycetes_(Class) 147 | Dothideomycetes_(Class) 148 | Eurotiomycetes_(Class) 149 | Leotiomycetes_(Class) 150 | Saccharomycetes_(Class) 151 | Sordariomycetes_(Class) 152 | Tremellomycetes_(Class) 153 | ## 154 | Agaricales_(Order) 155 | Boletales_(Order) 156 | Capnodiales_(Order) 157 | Chaetothyriales_(Order) 158 | Eurotiales_(Order) 159 | Glomerellales_(Order) 160 | Helotiales_(Order) 161 | Hypocreales_(Order) 162 | Mucorales_(Order) 163 | Onygenales_(Order) 164 | Pleosporales_(Order) 165 | Polyporales_(Order) 166 | ## 167 | #MAIN_MENU 168 | ##PLANTS 169 | Viridiplantae_(Kingdom) 170 | ## 171 | Chlorophyta_(Phylum) 172 | ## 173 | Liliopsida_(Class) 174 | ## 175 | Brassicales_(Order) 176 | Eudicots_(Order) 177 | Solanales_(Order) 178 | Poales_(Order) 179 | ## 180 | Embryophyta_(other) 181 | Fabales_(Other) 182 | #MAIN_MENU 183 | ##PROTIST 184 | Alveolata_(Sub_clade) 185 | Apicomplexa_(Sub_clade) 186 | Euglenozoa_(Sub_clade) 187 | Stramenopiles_(Sub_clade) 188 | Aconoidasida_(Sub_clade) 189 | Coccidia_(Sub_clade) 190 | Plasmodium_(Sub_clade) 191 | #MAIN_MENU 192 | ##VERTEBRATA 193 | Vertebrata_(Sub_phylum) 194 | ## 195 | Actinopterygii_(Superclass_and_Class) 196 | Aves_(Superclass_and_Class) 197 | Mammalia_(Superclass_and_Class) 198 | Tetrapoda_(Superclass_and_Class) 199 | ## 200 | Carnivora_(Superorder_and_Order) 201 | Cyprinodontiformes_(Superorder_and_Order) 202 | Euarchontoglires_(Superorder_and_Order) 203 | Laurasiatheria_(Superorder_and_Order) 204 | Passeriformes_(Superorder_and_Order) 205 | Primates_(Superorder_and_Order) 206 | ## 207 | Cetartiodactyla_(Other) 208 | Eutheria_(Other) 209 | Glires_(Other) 210 | Sauropsida_(Other) 211 | #MAIN_MENU 212 | ###EXIT 213 | -- 214 | Archaea;https://busco-data.ezlab.org/v4/data/lineages/archaea_odb10.2020-03-06.tar.gz 215 | Euryarchaeota;https://busco-data.ezlab.org/v4/data/lineages/euryarchaeota_odb10.2020-03-06.tar.gz 216 | Thermoplasmata;https://busco-data.ezlab.org/v4/data/lineages/thermoplasmata_odb10.2020-03-06.tar.gz 217 | Thermoproteales;https://busco-data.ezlab.org/v4/data/lineages/thermoproteales_odb10.2020-03-06.tar.gz 218 | Thaumarchaeota;https://busco-data.ezlab.org/v4/data/lineages/thaumarchaeota_odb10.2020-03-06.tar.gz 219 | Halobacteria;https://busco-data.ezlab.org/v4/data/lineages/halobacteria_odb10.2020-03-06.tar.gz 220 | Sulfolobales;https://busco-data.ezlab.org/v4/data/lineages/sulfolobales_odb10.2020-03-06.tar.gz 221 | Methanobacteria;https://busco-data.ezlab.org/v4/data/lineages/methanobacteria_odb10.2020-03-06.tar.gz 222 | Desulfurococcales;https://busco-data.ezlab.org/v4/data/lineages/desulfurococcales_odb10.2020-03-06.tar.gz 223 | Methanomicrobia;https://busco-data.ezlab.org/v4/data/lineages/methanomicrobia_odb10.2020-03-06.tar.gz 224 | Methanococcales;https://busco-data.ezlab.org/v4/data/lineages/methanococcales_odb10.2020-03-06.tar.gz 225 | Thermoprotei;https://busco-data.ezlab.org/v4/data/lineages/thermoprotei_odb10.2020-03-06.tar.gz 226 | Methanomicrobiales;https://busco-data.ezlab.org/v4/data/lineages/methanomicrobiales_odb10.2020-03-06.tar.gz 227 | Halobacteriales;https://busco-data.ezlab.org/v4/data/lineages/halobacteriales_odb10.2020-03-06.tar.gz 228 | Natrialbales;https://busco-data.ezlab.org/v4/data/lineages/natrialbales_odb10.2020-03-06.tar.gz 229 | Haloferacales;https://busco-data.ezlab.org/v4/data/lineages/haloferacales_odb10.2020-03-06.tar.gz 230 | -- 231 | Arthropoda;https://busco-data.ezlab.org/v4/data/lineages/arthropoda_odb10.2020-09-10.tar.gz 232 | Insecta;https://busco-data.ezlab.org/v4/data/lineages/insecta_odb10.2020-09-10.tar.gz 233 | Diptera;https://busco-data.ezlab.org/v4/data/lineages/diptera_odb10.2020-08-05.tar.gz 234 | Endopterygota;https://busco-data.ezlab.org/v4/data/lineages/endopterygota_odb10.2020-09-10.tar.gz 235 | Arachnida;https://busco-data.ezlab.org/v4/data/lineages/arachnida_odb10.2020-08-05.tar.gz 236 | Hymenoptera;https://busco-data.ezlab.org/v4/data/lineages/hymenoptera_odb10.2020-08-05.tar.gz 237 | Lepidoptera;https://busco-data.ezlab.org/v4/data/lineages/lepidoptera_odb10.2020-08-05.tar.gz 238 | Hemiptera;https://busco-data.ezlab.org/v4/data/lineages/hemiptera_odb10.2020-08-05.tar.gz 239 | -- 240 | Bacteria;https://busco-data.ezlab.org/v4/data/lineages/bacteria_odb10.2020-03-06.tar.gz 241 | Acidobacteria;https://busco-data.ezlab.org/v4/data/lineages/acidobacteria_odb10.2020-03-06.tar.gz 242 | Actinobacteria;https://busco-data.ezlab.org/v4/data/lineages/actinobacteria_class_odb10.2020-03-06.tar.gz 243 | Alteromonadales;https://busco-data.ezlab.org/v4/data/lineages/alteromonadales_odb10.2020-03-06.tar.gz 244 | Bacteroidetes-Chlorobi_group;https://busco-data.ezlab.org/v4/data/lineages/bacteroidetes-chlorobi_group_odb10.2020-03-06.tar.gz 245 | Actinobacteria;https://busco-data.ezlab.org/v4/data/lineages/actinobacteria_phylum_odb10.2020-03-06.tar.gz 246 | Alphaproteobacteria;https://busco-data.ezlab.org/v4/data/lineages/alphaproteobacteria_odb10.2020-03-06.tar.gz 247 | Bacillales;https://busco-data.ezlab.org/v4/data/lineages/bacillales_odb10.2020-03-06.tar.gz 248 | Rhizobium-Agrobacterium_group;https://busco-data.ezlab.org/v4/data/lineages/rhizobium-agrobacterium_group_odb10.2020-03-06.tar.gz 249 | Bacteroidetes;https://busco-data.ezlab.org/v4/data/lineages/bacteroidetes_odb10.2020-03-06.tar.gz 250 | Aquificae;https://busco-data.ezlab.org/v4/data/lineages/aquificae_odb10.2020-03-06.tar.gz 251 | Bacteroidales;https://busco-data.ezlab.org/v4/data/lineages/bacteroidales_odb10.2020-03-06.tar.gz 252 | delta-epsilon-subdivisions;https://busco-data.ezlab.org/v4/data/lineages/delta-epsilon-subdivisions_odb10.2020-03-06.tar.gz 253 | Chlamydiae;https://busco-data.ezlab.org/v4/data/lineages/chlamydiae_odb10.2020-03-06.tar.gz 254 | Bacilli;https://busco-data.ezlab.org/v4/data/lineages/bacilli_odb10.2020-03-06.tar.gz 255 | Burkholderiales;https://busco-data.ezlab.org/v4/data/lineages/burkholderiales_odb10.2020-03-06.tar.gz 256 | Chlorobi;https://busco-data.ezlab.org/v4/data/lineages/chlorobi_odb10.2020-03-06.tar.gz 257 | Bacteroidia;https://busco-data.ezlab.org/v4/data/lineages/bacteroidia_odb10.2020-03-06.tar.gz 258 | Campylobacterales;https://busco-data.ezlab.org/v4/data/lineages/campylobacterales_odb10.2020-03-06.tar.gz 259 | Chloroflexi;https://busco-data.ezlab.org/v4/data/lineages/chloroflexi_odb10.2020-03-06.tar.gz 260 | Betaproteobacteria;https://busco-data.ezlab.org/v4/data/lineages/betaproteobacteria_odb10.2020-03-06.tar.gz 261 | Cellvibrionales;https://busco-data.ezlab.org/v4/data/lineages/cellvibrionales_odb10.2020-03-06.tar.gz 262 | Cyanobacteria;https://busco-data.ezlab.org/v4/data/lineages/cyanobacteria_odb10.2020-03-06.tar.gz 263 | Clostridia;https://busco-data.ezlab.org/v4/data/lineages/clostridia_odb10.2020-03-06.tar.gz 264 | Chromatiales;https://busco-data.ezlab.org/v4/data/lineages/chromatiales_odb10.2020-03-06.tar.gz 265 | Firmicutes;https://busco-data.ezlab.org/v4/data/lineages/firmicutes_odb10.2020-03-06.tar.gz 266 | Coriobacteriia;https://busco-data.ezlab.org/v4/data/lineages/coriobacteriia_odb10.2020-03-06.tar.gz 267 | Chroococcales;https://busco-data.ezlab.org/v4/data/lineages/chroococcales_odb10.2020-03-06.tar.gz 268 | Fusobacteria;https://busco-data.ezlab.org/v4/data/lineages/fusobacteria_odb10.2020-03-06.tar.gz 269 | Cytophagia;https://busco-data.ezlab.org/v4/data/lineages/cytophagia_odb10.2020-03-06.tar.gz 270 | Clostridiales;https://busco-data.ezlab.org/v4/data/lineages/clostridiales_odb10.2020-03-06.tar.gz 271 | Planctomycetes;https://busco-data.ezlab.org/v4/data/lineages/planctomycetes_odb10.2020-03-06.tar.gz 272 | Deltaproteobacteria;https://busco-data.ezlab.org/v4/data/lineages/deltaproteobacteria_odb10.2020-03-06.tar.gz 273 | Coriobacteriales;https://busco-data.ezlab.org/v4/data/lineages/coriobacteriales_odb10.2020-03-06.tar.gz 274 | Proteobacteria;https://busco-data.ezlab.org/v4/data/lineages/proteobacteria_odb10.2020-03-06.tar.gz 275 | Epsilonproteobacteria;https://busco-data.ezlab.org/v4/data/lineages/epsilonproteobacteria_odb10.2020-03-06.tar.gz 276 | Corynebacteriales;https://busco-data.ezlab.org/v4/data/lineages/corynebacteriales_odb10.2020-03-06.tar.gz 277 | Spirochaetes;https://busco-data.ezlab.org/v4/data/lineages/spirochaetes_odb10.2020-03-06.tar.gz 278 | Flavobacteriia;https://busco-data.ezlab.org/v4/data/lineages/flavobacteriia_odb10.2020-03-06.tar.gz 279 | Cytophagales;https://busco-data.ezlab.org/v4/data/lineages/cytophagales_odb10.2020-03-06.tar.gz 280 | Synergistetes;https://busco-data.ezlab.org/v4/data/lineages/synergistetes_odb10.2020-03-06.tar.gz 281 | Gammaproteobacteria;https://busco-data.ezlab.org/v4/data/lineages/gammaproteobacteria_odb10.2020-03-06.tar.gz 282 | Desulfobacterales;https://busco-data.ezlab.org/v4/data/lineages/desulfobacterales_odb10.2020-03-06.tar.gz 283 | Tenericutes;https://busco-data.ezlab.org/v4/data/lineages/tenericutes_odb10.2020-03-06.tar.gz 284 | Mollicutes;https://busco-data.ezlab.org/v4/data/lineages/mollicutes_odb10.2020-03-06.tar.gz 285 | Desulfovibrionales;https://busco-data.ezlab.org/v4/data/lineages/desulfovibrionales_odb10.2020-03-06.tar.gz 286 | Thermotogae;https://busco-data.ezlab.org/v4/data/lineages/thermotogae_odb10.2020-03-06.tar.gz 287 | Sphingobacteriia;https://busco-data.ezlab.org/v4/data/lineages/sphingobacteriia_odb10.2020-03-06.tar.gz 288 | Desulfuromonadales;https://busco-data.ezlab.org/v4/data/lineages/desulfuromonadales_odb10.2020-03-06.tar.gz 289 | Verrucomicrobia;https://busco-data.ezlab.org/v4/data/lineages/verrucomicrobia_odb10.2020-03-06.tar.gz 290 | Spirochaetia;https://busco-data.ezlab.org/v4/data/lineages/spirochaetia_odb10.2020-03-06.tar.gz 291 | Enterobacterales;https://busco-data.ezlab.org/v4/data/lineages/enterobacterales_odb10.2020-03-06.tar.gz 292 | Tissierellia;https://busco-data.ezlab.org/v4/data/lineages/tissierellia_odb10.2020-03-06.tar.gz 293 | Entomoplasmatales;https://busco-data.ezlab.org/v4/data/lineages/entomoplasmatales_odb10.2020-03-06.tar.gz 294 | Flavobacteriales;https://busco-data.ezlab.org/v4/data/lineages/flavobacteriales_odb10.2020-03-06.tar.gz 295 | Fusobacteriales;https://busco-data.ezlab.org/v4/data/lineages/fusobacteriales_odb10.2020-03-06.tar.gz 296 | Lactobacillales;https://busco-data.ezlab.org/v4/data/lineages/lactobacillales_odb10.2020-03-06.tar.gz 297 | Legionellales;https://busco-data.ezlab.org/v4/data/lineages/legionellales_odb10.2020-03-06.tar.gz 298 | Micrococcales;https://busco-data.ezlab.org/v4/data/lineages/micrococcales_odb10.2020-03-06.tar.gz 299 | Mycoplasmatales;https://busco-data.ezlab.org/v4/data/lineages/mycoplasmatales_odb10.2020-03-06.tar.gz 300 | Neisseriales;https://busco-data.ezlab.org/v4/data/lineages/neisseriales_odb10.2020-03-06.tar.gz 301 | Nitrosomonadales;https://busco-data.ezlab.org/v4/data/lineages/nitrosomonadales_odb10.2020-03-06.tar.gz 302 | Nostocales;https://busco-data.ezlab.org/v4/data/lineages/nostocales_odb10.2020-03-06.tar.gz 303 | Oceanospirillales;https://busco-data.ezlab.org/v4/data/lineages/oceanospirillales_odb10.2020-03-06.tar.gz 304 | Oscillatoriales;https://busco-data.ezlab.org/v4/data/lineages/oscillatoriales_odb10.2020-03-06.tar.gz 305 | Pasteurellales;https://busco-data.ezlab.org/v4/data/lineages/pasteurellales_odb10.2020-03-06.tar.gz 306 | Propionibacteriales;https://busco-data.ezlab.org/v4/data/lineages/propionibacteriales_odb10.2020-03-06.tar.gz 307 | Pseudomonadales;https://busco-data.ezlab.org/v4/data/lineages/pseudomonadales_odb10.2020-03-06.tar.gz 308 | Rhizobiales;https://busco-data.ezlab.org/v4/data/lineages/rhizobiales_odb10.2020-03-06.tar.gz 309 | Rhodobacterales;https://busco-data.ezlab.org/v4/data/lineages/rhodobacterales_odb10.2020-03-06.tar.gz 310 | Rhodospirillales;https://busco-data.ezlab.org/v4/data/lineages/rhodospirillales_odb10.2020-03-06.tar.gz 311 | Rickettsiales;https://busco-data.ezlab.org/v4/data/lineages/rickettsiales_odb10.2020-03-06.tar.gz 312 | Selenomonadales;https://busco-data.ezlab.org/v4/data/lineages/selenomonadales_odb10.2020-03-06.tar.gz 313 | Sphingomonadales;https://busco-data.ezlab.org/v4/data/lineages/sphingomonadales_odb10.2020-03-06.tar.gz 314 | Spirochaetales;https://busco-data.ezlab.org/v4/data/lineages/spirochaetales_odb10.2020-03-06.tar.gz 315 | Streptomycetales;https://busco-data.ezlab.org/v4/data/lineages/streptomycetales_odb10.2020-03-06.tar.gz 316 | Streptosporangiales;https://busco-data.ezlab.org/v4/data/lineages/streptosporangiales_odb10.2020-03-06.tar.gz 317 | Synechococcales;https://busco-data.ezlab.org/v4/data/lineages/synechococcales_odb10.2020-03-06.tar.gz 318 | Thermoanaerobacterales;https://busco-data.ezlab.org/v4/data/lineages/thermoanaerobacterales_odb10.2020-03-06.tar.gz 319 | Thiotrichales;https://busco-data.ezlab.org/v4/data/lineages/thiotrichales_odb10.2020-03-06.tar.gz 320 | Tissierellales;https://busco-data.ezlab.org/v4/data/lineages/tissierellales_odb10.2020-03-06.tar.gz 321 | Vibrionales;https://busco-data.ezlab.org/v4/data/lineages/vibrionales_odb10.2020-03-06.tar.gz 322 | Xanthomonadales;https://busco-data.ezlab.org/v4/data/lineages/xanthomonadales_odb10.2020-03-06.tar.gz 323 | -- 324 | Eukaryota;https://busco-data.ezlab.org/v4/data/lineages/eukaryota_odb10.2020-09-10.tar.gz 325 | Metazoa;https://busco-data.ezlab.org/v4/data/lineages/metazoa_odb10.2020-09-10.tar.gz 326 | Mollusca;https://busco-data.ezlab.org/v4/data/lineages/mollusca_odb10.2020-08-05.tar.gz 327 | Nematoda;https://busco-data.ezlab.org/v4/data/lineages/nematoda_odb10.2020-08-05.tar.gz 328 | -- 329 | Fungi;https://busco-data.ezlab.org/v4/data/lineages/fungi_odb10.2020-09-10.tar.gz 330 | Ascomycota;https://busco-data.ezlab.org/v4/data/lineages/ascomycota_odb10.2020-09-10.tar.gz 331 | Agaricomycetes;https://busco-data.ezlab.org/v4/data/lineages/agaricomycetes_odb10.2020-08-05.tar.gz 332 | Agaricales;https://busco-data.ezlab.org/v4/data/lineages/agaricales_odb10.2020-08-05.tar.gz 333 | Basidiomycota;https://busco-data.ezlab.org/v4/data/lineages/basidiomycota_odb10.2020-09-10.tar.gz 334 | Dothideomycetes;https://busco-data.ezlab.org/v4/data/lineages/dothideomycetes_odb10.2020-08-05.tar.gz 335 | Boletales;https://busco-data.ezlab.org/v4/data/lineages/boletales_odb10.2020-08-05.tar.gz 336 | Microsporidia;https://busco-data.ezlab.org/v4/data/lineages/microsporidia_odb10.2020-08-05.tar.gz 337 | Eurotiomycetes;https://busco-data.ezlab.org/v4/data/lineages/eurotiomycetes_odb10.2020-08-05.tar.gz 338 | Capnodiales;https://busco-data.ezlab.org/v4/data/lineages/capnodiales_odb10.2020-08-05.tar.gz 339 | Mucoromycota;https://busco-data.ezlab.org/v4/data/lineages/mucoromycota_odb10.2020-08-05.tar.gz 340 | Leotiomycetes;https://busco-data.ezlab.org/v4/data/lineages/leotiomycetes_odb10.2020-08-05.tar.gz 341 | Chaetothyriales;https://busco-data.ezlab.org/v4/data/lineages/chaetothyriales_odb10.2020-08-05.tar.gz 342 | Saccharomycetes;https://busco-data.ezlab.org/v4/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz 343 | Eurotiales;https://busco-data.ezlab.org/v4/data/lineages/eurotiales_odb10.2020-08-05.tar.gz 344 | Sordariomycetes;https://busco-data.ezlab.org/v4/data/lineages/sordariomycetes_odb10.2020-08-05.tar.gz 345 | Glomerellales;https://busco-data.ezlab.org/v4/data/lineages/glomerellales_odb10.2020-08-05.tar.gz 346 | Tremellomycetes;https://busco-data.ezlab.org/v4/data/lineages/tremellomycetes_odb10.2020-08-05.tar.gz 347 | Helotiales;https://busco-data.ezlab.org/v4/data/lineages/helotiales_odb10.2020-08-05.tar.gz 348 | Hypocreales;https://busco-data.ezlab.org/v4/data/lineages/hypocreales_odb10.2020-08-05.tar.gz 349 | Mucorales;https://busco-data.ezlab.org/v4/data/lineages/mucorales_odb10.2020-08-05.tar.gz 350 | Onygenales;https://busco-data.ezlab.org/v4/data/lineages/onygenales_odb10.2020-08-05.tar.gz 351 | Pleosporales;https://busco-data.ezlab.org/v4/data/lineages/pleosporales_odb10.2020-08-05.tar.gz 352 | Polyporales;https://busco-data.ezlab.org/v4/data/lineages/polyporales_odb10.2020-08-05.tar.gz 353 | -- 354 | Viridiplantae;https://busco-data.ezlab.org/v4/data/lineages/viridiplantae_odb10.2020-09-10.tar.gz 355 | Chlorophyta;https://busco-data.ezlab.org/v4/data/lineages/chlorophyta_odb10.2020-08-05.tar.gz 356 | Liliopsida;https://busco-data.ezlab.org/v4/data/lineages/liliopsida_odb10.2020-09-10.tar.gz 357 | Brassicales;https://busco-data.ezlab.org/v4/data/lineages/brassicales_odb10.2020-08-05.tar.gz 358 | Embryophyta;;https://busco-data.ezlab.org/v4/data/lineages/embryophyta_odb10.2020-09-10.tar.gz 359 | Eudicots;https://busco-data.ezlab.org/v4/data/lineages/eudicots_odb10.2020-09-10.tar.gz 360 | Fabales;https://busco-data.ezlab.org/v4/data/lineages/fabales_odb10.2020-08-05.tar.gz 361 | Solanales;https://busco-data.ezlab.org/v4/data/lineages/solanales_odb10.2020-08-05.tar.gz 362 | Poales;https://busco-data.ezlab.org/v4/data/lineages/poales_odb10.2020-08-05.tar.gz 363 | -- 364 | Alveolata;https://busco-data.ezlab.org/v4/data/lineages/alveolata_odb10.2020-09-10.tar.gz 365 | Aconoidasida;https://busco-data.ezlab.org/v4/data/lineages/aconoidasida_odb10.2020-08-05.tar.gz 366 | Apicomplexa;https://busco-data.ezlab.org/v4/data/lineages/apicomplexa_odb10.2020-09-10.tar.gz 367 | Coccidia;https://busco-data.ezlab.org/v4/data/lineages/coccidia_odb10.2020-08-05.tar.gz 368 | Euglenozoa;https://busco-data.ezlab.org/v4/data/lineages/euglenozoa_odb10.2020-08-05.tar.gz 369 | Plasmodium;https://busco-data.ezlab.org/v4/data/lineages/plasmodium_odb10.2020-08-05.tar.gz 370 | Stramenopiles;https://busco-data.ezlab.org/v4/data/lineages/stramenopiles_odb10.2020-08-05.tar.gz 371 | -- 372 | Vertebrata;https://busco-data.ezlab.org/v4/data/lineages/vertebrata_odb10.2020-09-10.tar.gz 373 | Tetrapoda;https://busco-data.ezlab.org/v4/data/lineages/tetrapoda_odb10.2020-09-10.tar.gz 374 | Euarchontoglires;https://busco-data.ezlab.org/v4/data/lineages/euarchontoglires_odb10.2020-08-05.tar.gz 375 | Cetartiodactyla;https://busco-data.ezlab.org/v4/data/lineages/cetartiodactyla_odb10.2020-08-05.tar.gz 376 | Actinopterygii;https://busco-data.ezlab.org/v4/data/lineages/actinopterygii_odb10.2020-08-05.tar.gz 377 | Laurasiatheria;https://busco-data.ezlab.org/v4/data/lineages/laurasiatheria_odb10.2020-09-10.tar.gz 378 | Eutheria;https://busco-data.ezlab.org/v4/data/lineages/eutheria_odb10.2020-09-10.tar.gz 379 | Mammalia;https://busco-data.ezlab.org/v4/data/lineages/mammalia_odb10.2020-09-10.tar.gz 380 | Carnivora;https://busco-data.ezlab.org/v4/data/lineages/carnivora_odb10.2020-08-05.tar.gz 381 | Glires;https://busco-data.ezlab.org/v4/data/lineages/glires_odb10.2020-08-05.tar.gz 382 | Aves;https://busco-data.ezlab.org/v4/data/lineages/aves_odb10.2020-09-10.tar.gz 383 | Cyprinodontiformes;https://busco-data.ezlab.org/v4/data/lineages/cyprinodontiformes_odb10.2020-08-05.tar.gz 384 | Sauropsida;https://busco-data.ezlab.org/v4/data/lineages/sauropsida_odb10.2020-09-10.tar.gz 385 | Passeriformes;https://busco-data.ezlab.org/v4/data/lineages/passeriformes_odb10.2020-08-05.tar.gz 386 | Primates;https://busco-data.ezlab.org/v4/data/lineages/primates_odb10.2020-08-05.tar.gz 387 | -------------------------------------------------------------------------------- /precheck_TransPi.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash -e 2 | export mypwd="$1" 3 | os_c() { 4 | OS="$(uname)" 5 | if [ "$OS" == "Linux" ]; then 6 | echo -e "\n\t -- Downloading Linux Anaconda3 installation -- \n" 7 | curl -o Anaconda3-2020.11-Linux-x86_64.sh https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh 8 | else 9 | echo -e "\n\t\e[31m -- ERROR: Are you in a Linux system? Please check requirements and rerun the pre-check --\e[39m\n" 10 | exit 0 11 | fi 12 | } 13 | source_c() { 14 | if [ -f ~/.bashrc ];then 15 | source ~/.bashrc 16 | fi 17 | } 18 | cleanConda () { 19 | cd $mypwd 20 | echo -e "\n\t -- Cleaning conda environment -- \n" 21 | conda clean -a -y 22 | echo -e "\n\t -- Done cleaning conda environment -- \n" 23 | } 24 | conda_only() { 25 | source_c 26 | #Check conda and environment 27 | check_conda=$( command -v conda ) 28 | if [ "$check_conda" != "" ];then #&& [ "$ver" -gt "45" ];then 29 | echo -e "\n\t -- Conda seems to be installed in your system --\n" 30 | ver=$( conda -V | awk '{print $2}' | cut -f 1,2 -d "." | tr -d "." ) 31 | vern=48 32 | if [ $( echo "$ver >= $vern" | bc -l ) -eq 1 ];then 33 | echo -e "\n\t -- Conda is installed (v4.8 or higher) --\n" 34 | #cleanConda 35 | fi 36 | else 37 | echo -e "\n\t -- Conda is not intalled --\n" 38 | os_c 39 | echo -e "\n\t -- Starting Anaconda installation -- \n" 40 | bash Anaconda3-20*.sh 41 | echo -e "\n\t -- Installation done -- \n" 42 | rm Anaconda3-20*.sh 43 | source_c 44 | fi 45 | } 46 | conda_c() { 47 | source_c 48 | #Check conda and environment 49 | check_conda=$( command -v conda ) 50 | if [ "$check_conda" != "" ];then #&& [ "$ver" -gt "45" ];then 51 | echo -e "\n\t -- Conda seems to be installed in your system --\n" 52 | ver=$( conda -V | awk '{print $2}' | cut -f 1,2 -d "." | tr -d "." ) 53 | vern=48 54 | if [ $( echo "$ver >= $vern" | bc -l ) -eq 1 ];then 55 | echo -e "\n\t -- Conda is installed (v4.8 or higher). Checking environment... --\n" 56 | #Check environment 57 | check_env=$( conda info -e | awk '$1 == "TransPi" {print $2}' | wc -l ) 58 | if [ "$check_env" -eq 0 ];then 59 | echo -e "\n\t -- TransPi environment has not been created. Checking environment file... --\n" 60 | if [ -f ${confDir}/transpi_env.yml ];then 61 | echo -e "\n\t -- TransPi environment file found. Creating environment... --\n" 62 | conda env create -f ${confDir}/transpi_env.yml 63 | else 64 | echo -e "\n\t\e[31m -- ERROR: TransPi environment file not found (transpi_env.yml). Please run the precheck in the TransPi directory. See manual for more info --\e[39m\n" 65 | exit 0 66 | fi 67 | elif [ "$check_env" -eq 1 ];then 68 | echo -e "\n\t -- TransPi environment is installed and ready to be used --\n" 69 | fi 70 | fi 71 | else 72 | echo -e "\n\t -- Conda is not intalled --\n" 73 | os_c 74 | echo -e "\n\t -- Starting Anaconda installation -- \n" 75 | bash Anaconda3-20*.sh 76 | echo -e "\n\t -- Installation done -- \n" 77 | rm Anaconda3-20*.sh 78 | source_c 79 | if [ -f ${confDir}/transpi_env.yml ];then 80 | echo -e "\n\t -- TransPi environment file found. Creating environment... --\n" 81 | conda env create -f ${confDir}/transpi_env.yml 82 | else 83 | echo -e "\n\t\e[31m -- ERROR: TransPi environment file not found (transpi_env.yml). Please run the precheck in the TransPi directory. See manual for more info --\e[39m\n" 84 | exit 0 85 | fi 86 | fi 87 | } 88 | dir_c () { 89 | cd $mypwd 90 | if [ ! -d scripts/ ];then 91 | mkdir scripts 92 | fi 93 | if [ ! -d DBs ];then 94 | mkdir DBs 95 | fi 96 | } 97 | bus_dow () { 98 | name=$1 99 | cd $mypwd 100 | if [ ! -d DBs/busco_db/ ];then 101 | echo -e "\n\t -- Creating directory for the BUSCO V4 database --\n" 102 | mkdir -p DBs/busco_db 103 | cd DBs/busco_db 104 | bname=$( echo $name | cut -f 1 -d "_" ) 105 | if [ `cat ${confDir}/conf/busV4list.txt | grep "${bname};" | wc -l` -eq 1 ];then 106 | echo -e "\n\t -- Downloading BUSCO V4 \"$name\" database --\n";wait 107 | wname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 2 -d ";" ) 108 | wget --no-check-certificate $wname 109 | echo -e "\n\t -- Preparing files ... --\n";wait 110 | tname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 1 -d ";" | tr [A-Z] [a-z] ) 111 | tar -xf ${tname}*.tar.gz 112 | rm ${tname}*.tar.gz 113 | echo -e "\n\t -- DONE with BUSCO V4 database --\n";wait 114 | fi 115 | dname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 1 -d ";" | tr [A-Z] [a-z] ) 116 | if [ -d ${dname}_odb10 ];then 117 | export busna=${dname}_odb10 118 | fi 119 | elif [ -d DBs/busco_db/ ];then 120 | cd DBs/busco_db 121 | bname=$( echo $name | cut -f 1 -d "_" ) 122 | dname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 1 -d ";" | tr [A-Z] [a-z] ) 123 | if [ -d ${dname}_odb10 ];then 124 | echo -e "\n\t -- BUSCO V4 \"$name\" database found -- \n" 125 | export busna=${dname}_odb10 126 | else 127 | bname=$( echo $name | cut -f 1 -d "_" ) 128 | if [ `cat ${confDir}/conf/busV4list.txt | grep "${bname};" | wc -l` -eq 1 ];then 129 | echo -e "\n\t -- Downloading BUSCO V4 \"$name\" database --\n";wait 130 | wname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 2 -d ";" ) 131 | wget --no-check-certificate $wname 132 | echo -e "\n\t -- Preparing files ... --\n";wait 133 | tname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 1 -d ";" | tr [A-Z] [a-z] ) 134 | tar -xvf ${tname}*.tar.gz 135 | rm ${tname}*.tar.gz 136 | echo -e "\n\t -- DONE with BUSCO V4 database --\n";wait 137 | fi 138 | dname=$( cat ${confDir}/conf/busV4list.txt | grep "${bname};" | cut -f 1 -d ";" | tr [A-Z] [a-z] ) 139 | if [ -d ${dname}_odb10 ];then 140 | export busna=${dname}_odb10 141 | fi 142 | fi 143 | fi 144 | } 145 | bus_c () { 146 | cd $mypwd 147 | echo -e "\n\t -- Selecting BUSCO V4 database -- \n" 148 | PS3=" 149 | Please select one (1-5): " 150 | if [ -f ${confDir}/conf/busV4list.txt ];then 151 | select var in `cat ${confDir}/conf/busV4list.txt | grep "###" | tr -d "#"`;do 152 | case $var in 153 | BACTERIA) 154 | echo -e "\n\t You selected BACTERIA. Which specific database? \n" 155 | PS3=" 156 | Please select database: " 157 | select var1 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##BACTERIA/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 158 | case $var1 in 159 | MAIN_MENU) 160 | bus_c 161 | ;; 162 | *) 163 | if [ "$var1" != "" ];then 164 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var1"` -ge 1 ];then 165 | bus_dow $var1 166 | fi 167 | else 168 | echo -e "\n\t Wrong option. Try again \n" 169 | bus_c 170 | fi 171 | ;; 172 | esac 173 | break 174 | done 175 | ;; 176 | EUKARYOTA) 177 | echo -e "\n\tYou selected EUKARYOTA. Which specific database? \n" 178 | PS3=" 179 | Please select database: " 180 | select var1 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##EUKARYOTA/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 181 | case $var1 in 182 | MAIN_MENU) 183 | bus_c 184 | ;; 185 | Arthropoda_\(Phylum\)) 186 | select var2 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##ARTHROPODA/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 187 | case $var2 in 188 | MAIN_MENU) 189 | bus_c 190 | ;; 191 | *) 192 | if [ "$var2" != "" ];then 193 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var2"` -ge 1 ];then 194 | bus_dow $var2 195 | fi 196 | else 197 | echo -e "\n\t Wrong option. Try again \n" 198 | bus_c 199 | fi 200 | esac 201 | break 202 | done 203 | ;; 204 | Fungi_\(Kingdom\)) 205 | select var2 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##FUNGI/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 206 | case $var2 in 207 | MAIN_MENU) 208 | bus_c 209 | ;; 210 | *) 211 | if [ "$var2" != "" ];then 212 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var2"` -ge 1 ];then 213 | bus_dow $var2 214 | fi 215 | else 216 | echo -e "\n\t Wrong option. Try again \n" 217 | bus_c 218 | fi 219 | esac 220 | break 221 | done 222 | ;; 223 | Plants_\(Kingdom\)) 224 | select var2 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##PLANTS/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 225 | case $var2 in 226 | MAIN_MENU) 227 | bus_c 228 | ;; 229 | *) 230 | if [ "$var2" != "" ];then 231 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var2"` -ge 1 ];then 232 | bus_dow $var2 233 | fi 234 | else 235 | echo -e "\n\t Wrong option. Try again \n" 236 | bus_c 237 | fi 238 | esac 239 | break 240 | done 241 | ;; 242 | Protists_\(Clade\)) 243 | select var2 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##PROTIST/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 244 | case $var2 in 245 | MAIN_MENU) 246 | bus_c 247 | ;; 248 | *) 249 | if [ "$var2" != "" ];then 250 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var2"` -ge 1 ];then 251 | bus_dow $var2 252 | fi 253 | else 254 | echo -e "\n\t Wrong option. Try again \n" 255 | bus_c 256 | fi 257 | esac 258 | break 259 | done 260 | ;; 261 | Vertebrata_\(Sub_phylum\)) 262 | select var2 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##VERTEBRATA/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 263 | case $var2 in 264 | MAIN_MENU) 265 | bus_c 266 | ;; 267 | *) 268 | if [ "$var2" != "" ];then 269 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var2"` -ge 1 ];then 270 | bus_dow $var2 271 | fi 272 | else 273 | echo -e "\n\t Wrong option. Try again \n" 274 | bus_c 275 | fi 276 | esac 277 | break 278 | done 279 | ;; 280 | *) 281 | if [ "$var1" != "" ];then 282 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var1"` -ge 1 ];then 283 | bus_dow $var1 284 | fi 285 | else 286 | echo -e "\n\t Wrong option. Try again \n" 287 | bus_c 288 | fi 289 | ;; 290 | esac 291 | break 292 | done 293 | ;; 294 | ARCHAEA) 295 | echo -e "\n\tYou selected ARCHAEA. Which specific database? \n" 296 | PS3=" 297 | Please select database: " 298 | select var1 in `cat ${confDir}/conf/busV4list.txt | sed -n "/##ARCHAEA/,/#MAIN/p" | grep -v "##" | tr -d "#"`;do 299 | case $var1 in 300 | MAIN_MENU) 301 | bus_c 302 | ;; 303 | *) 304 | if [ "$var1" != "" ];then 305 | if [ `cat ${confDir}/conf/busV4list.txt | grep -c "$var1"` -ge 1 ];then 306 | bus_dow $var1 307 | fi 308 | else 309 | echo -e "\n\t Wrong option. Try again \n" 310 | bus_c 311 | fi 312 | ;; 313 | esac 314 | break 315 | done 316 | ;; 317 | EXIT) 318 | echo -e "\n\t Exiting \n" 319 | exit 0 320 | ;; 321 | *) 322 | echo -e "\n\t Wrong option. Try again \n" 323 | bus_c 324 | ;; 325 | esac 326 | break 327 | done 328 | else 329 | echo -e "\n\t\e[31m -- ERROR: Please make sure that file \"busV4list.txt\" is available. Please run the precheck in the TransPi directory. See manual for more info --\e[39m\n\n" 330 | exit 0 331 | fi 332 | } 333 | uni_c () { 334 | PS3=" 335 | Please select UNIPROT database to use: " 336 | select var in `ls *`;do 337 | if [ "$var" != "" ];then 338 | if [ `echo $var | grep ".gz" | wc -l` -eq 1 ];then 339 | echo -e "\n\n\t -- File is compressed -- \n" 340 | echo -e "\n\n\t -- Uncompressing file ... -- \n" 341 | gunzip $var 342 | echo -e "\n\t -- UNIPROT database selected: \"${var%.gz}\" --\n" 343 | export unina=${var%.gz} 344 | else 345 | echo -e "\n\t -- UNIPROT database selected: \"$var\" --\n" 346 | export unina=${var} 347 | fi 348 | else 349 | echo -e "\n\t Wrong option. Try again \n" 350 | uni_c 351 | fi 352 | break 353 | done 354 | } 355 | unicomp_c () { 356 | echo -e -n "\n\t Do you want to uncompress the file(s)? (y,n,exit): " 357 | read ans 358 | case $ans in 359 | [yY] | [yY][eE][sS]) 360 | echo -e "\n\n\t -- Uncompressing file(s) ... -- \n" 361 | gunzip *.gz 362 | ;; 363 | [nN] | [nN][oO]) 364 | echo -e "\n\n\t\e[31m -- ERROR: Please uncompress the file(s) and rerun the pre-check --\e[39m\n" 365 | exit 0 366 | ;; 367 | exit) 368 | echo -e "\n\t -- Exiting -- \n" 369 | exit 0 370 | ;; 371 | *) 372 | echo -e "\n\n\t\e[31m -- Yes or No answer not specified. Try again --\e[39m\n" 373 | unicomp_c 374 | ;; 375 | esac 376 | } 377 | uniprot_user_DB(){ 378 | echo -e -n "\n\t -- Provide the PATH where to locate your proteins file: " 379 | read -e ans 380 | if [ -d ${ans} ];then 381 | echo -e "\n\t -- Directory ${ans} found -- \n" 382 | cd ${ans} 383 | uni_c 384 | elif [ -d $( dirname ${ans} ) ];then 385 | echo -e "\n\t -- Directory "$( dirname ${ans} )" found -- \n" 386 | cd $( dirname ${ans} ) 387 | uni_c 388 | else 389 | echo -e "\n\t\e[31m -- Directory ${ans} not found --\e[39m\n" 390 | uniprot_meta 391 | fi 392 | } 393 | uniprot_taxon_DB(){ 394 | echo -e "\n\t -- Input the Taxon ID (Taxonomy ID, NCBI txid) of your interest. TransPi will download the proteins from UNIPROT --" 395 | echo -e "\t Example: metazoan TaxID = 33208 -- \n" 396 | echo -e -n "\n\t -- Your Taxon ID (only the numbers): " 397 | read ans 398 | echo -e "\n\t -- Downloading UNIPROT proteins from Taxon ID: $ans -- \n" 399 | curl -o uniprot_${ans}.fasta.gz "https://www.uniprot.org/uniprot/?query=taxonomy:${ans}&format=fasta&compress=yes&include=no" 400 | gunzip uniprot_${ans}.fasta.gz 401 | date -u >.lastrun.txt 402 | uni_c 403 | } 404 | uniprot_meta () { 405 | myuni=$( pwd ) 406 | echo -e "\n\t -- TransPi uses a custom protein database (one of many) from UNIPROT for the annotation -- \n" 407 | echo " 408 | Options available: 409 | 410 | 1- Download metazoan proteins from UNIPROT 411 | 412 | 2- Provide the PATH of my DB 413 | 414 | 3- Provide UNIPROT Taxon ID 415 | 416 | 4- Skip for now 417 | 418 | " 419 | echo -e -n "\t Which option you want? " 420 | read ans 421 | case $ans in 422 | 1) 423 | echo -e "\n\n\t -- Downloading current metazoan protein dataset from UNIPROT -- \n" 424 | echo -e "\n\t -- This could take a couple of minutes depending on connection. Please wait -- \n" 425 | curl -o uniprot_metazoa_33208.fasta.gz "https://www.uniprot.org/uniprot/?query=taxonomy:33208&format=fasta&compress=yes&include=no" 426 | echo -e "\n\t -- Uncompressing uniprot_metazoa_33208.fasta.gz ... -- \n" 427 | gunzip uniprot_metazoa_33208.fasta.gz 428 | date -u >.lastrun.txt 429 | uni_c 430 | ;; 431 | 2) 432 | uniprot_user_DB 433 | ;; 434 | 3) 435 | uniprot_taxon_DB 436 | ;; 437 | 4) 438 | echo -e "\n\t -- Skipping UNIPROT DB -- \n" 439 | ;; 440 | *) 441 | echo -e "\n\t\e[31m -- Wrong option. Try again --\e[39m\n" 442 | uniprot_meta 443 | ;; 444 | esac 445 | } 446 | uniprot_c () { 447 | #Check UNIPROT 448 | cd $mypwd 449 | if [ ! -d DBs/uniprot_db/ ];then 450 | echo -e "\n\t -- Creating directory for the UNIPROT database --\n" 451 | mkdir -p DBs/uniprot_db/ 452 | cd DBs/uniprot_db/ 453 | uniprot_meta 454 | elif [ -d DBs/uniprot_db/ ];then 455 | cd DBs/uniprot_db/ 456 | myuni=$( pwd ) 457 | echo -e "\n\t -- UNIPROT database directory found at: $myuni -- \n" 458 | myfasta=$( ls -1 | grep -v ".gz" | egrep ".fasta|.fa" | wc -l ) 459 | myfastagz=$( ls -1 | egrep ".fasta.gz|.fa.gz" | wc -l ) 460 | if [ $myfasta -eq 0 ] && [ $myfastagz -eq 0 ];then 461 | echo -e "\n\t -- Directory \"$myuni\" is empty --\n" 462 | uniprot_meta 463 | else 464 | echo -e "\n\t -- Here is the list of UNIPROT files found at: $myuni -- \n" 465 | uni_c 466 | fi 467 | fi 468 | } 469 | java_c () { 470 | export NXF_VER=21.04.1 && curl -s https://get.nextflow.io | bash 2>.error_nextflow 471 | check_err=$( head -n 1 .error_nextflow | grep -c "java: command not found" ) 472 | if [ $check_err -eq 1 ];then 473 | echo -e "\n\t\e[31m -- ERROR: Please install Java 1.8 (or later). Requirement for Nextflow --\e[39m\n" 474 | exit 0 475 | fi 476 | rm .error_nextflow 477 | } 478 | nextflow_c () { 479 | #Check Nextflow 480 | cd $mypwd 481 | check_next=$( command -v nextflow | wc -l ) 482 | if [ $check_next -eq 1 ];then 483 | echo -e "\n\t -- Nextflow is installed -- \n" 484 | elif [ $check_next -eq 0 ];then 485 | check_next=$( ls -1 | grep -v "nextflow.config" | grep -c "nextflow" ) 486 | if [ $check_next -eq 1 ];then 487 | echo -e "\n\t -- Nextflow is installed -- \n" 488 | else 489 | echo -e -n "\n\t Do you want to install Nextflow? (y or n): " 490 | read ans 491 | case $ans in 492 | [yY] | [yY][eE][sS]) 493 | echo -e "\n\t -- Downloading Nextflow ... -- \n" 494 | java_c 495 | echo -e "\n\t -- Nextflow is now installed on $mypwd (local installation) -- \n" 496 | ;; 497 | [nN] | [nN][oO]) 498 | echo -e "\n\n\t\e[31m -- ERROR: Download and Install Nextflow. Then rerun the pre-check --\e[39m\n" 499 | exit 0 500 | ;; 501 | *) 502 | echo -e "\n\n\t\e[31m -- Yes or No answer not specified. Try again --\e[39m\n" 503 | nextflow_c 504 | ;; 505 | esac 506 | fi 507 | fi 508 | } 509 | evi_c () { 510 | cd ${confDir} 511 | check_evi=$( command -v tr2aacds.pl | wc -l ) 512 | if [ $check_evi -eq 0 ];then 513 | if [ ! -d ${confDir}/scripts/evigene/ ];then 514 | echo -e "\n\t -- EvidentialGene is not installed -- \n" 515 | mkdir -p ${confDir}/scripts && cd ${confDir}/scripts 516 | echo -e "\n\t -- Downloading EvidentialGene -- \n" 517 | git clone https://github.com/rivera10/TP-evigene.git 518 | mv TP-evigene/evigene . && rm -rf TP-evigene/ 519 | echo -e "\n\t -- Done with EvidentialGene -- \n" 520 | else 521 | echo -e "\n\t -- EvidentialGene directory was found at ${confDir}/scripts (local installation) -- \n" 522 | fi 523 | elif [ $check_evi -eq 1 ];then 524 | echo -e "\n\t -- EvidentialGene is already installed and in the PATH -- \n" 525 | fi 526 | } 527 | buildsql_c () { 528 | cd ${mypwd} 529 | if [ -d DBs/sqlite_db/ ];then 530 | cd DBs/sqlite_db/ 531 | else 532 | mkdir -p DBs/sqlite_db/ 533 | cd DBs/sqlite_db/ 534 | fi 535 | } 536 | condaTrinotate () { 537 | echo -e "\n\t -- Creating Trinotate conda environment -- \n" 538 | conda create --mkdir --yes --quiet -n TPtrinotate -c conda-forge bioconda::trinotate=3.2.2=pl5262hdfd78af_0 539 | source ~/.bashrc 540 | conda activate TPtrinotate 541 | echo -e "\n\t -- Done with Trinotate conda environment -- \n" 542 | } 543 | condaTrinotateEnd () { 544 | conda deactivate 545 | conda remove -n TPtrinotate --all -y 546 | } 547 | trisql_container () { 548 | if [ ! -e *.sqlite ];then 549 | echo -e "\n\n\t -- Custom sqlite database for Trinotate is not installed -- \n" 550 | echo -e "\n\t -- This could take a couple of minutes depending on connection. Please wait -- \n" 551 | rm -rf * 552 | wget https://github.com/Trinotate/Trinotate/archive/Trinotate-v3.2.2.tar.gz 553 | tar -xf Trinotate-v3.2.2.tar.gz 554 | mv Trinotate-Trinotate-v3.2.2/ Trinotate_build_scripts/ 555 | ./Trinotate_build_scripts/admin/Build_Trinotate_Boilerplate_SQLite_db.pl Trinotate 556 | rm uniprot_sprot.dat.gz Pfam-A.hmm.gz 557 | date -u >.lastrun.txt 558 | elif [ -e *.sqlite ];then 559 | echo -e "\n\t -- Custom sqlite database for Trinotate found at "${mypwd}/DBs/sqlite_db" -- \n" 560 | DB=$( if [ -f ${mypwd}/DBs/sqlite_db/.lastrun.txt ];then cat .lastrun.txt;else echo "N/A";fi ) 561 | echo -e "\n\t -- Databases (PFAM,SwissProt,EggNOG,GO) last update: ${DB} --\n " 562 | fi 563 | } 564 | trisql_c () { 565 | source ~/.bashrc 566 | check_conda=$( command -v conda ) 567 | if [ "$check_conda" == "" ];then 568 | echo -e "\n\t\e[31m -- Looks like conda is not installed--\e[39m\n" 569 | exit 0 570 | fi 571 | if [ ! -e *.sqlite ];then 572 | echo -e "\n\t -- Custom sqlite database for Trinotate is not installed -- \n" 573 | echo -e "\n\t -- This could take a couple of minutes depending on connection. Please wait -- \n" 574 | condaRoot=$( conda info --json | grep "CONDA_ROOT" | cut -f 2 -d ":" | tr -d "," | tr -d " " | tr -d "\"" ) 575 | if [ -f ${condaRoot}/etc/profile.d/conda.sh ];then 576 | source ${condaRoot}/etc/profile.d/conda.sh 577 | condaTrinotate 578 | check_sql=$( command -v Build_Trinotate_Boilerplate_SQLite_db.pl | wc -l ) 579 | if [ $check_sql -eq 0 ];then 580 | echo -e "\n\t -- Script \"Build_Trinotate_Boilerplate_SQLite_db.pl\" from Trinotate cannot be found -- \n" 581 | echo -e "\n\t\e[31m -- Verify your conda installation --\e[39m\n" 582 | exit 0 583 | elif [ $check_sql -eq 1 ];then 584 | Build_Trinotate_Boilerplate_SQLite_db.pl Trinotate 585 | rm uniprot_sprot.dat.gz Pfam-A.hmm.gz 586 | date -u >.lastrun.txt 587 | condaTrinotateEnd 588 | fi 589 | fi 590 | elif [ -e *.sqlite ];then 591 | echo -e "\n\t -- Custom sqlite database for Trinotate found at "${mypwd}/DBs/sqlite_db" -- \n" 592 | DB=$( if [ -f ${mypwd}/DBs/sqlite_db/.lastrun.txt ];then cat .lastrun.txt;else echo "N/A";fi ) 593 | echo -e "\n\t -- Databases (PFAM,SwissProt,EggNOG,GO) last update: ${DB} --\n " 594 | fi 595 | } 596 | pfam_c() { 597 | #Check PFAM files 598 | cd $mypwd 599 | if [ ! -d DBs/hmmerdb/ ];then 600 | echo -e "\n\t -- Creating directory for the HMMER database --\n" 601 | mkdir -p DBs/hmmerdb/ 602 | cd DBs/hmmerdb/ 603 | echo -e "-- Downloading Pfam-A files ... --\n" 604 | wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz 605 | echo -e "-- Preparing Pfam-A files ... --\n" 606 | gunzip Pfam-A.hmm.gz 607 | date -u >.lastrun.txt 608 | elif [ -d DBs/hmmerdb/ ];then 609 | echo -e "\n\t -- Directory for the HMMER database is present --\n" 610 | cd DBs/hmmerdb/ 611 | if [ -f Pfam-A.hmm ];then 612 | echo -e "\n\t -- Pfam file is present and ready to be used --\n" 613 | DB=$( if [ -f ${mypwd}/DBs/hmmerdb/.lastrun.txt ];then cat .lastrun.txt;else echo "N/A";fi ) 614 | echo -e "\n\t -- Pfam last update: ${DB} --\n" 615 | else 616 | echo -e "-- Downloading Pfam-A files ... --\n" 617 | wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz 618 | echo -e "-- Preparing Pfam-A files ... --\n" 619 | gunzip Pfam-A.hmm.gz 620 | date -u >.lastrun.txt 621 | fi 622 | fi 623 | } 624 | pfam_u() { 625 | cd $installDir 626 | if [ ! -d DBs/hmmerdb/ ];then 627 | echo -e "\n\t -- Creating directory for the HMMER database --\n" 628 | mkdir -p DBs/hmmerdb/ 629 | cd DBs/hmmerdb/ 630 | echo -e "-- Downloading Pfam-A files ... --\n" 631 | wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz 632 | echo -e "-- Preparing Pfam-A files ... --\n" 633 | gunzip Pfam-A.hmm.gz 634 | date -u >.lastrun.txt 635 | elif [ -d DBs/hmmerdb/ ];then 636 | echo -e "\n\t -- Directory for the HMMER database is present --\n" 637 | cd DBs/hmmerdb/ 638 | rm -rf * 639 | echo -e "-- Downloading Pfam-A files ... --\n" 640 | wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz 641 | echo -e "-- Preparing Pfam-A files ... --\n" 642 | gunzip Pfam-A.hmm.gz 643 | date -u >.lastrun.txt 644 | fi 645 | } 646 | sqld(){ 647 | rm -rf * 648 | source ~/.bashrc 649 | check_conda=$( command -v conda ) 650 | if [ "$check_conda" == "" ];then 651 | echo -e "\n\t\e[31m -- Looks like conda is not installed --\e[39m\n" 652 | echo -e "\n\t\e[31m -- Install conda and rerun this script --\e[39m\n" 653 | exit 0 654 | fi 655 | if [ ! -e *.sqlite ];then 656 | echo -e "\n\t -- Custom sqlite database for Trinotate is not installed -- \n" 657 | echo -e -n "\n\t Do you want to install the custom sqlite database? (y or n): " 658 | read ans 659 | case $ans in 660 | [yY] | [yY][eE][sS]) 661 | condaRoot=$( conda info --json | grep "CONDA_ROOT" | cut -f 2 -d ":" | tr -d "," | tr -d " " | tr -d "\"" ) 662 | if [ -f ${condaRoot}/etc/profile.d/conda.sh ];then 663 | source ${condaRoot}/etc/profile.d/conda.sh 664 | conda activate TransPi 665 | check_sql=$( command -v Build_Trinotate_Boilerplate_SQLite_db.pl | wc -l ) 666 | if [ $check_sql -eq 0 ];then 667 | echo -e "\n\t -- Script \"Build_Trinotate_Boilerplate_SQLite_db.pl\" from Trinotate cannot be found -- \n" 668 | echo -e "\n\t\e[31m -- Verify your conda installation --\e[39m\n" 669 | exit 0 670 | elif [ $check_sql -eq 1 ];then 671 | echo -e "\n\t -- This could take a couple of minutes depending on connection. Please wait -- \n" 672 | Build_Trinotate_Boilerplate_SQLite_db.pl Trinotate 673 | rm uniprot_sprot.dat.gz Pfam-A.hmm.gz 674 | date -u >.lastrun.txt 675 | fi 676 | fi 677 | ;; 678 | [nN] | [nN][oO]) 679 | echo -e "\n\n\t\e[31m -- ERROR: Generate the custom trinotate sqlite database at "${mypwd}/DBs/sqlite_db". Then rerun the pre-check --\e[39m\n" 680 | exit 0 681 | ;; 682 | *) 683 | echo -e "\n\n\t\e[31m -- Yes or No answer not specified. Try again --\e[39m\n" 684 | sqld 685 | ;; 686 | esac 687 | elif [ -e *.sqlite ];then 688 | echo -e "\n\t -- Custom sqlite database for Trinotate found at "${mypwd}/DBs/sqlite_db" -- \n" 689 | DB=$( if [ -f ${mypwd}/DBs/sqlite_db/.lastrun.txt ];then cat .lastrun.txt;else echo "N/A";fi ) 690 | echo -e "\n\t -- Databases (PFAM,SwissProt,EggNOG,GO) last update: ${DB} --\n " 691 | fi 692 | pfam_u 693 | } 694 | ddate() { 695 | if [ ! -e .lastrun.txt ];then 696 | echo -e "\n\t -- No info about when the databse was created -- \n" 697 | echo -e -n "\n\t -- Do you want rerun script and update the databases? (y or n): " 698 | read ans 699 | case $ans in 700 | [yY] | [yY][eE][sS]) 701 | sqld 702 | ;; 703 | [nN] | [nN][oO]) 704 | echo -e "\n\n\t -- Exiting program -- \n" 705 | exit 0 706 | ;; 707 | *) 708 | echo -e "\n\n\t\e[31m -- Yes or No answer not specified. Try again --\e[39m\n" 709 | ddate 710 | ;; 711 | esac 712 | elif [ -e .lastrun.txt ];then 713 | a=$( cat .lastrun.txt ) 714 | echo -e "\n\t -- Database was created on \e[32m${a}\e[39m -- \n" 715 | echo -e -n "\n\t -- Do you want rerun script and update the databases? (y or n): " 716 | read ans 717 | case $ans in 718 | [yY] | [yY][eE][sS]) 719 | sqld 720 | ;; 721 | [nN] | [nN][oO]) 722 | echo -e "\n\n\t -- Exiting program -- \n" 723 | exit 724 | ;; 725 | *) 726 | echo -e "\n\n\t\e[31m -- Yes or No answer not specified. Try again --\e[39m\n" 727 | ddate 728 | ;; 729 | esac 730 | fi 731 | } 732 | downd() { 733 | cd $mypwd 734 | if [ ! -d DBs/sqlite_db/ ];then 735 | echo -e "\n\t -- SQLite directory not found at ${mypwd}/DBs -- \n" 736 | echo -e "\n\t -- Creating ${mypwd}/DBs -- \n" 737 | mkdir -p $mypwd/DBs/sqlite_db/ 738 | downd 739 | elif [ -d DBs/sqlite_db/ ];then 740 | echo -e "\n\t -- SQLite directory found at ${mypwd}/DBs -- \n" 741 | cd DBs/sqlite_db/ 742 | if [ ! -e *.sqlite ];then 743 | sqld 744 | elif [ -e *.sqlite ];then 745 | echo -e "\n\t -- Custom sqlite database for Trinotate is installed -- \n" 746 | echo -e "\n\t -- Verifying when scripts was last run -- \n" 747 | ddate 748 | fi 749 | fi 750 | } 751 | get_var_container () { 752 | cd $mypwd 753 | echo "busco4db=$mypwd/DBs/busco_db/$busna" >>${mypwd}/.varfile.sh 754 | echo "uniname=$unina" >>${mypwd}/.varfile.sh 755 | echo "uniprot=$mypwd/DBs/uniprot_db/$unina" >>${mypwd}/.varfile.sh 756 | echo "pfloc=$mypwd/DBs/hmmerdb/Pfam-A.hmm" >>${mypwd}/.varfile.sh 757 | echo "pfname=Pfam-A.hmm" >>${mypwd}/.varfile.sh 758 | echo "nextflow=$mypwd/nextflow" >>${mypwd}/.varfile.sh 759 | echo "Tsql=$mypwd/DBs/sqlite_db/*.sqlite" >>${mypwd}/.varfile.sh 760 | echo "unpdate=\"$( if [ -f ${mypwd}/DBs/uniprot_db/.lastrun.txt ];then cat ${mypwd}/DBs/uniprot_db/.lastrun.txt;else echo "N/A";fi )\"" >>${mypwd}/.varfile.sh 761 | echo "pfdate=\"$( if [ -f ${mypwd}/DBs/hmmerdb/.lastrun.txt ];then cat ${mypwd}/DBs/hmmerdb/.lastrun.txt;else echo "N/A";fi )\"" >>${mypwd}/.varfile.sh 762 | echo "dbdate=\"$( if [ -f ${mypwd}/DBs/sqlite_db/.lastrun.txt ];then cat ${mypwd}/DBs/sqlite_db/.lastrun.txt;else echo "N/A";fi )\"" >>${mypwd}/.varfile.sh 763 | vpwd=$mypwd 764 | echo "mypwd=$mypwd" >>${vpwd}/.varfile.sh 765 | source .varfile.sh 766 | echo -e "\n\t -- INFO to use in TransPi --\n" 767 | echo -e "\t Installation PATH:\t $mypwd" 768 | echo -e "\t BUSCO V4 database:\t $busco4db" 769 | echo -e "\t UNIPROT database:\t $uniprot" 770 | echo -e "\t UNIPROT last update:\t $unpdate" 771 | echo -e "\t PFAM files:\t\t $pfloc" 772 | echo -e "\t PFAM last update:\t $pfdate" 773 | echo -e "\t SQL DB last update: \t $dbdate" 774 | echo -e "\t NEXTFLOW:\t\t $nextflow \n\n" 775 | cat ${confDir}/template.nextflow.config | sed -e "s|pipeInstall|pipeInstall=\"${mypwd}\"|" -e "s|busco4db|busco4db=\"${busco4db}\"|" -e "s|uniprot|uniprot=\"${uniprot}\"|" \ 776 | -e "s|uniname|uniname=\"${uniname}\"|" -e "s|pfloc|pfloc=\"${pfloc}\"|" -e "s|pfname|pfname=\"${pfname}\"|" -e "s|Tsql|Tsql=\"${Tsql}\"|" >nextflow.config 777 | rm .varfile.sh 778 | } 779 | get_var () { 780 | cd $mypwd 781 | echo "busco4db=$mypwd/DBs/busco_db/$busna" >>${mypwd}/.varfile.sh 782 | echo "uniname=$unina" >>${mypwd}/.varfile.sh 783 | echo "uniprot=$mypwd/DBs/uniprot_db/$unina" >>${mypwd}/.varfile.sh 784 | echo "pfloc=$mypwd/DBs/hmmerdb/Pfam-A.hmm" >>${mypwd}/.varfile.sh 785 | echo "pfname=Pfam-A.hmm" >>${mypwd}/.varfile.sh 786 | echo "nextflow=$mypwd/nextflow" >>${mypwd}/.varfile.sh 787 | echo "Tsql=$mypwd/DBs/sqlite_db/*.sqlite" >>${mypwd}/.varfile.sh 788 | echo "unpdate=\"$( if [ -f ${mypwd}/DBs/uniprot_db/.lastrun.txt ];then cat ${mypwd}/DBs/uniprot_db/.lastrun.txt;else echo "N/A";fi )\"" >>${mypwd}/.varfile.sh 789 | echo "pfdate=\"$( if [ -f ${mypwd}/DBs/hmmerdb/.lastrun.txt ];then cat ${mypwd}/DBs/hmmerdb/.lastrun.txt;else echo "N/A";fi )\"" >>${mypwd}/.varfile.sh 790 | echo "dbdate=\"$( if [ -f ${mypwd}/DBs/sqlite_db/.lastrun.txt ];then cat ${mypwd}/DBs/sqlite_db/.lastrun.txt;else echo "N/A";fi )\"" >>${mypwd}/.varfile.sh 791 | #echo "tenv=$( conda info --json | sed -n '/\"envs\":/,/\],/p' | grep -w "TransPi\"" | tr -d "," | tr -d " " )" >>${mypwd}/.varfile.sh 792 | #echo "cenv=$( conda info --json | sed -n '/\"envs\":/,/\],/p' | grep "busco4" | tr -d "," | tr -d " " )" >>${mypwd}/.varfile.sh 793 | vpwd=$mypwd 794 | echo "mypwd=$mypwd" >>${vpwd}/.varfile.sh 795 | source .varfile.sh 796 | echo -e "\n\t -- INFO to use in TransPi --\n" 797 | echo -e "\t Installation PATH:\t $mypwd" 798 | echo -e "\t BUSCO V4 database:\t $busco4db" 799 | echo -e "\t UNIPROT database:\t $uniprot" 800 | echo -e "\t UNIPROT last update:\t $unpdate" 801 | echo -e "\t PFAM files:\t\t $pfloc" 802 | echo -e "\t PFAM last update:\t $pfdate" 803 | echo -e "\t SQL DB last update: \t $dbdate" 804 | echo -e "\t NEXTFLOW:\t\t $nextflow \n\n" 805 | cat ${confDir}/template.nextflow.config | sed -e "s|pipeInstall|pipeInstall=\"${mypwd}\"|" -e "s|busco4db|busco4db=\"${busco4db}\"|" -e "s|uniprot|uniprot=\"${uniprot}\"|" \ 806 | -e "s|uniname|uniname=\"${uniname}\"|" -e "s|pfloc|pfloc=\"${pfloc}\"|" -e "s|pfname|pfname=\"${pfname}\"|" -e "s|Tsql|Tsql=\"${Tsql}\"|" >nextflow.config 807 | rm .varfile.sh 808 | } 809 | get_var_user() { 810 | cd $mypwd 811 | echo "busco4db=${busco4db}" >>${mypwd}/.varfile.sh 812 | echo "uniname=${uniname}" >>${mypwd}/.varfile.sh 813 | echo "uniprot=${uniprot}" >>${mypwd}/.varfile.sh 814 | echo "pfloc=${pfloc}" >>${mypwd}/.varfile.sh 815 | echo "pfname=${pfname}" >>${mypwd}/.varfile.sh 816 | echo "nextflow=$mypwd/nextflow" >>${mypwd}/.varfile.sh 817 | echo "Tsql=${Tsql}" >>${mypwd}/.varfile.sh 818 | vpwd=$mypwd 819 | echo "mypwd=$mypwd" >>${vpwd}/.varfile.sh 820 | source .varfile.sh 821 | echo -e "\n\t -- INFO to use in TransPi --\n" 822 | echo -e "\t Installation PATH:\t $mypwd" 823 | echo -e "\t Using your DBs\t\t" 824 | echo -e "\t BUSCO V4 database:\t $busco4db" 825 | echo -e "\t UNIPROT database:\t $uniprot" 826 | echo -e "\t PFAM files:\t\t $pfloc" 827 | echo -e "\t NEXTFLOW:\t\t $nextflow \n\n" 828 | cat ${confDir}/template.nextflow.config | sed -e "s|pipeInstall|pipeInstall=\"${mypwd}\"|" -e "s|busco4db|busco4db=\"${busco4db}\"|" -e "s|uniprot|uniprot=\"${uniprot}\"|" \ 829 | -e "s|uniname|uniname=\"${uniname}\"|" -e "s|pfloc|pfloc=\"${pfloc}\"|" -e "s|pfname|pfname=\"${pfname}\"|" -e "s|Tsql|Tsql=\"${Tsql}\"|" >nextflow.config 830 | rm .varfile.sh 831 | } 832 | container_pipeline_setup() { 833 | if [ "${userVar}" == "y" ];then 834 | nextflow_c 835 | evi_c 836 | echo -e "\n\t -- If no \"ERROR\" was found and all the neccesary databases are installed proceed to run TransPi -- \n" 837 | get_var_user 838 | else 839 | echo -e "\n\t -- Installing databases only -- \n" 840 | dir_c 841 | bus_c 842 | uniprot_c 843 | nextflow_c 844 | evi_c 845 | buildsql_c 846 | trisql_container 847 | pfam_c 848 | echo -e "\n\t -- If no \"ERROR\" was found and all the neccesary databases are installed proceed to run TransPi -- \n" 849 | get_var_container 850 | fi 851 | } 852 | conda_pipeline_setup() { 853 | if [ "${userVar}" == "y" ];then 854 | echo -e "\n\t -- Installing conda --\n" 855 | conda_only 856 | nextflow_c 857 | evi_c 858 | echo -e "\n\t -- If no \"ERROR\" was found and all the neccesary databases are installed proceed to run TransPi -- \n" 859 | get_var_user 860 | else 861 | echo -e "\n\t -- Installing conda and the databases -- \n" 862 | conda_only 863 | dir_c 864 | bus_c 865 | uniprot_c 866 | nextflow_c 867 | evi_c 868 | buildsql_c 869 | trisql_c 870 | pfam_c 871 | echo -e "\n\t -- If no \"ERROR\" was found and all the neccesary databases are installed proceed to run TransPi -- \n" 872 | get_var 873 | fi 874 | } 875 | user_buscoDBv4(){ 876 | echo -e "\n\t -- PATH where to locate your BUSCO v4 file -- " 877 | echo -e "\n\t -- Example: /home/ubuntu/myDB/metazoa_odb10 -- " 878 | echo -e -n "\n\t -- Provide the PATH where to locate your BUSCO v4 file: " 879 | read -e ans 880 | if [ -d ${ans} ];then 881 | echo -e "\n\t -- File ${ans} found -- \n" 882 | export busco4db=${ans} 883 | elif [ ! -d ${ans} ];then 884 | echo -e "\n\t\e[31m -- File ${ans} not found -- \e[39m\n" 885 | user_buscoDBv4 886 | fi 887 | } 888 | user_uniDB(){ 889 | echo -e "\n\t -- PATH where to locate your UNIPROT file -- " 890 | echo -e "\n\t -- Example: /home/ubuntu/myDB/uniprot_proteins.fasta -- " 891 | echo -e -n "\n\t -- Provide the PATH where to locate your UNIPROT file: " 892 | read -e ans 893 | if [ -f ${ans} ];then 894 | echo -e "\n\t -- File ${ans} found -- \n" 895 | export uniprot=${ans} 896 | export uniname=$( basename ${ans} ) 897 | elif [ ! -f ${ans} ];then 898 | echo -e "\n\t\e[31m -- File ${ans} not found -- \e[39m\n" 899 | user_uniDB 900 | fi 901 | } 902 | user_pfDB(){ 903 | echo -e "\n\t -- PATH where to locate your PFAM file -- " 904 | echo -e "\n\t -- Example: /home/ubuntu/myDB/Pfam-A.hmm -- " 905 | echo -e -n "\n\t -- Provide the PATH where to locate your PFAM file: " 906 | read -e ans 907 | if [ -f ${ans} ];then 908 | echo -e "\n\t -- File ${ans} found -- \n" 909 | export pfloc=${ans} 910 | export pfname=$( basename ${ans} ) 911 | elif [ ! -f ${ans} ];then 912 | echo -e "\n\t\e[31m -- File ${ans} not found -- \e[39m\n" 913 | user_pfDB 914 | fi 915 | } 916 | user_sqlDB(){ 917 | echo -e "\n\t -- PATH where to locate your SQL file -- " 918 | echo -e "\n\t -- Example: /home/ubuntu/myDB/Trinotate.sqlite -- " 919 | echo -e -n "\n\t -- Provide the PATH where to locate your SQL file: " 920 | read -e ans 921 | if [ -f ${ans} ];then 922 | echo -e "\n\t -- File ${ans} found -- \n" 923 | export Tsql=${ans} 924 | elif [ ! -f ${ans} ];then 925 | echo -e "\n\t\e[31m -- File ${ans} not found -- \e[39m\n" 926 | user_sqlDB 927 | fi 928 | } 929 | userDBs(){ 930 | user_buscoDBv4 931 | user_uniDB 932 | user_pfDB 933 | user_sqlDB 934 | userVar=y 935 | } 936 | dbs(){ 937 | echo -e "\n\t -- Either TransPi install the DBs for you or you provide the PATH of the DBs -- \n" 938 | echo -e -n "\t Do you want TransPi to handle the DBs installation? (y,n,exit): " 939 | read ans 940 | case $ans in 941 | [yY] | [yY][eE][sS]) 942 | echo -e "\n\n\t -- TransPi will handle the installation -- \n" 943 | ;; 944 | [nN] | [nN][oO]) 945 | echo -e "\n\n\t -- Using your DBs -- \n" 946 | userDBs 947 | ;; 948 | exit) 949 | echo -e "\n\n\t -- Exiting --\n" 950 | ;; 951 | *) 952 | echo -e "\n\n\t\e[31m -- Yes or No answer not specified. Try again --\e[39m\n" 953 | dbs 954 | ;; 955 | esac 956 | if [ "$1" == "1" ];then 957 | conda_pipeline_setup 958 | elif [ "$1" == "2" ];then 959 | container_pipeline_setup 960 | fi 961 | } 962 | message(){ 963 | echo " 964 | 965 | ######################################################################################### 966 | # # 967 | # TransPi precheck script # 968 | # # 969 | # Options available: # 970 | # # 971 | # 1- Install conda (if neccesary) and DBs # 972 | # # 973 | # Runs of TransPi using conda # 974 | # # 975 | # 2- Install DBs for containers use # 976 | # # 977 | # Runs of TransPi with containers (docker or singularity) # 978 | # # 979 | # 3- Update DBs # 980 | # # 981 | # SwissProt, PFAM, SQL DB used for annotation (requires conda) # 982 | # # 983 | # 4- Exit # 984 | # # 985 | ######################################################################################### 986 | 987 | " 988 | } 989 | moption(){ 990 | echo -e -n "\t Which option you want? " 991 | read ans 992 | case $ans in 993 | 1 | 2) 994 | dbs $ans 995 | ;; 996 | 3) 997 | echo -e "\n\t -- Updating DBs -- \n" 998 | downd 999 | ;; 1000 | 4) 1001 | echo -e "\n\t -- Exit -- \n" 1002 | exit 0 1003 | ;; 1004 | *) 1005 | echo -e "\n\t\e[31m -- Wrong option. Try again --\e[39m\n" 1006 | moption 1007 | ;; 1008 | esac 1009 | } 1010 | main(){ 1011 | if [ "$mypwd" == "" ] || [ "$mypwd" == "-h" ] || [ "$mypwd" == "-help" ] || [ "$mypwd" == "--help" ];then 1012 | echo -e "\n\t Script for checking the requirements of TransPi \n" 1013 | echo -e "\t Usage:\n\n\t\t bash precheck_TransPi.sh WORK_PATH \n" 1014 | echo -e "\t\t\t WORK_PATH = PATH to download requirements and databases used by TransPi \n\n\t\t\t Example: /home/bioinf/run/ \n" 1015 | exit 0 1016 | elif [ ! -d "$mypwd" ];then 1017 | echo -e "\n\t -- Directory "${mypwd}" is not found -- \n" 1018 | echo -e "\n\t -- Creating "${mypwd}" -- \n" 1019 | mkdir -p ${mypwd} 1020 | if [ -d "$mypwd" ];then 1021 | echo -e "\n\t -- Directory created succesfully -- \n" 1022 | main 1023 | else 1024 | echo -e "\n\t -- Please provide a valid PATH to run TransPi -- \n" 1025 | exit 0 1026 | fi 1027 | elif [ -d "$mypwd" ];then 1028 | if [ ${mypwd} == "." ];then 1029 | mypwd=$(pwd) 1030 | confDir=$(pwd) 1031 | elif [ ${mypwd} == $(pwd) ]; then 1032 | confDir=$(pwd) 1033 | else 1034 | cd ${mypwd} && mypwd=$(pwd) && cd - 1035 | confDir=$( dirname ${BASH_SOURCE} ) 1036 | if [ ${confDir} == "." ];then 1037 | confDir=$(pwd) 1038 | else 1039 | cd ${confDir} && confDir=$(pwd) 1040 | fi 1041 | fi 1042 | message 1043 | moption 1044 | fi 1045 | } 1046 | main 1047 | --------------------------------------------------------------------------------