├── .gitignore ├── 00_scripts ├── .gitignore ├── 00_import_trinity.sh ├── 01_trimmomatic_pe.sh ├── 01_trimmomatic_se.sh ├── 02_merge.sh ├── 03_assemble.sh ├── 04_assembly_stats.sh ├── 05_prepare_ref.sh ├── 06_transcripts_abundance.sh ├── 07_build_matrix.sh ├── 07_htseq_count.sh ├── 08_de_analysis.sh ├── 09_expression_clustering.sh ├── 10_extract_de_clustering.sh ├── 11_extract_go_per_gene.sh ├── 12_goseq.sh ├── datarmor_jobs │ └── 01_trimmomatic_pe_jobs.sh └── utility_scripts │ ├── assessing_read_content.sh │ ├── blastp.sh │ ├── compare_replicates_qc.sh │ ├── corset.sh │ ├── deprecated │ ├── 07_build_matrix.sh │ ├── 08_de_analysis.sh │ ├── 09_expression_clustering.sh │ ├── 10_extract_de_clustering.sh │ ├── 11_extract_go_per_gene.sh │ └── 12_goseq.sh │ ├── fastqc.sh │ ├── htseq_count.sh │ ├── insilico_normalization.sh │ ├── pfam.sh │ ├── prepare_info_file.sh │ ├── prepare_jobs_header.sh │ ├── transdecoder_getorf.sh │ └── transdecoder_predict.sh ├── 01_info_files ├── .gitignore ├── example_pair_comparison.txt ├── example_sample_replicates.txt └── example_samples_info.txt ├── 02_data └── .gitignore ├── 03_trimmed └── .gitignore ├── 04_merged └── .gitignore ├── 06_assembly_stats └── .gitignore ├── 07_de_results ├── .gitignore └── rsem_DH-1MA.bowtie.sorted.bam.bai ├── 98_log_files └── .gitignore ├── 99_archive └── .gitignore └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | .Rhistory 3 | *.txt 4 | *transdecoder_dir/ 5 | 05_trinity_assembly_200/ 6 | 05_trinity_assembly/ 7 | chado_test 8 | -------------------------------------------------------------------------------- /00_scripts/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.txt 3 | *.fastq.gz 4 | *.gz 5 | trinity_utils/ 6 | trinotate_utils/ 7 | transdecoder_utils/ 8 | transrate_utils/ 9 | transvestigator_utils/ 10 | corset_utils/ 11 | -------------------------------------------------------------------------------- /00_scripts/00_import_trinity.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | cd $(pwd) 5 | 6 | #clone into trinity 7 | git clone https://github.com/trinityrnaseq/trinityrnaseq 00_scripts/trinity_utils 8 | 9 | # clone trinotate 10 | git clone https://github.com/Trinotate/Trinotate 00_scripts/trinotate_utils 11 | 12 | #clone transDecoder 13 | git clone https://github.com/TransDecoder/TransDecoder 00_scripts/transdecoder_utils 14 | 15 | # clone transvestigator 16 | git clone https://github.com/genomeannotation/transvestigator 00_scripts/transvestigator_utils 17 | 18 | #clone transrate 19 | git clone https://github.com/blahah/transrate 00_scripts/transrate_utils 20 | 21 | #clone corset 22 | git clone https://github.com/Oshlack/Corset 00_scripts/corset_utils 23 | 24 | -------------------------------------------------------------------------------- /00_scripts/01_trimmomatic_pe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -N trimmomatic__BASE__ 3 | #PBS -o trimmomatic__BASE__.out 4 | #PBS -l walltime=02:00:00 5 | #PBS -l mem=60g 6 | #####PBS -m ea 7 | #PBS -l ncpus=8 8 | #PBS -q omp 9 | #PBS -r n 10 | 11 | cd $PBS_O_WORKDIR 12 | 13 | 14 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 15 | SCRIPT=$0 16 | NAME=$(basename $0) 17 | LOG_FOLDER="98_log_files" 18 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 19 | 20 | 21 | # Global variables 22 | 23 | ADAPTERFILE="univec.fasta" 24 | NCPU=8 25 | base=__BASE__ 26 | TRIMMOMATIC_JAR="trimmomatic-0.36.jar" 27 | 28 | java -Xmx40G -jar $TRIMMOMATIC_JAR PE \ 29 | -threads 8 \ 30 | -phred33 \ 31 | 02_data/"$base"_1.fastq.gz \ 32 | 02_data/"$base"_2.fastq.gz \ 33 | 03_trimmed/"$base"_R1.paired.fastq.gz \ 34 | 03_trimmed/"$base"_R1.single.fastq.gz \ 35 | 03_trimmed/"$base"_R2.paired.fastq.gz \ 36 | 03_trimmed/"$base"_R2.single.fastq.gz \ 37 | ILLUMINACLIP:"$ADAPTERFILE":2:20:7 \ 38 | LEADING:20 \ 39 | TRAILING:20 \ 40 | SLIDINGWINDOW:30:30 \ 41 | MINLEN:40 2> 98_log_files/log.trimmomatic.pe."$TIMESTAMP" 42 | -------------------------------------------------------------------------------- /00_scripts/01_trimmomatic_se.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="trim" 5 | #SBATCH -o log-trim_pe.out 6 | #SBATCH -c 8 7 | #SBATCH -p ibismini 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=type_your_mail@ulaval.ca 10 | #SBATCH --time=0-20:00 11 | #SBATCH --mem=50000 12 | 13 | cd $SLURM_SUBMIT_DIR 14 | 15 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 16 | SCRIPT=$0 17 | NAME=$(basename $0) 18 | LOG_FOLDER="98_log_files" 19 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 20 | 21 | 22 | #Global variables 23 | ADAPTERFILE="01_info_files/univec.fasta" 24 | TRIMMOMATIC_JAR="/prg/trimmomatic/0.36/trimmomatic-0.36.jar" 25 | 26 | for file in $(ls 02_data/*.f*q.gz|perl -pe 's/_R[12].f(ast)?q.gz//') 27 | do 28 | base=$(basename "$file") 29 | 30 | java -Xmx40G -jar $TRIMMOMATIC_JAR SE \ 31 | -phred33 \ 32 | -threads 8 \ 33 | 02_data/"$base"_R1.fastq.gz \ 34 | 03_trimmed/"$base"_R1.trimmed.fastq.gz \ 35 | ILLUMINACLIP:"$ADAPTERFILE":2:20:7 \ 36 | LEADING:20 \ 37 | TRAILING:20 \ 38 | SLIDINGWINDOW:30:30 \ 39 | MINLEN:60 40 | 41 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_trimmomatic_se.log 42 | -------------------------------------------------------------------------------- /00_scripts/02_merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="merge" 5 | #SBATCH -o log-merge.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismini 8 | #SBATCH -A ibismini 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=2-00:00 12 | #SBATCH --mem=10000 13 | 14 | <<<<<<< HEAD 15 | cd $SLURM_SUBMIT_DIR 16 | 17 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 18 | SCRIPT=$0 19 | NAME=$(basename $0) 20 | LOG_FOLDER="98_log_files" 21 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 22 | 23 | INPUTFOLDER="03_trimmed" 24 | OUTPUTFOLDER="04_merged" 25 | 26 | cat "$INPUTFOLDER"/*_R1.paired.fastq.gz >> "$OUTPUTFOLDER"/all_reads.left.fastq 27 | 28 | cat "$INPUTFOLDER"/*_R2.paired.fastq.gz >> "$OUTPUTFOLDER"/all_reads.right.fastq 29 | 30 | ======= 31 | #move to present directory 32 | cd $(pwd) 33 | 34 | #cat all reads 35 | cat "$INPUTFOLDER"/*_R1.paired.fastq.gz > "$OUTPUTFOLDER"/all_reads.left.fastq.gz 36 | 37 | cat "$INPUTFOLDER"/*_R2.paired.fastq.gz > "$OUTPUTFOLDER"/all_reads.right.fastq.gz 38 | >>>>>>> 8629e02ffad09c4df82ffb278128308c999cbded 39 | -------------------------------------------------------------------------------- /00_scripts/03_assemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="assemble" 5 | #SBATCH -o log-assemble.out 6 | #SBATCH -c 8 7 | #SBATCH -p ibis2 8 | #SBATCH -A ibis2 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=10-00:00 12 | #SBATCH --mem=220000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | #Global variables 23 | READSLEFT="04_merged/all_left.fq" 24 | READSRIGHT="04_merged/all_right.fq" 25 | #READSSINGLE="03_trimmed/*.trimmed.fastq.gz" 26 | 27 | #Trinity variables 28 | ##Required 29 | seqtype="--seqType fq" # type of reads: ( fa, or fq ) 30 | 31 | mem="--max_memory 200G" # suggested max memory to use by Trinity where limiting can be enabled. (jellyfish, sorting, etc) 32 | # provied in Gb of RAM, ie. '--max_memory 10G' 33 | #paired reads: 34 | left="--left $READSLEFT" #left reads, one or more file names (separated by commas, no spaces) 35 | right="--right $READSRIGHT" #right reads, one or more file names (separated by commas, no spaces) 36 | #single-end: 37 | #single="--single $READSSINGLE" #single reads, one or more file names, comma-delimited 38 | #(note, if single file contains pairs, can use flag: --run_as_paired ) 39 | ##Optionnal 40 | #strand="--SS_lib_type " #Strand-specific RNA-Seq read orientation. 41 | # if paired: RF or FR, 42 | #if single: F or R. (dUTP method = RF) 43 | #See web documentation. 44 | cpu="--CPU 8" #number of CPUs to use, default: 2 45 | mincontiglength="--min_contig_length 200" #minimum assembled contig length to report 46 | #(def=200) 47 | #corlongread="--long_reads " #fasta file containing error-corrected or circular consensus (CCS) pac bio reads 48 | #genomeguided="--genome_guided_bam " #genome guided mode, provide path to coordinate-sorted bam file. 49 | #(see genome-guided param section under --show_full_usage_info) 50 | #jaccard="--jaccard_clip" #:option, set if you have paired reads and 51 | #you expect high gene density with UTR 52 | #overlap (use FASTQ input file format 53 | #for reads). 54 | #(note: jaccard_clip is an expensive 55 | #operation, so avoid using it unless 56 | #necessary due to finding excessive fusion 57 | #transcripts w/o it.) 58 | #trim="--trimmomatic" #run Trimmomatic to quality trim reads 59 | #see '--quality_trimming_params' under full usage info for tailored settings. 60 | #normalize="--normalize_reads" #run in silico normalization of reads. Defaults to max. read coverage of 50. 61 | #see '--normalize_max_read_cov' under full usage info for tailored settings. 62 | #notphase2="--no_distributed_trinity_exec" #do not run Trinity phase 2 (assembly of partitioned reads), and stop after generating command list. 63 | output="--output 05_trinity_assembly_200/" #name of directory for output (will be 64 | #created if it doesn't already exist) 65 | #default( your current working directory: "/home/leluyer/trinity_out_dir" 66 | #note: must include 'trinity' in the name as a safety precaution! ) 67 | #cleanup="--full_cleanup" #only retain the Trinity fasta file, rename as ${output_dir}.Trinity.fasta 68 | 69 | 70 | 00_scripts/trinity_utils/Trinity $seqtype $mem $left $right $single \ 71 | $strand $cpu $mincontiglength $corlongread \ 72 | $genomeguided $jaccard $normalize $notphase2 \ 73 | $output $cleanup 2>&1 | tee 98_log_files/"$TIMESTAMP"_trinityassembly.log 74 | 75 | 76 | 77 | # --cite :show the Trinity literature citation 78 | # 79 | # --verbose :provide additional job status info during the run. 80 | # 81 | # --version :reports Trinity version (v2.1.1) and exits. 82 | # 83 | # --show_full_usage_info :show the many many more options available for running Trinity (expert usage). 84 | # 85 | # 86 | ############################################################################### 87 | # 88 | # *Note, a typical Trinity command might be: 89 | # 90 | # Trinity --seqType fq --max_memory 50G --left reads_1.fq --right reads_2.fq --CPU 6 91 | # 92 | # 93 | # and for Genome-guided Trinity: 94 | # 95 | # Trinity --genome_guided_bam rnaseq_alignments.csorted.bam --max_memory 50G 96 | # --genome_guided_max_intron 10000 --CPU 6 97 | # 98 | # see: /software6/apps/trinityrnaseq/2.1.1_gcc/sample_data/test_Trinity_Assembly/ 99 | # for sample data and 'runMe.sh' for example Trinity execution 100 | # 101 | # For more details, visit: http://trinityrnaseq.github.io 102 | 103 | -------------------------------------------------------------------------------- /00_scripts/04_assembly_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="stats" 5 | #SBATCH -o log-stat.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibis2 8 | #SBATCH -A ibis2 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=00-10:00 12 | #SBATCH --mem=2000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | 17 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 18 | SCRIPT=$0 19 | NAME=$(basename $0) 20 | LOG_FOLDER="98_log_files" 21 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 22 | 23 | 24 | #global variables 25 | INPUTFILE="05_trinity_assembly/Trinity.fasta" 26 | OUTPUTFILE="06_assembly_stats/results_stats.txt" 27 | 28 | #Check stats 29 | 00_scripts/trinity_utils/util/TrinityStats.pl "$INPUTFILE" > "$OUTPUTFILE" 2>&1 | tee 98_log_files/"$TIMESTAMP"_assemblystats.log 30 | 31 | -------------------------------------------------------------------------------- /00_scripts/05_prepare_ref.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="prep" 5 | #SBATCH -o log-prep.out 6 | #SBATCH -c 8 7 | #SBATCH -p ibismax 8 | #SBATCH -A ibismax 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=01-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | #Global variables 24 | TRANSCRIPTOME="05_trinity_assembly/Trinity.filtered.fasta" 25 | 26 | ######################################################################### 27 | #Required 28 | 29 | trans="--transcripts $TRANSCRIPTOME" #transcript fasta file 30 | seq="--seqType fq" #fq|fa 31 | 32 | #single="--single $READSSINGLE" 33 | meth="--est_method RSEM" #abundance estimation method. 34 | #alignment_based: RSEM|eXpress 35 | #alignment_free: kallisto|salmon 36 | output="--output_dir 06_assembly_stats" #write all files to output directory 37 | 38 | # if alignment_based est_method: 39 | alnmeth="--aln_method bowtie" #bowtie|bowtie2|(path to bam file) alignment method. (note: RSEM requires bowtie) 40 | #(if you already have a bam file, you can use it here instead of rerunning bowtie) 41 | #(note, no strand-specific mode for kallisto) 42 | cpu="--thread_count 8" #number of threads to use (default = 4) 43 | #debug="--debug" #retain intermediate files 44 | #genetrans="--gene_trans_map " #file containing 'gene(tab)transcript' identifiers per line. 45 | # or 46 | trinmode="--trinity_mode" #Setting --trinity_mode will automatically generate the gene_trans_map and use it. 47 | 48 | prepref="--prep_reference" #prep reference (builds target index) 49 | outpref="--output_prefix ref_bowtie" #prefix for output files. Defaults to --est_method setting. 50 | 51 | ######################################## 52 | # Parameters for single-end reads: 53 | # 54 | #fraglength="--fragment_length 200" #specify RNA-Seq fragment length (default: 200) 55 | #frgstd=" --fragment_std 80" #fragment length standard deviation (defalt: 80) 56 | ######################################## 57 | # bowtie-related parameters: (note, tool-specific settings are further below) 58 | 59 | #maxins="--max_ins_size 800" #maximum insert size (bowtie -X parameter, default: 800) 60 | #coord="--coordsort_bam" #provide coord-sorted bam in addition to the default (unsorted) bam. 61 | ######################################## 62 | # RSEM opts: 63 | #bowtie_rsem="--bowtie_RSEM " #if using 'bowtie', default: "--all --best --strata -m 300 --chunkmbs 512" 64 | #bowtie2_rsem="--bowtie2_RSEM " #if using 'bowtie2', default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 " 65 | #include_rsem_bam="--include_rsem_bam" # provide the RSEM enhanced bam file including posterior probabilities of read assignments. 66 | #rsem_opt="--rsem_add_opts " #additional parameters to pass on to rsem-calculate-expression 67 | ########################################################################## 68 | # eXpress opts: 69 | # --bowtie_eXpress default: "--all --best --strata -m 300 --chunkmbs 512" 70 | # --bowtie2_eXpress default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 " 71 | # --eXpress_add_opts default: "" 72 | 73 | ########################################################################## 74 | # kallisto opts: 75 | # --kallisto_add_opts default: 76 | ########################################################################## 77 | # salmon opts: 78 | # --salmon_idx_type quasi|fmd (defalt: quasi) 79 | # --salmon_add_opts default: 80 | 81 | 82 | #run reference preparation 83 | 00_scripts/trinity_utils/util/align_and_estimate_abundance.pl $trans $meth $alnmeth $trinmode $outpref $prepref $output 2>&1 | tee 98_log_files/"$TIMESTAMP"_prepref.log 84 | 85 | #00_scripts/trinity_utils/util/align_and_estimate_abundance.pl --transcripts Trinity.fasta --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference 86 | #note: Not all the commands have been integrated to data 87 | -------------------------------------------------------------------------------- /00_scripts/06_transcripts_abundance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="trans_abundance" 5 | #SBATCH -o log-trans_abundance.out 6 | #SBATCH -c 8 7 | #SBATCH -p ibismax 8 | #SBATCH -A ibismax 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=02-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | for file in $(ls 03_trimmed/*.paired.f*q.gz|perl -pe 's/_R[12].paired.fastq.gz//') 24 | 25 | do 26 | sample=$(basename "$file") 27 | 28 | #Global variables 29 | TRANSCRIPTOME="05_trinity_assembly/Trinity.filtered.fasta" 30 | READSLEFT="03_trimmed/"$sample"_R1.paired.fastq.gz" 31 | READSRIGHT="03_trimmed/"$sample"_R2.paired.fastq.gz" 32 | #READSSINGLE="03_trimmed/*_R1.trimmed.fastq.gz" 33 | 34 | ######################################################################### 35 | #Required 36 | 37 | trans="--transcripts $TRANSCRIPTOME" #transcript fasta file 38 | seq="--seqType fq" #fq|fa 39 | 40 | # If Paired-end: 41 | left="--left $READSLEFT" 42 | right="--right $READSRIGHT" 43 | 44 | # or Single-end: 45 | #single="--single $READSSINGLE" 46 | meth="--est_method RSEM" #abundance estimation method. 47 | #alignment_based: RSEM|eXpress 48 | #alignment_free: kallisto|salmon 49 | output="--output_dir 07_de_results" #write all files to output directory 50 | 51 | # if alignment_based est_method: 52 | alnmeth="--aln_method bowtie" #bowtie|bowtie2|(path to bam file) alignment method. (note: RSEM requires bowtie) 53 | #(if you already have a bam file, you can use it here instead of rerunning bowtie) 54 | # Optional: 55 | #strand="--SS_lib_type " #strand-specific library type: paired('RF' or 'FR'), single('F' or 'R'). 56 | #(note, no strand-specific mode for kallisto) 57 | cpu="--thread_count 8" #number of threads to use (default = 4) 58 | #debug="--debug" #retain intermediate files 59 | #genetrans="--gene_trans_map " #file containing 'gene(tab)transcript' identifiers per line. 60 | # or 61 | trinmode="--trinity_mode" #Setting --trinity_mode will automatically generate the gene_trans_map and use it. 62 | 63 | #prepref="--prep_reference" #prep reference (builds target index) 64 | outpref="--output_prefix rsem_"$sample"" #prefix for output files. Defaults to --est_method setting. 65 | #outpref="--output_prefix "$sample"" #prefix for output files. Defaults to --est_method setting. 66 | 67 | ######################################## 68 | # Parameters for single-end reads: 69 | # 70 | #fraglength="--fragment_length 200" #specify RNA-Seq fragment length (default: 200) 71 | #frgstd=" --fragment_std 80" #fragment length standard deviation (defalt: 80) 72 | ######################################## 73 | # bowtie-related parameters: (note, tool-specific settings are further below) 74 | 75 | #maxins="--max_ins_size 800" #maximum insert size (bowtie -X parameter, default: 800) 76 | #coord="--coordsort_bam" #provide coord-sorted bam in addition to the default (unsorted) bam. 77 | ######################################## 78 | # RSEM opts: 79 | #bowtie_rsem="--bowtie_RSEM " #if using 'bowtie', default: "--all --best --strata -m 300 --chunkmbs 512" 80 | #bowtie2_rsem="--bowtie2_RSEM " #if using 'bowtie2', default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 " 81 | #include_rsem_bam="--include_rsem_bam" # provide the RSEM enhanced bam file including posterior probabilities of read assignments. 82 | #rsem_opt="--rsem_add_opts " #additional parameters to pass on to rsem-calculate-expression 83 | ########################################################################## 84 | # eXpress opts: 85 | # --bowtie_eXpress default: "--all --best --strata -m 300 --chunkmbs 512" 86 | # --bowtie2_eXpress default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 " 87 | # --eXpress_add_opts default: "" 88 | 89 | ########################################################################## 90 | # kallisto opts: 91 | # --kallisto_add_opts default: 92 | ########################################################################## 93 | # salmon opts: 94 | # --salmon_idx_type quasi|fmd (defalt: quasi) 95 | # --salmon_add_opts default: 96 | 97 | 98 | #Align 99 | 00_scripts/trinity_utils/util/align_and_estimate_abundance.pl $trans $seq $single $left $right $meth $output $trinmode $alnmeth $strand $cpu $outpref $maxins $coord $bowtie_rsem $bowtie2_rsem $include_rsem_bam $rsem_opt 100 | 101 | 102 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_align.log 103 | #note: Not all the commands have been integrated to data 104 | -------------------------------------------------------------------------------- /00_scripts/07_build_matrix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="trans_matrix" 5 | #SBATCH -o log-matrix.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismax 8 | #SBATCH -A ibismax 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=02-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | ls 07_de_results/*.genes.results >01_info_files/list.results.txt 24 | 25 | #Required: 26 | meth="--est_method RSEM" #RSEM|eXpress|kallisto (needs to know what format to expect) 27 | 28 | # Options: 29 | norm="--cross_sample_norm none" #TMM|UpperQuartile|none (default: TMM) 30 | #name_dir="--name_sample_by_basedir" #name sample column by dirname instead of filename 31 | # base_dir="--basedir_index -2" #default(-2) 32 | 33 | out_pref="--out_prefix matrix.nonorm" #default: 'matrix' 34 | listfile="--samples_file 01_info_files/list.results.txt" #rsem results 35 | #run estimate to matrix 36 | 00_scripts/trinity_utils/util/abundance_estimates_to_matrix.pl $meth $norm \ 37 | 07_de_results/rsem_DH-1MA.genes.results \ 38 | 07_de_results/rsem_DH-4MA.genes.results \ 39 | 07_de_results/rsem_DH-5MA.genes.results \ 40 | 07_de_results/rsem_DL-4MA.genes.results \ 41 | 07_de_results/rsem_DL-5MA.genes.results \ 42 | 07_de_results/rsem_DL-6MA.genes.results \ 43 | 07_de_results/rsem_WH-1MA.genes.results \ 44 | 07_de_results/rsem_WH-2MA.genes.results \ 45 | 07_de_results/rsem_WH-3MA.genes.results \ 46 | 07_de_results/rsem_WL-1MA.genes.results \ 47 | 07_de_results/rsem_WL-3MA.genes.results \ 48 | 07_de_results/rsem_WL-6MA.genes.results \ 49 | $name_dir $base_dir $out_pref 2>&1 | tee 98_log_files/"$TIMESTAMP"_matrix.log 50 | -------------------------------------------------------------------------------- /00_scripts/07_htseq_count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="htseq" 5 | #SBATCH -o log-htseq.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismax 8 | #SBATCH -A ibismax 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=02-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | #Global variables 24 | DATAINPUT="07_de_results" 25 | DATAOUTPUT="07_de_results" 26 | GFF_FOLDER="01_info_files" 27 | GFF_FILE="transcriptome.gff3" 28 | 29 | 30 | #sort bam files 31 | for i in $(ls 07_de_results/*.bam|sed 's/.bam//g'|sort -u) 32 | do 33 | samtools sort "$i".bam "$i".sorted 34 | samtools index "$i".sorted.bam 35 | done 36 | 37 | #create gff3 file 38 | # import function 39 | git clone https://github.com/scottcain/chado_test 40 | 41 | chado_test/chado/bin/gmod_fasta2gff3.pl --fasta_dir 05_trinity_assembly/Trinity.filtered.fasta --gfffilename "$GFF_FOLDER"/"$GFF_FILE" --nosequence --type CDS 42 | 43 | # launch htseqcount 44 | for i in $(ls 07_de_results/*sorted.bam) 45 | do 46 | base="$(basename $i)" 47 | 48 | htseq-count -f bam -s no -t CDS -r pos -i Name "$DATAINPUT"/"$base" "$GFF_FOLDER"/"$GFF_FILE" >> "$DATAOUTPUT"/htseq-count_"$base".txt 49 | 50 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_htseq.log 51 | -------------------------------------------------------------------------------- /00_scripts/08_de_analysis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N de_analysis 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | PAIR_COMP="01_info_files/pair_comparison.txt" 22 | MATRIX="/path/to/matrix/files" 23 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt" 24 | 25 | #Trinity variable 26 | #Required: 27 | 28 | matrix="--matrix $MATRIX" #matrix of raw read counts (not normalized!) 29 | 30 | method="--method DEseq2" # edgeR|DESeq2|voom|ROTS 31 | # note: you should have biological replicates. 32 | # edgeR will support having no bio replicates with 33 | # a fixed dispersion setting. 34 | # Optional: 35 | sample_rep="--samples_file $SAMPLE_REPLICATE" #tab-delimited text file indicating biological replicate relationships. 36 | #ex. 37 | # cond_A cond_A_rep1 38 | # cond_A cond_A_rep2 39 | # cond_B cond_B_rep1 40 | # cond_B cond_B_rep2 41 | # General options: 42 | min_row_count="--min_rowSum_counts 2" #default: 2 (only those rows of matrix meeting requirement will be tested) 43 | output="--output 07_de_results/"$method"_out_dir" #name of directory to place outputs (default: $method.$pid.dir) 44 | ref_sample="--reference_sample " # name of a sample to which all other samples should be compared. 45 | # (default is doing all pairwise-comparisons among samples) 46 | contrasts="--contrasts $PAIR_COMP" # file (tab-delimited) containing the pairs of sample comparisons to perform. 47 | # ex. 48 | # cond_A cond_B 49 | # cond_Y cond_Z 50 | ## EdgeR-related parameters 51 | ## (no biological replicates) 52 | disp="--dispersion " # edgeR dispersion value (Read edgeR manual to guide your value choice) 53 | ## ROTS parameters 54 | rots_b="--ROTS_B 500" # number of bootstraps and permutation resampling (default: 500) 55 | rots_k="--ROTS_K 5000" # largest top genes size (default: 5000) 56 | 57 | #create variable for log file name 58 | METH=$(echo $method|sed 's/--method //g') 59 | 60 | # Run DE analysis 61 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_DE_analysis.pl $matrix $PAIR_COMP $method \ 62 | $sample_rep $min_row_count $output \ 63 | $ref_sample $contrasts $disp $rots_b $rots_k 2>&1 | tee 98_log_files/"$TIMESTAMP"_de_"$METH".log 64 | 65 | ############################################################################################### 66 | # 67 | # Documentation and manuals for various DE methods. Please read for more advanced and more 68 | # fine-tuned DE analysis than provided by this helper script. 69 | # 70 | # edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html 71 | # DESeq2: http://bioconductor.org/packages/release/bioc/html/DESeq2.html 72 | # voom/limma: http://bioconductor.org/packages/release/bioc/html/limma.html 73 | # ROTS: http://www.btk.fi/research/research-groups/elo/software/rots/ 74 | # 75 | ############################################################################################### 76 | -------------------------------------------------------------------------------- /00_scripts/09_expression_clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N cluster 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | R_DATA="/path/to/R.data" 22 | 23 | # Trinity variables 24 | k_cluster="-K " # define K clusters via k-means algorithm 25 | 26 | #or, cut the hierarchical tree: 27 | 28 | #k_tree="--Ktree " #cut tree into K clusters 29 | 30 | p_tree="--Ptree " #cut tree based on this percent of max(height) of tree 31 | 32 | r_data="-R &1 | tee 98_log_files/"$TIMESTAMP"_cluster.log 43 | -------------------------------------------------------------------------------- /00_scripts/10_extract_de_clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N cluster_de 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | MATRIX="/path/to/matrix" 22 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt" 23 | 24 | # Trinity variable 25 | matrix="--matrix $MATRIX" #TMM.EXPR.matrix 26 | 27 | # Optional: 28 | p_value="-P 0.001" #p-value cutoff for FDR (default: 0.001) 29 | min_log2FC="-C 2" #min abs(log2(a/b)) fold change (default: 2 (meaning 2^(2) or 4-fold). 30 | output="--output " #prefix for output file (default: "diffExpr.P${Pvalue}_C${C}) 31 | 32 | # Misc: 33 | sample_replicate="--samples $SAMPLE_REPLICATE" # sample-to-replicate mappings (provided to run_DE_analysis.pl) 34 | max_de_genes="--max_DE_genes_per_comparison 100" # extract only up to the top number of DE features within each pairwise comparison. 35 | # This is useful when you have massive numbers of DE features but still want to make 36 | # useful heatmaps and other plots with more manageable numbers of data points. 37 | 38 | order_column="--order_columns_by_samples_file" # instead of clustering samples or replicates hierarchically based on gene expression patterns, 39 | # order columns according to order in the --samples file. 40 | max_gene_clust="--max_genes_clust 10000" # default: 10000 (if more than that, heatmaps are not generated, since too time consuming) 41 | 42 | #go_enrich="--examine_GO_enrichment" #run GO enrichment analysis 43 | #go_annot="-GO_annots " # GO annotations file 44 | #gene_len="--gene_lengths float" #lengths of genes file 45 | 46 | # run clustering 47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/analyze_diff_expr.pl $matrix $p_value $min_log2FC \ 48 | $ouput $sample_replicate $max_de_genes $order_column \ 49 | $max_gene_clust $go_enrich $go_annot $gene_len 2>&1 | tee 98_log_files/"$TIMESTAMP"_cluster_de.log 50 | -------------------------------------------------------------------------------- /00_scripts/11_extract_go_per_gene.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N go_assign 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | TRINOTATE_FILE="01_info_files/trinotate.xls" 22 | 23 | #Trinity global 24 | # Required: 25 | 26 | trinotate_file="--Trinotate_xls $TRINOTATE_FILE" # Trinotate.xls file. 27 | 28 | gene_mode="--gene" 29 | or 30 | trans_mode="--trans" #gene or transcript-mode 31 | 32 | # Optional: 33 | ances_terms="--include_ancestral_terms" # climbs the GO DAG, and incorporates 34 | # all parent terms for an assignment. 35 | 36 | 37 | # run clustering 38 | 00_scripts/trinotate_utils/util/extract_GO_assignments_from_Trinotate_xls.pl \ 39 | $gene_mode $trans_mode $ances_terms > 07_de_resuls/go_annotations.txt 40 | -------------------------------------------------------------------------------- /00_scripts/12_goseq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N goseq 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | GO_ASSIGN="07_de_resuls/go_annotations.txt" 22 | FACTOR_LAB="01_info_files/factor_labeling.txt" 23 | LIST_GEN="path/to/list1of/gene/to/test" 24 | GEN_LENGTH="path/to/file/length/gene" 25 | 26 | /Analysis/DifferentialExpression/run_GOseq.pl \ 27 | --factor_labeling factor_labeling.txt \ 28 | --GO_assignments go_annotations.txt \ 29 | --lengths gene.lengths.txt 30 | #Trinity global 31 | # Required: 32 | ############################################################################################### 33 | # 34 | fact_label="--factor_labeling $FACTOR_LAB" #tab delimited file with format: factorfeature_id 35 | # or 36 | gen_single_fact="--genes_single_factor $LIST_GEN" #list of genes to test (can be a matrix, only the first column is used for gene IDs) 37 | 38 | go_assign="--GO_assignments $GO_ASSIGN" #extracted GO assignments with format: feature_id GO:000001,GO:00002,... 39 | 40 | len="--lengths $GEN_LENGTH" feature lengths with format: feature_id length 41 | 42 | ############################################################################################### 43 | 44 | 45 | 46 | # run clustering 47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_GOseq.pl \ 48 | $fact_label $gen_single_fact $go_assign $len 2>&1 | tee 98_log_files/"$TIMESTAMP"_goseq.log 49 | -------------------------------------------------------------------------------- /00_scripts/datarmor_jobs/01_trimmomatic_pe_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # launch scripts for Colosse 4 | for file in $(ls 02_data/*.f*q.gz|perl -pe 's/_[12].f(ast)?q.gz//'|sort -u) 5 | do 6 | base=$(basename "$file") 7 | toEval="cat 00_scripts/01_trimmomatic_pe.sh | sed 's/__BASE__/$base/g'"; eval $toEval > 00_scripts/datarmor_jobs/TRIM_$base.sh 8 | done 9 | 10 | 11 | #change jobs header 12 | 13 | #Submit jobs 14 | for i in $(ls 00_scripts/datarmor_jobs/TRIM*sh); do qsub $i; done 15 | 16 | # Clean up 17 | rm 00_scripts/datarmor_jobs/TRIM*sh 18 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/assessing_read_content.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N assess_read_representation 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 8 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | #Move to job submission directory 12 | cd $SGE_O_WORKDIR 13 | 14 | 15 | 16 | #Global variables 17 | TRANSCRIPTOME="05_trinity_assembly/Trinity.fasta" 18 | READSLEFT="04_merged/*.left.fastq.gz" 19 | READSRIGHT="04_merged/*.right.fastq.gz" 20 | #READSSINGLE="03_trimmed/*.left.fastq.gz" 21 | 22 | ######################################################################### 23 | #Required 24 | 25 | # If Paired-end: 26 | left="--left $READSLEFT" 27 | right="--right $READSRIGHT" 28 | 29 | # or Single-end: 30 | #single="--single $READSSINGLE" 31 | 32 | target="--target $TRANSCRIPTOME" #multi-fasta file containing the target sequences (should be named {refName}.fa ) 33 | seq="--seqType fq" #fa | fq (fastA or fastQ format) 34 | aligner="--aligner bowtie" #bowtie, bowtie2 35 | 36 | # Optional: 37 | #strand="--SS_lib_type " # strand-specific library type: single: F or R paired: FR or RF 38 | # 3 examples: single RNA-Ligation method: F 39 | # single dUTP method: R 40 | # paired dUTP method: RF 41 | output="--output 06_assembly_stats/assess_read_count_out" #output directory (default ${aligner}_out) 42 | 43 | 44 | tophits="--num_top_hits 20" #(default: 20) 45 | 46 | #intermediate="--retain_intermediate_files" #retain all the intermediate sam files produced (they take up lots of space! and there's lots of them) 47 | prep_rsem="--prep_rsem" # prep the rsem-ready files 48 | run_rsem="--run_rsem" # execute rsem (implies --prep_rsem) 49 | trinmode="--trinity_mode" #extract gene/trans mapping info from Trinity.fasta file directly 50 | #trans_map="--gene_trans_map " #rsem gene-to-transcript mapping file to use. 51 | max_dist="--max_dist_between_pairs 2000" #default (2000) 52 | #just_prep="--just_prep_build" #just prepare the bowtie-build and stop. 53 | 54 | 55 | 56 | #assess read count in assembly 57 | 00_scripts/trinity_utils/util/bowtie_PE_separate_then_join.pl $target $seq $single $left $right \ 58 | $output $aligner $trinmode \ 59 | $strand $tophits $intermediate $prep_rsem \ 60 | $run_rsem $trans_map $max_dist $just_prep 2>&1 | tee 98_log_files/"$TIMESTAMP"_assess_read_count.log 61 | 62 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/blastp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="blastp" 5 | #SBATCH -o log-blastp.out 6 | #SBATCH -c 5 7 | #SBATCH -p ibismini 8 | #SBATCH -A ibismini 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=5-00:00 12 | #SBATCH --mem=50000 13 | 14 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 15 | SCRIPT=$0 16 | NAME=$(basename $0) 17 | LOG_FOLDER="98_log_files" 18 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 19 | 20 | #Global variables 21 | INPUT="longest_orfs.pep" 22 | DATAFOLDER="Trinity_cleaned.fasta.transdecoder_dir" 23 | UNIPROT="/biodata/bio_sequences/proteins/uniprot/current/uniprot_sprot.fasta" 24 | DATAFOLDEROUT="07_de_results" 25 | OUTPUT="blastp.outfmt6" 26 | 27 | 28 | cat "$DATAFOLDER"/"$INPUT" | parallel -j 5 -k --block 10k --recstart '>' --pipe blastp -db "$UNIPROT" -query - -outfmt 6 -max_target_seqs 1 -evalue 1e-6 > "$DATAFOLDEROUT"/"$OUTPUT" 29 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/compare_replicates_qc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N compare_duplicates 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 8 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | #Move to job submission directory 12 | cd $SGE_O_WORKDIR 13 | 14 | 15 | ####################### 16 | # Inputs and Outputs: # 17 | ####################### 18 | # 19 | # --matrix matrix.RAW.normalized.FPKM 20 | # 21 | # Optional: 22 | # 23 | # Sample groupings: 24 | # 25 | # --samples tab-delimited text file indicating biological replicate relationships. 26 | # ex. 27 | # cond_A cond_A_rep1 28 | # cond_A cond_A_rep2 29 | # cond_B cond_B_rep1 30 | # cond_B cond_B_rep2 31 | # 32 | # --gene_factors tab-delimited file containing gene-to-factor relationships. 33 | # ex. 34 | # liver_enriched gene1 35 | # heart_enriched gene2 36 | # ... 37 | # (use of this data in plotting is noted for corresponding plotting options) 38 | # 39 | # 40 | # --output prefix for output file (default: "${matrix_file}.heatmap") 41 | # 42 | # --save save R session (as .RData file) 43 | # --no_reuse do not reuse any existing .RData file on initial loading 44 | # 45 | ##################### 46 | # Plotting Actions # 47 | ##################### 48 | # 49 | # --compare_replicates provide scatter, MA, QQ, and correlation plots to compare replicates. 50 | # 51 | # 52 | # 53 | # --barplot_sum_counts generate a barplot that sums frag counts per replicate across all samples. 54 | # 55 | # --boxplot_log2_dist generate a boxplot showing the log2 dist of counts where counts >= min fpkm 56 | # 57 | # --sample_cor_matrix generate a sample correlation matrix plot 58 | # --sample_cor_scale_limits ex. "-0.2,0.6" 59 | # --sample_cor_sum_gene_factor_expr instead of plotting the correlation value, plot the sum of expr according to gene factor 60 | # requires --gene_factors 61 | # 62 | # --sample_cor_subset_matrix plot the sample correlation matrix, but create a disjoint set for rows,cols. 63 | # The subset of the samples to provide as the columns is provided as parameter. 64 | # 65 | # --gene_cor_matrix generate a gene-level correlation matrix plot 66 | # 67 | # --indiv_gene_cor generate a correlation matrix and heatmaps for '--top_cor_gene_count' to specified genes (comma-delimited list) 68 | # --top_cor_gene_count (requires '--indiv_gene_cor with gene identifier specified') 69 | # --min_gene_cor_val (requires '--indiv_gene_cor with gene identifier specified') 70 | # 71 | # --heatmap genes vs. samples heatmap plot 72 | # --heatmap_scale_limits "" cap scale intensity to low,high (ie. "-5,5") 73 | # --heatmap_colorscheme default is 'purple,black,yellow' 74 | # a popular alternative is 'green,black,red' 75 | # Specify a two-color gradient like so: "black,yellow". 76 | # 77 | # # sample (column) labeling order 78 | # --lexical_column_ordering order samples by column name lexical order. 79 | # --specified_column_ordering comma-delimited list of column names (must match matrix exactly!) 80 | # --order_columns_by_samples_file order the columns in the heatmap according to replicate name ordering in the samples file. 81 | # 82 | # # gene (row) labeling order 83 | # --order_by_gene_factor order the genes by their factor (given --gene_factors) 84 | # 85 | # --gene_heatmaps generate heatmaps for just one or more specified genes 86 | # Requires a comma-delimited list of gene identifiers. 87 | # Plots one heatmap containing all specified genes, then separate heatmaps for each gene. 88 | # if --gene_factors set, will include factor annotations as color panel. 89 | # else if --prin_comp set, will include include principal component color panel. 90 | # 91 | # --prin_comp generate principal components, include top components in heatmap 92 | # --add_prin_comp_heatmaps draw heatmaps for the top features at each end of the prin. comp. axis. 93 | # (requires '--prin_comp') 94 | # --add_top_loadings_pc_heatmap draw a heatmap containing the top feature loadings across all PCs. 95 | # 96 | # --mean_vs_sd expression variability plot. (highlight specific genes by category via --gene_factors ) 97 | # 98 | # --var_vs_count_hist create histogram of counts of samples having feature expressed within a given expression bin. 99 | # vartype can be any of 'sd|var|cv|fano' 100 | # --count_hist_num_bins number of bins to distribute counts in the histogram (default: 10) 101 | # --count_hist_max_expr maximum value for the expression histogram (default: max(data)) 102 | # --count_hist_convert_percentages convert the histogram counts to percentage values. 103 | # 104 | # 105 | # --per_gene_plots plot each gene as a separate expression plot (barplot or lineplot) 106 | # --per_gene_plot_width default: 2.5 107 | # --per_gene_plot_height default: 2.5 108 | # --per_gene_plots_per_row default: 1 109 | # --per_gene_plots_per_col default: 2 110 | # 111 | # 112 | ######################################################## 113 | # Data Filtering, in order of operation below: ######################################################### 114 | # 115 | # 116 | # --restrict_samples comma-delimited list of samples to restrict to (comma-delim list) 117 | # 118 | # --top_rows only include the top number of rows in the matrix, as ordered. 119 | # 120 | # --min_colSums min number of fragments, default: 0 121 | # 122 | # --min_rowSums min number of fragments, default: 0 123 | # 124 | # --gene_grep grep on string to restrict to genes 125 | # 126 | # 127 | # --min_expressed_genes minimum number of genes (rows) for a column (replicate) having at least '--min_gene_expr_val' 128 | # --min_gene_expr_val a gene must be at least this value expressed across all samples. (default: 0) 129 | # 130 | # --min_across_ALL_samples_gene_expr_val a gene must have this minimum expression value across ALL samples to be retained. 131 | # 132 | # --min_across_ANY_samples_gene_expr_val a gene must have at least this expression value across ANY single sample to be retained. 133 | # 134 | # --minValAltNA minimum cell value after above transformations, otherwise convert to NA 135 | # 136 | # 137 | # 138 | # --top_genes use only the top number of most highly expressed transcripts 139 | # 140 | # --top_variable_genes Restrict to the those genes with highest coeff. of variability across samples (use median of replicates) 141 | # 142 | # --var_gene_method method for ranking top variable genes ( 'coeffvar|anova', default: 'anova' ) 143 | # --anova_maxFDR if anova chose, require FDR value <= anova_maxFDR (default: 0.05) 144 | # or 145 | # --anova_maxP if set, over-rides anova_maxQ (default, off, uses --anova_maxQ) 146 | # 147 | ###################################### 148 | # Data transformations: # 149 | ###################################### 150 | # 151 | # --CPM convert to counts per million (uses sum of totals before filtering) 152 | # 153 | # --binary all values > 0 are set to 1. All values < 0 are set to zero. 154 | # 155 | # --log2 156 | # 157 | # --center_rows subtract row mean from each data point. (only used under '--heatmap' ) 158 | # 159 | # --Zscale_rows Z-scale the values across the rows (genes) 160 | # 161 | ######################### 162 | # Clustering methods: # 163 | ######################### 164 | # 165 | # --gene_dist Setting used for --heatmap (samples vs. genes) 166 | # Options: euclidean, gene_cor 167 | # maximum, manhattan, canberra, binary, minkowski 168 | # (default: 'gene_cor') Note: if using 'gene_cor', set method using '--gene_cor' below. 169 | # 170 | # 171 | # --sample_dist Setting used for --heatmap (samples vs. genes) 172 | # Options: euclidean, gene_cor 173 | # maximum, manhattan, canberra, binary, minkowski 174 | # (default: 'sample_cor') Note: if using 'sample_cor', set method using '--sample_cor' below. 175 | # 176 | # 177 | # --gene_clust ward, single, complete, average, mcquitty, median, centroid, none (default: complete) 178 | # --sample_clust ward, single, complete, average, mcquitty, median, centroid, none (default: complete) 179 | # 180 | # --gene_cor Options: pearson, spearman (default: pearson) 181 | # --sample_cor Options: pearson, spearman (default: pearson) 182 | # 183 | #################### 184 | # Image settings: # 185 | #################### 186 | # 187 | # 188 | # --pdf_width 189 | # --pdf_height 190 | # 191 | ################ 192 | # Misc. params # 193 | ################ 194 | # 195 | # --write_intermediate_data_tables writes out the data table after each transformation. 196 | # 197 | # --show_pipeline_flowchart describe order of events and exit. 198 | # 199 | #################################################################################### 200 | 201 | 202 | 203 | 204 | 205 | #assess read count in assembly 206 | 00_scripts/trinity_utils/util/bowtie_PE_separate_then_join.pl $target $seq $single $left $right \ 207 | $output $aligner $trinmode \ 208 | $strand $tophits $intermediate $prep_rsem \ 209 | $run_rsem $trans_map $max_dist $just_prep 2>&1 | tee 98_log_files/"$TIMESTAMP"_assess_read_count.log 210 | 211 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/corset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="corset" 5 | #SBATCH -o log-corset.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismini 8 | #SBATCH -A ibismini 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=01-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | #global variables 24 | 25 | #list_double="-d " #A comma separated list of distance thresholds. The range must be 26 | #between 0 and 1. e.g -d 0.4,0.5. If more than one distance threshold 27 | #is supplied, the output filenames will be of the form: 28 | #counts-.txt and clusters-.txt 29 | #Default: 0.3 30 | 31 | #log_lik_tresh="-D " #The value used for thresholding the log likelihood ratio. The default 32 | #value will depend on the number of degrees of freedom (which is the 33 | #number of groups -1). By default D = 17.5 + 2.5 * ndf, which corresponds 34 | #approximately to a p-value threshold of 10^-5, when there are fewer than 35 | #10 groups. 36 | 37 | mincov="-m 10" #Filter out any transcripts with fewer than this many reads aligning. 38 | #Default: 10 39 | #grouping="-g DH1MA,DH1MA,DH4BA,DH4MA,DH4MA,DH5BA,DH5MA,DH6BA,DL1BA,DL1BA,DL2BA,DL2BA,DL3BA,DL3BA,DL4MA,DL4MA,DL5MA,DL5MA,DL6MA,DL6MA,Small1A,Small1A,Small2A,Small2A,Small3A,Small3A,WH1BA,WH1MA,WH2BA,WH2BA,WH2MA,WH2MA,WH3BA" 40 | #groups. The parameter must be a comma separated list (no spaces), with the 41 | #groupings given in the same order as the bam filename. For example: 42 | #-g Group1,Group1,Group2,Group2 etc. If this option is not used, each sample 43 | #is treated as an independent experimental group. 44 | 45 | #outpref="-p " #Prefix for the output filenames. The output files will be of the form 46 | #-counts.txt and -clusters.txt. Default filenames are: 47 | #counts.txt and clusters.txt 48 | 49 | #outputover="-f &1 | tee 98_log_files/"$TIMESTAMP"_corset.log 71 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/deprecated/07_build_matrix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="trans_matrix" 5 | #SBATCH -o log-matrix.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismax 8 | #SBATCH -A ibismax 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=02-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | ls 07_de_results/*.genes.results >01_info_files/list.results.txt 24 | 25 | #Required: 26 | meth="--est_method RSEM" #RSEM|eXpress|kallisto (needs to know what format to expect) 27 | 28 | # Options: 29 | norm="--cross_sample_norm none" #TMM|UpperQuartile|none (default: TMM) 30 | #name_dir="--name_sample_by_basedir" #name sample column by dirname instead of filename 31 | # base_dir="--basedir_index -2" #default(-2) 32 | 33 | out_pref="--out_prefix matrix.nonorm" #default: 'matrix' 34 | listfile="--samples_file 01_info_files/list.results.txt" #rsem results 35 | #run estimate to matrix 36 | 00_scripts/trinity_utils/util/abundance_estimates_to_matrix.pl $meth $norm \ 37 | 07_de_results/rsem_DH-1MA.genes.results \ 38 | 07_de_results/rsem_DH-4MA.genes.results \ 39 | 07_de_results/rsem_DH-5MA.genes.results \ 40 | 07_de_results/rsem_DL-4MA.genes.results \ 41 | 07_de_results/rsem_DL-5MA.genes.results \ 42 | 07_de_results/rsem_DL-6MA.genes.results \ 43 | 07_de_results/rsem_WH-1MA.genes.results \ 44 | 07_de_results/rsem_WH-2MA.genes.results \ 45 | 07_de_results/rsem_WH-3MA.genes.results \ 46 | 07_de_results/rsem_WL-1MA.genes.results \ 47 | 07_de_results/rsem_WL-3MA.genes.results \ 48 | 07_de_results/rsem_WL-6MA.genes.results \ 49 | $name_dir $base_dir $out_pref 2>&1 | tee 98_log_files/"$TIMESTAMP"_matrix.log 50 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/deprecated/08_de_analysis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N de_analysis 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | PAIR_COMP="01_info_files/pair_comparison.txt" 22 | MATRIX="/path/to/matrix/files" 23 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt" 24 | 25 | #Trinity variable 26 | #Required: 27 | 28 | matrix="--matrix $MATRIX" #matrix of raw read counts (not normalized!) 29 | 30 | method="--method DEseq2" # edgeR|DESeq2|voom|ROTS 31 | # note: you should have biological replicates. 32 | # edgeR will support having no bio replicates with 33 | # a fixed dispersion setting. 34 | # Optional: 35 | sample_rep="--samples_file $SAMPLE_REPLICATE" #tab-delimited text file indicating biological replicate relationships. 36 | #ex. 37 | # cond_A cond_A_rep1 38 | # cond_A cond_A_rep2 39 | # cond_B cond_B_rep1 40 | # cond_B cond_B_rep2 41 | # General options: 42 | min_row_count="--min_rowSum_counts 2" #default: 2 (only those rows of matrix meeting requirement will be tested) 43 | output="--output 07_de_results/"$method"_out_dir" #name of directory to place outputs (default: $method.$pid.dir) 44 | ref_sample="--reference_sample " # name of a sample to which all other samples should be compared. 45 | # (default is doing all pairwise-comparisons among samples) 46 | contrasts="--contrasts $PAIR_COMP" # file (tab-delimited) containing the pairs of sample comparisons to perform. 47 | # ex. 48 | # cond_A cond_B 49 | # cond_Y cond_Z 50 | ## EdgeR-related parameters 51 | ## (no biological replicates) 52 | disp="--dispersion " # edgeR dispersion value (Read edgeR manual to guide your value choice) 53 | ## ROTS parameters 54 | rots_b="--ROTS_B 500" # number of bootstraps and permutation resampling (default: 500) 55 | rots_k="--ROTS_K 5000" # largest top genes size (default: 5000) 56 | 57 | #create variable for log file name 58 | METH=$(echo $method|sed 's/--method //g') 59 | 60 | # Run DE analysis 61 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_DE_analysis.pl $matrix $PAIR_COMP $method \ 62 | $sample_rep $min_row_count $output \ 63 | $ref_sample $contrasts $disp $rots_b $rots_k 2>&1 | tee 98_log_files/"$TIMESTAMP"_de_"$METH".log 64 | 65 | ############################################################################################### 66 | # 67 | # Documentation and manuals for various DE methods. Please read for more advanced and more 68 | # fine-tuned DE analysis than provided by this helper script. 69 | # 70 | # edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html 71 | # DESeq2: http://bioconductor.org/packages/release/bioc/html/DESeq2.html 72 | # voom/limma: http://bioconductor.org/packages/release/bioc/html/limma.html 73 | # ROTS: http://www.btk.fi/research/research-groups/elo/software/rots/ 74 | # 75 | ############################################################################################### 76 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/deprecated/09_expression_clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N cluster 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | R_DATA="/path/to/R.data" 22 | 23 | # Trinity variables 24 | k_cluster="-K " # define K clusters via k-means algorithm 25 | 26 | #or, cut the hierarchical tree: 27 | 28 | #k_tree="--Ktree " #cut tree into K clusters 29 | 30 | p_tree="--Ptree " #cut tree based on this percent of max(height) of tree 31 | 32 | r_data="-R &1 | tee 98_log_files/"$TIMESTAMP"_cluster.log 43 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/deprecated/10_extract_de_clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N cluster_de 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | MATRIX="/path/to/matrix" 22 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt" 23 | 24 | # Trinity variable 25 | matrix="--matrix $MATRIX" #TMM.EXPR.matrix 26 | 27 | # Optional: 28 | p_value="-P 0.001" #p-value cutoff for FDR (default: 0.001) 29 | min_log2FC="-C 2" #min abs(log2(a/b)) fold change (default: 2 (meaning 2^(2) or 4-fold). 30 | output="--output " #prefix for output file (default: "diffExpr.P${Pvalue}_C${C}) 31 | 32 | # Misc: 33 | sample_replicate="--samples $SAMPLE_REPLICATE" # sample-to-replicate mappings (provided to run_DE_analysis.pl) 34 | max_de_genes="--max_DE_genes_per_comparison 100" # extract only up to the top number of DE features within each pairwise comparison. 35 | # This is useful when you have massive numbers of DE features but still want to make 36 | # useful heatmaps and other plots with more manageable numbers of data points. 37 | 38 | order_column="--order_columns_by_samples_file" # instead of clustering samples or replicates hierarchically based on gene expression patterns, 39 | # order columns according to order in the --samples file. 40 | max_gene_clust="--max_genes_clust 10000" # default: 10000 (if more than that, heatmaps are not generated, since too time consuming) 41 | 42 | #go_enrich="--examine_GO_enrichment" #run GO enrichment analysis 43 | #go_annot="-GO_annots " # GO annotations file 44 | #gene_len="--gene_lengths float" #lengths of genes file 45 | 46 | # run clustering 47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/analyze_diff_expr.pl $matrix $p_value $min_log2FC \ 48 | $ouput $sample_replicate $max_de_genes $order_column \ 49 | $max_gene_clust $go_enrich $go_annot $gene_len 2>&1 | tee 98_log_files/"$TIMESTAMP"_cluster_de.log 50 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/deprecated/11_extract_go_per_gene.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N go_assign 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | TRINOTATE_FILE="01_info_files/trinotate.xls" 22 | 23 | #Trinity global 24 | # Required: 25 | 26 | trinotate_file="--Trinotate_xls $TRINOTATE_FILE" # Trinotate.xls file. 27 | 28 | gene_mode="--gene" 29 | or 30 | trans_mode="--trans" #gene or transcript-mode 31 | 32 | # Optional: 33 | ances_terms="--include_ancestral_terms" # climbs the GO DAG, and incorporates 34 | # all parent terms for an assignment. 35 | 36 | 37 | # run clustering 38 | 00_scripts/trinotate_utils/util/extract_GO_assignments_from_Trinotate_xls.pl \ 39 | $gene_mode $trans_mode $ances_terms > 07_de_resuls/go_annotations.txt 40 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/deprecated/12_goseq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N goseq 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Move to job submission directory 18 | cd $SGE_O_WORKDIR 19 | 20 | #Global variable 21 | GO_ASSIGN="07_de_resuls/go_annotations.txt" 22 | FACTOR_LAB="01_info_files/factor_labeling.txt" 23 | LIST_GEN="path/to/list1of/gene/to/test" 24 | GEN_LENGTH="path/to/file/length/gene" 25 | 26 | /Analysis/DifferentialExpression/run_GOseq.pl \ 27 | --factor_labeling factor_labeling.txt \ 28 | --GO_assignments go_annotations.txt \ 29 | --lengths gene.lengths.txt 30 | #Trinity global 31 | # Required: 32 | ############################################################################################### 33 | # 34 | fact_label="--factor_labeling $FACTOR_LAB" #tab delimited file with format: factorfeature_id 35 | # or 36 | gen_single_fact="--genes_single_factor $LIST_GEN" #list of genes to test (can be a matrix, only the first column is used for gene IDs) 37 | 38 | go_assign="--GO_assignments $GO_ASSIGN" #extracted GO assignments with format: feature_id GO:000001,GO:00002,... 39 | 40 | len="--lengths $GEN_LENGTH" feature lengths with format: feature_id length 41 | 42 | ############################################################################################### 43 | 44 | 45 | 46 | # run clustering 47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_GOseq.pl \ 48 | $fact_label $gen_single_fact $go_assign $len 2>&1 | tee 98_log_files/"$TIMESTAMP"_goseq.log 49 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/fastqc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N fastqc 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=20G 7 | #$ -l h_rt=10:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | #Move to job submission directory 12 | cd $SGE_O_WORKDIR 13 | 14 | mkdir fastqc_dir 15 | for file in $(ls 02_data/.*f*q.gz|sed 's/.f(ast)?q.gz//g') 16 | do 17 | base=$(basename $file) 18 | mkdir ./fastqc.dir/"$base".dir 19 | fastqc -o ./fastqc.dir/"$base".dir -f fastq "$base".fastq.gz 20 | done 21 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/htseq_count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="htseq" 5 | #SBATCH -o log-htseq.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismax 8 | #SBATCH -A ibismax 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=02-00:00 12 | #SBATCH --mem=50000 13 | 14 | cd $SLURM_SUBMIT_DIR 15 | 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 17 | SCRIPT=$0 18 | NAME=$(basename $0) 19 | LOG_FOLDER="98_log_files" 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 21 | 22 | 23 | #Global variables 24 | DATAINPUT="07_de_results" 25 | DATAOUTPUT="07_de_results" 26 | GFF_FOLDER="01_info_files" 27 | GFF_FILE="transcriptome.gff3" 28 | 29 | 30 | #sort bam files 31 | for i in $(ls 07_de_results/*.bam|sed 's/.bam//g'|sort -u) 32 | do 33 | samtools sort "$i".bam "$i".sorted 34 | samtools index "$i".sorted.bam 35 | done 36 | 37 | #create gff3 file 38 | # import function 39 | git clone https://github.com/scottcain/chado_test 40 | 41 | chado_test/chado/bin/gmod_fasta2gff3.pl --fasta_dir 05_trinity_assembly/Trinity.filtered.fasta --gfffilename "$GFF_FOLDER"/"$GFF_FILE" --nosequence --type CDS 42 | 43 | # launch htseqcount 44 | for i in $(ls 07_de_results/*sorted.bam) 45 | do 46 | base="$(basename $i)" 47 | 48 | htseq-count -f bam -s no -t CDS -r pos -i Name "$DATAINPUT"/"$base" "$GFF_FOLDER"/"$GFF_FILE" >> "$DATAOUTPUT"/htseq-count_"$base".txt 49 | 50 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_htseq.log 51 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/insilico_normalization.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N reads_normalization 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 8 6 | #$ -l h_vmem=60G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | #Move to job submission directory 12 | cd $SGE_O_WORKDIR 13 | 14 | #Global variables 15 | READSLEFT="04_merged/*.left.fastq.gz" 16 | READSRIGHT="04_merged/*.right.fastq.gz" 17 | #READSSINGLE="03_trimmed/*.left.fastq.gz" 18 | 19 | ######################################################################### 20 | #Required 21 | 22 | # If Paired-end: 23 | left="--left $READSLEFT" 24 | right="--right $READSRIGHT" 25 | 26 | # or Single-end: 27 | #single="--single $READSSINGLE" 28 | 29 | seq="--seqType fq" #fa | fq (fastA or fastQ format) 30 | 31 | mem="--JM 90G" #:(Jellyfish Memory) number of GB of system memory to use for 32 | #k-mer counting by jellyfish (eg. 10G) *include the 'G' char 33 | #strand="--SS_lib_type " # strand-specific library type: single: F or R paired: FR or RF 34 | # 3 examples: single RNA-Ligation method: F 35 | # single dUTP method: R 36 | # paired dUTP method: RF 37 | 38 | #Or, if you have read collections in different files you can use 'list' files, where each line in a list 39 | # file is the full path to an input file. This saves you the time of combining them just so you can pass 40 | # a single file for each direction. 41 | left_list="--left_list " #left reads, one file path per line 42 | right_list="--right_list " #right reads, one file path per line 43 | 44 | 45 | pairs="--pairs_together" #process paired reads by averaging stats between pairs and retaining linking info. 46 | 47 | output="--output 10_normalized" 48 | 49 | cpu="--CPU 2" #number of threads to use (default: = 2) 50 | parallel_stat="--PARALLEL_STATS" #:generate read stats in parallel for paired reads 51 | 52 | kmer="--KMER_SIZE 25" #default 25 53 | 54 | max_pct="--max_pct_stdev 200" #maximum pct of mean for stdev of kmer coverage across read (default: 200) 55 | 56 | nocleanup="--no_cleanup" #leave intermediate files 57 | tmp_dir="--tmp_dir_name " #default("tmp_normalized_reads") 58 | 59 | 60 | #run normalization 61 | 00_scripts/trinity_utils/util/util/insilico_read_normalization.pl $seq $single $left $right \ 62 | $output $mem $left_list $right_list \ 63 | $pairs $cpu $parallel_stat $kmer $max_pct \ 64 | $nocleanup $tmp_dir 2>&1 | tee 98_log_files/"$TIMESTAMP"_normalization.log 65 | $run_rsem $trans_map $max_dist $just_prep 2>&1 | tee 98_log_files/"$TIMESTAMP"_assess_read_count.log 66 | 67 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/pfam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="pfam" 5 | #SBATCH -o log-pfam.out 6 | #SBATCH -c 4 7 | #SBATCH -p ibis2 8 | #SBATCH -A ibis2 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=5-00:00 12 | #SBATCH --mem=20000 13 | 14 | 15 | 16 | cd $SLURM_SUBMIT_DIR 17 | 18 | 19 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 20 | SCRIPT=$0 21 | NAME=$(basename $0) 22 | LOG_FOLDER="98_log_files" 23 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 24 | 25 | #Global variables 26 | INPUT="Trinity_cleaned.fasta.transdecoder_dir/longest_orfs.pep" 27 | PFAMDB="/home/jelel8/Databases/pfam/Pfam-A.hmm" 28 | OUTPUT="07_de_results/TrinotatePFAM.out" 29 | 30 | 31 | #prepare DB 32 | 33 | hmmpress $PFAMDB 34 | 35 | #run hmmer suite 36 | 37 | hmmscan --cpu 4 --domtblout $OUTPUT $PFAMDB $INPUT > pfam.log 38 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/prepare_info_file.sh: -------------------------------------------------------------------------------- 1 | #1/bin/bash 2 | 3 | 4 | #TODO prepare info file 5 | 6 | cond_A cond_A_rep1 A_rep1_left.fq A_rep1_right.fq 7 | cond_A cond_A_rep2 A_rep2_left.fq A_rep2_right.fq 8 | cond_B cond_B_rep1 B_rep1_left.fq B_rep1_right.fq 9 | cond_B cond_B_rep2 B_rep2_left.fq B_rep2_right.fq 10 | # 11 | # # note, Trinity-specific parameter settings should be included in the samples_file like so: 12 | # # (only --max_memory is absolutely required, since defaults exist for the other settings) 13 | # --CPU=6 14 | # --max_memory=10G 15 | # --seqType=fq 16 | # --SS_lib_type=RF 17 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/prepare_jobs_header.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | 5 | #change information in job headers 6 | 7 | # usage prepare_jobs_header.sh userID userEmail 8 | 9 | ID=$1 10 | email=$2 11 | PWD=$(pwd) 12 | 13 | cd $PWD 14 | 15 | for i in $(ls 00_scripts/*sh); do sed -i -e "s/userID/$ID/g" -e "s/userEmail/$email/g" $i 16 | 17 | done 18 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/transdecoder_getorf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -D ./ 4 | #SBATCH --job-name="transdecoder" 5 | #SBATCH -o log-transdecoder.out 6 | #SBATCH -c 1 7 | #SBATCH -p ibismini 8 | #SBATCH -A ibismini 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca 11 | #SBATCH --time=5-00:00 12 | #SBATCH --mem=50000 13 | 14 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 15 | SCRIPT=$0 16 | NAME=$(basename $0) 17 | LOG_FOLDER="98_log_files" 18 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 19 | 20 | #Global variables 21 | INPUT="05_trinity_assembly/Trinity_cleaned.fasta" 22 | 23 | ./00_scripts/transdecoder_utils/TransDecoder.LongOrfs -t $INPUT 2>&1 | tee 98_log_files/"$TIMESTAMP"_transdecoder_getorf.log 24 | -------------------------------------------------------------------------------- /00_scripts/utility_scripts/transdecoder_predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -N trandecod_predict 3 | #$ -M userID 4 | #$ -m beas 5 | #$ -pe smp 1 6 | #$ -l h_vmem=20G 7 | #$ -l h_rt=20:00:00 8 | #$ -cwd 9 | #$ -S /bin/bash 10 | 11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss) 12 | SCRIPT=$0 13 | NAME=$(basename $0) 14 | LOG_FOLDER="98_log_files" 15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME" 16 | 17 | #Global variables 18 | INPUT="05_trinity_assembly/Trinity.fasta" 19 | 20 | ./00_scripts/transdecoder_utils/TransDecoder.Predict -t $INPUT 2>&1 | tee 98_log_files/"$TIMESTAMP"_transdecoder_predict.log 21 | -------------------------------------------------------------------------------- /01_info_files/.gitignore: -------------------------------------------------------------------------------- 1 | *.gz 2 | *.fasta 3 | *.gff3 4 | -------------------------------------------------------------------------------- /01_info_files/example_pair_comparison.txt: -------------------------------------------------------------------------------- 1 | cond_A cond_B 2 | cond_Y cond_Z 3 | -------------------------------------------------------------------------------- /01_info_files/example_sample_replicates.txt: -------------------------------------------------------------------------------- 1 | cond_A cond_A_rep1 2 | cond_A cond_A_rep2 3 | cond_B cond_B_rep1 4 | cond_B cond_B_rep2 5 | -------------------------------------------------------------------------------- /01_info_files/example_samples_info.txt: -------------------------------------------------------------------------------- 1 | cond_A cond_A_rep1 A_rep1_left.fq A_rep1_right.fq 2 | cond_A cond_A_rep2 A_rep2_left.fq A_rep2_right.fq 3 | cond_B cond_B_rep1 B_rep1_left.fq B_rep1_right.fq 4 | cond_B cond_B_rep2 B_rep2_left.fq B_rep2_right.fq 5 | 6 | --max_memory=10G 7 | -------------------------------------------------------------------------------- /02_data/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.txt 3 | *.fastq.gz 4 | *.fq.gz 5 | *paired* 6 | *single* 7 | *.gz 8 | *.zip 9 | *fq 10 | *fastq 11 | genome* 12 | -------------------------------------------------------------------------------- /03_trimmed/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.txt 3 | *.fastq.gz 4 | *.fq.gz 5 | *paired* 6 | *single* 7 | *.gz 8 | *.zip 9 | *.sh 10 | *r1 11 | *r2 12 | -------------------------------------------------------------------------------- /04_merged/.gitignore: -------------------------------------------------------------------------------- 1 | *.gz 2 | *.fq 3 | *.readcount 4 | -------------------------------------------------------------------------------- /06_assembly_stats/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.sam 3 | *.bam 4 | -------------------------------------------------------------------------------- /07_de_results/.gitignore: -------------------------------------------------------------------------------- 1 | *.ok 2 | *.results 3 | *.bam 4 | *.bai 5 | *stat/ 6 | *.outfmt6 7 | *temp/ 8 | -------------------------------------------------------------------------------- /07_de_results/rsem_DH-1MA.bowtie.sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jleluyer/rna-seq_denovo_workflow/ac4cf95b2944f3823dde268fa7baf96264f3b603/07_de_results/rsem_DH-1MA.bowtie.sorted.bam.bai -------------------------------------------------------------------------------- /98_log_files/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.log 3 | *SC 4 | * 5 | -------------------------------------------------------------------------------- /99_archive/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RNA-seq analysis in de novo 2 | 3 | An integrated worklow for *de novo* analysis and DE gene assessment to conduct RNA-seq data analyses 4 | 5 | This Workflow is developped in [Louis Bernatchez' lab](https://www.bio.ulaval.ca/louisbernatchez/presentation.htm). 6 | 7 | **WARNING** 8 | 9 | The software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the software. 10 | 11 | ## Downloading 12 | 13 | You can clone this repository with: 14 | 15 | ``` 16 | git clone https://github.com/jleluyer/rna-seq_denovo_workflow 17 | ``` 18 | 19 | ## Documentation 20 | 21 | ### 1. Clone git hub directory 22 | 23 | ``` 24 | git clone https://github.com/jleluyer/rna-seq_denovo_workflow 25 | ``` 26 | 27 | ### 2. Prepare utilities 28 | 29 | ``` 30 | cd rna-seq_denovo_workflow 31 | 32 | ./00_scripts/00_import_trinity.sh 33 | ``` 34 | This script will create the utilities directories that have to be configured and installed 35 | 36 | ``` 37 | cd 00_scripts/trinity_utils/ 38 | make 39 | make plugins 40 | cd ../.. 41 | ``` 42 | 43 | ``` 44 | cd 00_scripts/transdecoder_utils/ 45 | make 46 | cd ../.. 47 | ``` 48 | 49 | For Corset, small changes need to be done to correctly confgure and install the software. Change **userNAME** for your username. 50 | 51 | ``` 52 | cd 00-scripts/corset_utils 53 | ./configure --with-bam_inc=/prg/samtools/0.1.19/ --with-bam_lib=/prg/samtools/0.1.19/ --prefix=/home/userNAME/local 54 | make 55 | make install 56 | cd ../.. 57 | ``` 58 | 59 | Make sure **Bowtie** is in your **$PATH**. 60 | 61 | ### 3. Import data 62 | 63 | ``` 64 | cp /path/to/the/data/repository/*.gz 02_data 65 | ``` 66 | 67 | ### 4. Trimming the data 68 | 69 | * Import univec.fasta 70 | 71 | ``` 72 | wget 01_info_files/univec.fasta ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec 73 | ``` 74 | Add your specific adaptors if absent in the database. 75 | 76 | * Trimming 77 | 78 | Two scripts are provided for **Single-End** or **Paired-end** data, **00_scripts/01_trimmomatic_se.sh** and **00_scripts/01_trimmomatic_pe.sh**, respectively. 79 | 80 | ``` 81 | sbatch 00_scripts/01_trimmomatic_se.sh 82 | ``` 83 | 84 | You may also want to check the quality of your data prior to trimming using **00_scripts/utility_scripts/fastq.sh**. This will require to have **fastQC** installed in your **$PATH**. 85 | 86 | ### 4. Merging data for assembly 87 | 88 | Note: Trinity is memory-consuming, make sure you adapt the number of samples to the memory available. When limited in memory, you could use a in silico normalization provided in the **00_scripts/utility_scripts/insilico_normalization.sh**. Otherwise, you may want to select a subset of the data and modify **./00_scripts/02_merge.sh**. For more information regarding memory usage, please visit [Trinity memory usage](http://trinityrnaseq.github.io/performance/mem.html) 89 | 90 | ``` 91 | sbatch 00_scripts/02_merge.sh 92 | ``` 93 | 94 | ### 5. Assembly 95 | 96 | Before running assembly, you need to make sure that the folder **05_trinity_assembly** is empty. 97 | You also need to adapt the script for your own needs. For instance, you will need to adapt the input files (either SE, PE or strand-specific) in the global variables section. Other parameters need also to be carefuly chosen (_e.g_: minimum length of transcripts, ...). 98 | 99 | ``` 100 | sbatch 00_scripts/03_assemble.sh 101 | ``` 102 | 103 | ### 6. Check assembly quality 104 | 105 | ``` 106 | sbatch 00_scripts/04_assembly_stats.sh 107 | ``` 108 | This script will output file **06_assembly_stats/results_stats.txt** 109 | 110 | Other scripts are provided to assess the quality of the transcriptome: 111 | **00_scripts/utility_scripts/assessing_read_content.sh** 112 | 113 | 114 | ### 7. Downstream analysis 115 | 116 | #### 7.1 Transcript abundance 117 | 118 | * Prepare reference 119 | 120 | ``` 121 | sbatch 00_scripts/05_prep_ref.sh 122 | ``` 123 | 124 | * Quantify transcripts abundance 125 | 126 | ``` 127 | sbatch 00_scripts/06_transcripts_abundance.sh 128 | ``` 129 | 130 | Several options are possible ase well as severel tools for quantifying trasncripts aboundance such as **RSEM**, **Kallisto**, **eXpress** or **salmon** as well as different aligners **Bowtie** or **Bowtie2** 131 | 132 | * Cluster transcripts 133 | 134 | ``` 135 | sbatch 00_scripts/utilities/corset.sh 136 | ``` 137 | 138 | ### 7.2 Annotation 139 | 140 | * Predict longest ORFs 141 | 142 | ``` 143 | sbatch 00_scripts/utility_scripts/transdecoder_getorfs.sh 144 | ``` 145 | 146 | * Blast against Uniprot 147 | 148 | ``` 149 | sbatch 00_scripts/utility_scripts/blastp.sh 150 | ``` 151 | _note: you will need access to **uniprot_sprot.fasta** database_ 152 | 153 | 154 | * Identify protein families 155 | 156 | ``` 157 | sbatch 00_scripts/utility_scripts/pfam.sh 158 | ``` 159 | _note: you will need access to **Pfam-A** database_ 160 | 161 | 162 | * Identify signal peptides 163 | 164 | ``` 165 | sbatch 00_scripts/utility_scripts/signalip.sh 166 | ``` 167 | _need to install signalip_ 168 | 169 | * Identify transmembrane regions 170 | 171 | ``` 172 | sbatch 00_scripts/utility_scripts/tmhmm.sh 173 | ``` 174 | _need to install tmhmm_ 175 | 176 | * Identify rRNA transcripts 177 | 178 | ``` 179 | sbatch 00_scripts/utility_scripts/rnammer.sh 180 | ``` 181 | _need to install rnammer_ 182 | 183 | * Compile the annotation in a final report 184 | 185 | ``` 186 | sbatch 00_scripts/utility_scripts/trianotate.sh 187 | ``` 188 | 189 | _note: you will need access to **Trinotate.sqlite** database_ 190 | 191 | 192 | ### 7.3 Differential expression analysis 193 | 194 | 195 | * Build transcripts expression matrices 196 | 197 | ``` 198 | sbatch 00_scripts/07_matrix.sh 199 | ``` 200 | * running DE 201 | ``` 202 | sbatch 00_scripts/08_de_analysis.sh 203 | ``` 204 | 205 | Several packages are available and implemented in the script such as **DeSeq2**, **Lima/voom**, **EdgeR** and **ROTS**. 206 | 207 | 208 | ## Notes 209 | 210 | ## Dependencies 211 | 212 | ### Softwares 213 | 214 | [Trinity](https://github.com/trinityrnaseq/trinityrnaseq) 215 | 216 | [Trinotate](https://github.com/Trinotate/Trinotate) 217 | 218 | [RSEM](https://github.com/deweylab/RSEM) 219 | 220 | [Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) 221 | 222 | **java 1.7** or higher 223 | 224 | [R](https://www.r-project.org/) 225 | 226 | ### R packages 227 | 228 | [edgeR](http://bioconductor.org/packages/release/bioc/html/edgeR.html) 229 | 230 | [DESeq2](http://bioconductor.org/packages/release/bioc/html/DESeq2.html) 231 | 232 | [limma/voom](http://bioconductor.org/packages/release/bioc/html/limma.html) 233 | 234 | [ROTS](http://www.btk.fi/research/research-groups/elo/software/rots/) 235 | 236 | [goseq](http://www.bioconductor.org/packages/release/bioc/html/goseq.html) 237 | 238 | #### Install R packages 239 | 240 | ```R 241 | %R 242 | source("http://bioconductor.org/biocLite.R") 243 | biocLite('edgeR') 244 | biocLite('limma') 245 | biocLite('DESeq2') 246 | biocLite('ctc') 247 | biocLite('Biobase') 248 | bioclite("goseq") 249 | install.packages('gplots') 250 | install.packages('ape') 251 | ``` 252 | 253 | ## Citations 254 | 255 | Grabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L, Raychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F, Birren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A. (2011). Full-length transcriptome assembly from RNA-seq data without a reference genome. **_Nat. Biotechnol._** [doi: 10.1038/nbt.1883](http://www.ncbi.nlm.nih.gov/pubmed/21572440) 256 | 257 | Haas BJ, Papanicolaou A, Yassour M, Grabherr M, Blood PD, Bowden J, Couger MB, Eccles D, Li B, Lieber M, Macmanes MD, Ott M, Orvis J, Pochet N, Strozzi F, Weeks N, Westerman R, William T, Dewey CN, Henschel R, Leduc RD, Friedman N, Regev A. (2013). De novo transcript sequence reconstruction from RNA-Seq: reference generation and analysis with Trinity. **_Nat. Protoc._** [doi: 10.1038/nprot.2013.084](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3875132/) 258 | 259 | [Trinity documentation](https://github.com/trinityrnaseq/trinityrnaseq/wiki) 260 | 261 | ## Licence 262 | 263 | The rna-seq_denovo_workflow is licensed under the GPL3 license. See the LICENCE file for more details. 264 | --------------------------------------------------------------------------------