├── .gitignore
├── 00_scripts
    ├── .gitignore
    ├── 00_import_trinity.sh
    ├── 01_trimmomatic_pe.sh
    ├── 01_trimmomatic_se.sh
    ├── 02_merge.sh
    ├── 03_assemble.sh
    ├── 04_assembly_stats.sh
    ├── 05_prepare_ref.sh
    ├── 06_transcripts_abundance.sh
    ├── 07_build_matrix.sh
    ├── 07_htseq_count.sh
    ├── 08_de_analysis.sh
    ├── 09_expression_clustering.sh
    ├── 10_extract_de_clustering.sh
    ├── 11_extract_go_per_gene.sh
    ├── 12_goseq.sh
    ├── datarmor_jobs
    │   └── 01_trimmomatic_pe_jobs.sh
    └── utility_scripts
    │   ├── assessing_read_content.sh
    │   ├── blastp.sh
    │   ├── compare_replicates_qc.sh
    │   ├── corset.sh
    │   ├── deprecated
    │       ├── 07_build_matrix.sh
    │       ├── 08_de_analysis.sh
    │       ├── 09_expression_clustering.sh
    │       ├── 10_extract_de_clustering.sh
    │       ├── 11_extract_go_per_gene.sh
    │       └── 12_goseq.sh
    │   ├── fastqc.sh
    │   ├── htseq_count.sh
    │   ├── insilico_normalization.sh
    │   ├── pfam.sh
    │   ├── prepare_info_file.sh
    │   ├── prepare_jobs_header.sh
    │   ├── transdecoder_getorf.sh
    │   └── transdecoder_predict.sh
├── 01_info_files
    ├── .gitignore
    ├── example_pair_comparison.txt
    ├── example_sample_replicates.txt
    └── example_samples_info.txt
├── 02_data
    └── .gitignore
├── 03_trimmed
    └── .gitignore
├── 04_merged
    └── .gitignore
├── 06_assembly_stats
    └── .gitignore
├── 07_de_results
    ├── .gitignore
    └── rsem_DH-1MA.bowtie.sorted.bam.bai
├── 98_log_files
    └── .gitignore
├── 99_archive
    └── .gitignore
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.out
2 | .Rhistory
3 | *.txt
4 | *transdecoder_dir/
5 | 05_trinity_assembly_200/
6 | 05_trinity_assembly/
7 | chado_test
8 | 


--------------------------------------------------------------------------------
/00_scripts/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.txt
 3 | *.fastq.gz
 4 | *.gz
 5 | trinity_utils/
 6 | trinotate_utils/
 7 | transdecoder_utils/
 8 | transrate_utils/
 9 | transvestigator_utils/
10 | corset_utils/
11 | 


--------------------------------------------------------------------------------
/00_scripts/00_import_trinity.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | cd $(pwd)
 5 | 
 6 | #clone into trinity
 7 | git clone https://github.com/trinityrnaseq/trinityrnaseq 00_scripts/trinity_utils
 8 | 
 9 | # clone trinotate
10 | git clone https://github.com/Trinotate/Trinotate 00_scripts/trinotate_utils
11 | 
12 | #clone transDecoder
13 | git clone https://github.com/TransDecoder/TransDecoder 00_scripts/transdecoder_utils
14 | 
15 | # clone transvestigator
16 | git clone https://github.com/genomeannotation/transvestigator 00_scripts/transvestigator_utils
17 | 
18 | #clone transrate
19 | git clone https://github.com/blahah/transrate 00_scripts/transrate_utils
20 | 
21 | #clone corset
22 | git clone https://github.com/Oshlack/Corset 00_scripts/corset_utils
23 | 
24 | 


--------------------------------------------------------------------------------
/00_scripts/01_trimmomatic_pe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -N trimmomatic__BASE__
 3 | #PBS -o trimmomatic__BASE__.out
 4 | #PBS -l walltime=02:00:00
 5 | #PBS -l mem=60g
 6 | #####PBS -m ea 
 7 | #PBS -l ncpus=8
 8 | #PBS -q omp
 9 | #PBS -r n
10 | 
11 | cd $PBS_O_WORKDIR
12 | 
13 | 
14 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
15 | SCRIPT=$0
16 | NAME=$(basename $0)
17 | LOG_FOLDER="98_log_files"
18 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
19 | 
20 | 
21 | # Global variables
22 | 
23 | ADAPTERFILE="univec.fasta"
24 | NCPU=8
25 | base=__BASE__
26 | TRIMMOMATIC_JAR="trimmomatic-0.36.jar"
27 | 
28 | java -Xmx40G -jar $TRIMMOMATIC_JAR PE \
29 | 	-threads 8 \
30 | 	-phred33 \
31 |         02_data/"$base"_1.fastq.gz \
32 |         02_data/"$base"_2.fastq.gz \
33 |         03_trimmed/"$base"_R1.paired.fastq.gz \
34 |         03_trimmed/"$base"_R1.single.fastq.gz \
35 |         03_trimmed/"$base"_R2.paired.fastq.gz \
36 |         03_trimmed/"$base"_R2.single.fastq.gz \
37 |         ILLUMINACLIP:"$ADAPTERFILE":2:20:7 \
38 |         LEADING:20 \
39 |         TRAILING:20 \
40 |         SLIDINGWINDOW:30:30 \
41 |         MINLEN:40 2> 98_log_files/log.trimmomatic.pe."$TIMESTAMP"      
42 | 


--------------------------------------------------------------------------------
/00_scripts/01_trimmomatic_se.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="trim"
 5 | #SBATCH -o log-trim_pe.out
 6 | #SBATCH -c 8
 7 | #SBATCH -p ibismini
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=type_your_mail@ulaval.ca
10 | #SBATCH --time=0-20:00
11 | #SBATCH --mem=50000
12 | 
13 | cd $SLURM_SUBMIT_DIR
14 | 
15 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
16 | SCRIPT=$0
17 | NAME=$(basename $0)
18 | LOG_FOLDER="98_log_files"
19 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
20 | 
21 | 
22 | #Global variables
23 | ADAPTERFILE="01_info_files/univec.fasta"
24 | TRIMMOMATIC_JAR="/prg/trimmomatic/0.36/trimmomatic-0.36.jar"
25 | 
26 | for file in $(ls 02_data/*.f*q.gz|perl -pe 's/_R[12].f(ast)?q.gz//')
27 | do
28 |         base=$(basename "$file")
29 | 
30 | java -Xmx40G -jar $TRIMMOMATIC_JAR SE \
31 |         -phred33 \
32 |         -threads 8 \
33 |         02_data/"$base"_R1.fastq.gz \
34 |         03_trimmed/"$base"_R1.trimmed.fastq.gz \
35 |         ILLUMINACLIP:"$ADAPTERFILE":2:20:7 \
36 |         LEADING:20 \
37 |         TRAILING:20 \
38 |         SLIDINGWINDOW:30:30 \
39 |         MINLEN:60 
40 | 
41 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_trimmomatic_se.log 
42 | 


--------------------------------------------------------------------------------
/00_scripts/02_merge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="merge"
 5 | #SBATCH -o log-merge.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismini
 8 | #SBATCH -A ibismini
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=2-00:00
12 | #SBATCH --mem=10000
13 | 
14 | <<<<<<< HEAD
15 | cd $SLURM_SUBMIT_DIR
16 | 
17 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
18 | SCRIPT=$0
19 | NAME=$(basename $0)
20 | LOG_FOLDER="98_log_files"
21 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
22 | 
23 | INPUTFOLDER="03_trimmed"
24 | OUTPUTFOLDER="04_merged"
25 | 
26 | 	cat "$INPUTFOLDER"/*_R1.paired.fastq.gz >> "$OUTPUTFOLDER"/all_reads.left.fastq
27 |  
28 | 	cat "$INPUTFOLDER"/*_R2.paired.fastq.gz >> "$OUTPUTFOLDER"/all_reads.right.fastq
29 | 
30 | =======
31 | #move to present directory
32 | cd $(pwd)
33 | 
34 | #cat all reads
35 | 	cat "$INPUTFOLDER"/*_R1.paired.fastq.gz > "$OUTPUTFOLDER"/all_reads.left.fastq.gz
36 |  
37 | 	cat "$INPUTFOLDER"/*_R2.paired.fastq.gz > "$OUTPUTFOLDER"/all_reads.right.fastq.gz
38 | >>>>>>> 8629e02ffad09c4df82ffb278128308c999cbded
39 | 


--------------------------------------------------------------------------------
/00_scripts/03_assemble.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH -D ./ 
  4 | #SBATCH --job-name="assemble"
  5 | #SBATCH -o log-assemble.out
  6 | #SBATCH -c 8
  7 | #SBATCH -p ibis2
  8 | #SBATCH -A ibis2
  9 | #SBATCH --mail-type=ALL
 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
 11 | #SBATCH --time=10-00:00
 12 | #SBATCH --mem=220000
 13 | 
 14 | cd $SLURM_SUBMIT_DIR
 15 | 
 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
 17 | SCRIPT=$0
 18 | NAME=$(basename $0)
 19 | LOG_FOLDER="98_log_files"
 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
 21 | 
 22 | #Global variables
 23 | READSLEFT="04_merged/all_left.fq"
 24 | READSRIGHT="04_merged/all_right.fq"
 25 | #READSSINGLE="03_trimmed/*.trimmed.fastq.gz"
 26 | 
 27 | #Trinity variables
 28 | ##Required
 29 | seqtype="--seqType fq"	     		# type of reads: ( fa, or fq )
 30 | 
 31 | mem="--max_memory 200G"   		# suggested max memory to use by Trinity where limiting can be enabled. (jellyfish, sorting, etc)
 32 |                             		# provied in Gb of RAM, ie.  '--max_memory 10G'
 33 | #paired reads:
 34 | left="--left  $READSLEFT"    		#left reads, one or more file names (separated by commas, no spaces)
 35 | right="--right $READSRIGHT"    		#right reads, one or more file names (separated by commas, no spaces)
 36 | #single-end:
 37 | #single="--single $READSSINGLE"   		#single reads, one or more file names, comma-delimited 
 38 | 						#(note, if single file contains pairs, can use flag: --run_as_paired )
 39 | ##Optionnal
 40 | #strand="--SS_lib_type <string> "        	#Strand-specific RNA-Seq read orientation.
 41 |                                   		# if paired: RF or FR,
 42 |                                    		#if single: F or R.   (dUTP method = RF)
 43 |                                    		#See web documentation.
 44 | cpu="--CPU 8" 	                    		#number of CPUs to use, default: 2
 45 | mincontiglength="--min_contig_length 200" 	#minimum assembled contig length to report
 46 |                                    		#(def=200)
 47 | #corlongread="--long_reads <string>"	        #fasta file containing error-corrected or circular consensus (CCS) pac bio reads
 48 | #genomeguided="--genome_guided_bam <string>"     #genome guided mode, provide path to coordinate-sorted bam file.
 49 |                                    		#(see genome-guided param section under --show_full_usage_info)
 50 | #jaccard="--jaccard_clip"                  	#:option, set if you have paired reads and
 51 |                                    		#you expect high gene density with UTR
 52 |                                    		#overlap (use FASTQ input file format
 53 |                                    		#for reads).
 54 |                                    		#(note: jaccard_clip is an expensive
 55 |                                    		#operation, so avoid using it unless
 56 |                                    		#necessary due to finding excessive fusion
 57 |                                    		#transcripts w/o it.)
 58 | #trim="--trimmomatic"      			#run Trimmomatic to quality trim reads
 59 |                                         	#see '--quality_trimming_params' under full usage info for tailored settings.
 60 | #normalize="--normalize_reads"               	#run in silico normalization of reads. Defaults to max. read coverage of 50.
 61 |                                        		#see '--normalize_max_read_cov' under full usage info for tailored settings.
 62 | #notphase2="--no_distributed_trinity_exec"   	#do not run Trinity phase 2 (assembly of partitioned reads), and stop after generating command list.
 63 | output="--output 05_trinity_assembly_200/"              	#name of directory for output (will be
 64 |                                    		#created if it doesn't already exist)
 65 |                                    		#default( your current working directory: "/home/leluyer/trinity_out_dir" 
 66 |                                     		#note: must include 'trinity' in the name as a safety precaution! )
 67 | #cleanup="--full_cleanup"                  	#only retain the Trinity fasta file, rename as ${output_dir}.Trinity.fasta
 68 | 
 69 | 
 70 | 00_scripts/trinity_utils/Trinity $seqtype $mem $left $right $single \
 71 | 	$strand $cpu $mincontiglength $corlongread \
 72 | 	$genomeguided $jaccard $normalize $notphase2 \
 73 | 	$output $cleanup 2>&1 | tee 98_log_files/"$TIMESTAMP"_trinityassembly.log
 74 | 
 75 | 
 76 | 
 77 | #  --cite                          :show the Trinity literature citation
 78 | #
 79 | #  --verbose                       :provide additional job status info during the run.
 80 | #
 81 | #  --version                       :reports Trinity version (v2.1.1) and exits.
 82 | #
 83 | #  --show_full_usage_info          :show the many many more options available for running Trinity (expert usage).
 84 | #
 85 | #
 86 | ###############################################################################
 87 | #
 88 | #  *Note, a typical Trinity command might be:
 89 | #
 90 | #        Trinity --seqType fq --max_memory 50G --left reads_1.fq  --right reads_2.fq --CPU 6
 91 | #
 92 | #
 93 | #    and for Genome-guided Trinity:
 94 | #
 95 | #        Trinity --genome_guided_bam rnaseq_alignments.csorted.bam --max_memory 50G
 96 | #                --genome_guided_max_intron 10000 --CPU 6
 97 | #
 98 | #     see: /software6/apps/trinityrnaseq/2.1.1_gcc/sample_data/test_Trinity_Assembly/
 99 | #          for sample data and 'runMe.sh' for example Trinity execution
100 | #
101 | #     For more details, visit: http://trinityrnaseq.github.io
102 | 
103 | 


--------------------------------------------------------------------------------
/00_scripts/04_assembly_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="stats"
 5 | #SBATCH -o log-stat.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibis2
 8 | #SBATCH -A ibis2
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=00-10:00
12 | #SBATCH --mem=2000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | 
17 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
18 | SCRIPT=$0
19 | NAME=$(basename $0)
20 | LOG_FOLDER="98_log_files"
21 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
22 | 
23 | 
24 | #global variables
25 | INPUTFILE="05_trinity_assembly/Trinity.fasta"
26 | OUTPUTFILE="06_assembly_stats/results_stats.txt"
27 | 
28 | #Check stats
29 | 00_scripts/trinity_utils/util/TrinityStats.pl "$INPUTFILE" > "$OUTPUTFILE" 2>&1 | tee 98_log_files/"$TIMESTAMP"_assemblystats.log
30 | 
31 | 


--------------------------------------------------------------------------------
/00_scripts/05_prepare_ref.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="prep"
 5 | #SBATCH -o log-prep.out
 6 | #SBATCH -c 8
 7 | #SBATCH -p ibismax
 8 | #SBATCH -A ibismax
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=01-00:00
12 | #SBATCH --mem=50000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
17 | SCRIPT=$0
18 | NAME=$(basename $0)
19 | LOG_FOLDER="98_log_files"
20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
21 | 
22 | 
23 | #Global variables
24 | TRANSCRIPTOME="05_trinity_assembly/Trinity.filtered.fasta"
25 | 
26 | #########################################################################
27 | #Required
28 | 
29 | trans="--transcripts $TRANSCRIPTOME"           		#transcript fasta file
30 | seq="--seqType fq"	               			#fq|fa
31 | 
32 | #single="--single $READSSINGLE"
33 | meth="--est_method RSEM"         			#abundance estimation method.
34 |                                         		#alignment_based:  RSEM|eXpress       
35 |                                         		#alignment_free: kallisto|salmon
36 | output="--output_dir 06_assembly_stats"            	#write all files to output directory
37 | 
38 | #  if alignment_based est_method:
39 | alnmeth="--aln_method bowtie"           			#bowtie|bowtie2|(path to bam file) alignment method.  (note: RSEM requires bowtie)
40 |                                        			#(if you already have a bam file, you can use it here instead of rerunning bowtie)
41 |                                          		#(note, no strand-specific mode for kallisto)
42 | cpu="--thread_count 8"                  		#number of threads to use (default = 4)
43 | #debug="--debug" 	          	             	#retain intermediate files
44 | #genetrans="--gene_trans_map <string>"        		#file containing 'gene(tab)transcript' identifiers per line.
45 | #     or  
46 | trinmode="--trinity_mode" 	  	                #Setting --trinity_mode will automatically generate the gene_trans_map and use it.
47 | 
48 | prepref="--prep_reference"	  	             	#prep reference (builds target index)
49 | outpref="--output_prefix ref_bowtie"    			#prefix for output files.  Defaults to --est_method setting.
50 | 
51 | ########################################
52 | #  Parameters for single-end reads:
53 | #
54 | #fraglength="--fragment_length 200"   		      	#specify RNA-Seq fragment length (default: 200) 
55 | #frgstd=" --fragment_std 80"            			#fragment length standard deviation (defalt: 80)
56 | ########################################
57 | #   bowtie-related parameters: (note, tool-specific settings are further below)
58 | 
59 | #maxins="--max_ins_size 800" 	 	     	   	#maximum insert size (bowtie -X parameter, default: 800)
60 | #coord="--coordsort_bam"                  		#provide coord-sorted bam in addition to the default (unsorted) bam.
61 | ########################################
62 | #  RSEM opts:
63 | #bowtie_rsem="--bowtie_RSEM <string>" 		        #if using 'bowtie', default: "--all --best --strata -m 300 --chunkmbs 512"
64 | #bowtie2_rsem="--bowtie2_RSEM <string>"         		#if using 'bowtie2', default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 "
65 | #include_rsem_bam="--include_rsem_bam" 	    	        # provide the RSEM enhanced bam file including posterior probabilities of read assignments.
66 | #rsem_opt="--rsem_add_opts <string>"        		#additional parameters to pass on to rsem-calculate-expression
67 | ##########################################################################
68 | #  eXpress opts:
69 | #  --bowtie_eXpress <string>  default: "--all --best --strata -m 300 --chunkmbs 512"
70 | #  --bowtie2_eXpress <string> default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 "
71 | #  --eXpress_add_opts <string>  default: ""
72 | 
73 | ##########################################################################
74 | #  kallisto opts:
75 | #  --kallisto_add_opts <string>  default:   
76 | ##########################################################################
77 | #  salmon opts:
78 | #  --salmon_idx_type <string>    quasi|fmd (defalt: quasi)
79 | #  --salmon_add_opts <string>    default: 
80 | 
81 | 
82 | #run reference preparation
83 | 00_scripts/trinity_utils/util/align_and_estimate_abundance.pl $trans $meth $alnmeth $trinmode $outpref $prepref $output 2>&1 | tee 98_log_files/"$TIMESTAMP"_prepref.log
84 | 
85 | #00_scripts/trinity_utils/util/align_and_estimate_abundance.pl --transcripts Trinity.fasta --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference
86 | #note: Not all the commands have been integrated to data	
87 | 


--------------------------------------------------------------------------------
/00_scripts/06_transcripts_abundance.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH -D ./ 
  4 | #SBATCH --job-name="trans_abundance"
  5 | #SBATCH -o log-trans_abundance.out
  6 | #SBATCH -c 8
  7 | #SBATCH -p ibismax
  8 | #SBATCH -A ibismax
  9 | #SBATCH --mail-type=ALL
 10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
 11 | #SBATCH --time=02-00:00
 12 | #SBATCH --mem=50000
 13 | 
 14 | cd $SLURM_SUBMIT_DIR
 15 | 
 16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
 17 | SCRIPT=$0
 18 | NAME=$(basename $0)
 19 | LOG_FOLDER="98_log_files"
 20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
 21 | 
 22 | 
 23 | for file in $(ls 03_trimmed/*.paired.f*q.gz|perl -pe 's/_R[12].paired.fastq.gz//')
 24 | 
 25 | do
 26 | 	sample=$(basename "$file")
 27 | 
 28 | #Global variables
 29 | TRANSCRIPTOME="05_trinity_assembly/Trinity.filtered.fasta"
 30 | READSLEFT="03_trimmed/"$sample"_R1.paired.fastq.gz"
 31 | READSRIGHT="03_trimmed/"$sample"_R2.paired.fastq.gz"
 32 | #READSSINGLE="03_trimmed/*_R1.trimmed.fastq.gz"
 33 | 
 34 | #########################################################################
 35 | #Required
 36 | 
 37 | trans="--transcripts $TRANSCRIPTOME"           		#transcript fasta file
 38 | seq="--seqType fq"	               			#fq|fa
 39 | 
 40 | #  If Paired-end:
 41 | left="--left $READSLEFT"
 42 | right="--right $READSRIGHT"
 43 | 
 44 | #  or Single-end:
 45 | #single="--single $READSSINGLE"
 46 | meth="--est_method RSEM"         			#abundance estimation method.
 47 |                                         		#alignment_based:  RSEM|eXpress       
 48 |                                         		#alignment_free: kallisto|salmon
 49 | output="--output_dir 07_de_results"            	#write all files to output directory
 50 | 
 51 | #  if alignment_based est_method:
 52 | alnmeth="--aln_method bowtie"           			#bowtie|bowtie2|(path to bam file) alignment method.  (note: RSEM requires bowtie)
 53 |                                        			#(if you already have a bam file, you can use it here instead of rerunning bowtie)
 54 | # Optional:
 55 | #strand="--SS_lib_type <string>"          		#strand-specific library type:  paired('RF' or 'FR'), single('F' or 'R').
 56 |                                          		#(note, no strand-specific mode for kallisto)
 57 | cpu="--thread_count 8"                  		#number of threads to use (default = 4)
 58 | #debug="--debug" 	          	             	#retain intermediate files
 59 | #genetrans="--gene_trans_map <string>"        		#file containing 'gene(tab)transcript' identifiers per line.
 60 | #     or  
 61 | trinmode="--trinity_mode" 	  	                #Setting --trinity_mode will automatically generate the gene_trans_map and use it.
 62 | 
 63 | #prepref="--prep_reference"	  	             	#prep reference (builds target index)
 64 | outpref="--output_prefix rsem_"$sample""    			#prefix for output files.  Defaults to --est_method setting.
 65 | #outpref="--output_prefix "$sample""    			#prefix for output files.  Defaults to --est_method setting.
 66 | 
 67 | ########################################
 68 | #  Parameters for single-end reads:
 69 | #
 70 | #fraglength="--fragment_length 200"   		      	#specify RNA-Seq fragment length (default: 200) 
 71 | #frgstd=" --fragment_std 80"            			#fragment length standard deviation (defalt: 80)
 72 | ########################################
 73 | #   bowtie-related parameters: (note, tool-specific settings are further below)
 74 | 
 75 | #maxins="--max_ins_size 800" 	 	     	   	#maximum insert size (bowtie -X parameter, default: 800)
 76 | #coord="--coordsort_bam"                  		#provide coord-sorted bam in addition to the default (unsorted) bam.
 77 | ########################################
 78 | #  RSEM opts:
 79 | #bowtie_rsem="--bowtie_RSEM <string>" 		        #if using 'bowtie', default: "--all --best --strata -m 300 --chunkmbs 512"
 80 | #bowtie2_rsem="--bowtie2_RSEM <string>"         		#if using 'bowtie2', default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 "
 81 | #include_rsem_bam="--include_rsem_bam" 	    	        # provide the RSEM enhanced bam file including posterior probabilities of read assignments.
 82 | #rsem_opt="--rsem_add_opts <string>"        		#additional parameters to pass on to rsem-calculate-expression
 83 | ##########################################################################
 84 | #  eXpress opts:
 85 | #  --bowtie_eXpress <string>  default: "--all --best --strata -m 300 --chunkmbs 512"
 86 | #  --bowtie2_eXpress <string> default: "--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 "
 87 | #  --eXpress_add_opts <string>  default: ""
 88 | 
 89 | ##########################################################################
 90 | #  kallisto opts:
 91 | #  --kallisto_add_opts <string>  default:   
 92 | ##########################################################################
 93 | #  salmon opts:
 94 | #  --salmon_idx_type <string>    quasi|fmd (defalt: quasi)
 95 | #  --salmon_add_opts <string>    default: 
 96 | 
 97 | 
 98 | #Align
 99 | 00_scripts/trinity_utils/util/align_and_estimate_abundance.pl $trans $seq $single $left $right $meth $output $trinmode $alnmeth $strand $cpu $outpref $maxins $coord $bowtie_rsem $bowtie2_rsem $include_rsem_bam $rsem_opt 
100 | 
101 | 
102 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_align.log
103 | #note: Not all the commands have been integrated to data	
104 | 


--------------------------------------------------------------------------------
/00_scripts/07_build_matrix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="trans_matrix"
 5 | #SBATCH -o log-matrix.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismax
 8 | #SBATCH -A ibismax
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=02-00:00
12 | #SBATCH --mem=50000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
17 | SCRIPT=$0
18 | NAME=$(basename $0)
19 | LOG_FOLDER="98_log_files"
20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
21 | 
22 | 
23 | ls 07_de_results/*.genes.results >01_info_files/list.results.txt
24 | 
25 | #Required:
26 | meth="--est_method RSEM"            #RSEM|eXpress|kallisto  (needs to know what format to expect)
27 | 
28 | # Options:
29 | norm="--cross_sample_norm none"          #TMM|UpperQuartile|none   (default: TMM)
30 | #name_dir="--name_sample_by_basedir"     #name sample column by dirname instead of filename
31 | #	base_dir="--basedir_index -2"           #default(-2)
32 | 
33 | out_pref="--out_prefix matrix.nonorm"          #default: 'matrix'
34 | listfile="--samples_file 01_info_files/list.results.txt" 	#rsem results
35 | #run estimate to matrix
36 | 00_scripts/trinity_utils/util/abundance_estimates_to_matrix.pl $meth $norm \
37 | 07_de_results/rsem_DH-1MA.genes.results \
38 | 07_de_results/rsem_DH-4MA.genes.results \
39 | 07_de_results/rsem_DH-5MA.genes.results \
40 | 07_de_results/rsem_DL-4MA.genes.results \
41 | 07_de_results/rsem_DL-5MA.genes.results \
42 | 07_de_results/rsem_DL-6MA.genes.results \
43 | 07_de_results/rsem_WH-1MA.genes.results \
44 | 07_de_results/rsem_WH-2MA.genes.results \
45 | 07_de_results/rsem_WH-3MA.genes.results \
46 | 07_de_results/rsem_WL-1MA.genes.results \
47 | 07_de_results/rsem_WL-3MA.genes.results \
48 | 07_de_results/rsem_WL-6MA.genes.results \
49 | $name_dir $base_dir $out_pref 2>&1 | tee 98_log_files/"$TIMESTAMP"_matrix.log
50 | 


--------------------------------------------------------------------------------
/00_scripts/07_htseq_count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="htseq"
 5 | #SBATCH -o log-htseq.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismax
 8 | #SBATCH -A ibismax
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=02-00:00
12 | #SBATCH --mem=50000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
17 | SCRIPT=$0
18 | NAME=$(basename $0)
19 | LOG_FOLDER="98_log_files"
20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
21 | 
22 | 
23 | #Global variables
24 | DATAINPUT="07_de_results"
25 | DATAOUTPUT="07_de_results"
26 | GFF_FOLDER="01_info_files"
27 | GFF_FILE="transcriptome.gff3"
28 | 
29 | 
30 | #sort bam files
31 | for i in $(ls 07_de_results/*.bam|sed 's/.bam//g'|sort -u)
32 | do
33 | samtools sort "$i".bam "$i".sorted
34 | samtools index "$i".sorted.bam
35 | done
36 | 
37 | #create gff3 file
38 | # import function
39 | git clone https://github.com/scottcain/chado_test
40 | 
41 | chado_test/chado/bin/gmod_fasta2gff3.pl --fasta_dir  05_trinity_assembly/Trinity.filtered.fasta --gfffilename "$GFF_FOLDER"/"$GFF_FILE" --nosequence --type CDS 
42 | 
43 | # launch htseqcount
44 | for i in $(ls 07_de_results/*sorted.bam)
45 | do
46 | base="$(basename $i)"
47 | 
48 | htseq-count -f bam -s no -t CDS -r pos -i Name "$DATAINPUT"/"$base" "$GFF_FOLDER"/"$GFF_FILE" >> "$DATAOUTPUT"/htseq-count_"$base".txt
49 | 
50 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_htseq.log
51 | 


--------------------------------------------------------------------------------
/00_scripts/08_de_analysis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N de_analysis
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | PAIR_COMP="01_info_files/pair_comparison.txt"
22 | MATRIX="/path/to/matrix/files" 
23 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt"
24 | 
25 | #Trinity variable
26 | #Required:
27 |  
28 | matrix="--matrix $MATRIX"               #matrix of raw read counts (not normalized!)
29 |  
30 | method="--method DEseq2"            # edgeR|DESeq2|voom|ROTS
31 |                                     # note: you should have biological replicates.
32 |                                    #  edgeR will support having no bio replicates with
33 |                                     # a fixed dispersion setting. 
34 | # Optional:
35 | sample_rep="--samples_file $SAMPLE_REPLICATE"     	 #tab-delimited text file indicating biological replicate relationships.
36 |                 		                   	 #ex.
37 |  		                                      	 # cond_A    cond_A_rep1
38 |                                 		         # cond_A    cond_A_rep2
39 |                                         		 # cond_B    cond_B_rep1
40 |                              				 # cond_B    cond_B_rep2
41 | #  General options:
42 | min_row_count="--min_rowSum_counts 2"       #default: 2  (only those rows of matrix meeting requirement will be tested)
43 | output="--output 07_de_results/"$method"_out_dir"                      #name of directory to place outputs (default: $method.$pid.dir)
44 | ref_sample="--reference_sample <string>"    			# name of a sample to which all other samples should be compared.
45 |                              					# (default is doing all pairwise-comparisons among samples)
46 | contrasts="--contrasts $PAIR_COMP"            # file (tab-delimited) containing the pairs of sample comparisons to perform.
47 |                               		   	# ex. 
48 |                                       		# cond_A    cond_B
49 |                                        		# cond_Y    cond_Z
50 | ## EdgeR-related parameters
51 | ## (no biological replicates)
52 | disp="--dispersion <float>"            # edgeR dispersion value (Read edgeR manual to guide your value choice)
53 | ## ROTS parameters
54 | rots_b="--ROTS_B 500"                   # number of bootstraps and permutation resampling (default: 500)
55 | rots_k="--ROTS_K 5000"                   # largest top genes size (default: 5000)
56 | 
57 | #create variable for log file name
58 | METH=$(echo $method|sed 's/--method //g')
59 | 
60 | # Run DE analysis
61 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_DE_analysis.pl $matrix $PAIR_COMP $method \
62 | 						$sample_rep $min_row_count $output \
63 | 						$ref_sample $contrasts $disp $rots_b $rots_k 2>&1 | tee 98_log_files/"$TIMESTAMP"_de_"$METH".log 
64 | 
65 |  ###############################################################################################
66 |  #
67 |  #   Documentation and manuals for various DE methods.  Please read for more advanced and more
68 |  #   fine-tuned DE analysis than provided by this helper script.
69 |  #
70 |  #  edgeR:       http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
71 |  #  DESeq2:      http://bioconductor.org/packages/release/bioc/html/DESeq2.html    
72 |  #  voom/limma:  http://bioconductor.org/packages/release/bioc/html/limma.html
73 |  #  ROTS:        http://www.btk.fi/research/research-groups/elo/software/rots/
74 |  #
75 |  ###############################################################################################
76 | 


--------------------------------------------------------------------------------
/00_scripts/09_expression_clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N cluster
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | R_DATA="/path/to/R.data"
22 | 
23 | # Trinity variables
24 | k_cluster="-K <int>"         # define K clusters via k-means algorithm
25 | 
26 | #or, cut the hierarchical tree:
27 | 
28 | #k_tree="--Ktree <int>"     	#cut tree into K clusters
29 | 
30 | p_tree="--Ptree <float>"	   #cut tree based on this percent of max(height) of tree 
31 | 
32 | r_data="-R <string $R_DATA" 	#the filename for the store RData (file.all.RData)
33 | 
34 | # misc:
35 | 
36 | lexical_order=" --lexical_column_ordering"       #reorder column names according to lexical ordering
37 | #no_order="--no_column_reordering"  
38 | 
39 | 
40 | # run clustering
41 | 00_scripts/trinity_utils//Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl $k_cluster $k_tree \
42 | 			$p_tree $r_data $lexical_order $no_order 2>&1 | tee 98_log_files/"$TIMESTAMP"_cluster.log 
43 | 


--------------------------------------------------------------------------------
/00_scripts/10_extract_de_clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N cluster_de
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | MATRIX="/path/to/matrix"
22 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt"
23 | 
24 | # Trinity variable
25 | matrix="--matrix $MATRIX"       #TMM.EXPR.matrix
26 | 
27 | # Optional:
28 | p_value="-P 0.001"             #p-value cutoff for FDR  (default: 0.001)
29 | min_log2FC="-C 2"           #min abs(log2(a/b)) fold change (default: 2  (meaning 2^(2) or 4-fold).
30 | output="--output <float>"      #prefix for output file (default: "diffExpr.P${Pvalue}_C${C})
31 | 
32 | # Misc:
33 | sample_replicate="--samples $SAMPLE_REPLICATE"                    # sample-to-replicate mappings (provided to run_DE_analysis.pl)
34 | max_de_genes="--max_DE_genes_per_comparison 100"    	# extract only up to the top number of DE features within each pairwise comparison.
35 |                                          		# This is useful when you have massive numbers of DE features but still want to make
36 |                                           		# useful heatmaps and other plots with more manageable numbers of data points.
37 |  
38 | order_column="--order_columns_by_samples_file"      	# instead of clustering samples or replicates hierarchically based on gene expression patterns,
39 |                                        		# order columns according to order in the --samples file.
40 | max_gene_clust="--max_genes_clust 10000"            # default: 10000  (if more than that, heatmaps are not generated, since too time consuming)
41 | 
42 | #go_enrich="--examine_GO_enrichment"                #run GO enrichment analysis
43 | #go_annot="-GO_annots <string>"             # GO annotations file
44 | #gene_len="--gene_lengths float"           #lengths of genes file
45 | 
46 | # run clustering
47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/analyze_diff_expr.pl $matrix $p_value $min_log2FC \
48 | 										$ouput $sample_replicate $max_de_genes $order_column \
49 | 										$max_gene_clust $go_enrich $go_annot $gene_len  2>&1 | tee 98_log_files/"$TIMESTAMP"_cluster_de.log 
50 | 


--------------------------------------------------------------------------------
/00_scripts/11_extract_go_per_gene.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N go_assign
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | TRINOTATE_FILE="01_info_files/trinotate.xls"
22 | 
23 | #Trinity global
24 | # Required:
25 | 
26 | trinotate_file="--Trinotate_xls $TRINOTATE_FILE"     # Trinotate.xls file.
27 | 
28 | gene_mode="--gene"
29 |  or 
30 | trans_mode="--trans"         #gene or transcript-mode
31 | 
32 | # Optional:
33 | ances_terms="--include_ancestral_terms"    # climbs the GO DAG, and incorporates
34 | 	                               #  all parent terms for an assignment.
35 | 
36 | 
37 | # run clustering
38 | 00_scripts/trinotate_utils/util/extract_GO_assignments_from_Trinotate_xls.pl \
39 | 					$gene_mode $trans_mode $ances_terms > 07_de_resuls/go_annotations.txt 
40 | 


--------------------------------------------------------------------------------
/00_scripts/12_goseq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N goseq
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | GO_ASSIGN="07_de_resuls/go_annotations.txt"
22 | FACTOR_LAB="01_info_files/factor_labeling.txt"
23 | LIST_GEN="path/to/list1of/gene/to/test"
24 | GEN_LENGTH="path/to/file/length/gene"
25 | 
26 | /Analysis/DifferentialExpression/run_GOseq.pl \
27 |                        --factor_labeling  factor_labeling.txt \
28 |                        --GO_assignments go_annotations.txt \
29 |                        --lengths gene.lengths.txt
30 | #Trinity global
31 | # Required:
32 | ###############################################################################################
33 | #
34 | fact_label="--factor_labeling $FACTOR_LAB"       #tab delimited file with format:  factor<tab>feature_id
35 | #   or
36 | gen_single_fact="--genes_single_factor $LIST_GEN"   #list of genes to test (can be a matrix, only the first column is used for gene IDs)
37 | 
38 | go_assign="--GO_assignments $GO_ASSIGN"        #extracted GO assignments with format: feature_id <tab> GO:000001,GO:00002,...
39 | 
40 | len="--lengths $GEN_LENGTH"               feature lengths with format:  feature_id <tab> length
41 | 
42 | ###############################################################################################
43 | 
44 | 
45 | 
46 | # run clustering
47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_GOseq.pl \
48 | 			$fact_label $gen_single_fact $go_assign $len 2>&1 | tee 98_log_files/"$TIMESTAMP"_goseq.log
49 | 


--------------------------------------------------------------------------------
/00_scripts/datarmor_jobs/01_trimmomatic_pe_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # launch scripts for Colosse
 4 | for file in $(ls 02_data/*.f*q.gz|perl -pe 's/_[12].f(ast)?q.gz//'|sort -u)
 5 | do
 6 | 	base=$(basename "$file")
 7 | 	toEval="cat 00_scripts/01_trimmomatic_pe.sh | sed 's/__BASE__/$base/g'"; eval $toEval > 00_scripts/datarmor_jobs/TRIM_$base.sh
 8 | done
 9 | 
10 | 
11 | #change jobs header
12 | 
13 | #Submit jobs
14 | for i in $(ls 00_scripts/datarmor_jobs/TRIM*sh); do qsub $i; done
15 | 
16 | # Clean up
17 | rm 00_scripts/datarmor_jobs/TRIM*sh
18 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/assessing_read_content.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N assess_read_representation
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 8
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | #Move to job submission directory
12 | cd $SGE_O_WORKDIR
13 | 
14 | 
15 | 
16 | #Global variables
17 | TRANSCRIPTOME="05_trinity_assembly/Trinity.fasta"
18 | READSLEFT="04_merged/*.left.fastq.gz"
19 | READSRIGHT="04_merged/*.right.fastq.gz"
20 | #READSSINGLE="03_trimmed/*.left.fastq.gz"
21 | 
22 | #########################################################################
23 | #Required
24 | 
25 | #  If Paired-end:
26 | left="--left $READSLEFT"
27 | right="--right $READSRIGHT"
28 | 
29 | #  or Single-end:
30 | #single="--single $READSSINGLE"
31 | 
32 | target="--target $TRANSCRIPTOME"            	#multi-fasta file containing the target sequences (should be named {refName}.fa )
33 | seq="--seqType fq"           			#fa | fq    (fastA or fastQ format)
34 | aligner="--aligner bowtie"           		#bowtie, bowtie2
35 | 
36 | # Optional:
37 | #strand="--SS_lib_type <string>"       		# strand-specific library type:  single: F or R  paired: FR or RF
38 |            	                       		# 3 examples:  single RNA-Ligation method:  F
39 |                                              	# single dUTP method: R
40 |                                              	# paired dUTP method: RF
41 | output="--output 06_assembly_stats/assess_read_count_out"                  #output directory (default ${aligner}_out)
42 | 
43 |  
44 | tophits="--num_top_hits 20"        #(default: 20) 
45 | 
46 | #intermediate="--retain_intermediate_files"    				 #retain all the intermediate sam files produced (they take up lots of space! and there's lots of them)
47 | prep_rsem="--prep_rsem"                    	# prep the rsem-ready files
48 | run_rsem="--run_rsem"                     	# execute rsem (implies --prep_rsem)
49 | trinmode="--trinity_mode"       		#extract gene/trans mapping info from Trinity.fasta file directly
50 | #trans_map="--gene_trans_map <string>"    	#rsem gene-to-transcript mapping file to use.
51 | max_dist="--max_dist_between_pairs 2000"        #default (2000) 
52 | #just_prep="--just_prep_build"               #just prepare the bowtie-build and stop.
53 | 
54 | 
55 | 
56 | #assess read count in assembly
57 | 00_scripts/trinity_utils/util/bowtie_PE_separate_then_join.pl $target $seq $single $left $right \
58 | 	$output $aligner $trinmode \
59 | 	$strand $tophits $intermediate $prep_rsem \
60 | 	$run_rsem $trans_map $max_dist $just_prep 2>&1 | tee 98_log_files/"$TIMESTAMP"_assess_read_count.log
61 | 
62 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/blastp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="blastp"
 5 | #SBATCH -o log-blastp.out
 6 | #SBATCH -c 5
 7 | #SBATCH -p ibismini
 8 | #SBATCH -A ibismini
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=5-00:00
12 | #SBATCH --mem=50000
13 | 
14 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
15 | SCRIPT=$0
16 | NAME=$(basename $0)
17 | LOG_FOLDER="98_log_files"
18 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
19 | 
20 | #Global variables
21 | INPUT="longest_orfs.pep"
22 | DATAFOLDER="Trinity_cleaned.fasta.transdecoder_dir"
23 | UNIPROT="/biodata/bio_sequences/proteins/uniprot/current/uniprot_sprot.fasta"
24 | DATAFOLDEROUT="07_de_results"
25 | OUTPUT="blastp.outfmt6"
26 | 
27 | 
28 | cat "$DATAFOLDER"/"$INPUT" | parallel -j 5 -k --block 10k --recstart '>' --pipe blastp -db "$UNIPROT" -query - -outfmt 6 -max_target_seqs 1 -evalue 1e-6 > "$DATAFOLDEROUT"/"$OUTPUT"
29 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/compare_replicates_qc.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #$ -N compare_duplicates
  3 | #$ -M userID
  4 | #$ -m beas
  5 | #$ -pe smp 8
  6 | #$ -l h_vmem=60G
  7 | #$ -l h_rt=20:00:00
  8 | #$ -cwd
  9 | #$ -S /bin/bash
 10 | 
 11 | #Move to job submission directory
 12 | cd $SGE_O_WORKDIR
 13 | 
 14 | 
 15 | #######################
 16 | # Inputs and Outputs: #
 17 | #######################
 18 | #
 19 | #  --matrix <string>        matrix.RAW.normalized.FPKM
 20 | #
 21 | #  Optional:
 22 | #
 23 | #  Sample groupings:
 24 | #
 25 | #  --samples <string>      tab-delimited text file indicating biological replicate relationships.
 26 | #                                   ex.
 27 | #                                        cond_A    cond_A_rep1
 28 | #                                        cond_A    cond_A_rep2
 29 | #                                        cond_B    cond_B_rep1
 30 | #                                        cond_B    cond_B_rep2
 31 | #
 32 | #  --gene_factors <string>   tab-delimited file containing gene-to-factor relationships.
 33 | #                               ex.
 34 | #                                    liver_enriched <tab> gene1
 35 | #                                    heart_enriched <tab> gene2
 36 | #                                    ...
 37 | #                            (use of this data in plotting is noted for corresponding plotting options)
 38 | #
 39 | #
 40 | #  --output <string>        prefix for output file (default: "${matrix_file}.heatmap")
 41 | #
 42 | #  --save                   save R session (as .RData file)
 43 | #  --no_reuse               do not reuse any existing .RData file on initial loading
 44 | #
 45 | #####################
 46 | #  Plotting Actions #
 47 | #####################
 48 | #
 49 | #  --compare_replicates        provide scatter, MA, QQ, and correlation plots to compare replicates.
 50 | #
 51 | #   
 52 | #
 53 | #  --barplot_sum_counts        generate a barplot that sums frag counts per replicate across all samples.
 54 | #
 55 | #  --boxplot_log2_dist <float>        generate a boxplot showing the log2 dist of counts where counts >= min fpkm
 56 | #
 57 | #  --sample_cor_matrix         generate a sample correlation matrix plot
 58 | #    --sample_cor_scale_limits <string>    ex. "-0.2,0.6"
 59 | #    --sample_cor_sum_gene_factor_expr <factor=string>    instead of plotting the correlation value, plot the sum of expr according to gene factor
 60 | #                                                         requires --gene_factors 
 61 | #
 62 | #  --sample_cor_subset_matrix <string>  plot the sample correlation matrix, but create a disjoint set for rows,cols.
 63 | #                                       The subset of the samples to provide as the columns is provided as parameter.
 64 | #
 65 | #  --gene_cor_matrix           generate a gene-level correlation matrix plot
 66 | #
 67 | #  --indiv_gene_cor <string>   generate a correlation matrix and heatmaps for '--top_cor_gene_count' to specified genes (comma-delimited list)
 68 | #      --top_cor_gene_count <int>   (requires '--indiv_gene_cor with gene identifier specified')
 69 | #      --min_gene_cor_val <float>   (requires '--indiv_gene_cor with gene identifier specified')
 70 | #
 71 | #  --heatmap                   genes vs. samples heatmap plot
 72 | #      --heatmap_scale_limits "<int,int>"  cap scale intensity to low,high  (ie.  "-5,5")
 73 | #      --heatmap_colorscheme <string>  default is 'purple,black,yellow'
 74 | #                                      a popular alternative is 'green,black,red'
 75 | #                                      Specify a two-color gradient like so: "black,yellow".
 76 | #
 77 | #     # sample (column) labeling order
 78 | #      --lexical_column_ordering        order samples by column name lexical order.
 79 | #      --specified_column_ordering <string>  comma-delimited list of column names (must match matrix exactly!)
 80 | #      --order_columns_by_samples_file  order the columns in the heatmap according to replicate name ordering in the samples file.
 81 | #
 82 | #     # gene (row) labeling order
 83 | #      --order_by_gene_factor           order the genes by their factor (given --gene_factors)
 84 | #
 85 | #  --gene_heatmaps <string>    generate heatmaps for just one or more specified genes
 86 | #                              Requires a comma-delimited list of gene identifiers.
 87 | #                              Plots one heatmap containing all specified genes, then separate heatmaps for each gene.
 88 | #                                 if --gene_factors set, will include factor annotations as color panel.
 89 | #                                 else if --prin_comp set, will include include principal component color panel.
 90 | #
 91 | #  --prin_comp <int>           generate principal components, include <int> top components in heatmap  
 92 | #      --add_prin_comp_heatmaps <int>  draw heatmaps for the top <int> features at each end of the prin. comp. axis.
 93 | #                                      (requires '--prin_comp') 
 94 | #      --add_top_loadings_pc_heatmap <int>  draw a heatmap containing the <int> top feature loadings across all PCs.
 95 | #
 96 | #  --mean_vs_sd               expression variability plot. (highlight specific genes by category via --gene_factors )
 97 | #
 98 | #  --var_vs_count_hist <vartype=string>        create histogram of counts of samples having feature expressed within a given expression bin.
 99 | #                                              vartype can be any of 'sd|var|cv|fano'
100 | #      --count_hist_num_bins <int>  number of bins to distribute counts in the histogram (default: 10)
101 | #      --count_hist_max_expr <float>  maximum value for the expression histogram (default: max(data))
102 | #      --count_hist_convert_percentages       convert the histogram counts to percentage values.
103 | #
104 | #
105 | #  --per_gene_plots                   plot each gene as a separate expression plot (barplot or lineplot)
106 | #    --per_gene_plot_width <float>     default: 2.5
107 | #    --per_gene_plot_height <float>    default: 2.5
108 | #    --per_gene_plots_per_row <int>   default: 1
109 | #    --per_gene_plots_per_col <int>   default: 2
110 | #
111 | #
112 | ########################################################
113 | #  Data Filtering, in order of operation below:  #########################################################
114 | #
115 | #
116 | #  --restrict_samples <string>   comma-delimited list of samples to restrict to (comma-delim list)
117 | #
118 | #  --top_rows <int>         only include the top number of rows in the matrix, as ordered.
119 | #
120 | #  --min_colSums <int>      min number of fragments, default: 0
121 | #
122 | #  --min_rowSums <int>      min number of fragments, default: 0
123 | #
124 | #  --gene_grep <string>     grep on string to restrict to genes
125 | #
126 | #
127 | #  --min_expressed_genes <int>        minimum number of genes (rows) for a column (replicate) having at least '--min_gene_expr_val'
128 | #       --min_gene_expr_val <float>   a gene must be at least this value expressed across all samples.  (default: 0)
129 | #
130 | #  --min_across_ALL_samples_gene_expr_val <int>   a gene must have this minimum expression value across ALL samples to be retained.
131 | #
132 | #  --min_across_ANY_samples_gene_expr_val <int>   a gene must have at least this expression value across ANY single sample to be retained.
133 | #
134 | #  --minValAltNA <float>    minimum cell value after above transformations, otherwise convert to NA
135 | #
136 | #
137 | #
138 | #  --top_genes <int>        use only the top number of most highly expressed transcripts
139 | #
140 | #  --top_variable_genes <int>      Restrict to the those genes with highest coeff. of variability across samples (use median of replicates)
141 | #
142 | #      --var_gene_method <string>   method for ranking top variable genes ( 'coeffvar|anova', default: 'anova' )
143 | #           --anova_maxFDR <float>    if anova chose, require FDR value <= anova_maxFDR  (default: 0.05)
144 | #            or
145 | #           --anova_maxP <float>    if set, over-rides anova_maxQ  (default, off, uses --anova_maxQ)
146 | #
147 | ######################################
148 | #  Data transformations:             #
149 | ######################################
150 | #
151 | #  --CPM                    convert to counts per million (uses sum of totals before filtering)
152 | #
153 | #  --binary                 all values > 0 are set to 1.  All values < 0 are set to zero.
154 | #
155 | #  --log2
156 | #
157 | #  --center_rows            subtract row mean from each data point. (only used under '--heatmap' )
158 | #
159 | #  --Zscale_rows            Z-scale the values across the rows (genes)  
160 | #
161 | #########################
162 | #  Clustering methods:  #
163 | #########################
164 | #
165 | #  --gene_dist <string>        Setting used for --heatmap (samples vs. genes)
166 | #                                  Options: euclidean, gene_cor
167 | #                                           maximum, manhattan, canberra, binary, minkowski
168 | #                                  (default: 'gene_cor')  Note: if using 'gene_cor', set method using '--gene_cor' below.
169 | #
170 | #
171 | #  --sample_dist <string>      Setting used for --heatmap (samples vs. genes)
172 | #                                  Options: euclidean, gene_cor
173 | #                                           maximum, manhattan, canberra, binary, minkowski
174 | #                                  (default: 'sample_cor')  Note: if using 'sample_cor', set method using '--sample_cor' below.
175 | #
176 | #
177 | #  --gene_clust <string>       ward, single, complete, average, mcquitty, median, centroid, none (default: complete)
178 | #  --sample_clust <string>     ward, single, complete, average, mcquitty, median, centroid, none (default: complete)
179 | #
180 | #  --gene_cor <string>             Options: pearson, spearman  (default: pearson)
181 | #  --sample_cor <string>           Options: pearson, spearman  (default: pearson)
182 | #
183 | ####################
184 | #  Image settings: #
185 | ####################
186 | #
187 | #
188 | #  --pdf_width <int>
189 | #  --pdf_height <int>
190 | #
191 | ################
192 | # Misc. params #
193 | ################
194 | #
195 | #  --write_intermediate_data_tables         writes out the data table after each transformation.
196 | #
197 | #  --show_pipeline_flowchart                describe order of events and exit.
198 | #
199 | ####################################################################################
200 | 
201 | 
202 | 
203 | 
204 | 
205 | #assess read count in assembly
206 | 00_scripts/trinity_utils/util/bowtie_PE_separate_then_join.pl $target $seq $single $left $right \
207 | 	$output $aligner $trinmode \
208 | 	$strand $tophits $intermediate $prep_rsem \
209 | 	$run_rsem $trans_map $max_dist $just_prep 2>&1 | tee 98_log_files/"$TIMESTAMP"_assess_read_count.log
210 | 
211 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/corset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="corset"
 5 | #SBATCH -o log-corset.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismini
 8 | #SBATCH -A ibismini
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=01-00:00
12 | #SBATCH --mem=50000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
17 | SCRIPT=$0
18 | NAME=$(basename $0)
19 | LOG_FOLDER="98_log_files"
20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
21 | 
22 | 
23 | #global variables
24 | 
25 | #list_double="-d <double list>" 			#A comma separated list of distance thresholds. The range must be
26 | 	                  			#between 0 and 1. e.g -d 0.4,0.5. If more than one distance threshold
27 | 	                  			#is supplied, the output filenames will be of the form:
28 | 	                  			#counts-<threshold>.txt and clusters-<threshold>.txt 
29 | 	                  			#Default: 0.3
30 | 
31 | #log_lik_tresh="-D <double>"      		#The value used for thresholding the log likelihood ratio. The default 
32 | 	                  			#value will depend on the number of degrees of freedom (which is the 
33 | 	                  			#number of groups -1). By default D = 17.5 + 2.5 * ndf, which corresponds 
34 | 	                  			#approximately to a p-value threshold of 10^-5, when there are fewer than
35 | 	                  			#10 groups.
36 | 
37 | mincov="-m 10"         			#Filter out any transcripts with fewer than this many reads aligning.
38 | 	                  			#Default: 10
39 | #grouping="-g DH1MA,DH1MA,DH4BA,DH4MA,DH4MA,DH5BA,DH5MA,DH6BA,DL1BA,DL1BA,DL2BA,DL2BA,DL3BA,DL3BA,DL4MA,DL4MA,DL5MA,DL5MA,DL6MA,DL6MA,Small1A,Small1A,Small2A,Small2A,Small3A,Small3A,WH1BA,WH1MA,WH2BA,WH2BA,WH2MA,WH2MA,WH3BA"
40 | 	                  			#groups. The parameter must be a comma separated list (no spaces), with the 
41 | 	                  			#groupings given in the same order as the bam filename. For example:
42 | 	                  			#-g Group1,Group1,Group2,Group2 etc. If this option is not used, each sample
43 | 	                  			#is treated as an independent experimental group.
44 | 
45 | #outpref="-p <string>"      			#Prefix for the output filenames. The output files will be of the form
46 | 	                  			#<prefix>-counts.txt and <prefix>-clusters.txt. Default filenames are:
47 | 	                  			#counts.txt and clusters.txt
48 | 
49 | #outputover="-f <true/false"  			#Specifies whether the output files should be overwritten if they already exist.
50 | 	                  			#Default: false
51 | 
52 | #names="-n DHM-r1,DHM-r2,DHB-r1,DHM-r3,DHM-r4,DHB-r2,DHM-r5,DHB-r3,DLB-r1,DLB-r2,DLB-r3,DLB-r4,DLB-r5,DLB-r6,DLM-r1,DLM-r2,DLM-r3,DLM-r4,DLM-r5,DLM-r6,Small1-r1,Small1-r2,Small2-r1,Small2-r2,Small3-r1,Small3-r2,WHB-r1,WHM-r1,WHB-r2,WHB-3,WHM-r2,WHM-r3,WHB-r4"
53 | 	                  			#This should be a comma separated list without spaces.
54 | 	                  			#e.g. -n Group1-ReplicateA,Group1-ReplicateB,Group2-ReplicateA etc.
55 | 	                  			#Default: the input filenames will be used.
56 | 
57 | #summary_fie="#-r <true/true-stop/false>"       	#Output a file summarising the read alignments. This may be used if you
58 | 	                  			#would like to read the bam files and run the clustering in seperate runs
59 | 	                  			#of corset. e.g. to read input bam files in parallel. The output will be the
60 | 	                  			#bam filename appended with .corset-reads.
61 | 	                  			#Default: false
62 | 
63 | input_type="-i bam"  			#The input file type. Use -i corset, if you previously ran
64 | 	                 			#corset with the -r option and would like to restart using those
65 | 	                  			#read summary files. Running with -i corset will switch off the -r option.
66 | 	                  			#Default: bam
67 | 
68 | 
69 | 
70 | 00_scripts/corset_utils/corset $list_double $log_lik_tresh $mincov $grouping $outpref $outputover $names $input_type 07_de_results/*.bam 2>&1 | tee 98_log_files/"$TIMESTAMP"_corset.log
71 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/deprecated/07_build_matrix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="trans_matrix"
 5 | #SBATCH -o log-matrix.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismax
 8 | #SBATCH -A ibismax
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=02-00:00
12 | #SBATCH --mem=50000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
17 | SCRIPT=$0
18 | NAME=$(basename $0)
19 | LOG_FOLDER="98_log_files"
20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
21 | 
22 | 
23 | ls 07_de_results/*.genes.results >01_info_files/list.results.txt
24 | 
25 | #Required:
26 | meth="--est_method RSEM"            #RSEM|eXpress|kallisto  (needs to know what format to expect)
27 | 
28 | # Options:
29 | norm="--cross_sample_norm none"          #TMM|UpperQuartile|none   (default: TMM)
30 | #name_dir="--name_sample_by_basedir"     #name sample column by dirname instead of filename
31 | #	base_dir="--basedir_index -2"           #default(-2)
32 | 
33 | out_pref="--out_prefix matrix.nonorm"          #default: 'matrix'
34 | listfile="--samples_file 01_info_files/list.results.txt" 	#rsem results
35 | #run estimate to matrix
36 | 00_scripts/trinity_utils/util/abundance_estimates_to_matrix.pl $meth $norm \
37 | 07_de_results/rsem_DH-1MA.genes.results \
38 | 07_de_results/rsem_DH-4MA.genes.results \
39 | 07_de_results/rsem_DH-5MA.genes.results \
40 | 07_de_results/rsem_DL-4MA.genes.results \
41 | 07_de_results/rsem_DL-5MA.genes.results \
42 | 07_de_results/rsem_DL-6MA.genes.results \
43 | 07_de_results/rsem_WH-1MA.genes.results \
44 | 07_de_results/rsem_WH-2MA.genes.results \
45 | 07_de_results/rsem_WH-3MA.genes.results \
46 | 07_de_results/rsem_WL-1MA.genes.results \
47 | 07_de_results/rsem_WL-3MA.genes.results \
48 | 07_de_results/rsem_WL-6MA.genes.results \
49 | $name_dir $base_dir $out_pref 2>&1 | tee 98_log_files/"$TIMESTAMP"_matrix.log
50 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/deprecated/08_de_analysis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N de_analysis
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | PAIR_COMP="01_info_files/pair_comparison.txt"
22 | MATRIX="/path/to/matrix/files" 
23 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt"
24 | 
25 | #Trinity variable
26 | #Required:
27 |  
28 | matrix="--matrix $MATRIX"               #matrix of raw read counts (not normalized!)
29 |  
30 | method="--method DEseq2"            # edgeR|DESeq2|voom|ROTS
31 |                                     # note: you should have biological replicates.
32 |                                    #  edgeR will support having no bio replicates with
33 |                                     # a fixed dispersion setting. 
34 | # Optional:
35 | sample_rep="--samples_file $SAMPLE_REPLICATE"     	 #tab-delimited text file indicating biological replicate relationships.
36 |                 		                   	 #ex.
37 |  		                                      	 # cond_A    cond_A_rep1
38 |                                 		         # cond_A    cond_A_rep2
39 |                                         		 # cond_B    cond_B_rep1
40 |                              				 # cond_B    cond_B_rep2
41 | #  General options:
42 | min_row_count="--min_rowSum_counts 2"       #default: 2  (only those rows of matrix meeting requirement will be tested)
43 | output="--output 07_de_results/"$method"_out_dir"                      #name of directory to place outputs (default: $method.$pid.dir)
44 | ref_sample="--reference_sample <string>"    			# name of a sample to which all other samples should be compared.
45 |                              					# (default is doing all pairwise-comparisons among samples)
46 | contrasts="--contrasts $PAIR_COMP"            # file (tab-delimited) containing the pairs of sample comparisons to perform.
47 |                               		   	# ex. 
48 |                                       		# cond_A    cond_B
49 |                                        		# cond_Y    cond_Z
50 | ## EdgeR-related parameters
51 | ## (no biological replicates)
52 | disp="--dispersion <float>"            # edgeR dispersion value (Read edgeR manual to guide your value choice)
53 | ## ROTS parameters
54 | rots_b="--ROTS_B 500"                   # number of bootstraps and permutation resampling (default: 500)
55 | rots_k="--ROTS_K 5000"                   # largest top genes size (default: 5000)
56 | 
57 | #create variable for log file name
58 | METH=$(echo $method|sed 's/--method //g')
59 | 
60 | # Run DE analysis
61 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_DE_analysis.pl $matrix $PAIR_COMP $method \
62 | 						$sample_rep $min_row_count $output \
63 | 						$ref_sample $contrasts $disp $rots_b $rots_k 2>&1 | tee 98_log_files/"$TIMESTAMP"_de_"$METH".log 
64 | 
65 |  ###############################################################################################
66 |  #
67 |  #   Documentation and manuals for various DE methods.  Please read for more advanced and more
68 |  #   fine-tuned DE analysis than provided by this helper script.
69 |  #
70 |  #  edgeR:       http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
71 |  #  DESeq2:      http://bioconductor.org/packages/release/bioc/html/DESeq2.html    
72 |  #  voom/limma:  http://bioconductor.org/packages/release/bioc/html/limma.html
73 |  #  ROTS:        http://www.btk.fi/research/research-groups/elo/software/rots/
74 |  #
75 |  ###############################################################################################
76 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/deprecated/09_expression_clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N cluster
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | R_DATA="/path/to/R.data"
22 | 
23 | # Trinity variables
24 | k_cluster="-K <int>"         # define K clusters via k-means algorithm
25 | 
26 | #or, cut the hierarchical tree:
27 | 
28 | #k_tree="--Ktree <int>"     	#cut tree into K clusters
29 | 
30 | p_tree="--Ptree <float>"	   #cut tree based on this percent of max(height) of tree 
31 | 
32 | r_data="-R <string $R_DATA" 	#the filename for the store RData (file.all.RData)
33 | 
34 | # misc:
35 | 
36 | lexical_order=" --lexical_column_ordering"       #reorder column names according to lexical ordering
37 | #no_order="--no_column_reordering"  
38 | 
39 | 
40 | # run clustering
41 | 00_scripts/trinity_utils//Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl $k_cluster $k_tree \
42 | 			$p_tree $r_data $lexical_order $no_order 2>&1 | tee 98_log_files/"$TIMESTAMP"_cluster.log 
43 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/deprecated/10_extract_de_clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N cluster_de
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | MATRIX="/path/to/matrix"
22 | SAMPLE_REPLICATE="01_info_files/sample_replicates.txt"
23 | 
24 | # Trinity variable
25 | matrix="--matrix $MATRIX"       #TMM.EXPR.matrix
26 | 
27 | # Optional:
28 | p_value="-P 0.001"             #p-value cutoff for FDR  (default: 0.001)
29 | min_log2FC="-C 2"           #min abs(log2(a/b)) fold change (default: 2  (meaning 2^(2) or 4-fold).
30 | output="--output <float>"      #prefix for output file (default: "diffExpr.P${Pvalue}_C${C})
31 | 
32 | # Misc:
33 | sample_replicate="--samples $SAMPLE_REPLICATE"                    # sample-to-replicate mappings (provided to run_DE_analysis.pl)
34 | max_de_genes="--max_DE_genes_per_comparison 100"    	# extract only up to the top number of DE features within each pairwise comparison.
35 |                                          		# This is useful when you have massive numbers of DE features but still want to make
36 |                                           		# useful heatmaps and other plots with more manageable numbers of data points.
37 |  
38 | order_column="--order_columns_by_samples_file"      	# instead of clustering samples or replicates hierarchically based on gene expression patterns,
39 |                                        		# order columns according to order in the --samples file.
40 | max_gene_clust="--max_genes_clust 10000"            # default: 10000  (if more than that, heatmaps are not generated, since too time consuming)
41 | 
42 | #go_enrich="--examine_GO_enrichment"                #run GO enrichment analysis
43 | #go_annot="-GO_annots <string>"             # GO annotations file
44 | #gene_len="--gene_lengths float"           #lengths of genes file
45 | 
46 | # run clustering
47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/analyze_diff_expr.pl $matrix $p_value $min_log2FC \
48 | 										$ouput $sample_replicate $max_de_genes $order_column \
49 | 										$max_gene_clust $go_enrich $go_annot $gene_len  2>&1 | tee 98_log_files/"$TIMESTAMP"_cluster_de.log 
50 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/deprecated/11_extract_go_per_gene.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N go_assign
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | TRINOTATE_FILE="01_info_files/trinotate.xls"
22 | 
23 | #Trinity global
24 | # Required:
25 | 
26 | trinotate_file="--Trinotate_xls $TRINOTATE_FILE"     # Trinotate.xls file.
27 | 
28 | gene_mode="--gene"
29 |  or 
30 | trans_mode="--trans"         #gene or transcript-mode
31 | 
32 | # Optional:
33 | ances_terms="--include_ancestral_terms"    # climbs the GO DAG, and incorporates
34 | 	                               #  all parent terms for an assignment.
35 | 
36 | 
37 | # run clustering
38 | 00_scripts/trinotate_utils/util/extract_GO_assignments_from_Trinotate_xls.pl \
39 | 					$gene_mode $trans_mode $ances_terms > 07_de_resuls/go_annotations.txt 
40 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/deprecated/12_goseq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N goseq
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Move to job submission directory
18 | cd $SGE_O_WORKDIR
19 | 
20 | #Global variable
21 | GO_ASSIGN="07_de_resuls/go_annotations.txt"
22 | FACTOR_LAB="01_info_files/factor_labeling.txt"
23 | LIST_GEN="path/to/list1of/gene/to/test"
24 | GEN_LENGTH="path/to/file/length/gene"
25 | 
26 | /Analysis/DifferentialExpression/run_GOseq.pl \
27 |                        --factor_labeling  factor_labeling.txt \
28 |                        --GO_assignments go_annotations.txt \
29 |                        --lengths gene.lengths.txt
30 | #Trinity global
31 | # Required:
32 | ###############################################################################################
33 | #
34 | fact_label="--factor_labeling $FACTOR_LAB"       #tab delimited file with format:  factor<tab>feature_id
35 | #   or
36 | gen_single_fact="--genes_single_factor $LIST_GEN"   #list of genes to test (can be a matrix, only the first column is used for gene IDs)
37 | 
38 | go_assign="--GO_assignments $GO_ASSIGN"        #extracted GO assignments with format: feature_id <tab> GO:000001,GO:00002,...
39 | 
40 | len="--lengths $GEN_LENGTH"               feature lengths with format:  feature_id <tab> length
41 | 
42 | ###############################################################################################
43 | 
44 | 
45 | 
46 | # run clustering
47 | 00_scripts/trinity_utils/Analysis/DifferentialExpression/run_GOseq.pl \
48 | 			$fact_label $gen_single_fact $go_assign $len 2>&1 | tee 98_log_files/"$TIMESTAMP"_goseq.log
49 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/fastqc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N fastqc
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=20G
 7 | #$ -l h_rt=10:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | #Move to job submission directory
12 | cd $SGE_O_WORKDIR
13 | 
14 | mkdir fastqc_dir
15 | for file in $(ls 02_data/.*f*q.gz|sed 's/.f(ast)?q.gz//g')
16 | do 
17 | base=$(basename $file)
18 |         mkdir  ./fastqc.dir/"$base".dir
19 |         fastqc -o ./fastqc.dir/"$base".dir -f fastq "$base".fastq.gz
20 | done
21 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/htseq_count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="htseq"
 5 | #SBATCH -o log-htseq.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismax
 8 | #SBATCH -A ibismax
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=02-00:00
12 | #SBATCH --mem=50000
13 | 
14 | cd $SLURM_SUBMIT_DIR
15 | 
16 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
17 | SCRIPT=$0
18 | NAME=$(basename $0)
19 | LOG_FOLDER="98_log_files"
20 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
21 | 
22 | 
23 | #Global variables
24 | DATAINPUT="07_de_results"
25 | DATAOUTPUT="07_de_results"
26 | GFF_FOLDER="01_info_files"
27 | GFF_FILE="transcriptome.gff3"
28 | 
29 | 
30 | #sort bam files
31 | for i in $(ls 07_de_results/*.bam|sed 's/.bam//g'|sort -u)
32 | do
33 | samtools sort "$i".bam "$i".sorted
34 | samtools index "$i".sorted.bam
35 | done
36 | 
37 | #create gff3 file
38 | # import function
39 | git clone https://github.com/scottcain/chado_test
40 | 
41 | chado_test/chado/bin/gmod_fasta2gff3.pl --fasta_dir  05_trinity_assembly/Trinity.filtered.fasta --gfffilename "$GFF_FOLDER"/"$GFF_FILE" --nosequence --type CDS 
42 | 
43 | # launch htseqcount
44 | for i in $(ls 07_de_results/*sorted.bam)
45 | do
46 | base="$(basename $i)"
47 | 
48 | htseq-count -f bam -s no -t CDS -r pos -i Name "$DATAINPUT"/"$base" "$GFF_FOLDER"/"$GFF_FILE" >> "$DATAOUTPUT"/htseq-count_"$base".txt
49 | 
50 | done 2>&1 | tee 98_log_files/"$TIMESTAMP"_htseq.log
51 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/insilico_normalization.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N reads_normalization
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 8
 6 | #$ -l h_vmem=60G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | #Move to job submission directory
12 | cd $SGE_O_WORKDIR
13 | 
14 | #Global variables
15 | READSLEFT="04_merged/*.left.fastq.gz"
16 | READSRIGHT="04_merged/*.right.fastq.gz"
17 | #READSSINGLE="03_trimmed/*.left.fastq.gz"
18 | 
19 | #########################################################################
20 | #Required
21 | 
22 | #  If Paired-end:
23 | left="--left $READSLEFT"
24 | right="--right $READSRIGHT"
25 | 
26 | #  or Single-end:
27 | #single="--single $READSSINGLE"
28 | 
29 | seq="--seqType fq"           			#fa | fq    (fastA or fastQ format)
30 | 
31 | mem="--JM 90G" 					#:(Jellyfish Memory) number of GB of system memory to use for 
32 |                             			#k-mer counting by jellyfish  (eg. 10G) *include the 'G' char
33 | #strand="--SS_lib_type <string>"       		# strand-specific library type:  single: F or R  paired: FR or RF
34 |            	                       		# 3 examples:  single RNA-Ligation method:  F
35 |                                              	# single dUTP method: R
36 |                                              	# paired dUTP method: RF
37 | 
38 | #Or, if you have read collections in different files you can use 'list' files, where each line in a list
39 |  #  file is the full path to an input file.  This saves you the time of combining them just so you can pass
40 |  #  a single file for each direction.
41 | left_list="--left_list  <string>"		#left reads, one file path per line
42 | right_list="--right_list <string>" 		#right reads, one file path per line
43 | 
44 | 
45 | pairs="--pairs_together" 			#process paired reads by averaging stats between pairs and retaining linking info.
46 | 
47 | output="--output 10_normalized"			
48 | 
49 | cpu="--CPU 2"					#number of threads to use (default: = 2)
50 | parallel_stat="--PARALLEL_STATS"		#:generate read stats in parallel for paired reads
51 | 
52 | kmer="--KMER_SIZE 25"               		#default 25
53 | 
54 | max_pct="--max_pct_stdev 200"           	#maximum pct of mean for stdev of kmer coverage across read (default: 200)
55 | 
56 | nocleanup="--no_cleanup"                    	#leave intermediate files                      
57 | tmp_dir="--tmp_dir_name <string>"         	#default("tmp_normalized_reads")
58 | 
59 | 
60 | #run normalization
61 | 00_scripts/trinity_utils/util/util/insilico_read_normalization.pl $seq $single $left $right \
62 | 	$output $mem $left_list $right_list \
63 | 	$pairs $cpu $parallel_stat $kmer $max_pct \
64 | 	$nocleanup $tmp_dir 2>&1 | tee 98_log_files/"$TIMESTAMP"_normalization.log 
65 | 	$run_rsem $trans_map $max_dist $just_prep 2>&1 | tee 98_log_files/"$TIMESTAMP"_assess_read_count.log
66 | 
67 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/pfam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="pfam"
 5 | #SBATCH -o log-pfam.out
 6 | #SBATCH -c 4
 7 | #SBATCH -p ibis2
 8 | #SBATCH -A ibis2
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=5-00:00
12 | #SBATCH --mem=20000
13 | 
14 | 
15 | 
16 | cd $SLURM_SUBMIT_DIR
17 | 
18 | 
19 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
20 | SCRIPT=$0
21 | NAME=$(basename $0)
22 | LOG_FOLDER="98_log_files"
23 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
24 | 
25 | #Global variables
26 | INPUT="Trinity_cleaned.fasta.transdecoder_dir/longest_orfs.pep"
27 | PFAMDB="/home/jelel8/Databases/pfam/Pfam-A.hmm"
28 | OUTPUT="07_de_results/TrinotatePFAM.out"
29 | 
30 | 
31 | #prepare DB
32 | 
33 | hmmpress $PFAMDB
34 | 
35 | #run hmmer suite
36 | 
37 | hmmscan --cpu 4 --domtblout $OUTPUT $PFAMDB $INPUT > pfam.log 
38 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/prepare_info_file.sh:
--------------------------------------------------------------------------------
 1 | #1/bin/bash
 2 | 
 3 | 
 4 | #TODO prepare info file
 5 | 
 6 | cond_A    cond_A_rep1    A_rep1_left.fq    A_rep1_right.fq
 7 | cond_A    cond_A_rep2    A_rep2_left.fq    A_rep2_right.fq
 8 | cond_B    cond_B_rep1    B_rep1_left.fq    B_rep1_right.fq
 9 | cond_B    cond_B_rep2    B_rep2_left.fq    B_rep2_right.fq
10 | #
11 | #                                   # note, Trinity-specific parameter settings should be included in the samples_file like so:
12 | #                                   # (only --max_memory is absolutely required, since defaults exist for the other settings)
13 | #                                   --CPU=6
14 | #                                   --max_memory=10G
15 | #                                   --seqType=fq
16 | #                                   --SS_lib_type=RF
17 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/prepare_jobs_header.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | 
 5 | #change information in job headers
 6 | 
 7 | # usage prepare_jobs_header.sh userID userEmail
 8 | 
 9 | ID=$1
10 | email=$2
11 | PWD=$(pwd)
12 | 
13 | cd $PWD
14 | 
15 | for i in $(ls 00_scripts/*sh); do sed -i -e "s/userID/$ID/g" -e "s/userEmail/$email/g" $i
16 | 
17 | done
18 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/transdecoder_getorf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -D ./ 
 4 | #SBATCH --job-name="transdecoder"
 5 | #SBATCH -o log-transdecoder.out
 6 | #SBATCH -c 1
 7 | #SBATCH -p ibismini
 8 | #SBATCH -A ibismini
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --mail-user=type_your_mail@ulaval.ca
11 | #SBATCH --time=5-00:00
12 | #SBATCH --mem=50000
13 | 
14 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
15 | SCRIPT=$0
16 | NAME=$(basename $0)
17 | LOG_FOLDER="98_log_files"
18 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
19 | 
20 | #Global variables
21 | INPUT="05_trinity_assembly/Trinity_cleaned.fasta"
22 | 
23 | ./00_scripts/transdecoder_utils/TransDecoder.LongOrfs -t $INPUT 2>&1 | tee 98_log_files/"$TIMESTAMP"_transdecoder_getorf.log   
24 | 


--------------------------------------------------------------------------------
/00_scripts/utility_scripts/transdecoder_predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -N trandecod_predict
 3 | #$ -M userID
 4 | #$ -m beas
 5 | #$ -pe smp 1
 6 | #$ -l h_vmem=20G
 7 | #$ -l h_rt=20:00:00
 8 | #$ -cwd
 9 | #$ -S /bin/bash
10 | 
11 | TIMESTAMP=$(date +%Y-%m-%d_%Hh%Mm%Ss)
12 | SCRIPT=$0
13 | NAME=$(basename $0)
14 | LOG_FOLDER="98_log_files"
15 | cp $SCRIPT $LOG_FOLDER/"$TIMESTAMP"_"$NAME"
16 | 
17 | #Global variables
18 | INPUT="05_trinity_assembly/Trinity.fasta"
19 | 
20 | ./00_scripts/transdecoder_utils/TransDecoder.Predict -t $INPUT 2>&1 | tee 98_log_files/"$TIMESTAMP"_transdecoder_predict.log 
21 | 


--------------------------------------------------------------------------------
/01_info_files/.gitignore:
--------------------------------------------------------------------------------
1 | *.gz
2 | *.fasta
3 | *.gff3
4 | 


--------------------------------------------------------------------------------
/01_info_files/example_pair_comparison.txt:
--------------------------------------------------------------------------------
1 | cond_A    cond_B
2 | cond_Y    cond_Z
3 | 


--------------------------------------------------------------------------------
/01_info_files/example_sample_replicates.txt:
--------------------------------------------------------------------------------
1 | cond_A    cond_A_rep1
2 | cond_A    cond_A_rep2
3 | cond_B    cond_B_rep1
4 | cond_B    cond_B_rep2
5 | 


--------------------------------------------------------------------------------
/01_info_files/example_samples_info.txt:
--------------------------------------------------------------------------------
1 | cond_A    cond_A_rep1    A_rep1_left.fq    A_rep1_right.fq
2 | cond_A    cond_A_rep2    A_rep2_left.fq    A_rep2_right.fq
3 | cond_B    cond_B_rep1    B_rep1_left.fq    B_rep1_right.fq
4 | cond_B    cond_B_rep2    B_rep2_left.fq    B_rep2_right.fq
5 | 
6 | --max_memory=10G
7 | 


--------------------------------------------------------------------------------
/02_data/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.txt
 3 | *.fastq.gz
 4 | *.fq.gz
 5 | *paired*
 6 | *single*
 7 | *.gz
 8 | *.zip
 9 | *fq
10 | *fastq
11 | genome*
12 | 


--------------------------------------------------------------------------------
/03_trimmed/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.txt
 3 | *.fastq.gz
 4 | *.fq.gz
 5 | *paired*
 6 | *single*
 7 | *.gz
 8 | *.zip
 9 | *.sh
10 | *r1
11 | *r2
12 | 


--------------------------------------------------------------------------------
/04_merged/.gitignore:
--------------------------------------------------------------------------------
1 | *.gz
2 | *.fq
3 | *.readcount
4 | 


--------------------------------------------------------------------------------
/06_assembly_stats/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | *.sam
3 | *.bam
4 | 


--------------------------------------------------------------------------------
/07_de_results/.gitignore:
--------------------------------------------------------------------------------
1 | *.ok
2 | *.results
3 | *.bam
4 | *.bai
5 | *stat/
6 | *.outfmt6
7 | *temp/
8 | 


--------------------------------------------------------------------------------
/07_de_results/rsem_DH-1MA.bowtie.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jleluyer/rna-seq_denovo_workflow/ac4cf95b2944f3823dde268fa7baf96264f3b603/07_de_results/rsem_DH-1MA.bowtie.sorted.bam.bai


--------------------------------------------------------------------------------
/98_log_files/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | *.log
3 | *SC
4 | *
5 | 


--------------------------------------------------------------------------------
/99_archive/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RNA-seq analysis in de novo
  2 | 
  3 | An integrated worklow for *de novo* analysis and DE gene assessment to conduct RNA-seq data analyses
  4 | 
  5 | This Workflow is developped in [Louis Bernatchez' lab](https://www.bio.ulaval.ca/louisbernatchez/presentation.htm).
  6 | 
  7 | **WARNING**
  8 | 
  9 | The software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the software.
 10 | 
 11 | ## Downloading
 12 | 
 13 | You can clone this repository with:
 14 | 
 15 | ```
 16 | git clone https://github.com/jleluyer/rna-seq_denovo_workflow
 17 | ```
 18 | 
 19 | ## Documentation
 20 | 
 21 | ### 1. Clone git hub directory
 22 | 
 23 | ```
 24 | git clone https://github.com/jleluyer/rna-seq_denovo_workflow
 25 | ```
 26 | 
 27 | ### 2. Prepare utilities
 28 | 
 29 | ```
 30 | cd rna-seq_denovo_workflow
 31 | 
 32 | ./00_scripts/00_import_trinity.sh
 33 | ```
 34 | This script will create the utilities directories that have to be configured and installed
 35 | 
 36 | ```
 37 | cd 00_scripts/trinity_utils/
 38 | make
 39 | make plugins
 40 | cd ../..
 41 | ```
 42 | 
 43 | ```
 44 | cd 00_scripts/transdecoder_utils/
 45 | make
 46 | cd ../..
 47 | ```
 48 | 
 49 | For Corset, small changes need to be done to correctly confgure and install the software. Change **userNAME** for your username.
 50 | 
 51 | ```
 52 | cd 00-scripts/corset_utils
 53 | ./configure --with-bam_inc=/prg/samtools/0.1.19/ --with-bam_lib=/prg/samtools/0.1.19/ --prefix=/home/userNAME/local
 54 | make
 55 | make install
 56 | cd ../..
 57 | ```
 58 | 
 59 | Make sure **Bowtie** is in your **$PATH**.
 60 | 
 61 | ### 3. Import data
 62 | 
 63 | ```
 64 | cp /path/to/the/data/repository/*.gz 02_data
 65 | ```
 66 | 
 67 | ### 4. Trimming the data
 68 | 
 69 | * Import univec.fasta
 70 | 
 71 | ```
 72 | wget 01_info_files/univec.fasta ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec
 73 | ```
 74 | Add your specific adaptors if absent in the database.
 75 | 
 76 | * Trimming
 77 | 
 78 | Two scripts are provided for **Single-End** or **Paired-end** data, **00_scripts/01_trimmomatic_se.sh** and **00_scripts/01_trimmomatic_pe.sh**, respectively.
 79 | 
 80 | ```
 81 | sbatch 00_scripts/01_trimmomatic_se.sh
 82 | ```
 83 | 
 84 | You may also want to check the quality of your data prior to trimming using **00_scripts/utility_scripts/fastq.sh**. This will require to have **fastQC** installed in your **$PATH**.
 85 | 
 86 | ### 4. Merging data for assembly
 87 | 
 88 | Note: Trinity is memory-consuming, make sure you adapt the number of samples to the memory available. When limited in memory, you could use a in silico normalization provided in the **00_scripts/utility_scripts/insilico_normalization.sh**. Otherwise, you may want to select a subset of the data and modify **./00_scripts/02_merge.sh**. For more information regarding memory usage, please visit [Trinity memory usage](http://trinityrnaseq.github.io/performance/mem.html) 
 89 | 
 90 | ```
 91 | sbatch 00_scripts/02_merge.sh
 92 | ```
 93 | 
 94 | ### 5. Assembly
 95 | 
 96 | Before running assembly, you need to make sure that the folder **05_trinity_assembly** is empty.
 97 | You also need to adapt the script for your own needs. For instance, you will need to adapt the input files (either SE, PE or strand-specific) in the global variables section. Other parameters need also to be carefuly chosen (_e.g_: minimum length of transcripts, ...).
 98 | 
 99 | ```
100 | sbatch 00_scripts/03_assemble.sh
101 | ```
102 | 
103 | ### 6. Check assembly quality
104 | 
105 | ```
106 | sbatch 00_scripts/04_assembly_stats.sh
107 | ```
108 | This script will output file **06_assembly_stats/results_stats.txt**
109 | 
110 | Other scripts are provided to assess the quality of the transcriptome:
111 | **00_scripts/utility_scripts/assessing_read_content.sh**
112 | 
113 | 
114 | ### 7. Downstream analysis
115 | 
116 | #### 7.1 Transcript abundance
117 | 
118 | * Prepare reference
119 | 
120 | ```
121 | sbatch 00_scripts/05_prep_ref.sh
122 | ```
123 | 
124 | * Quantify transcripts abundance
125 | 
126 | ```
127 | sbatch 00_scripts/06_transcripts_abundance.sh
128 | ```
129 | 
130 | Several options are possible ase well as severel tools for quantifying trasncripts aboundance such as **RSEM**, **Kallisto**, **eXpress** or **salmon** as well as different aligners **Bowtie** or **Bowtie2**
131 | 
132 | * Cluster transcripts
133 | 
134 | ```
135 | sbatch 00_scripts/utilities/corset.sh
136 | ```
137 | 
138 | ### 7.2 Annotation
139 | 
140 | * Predict longest ORFs
141 | 
142 | ```
143 | sbatch 00_scripts/utility_scripts/transdecoder_getorfs.sh
144 | ```
145 | 
146 | * Blast against Uniprot
147 | 
148 | ```
149 | sbatch 00_scripts/utility_scripts/blastp.sh
150 | ```
151 | _note: you will need access to **uniprot_sprot.fasta** database_
152 | 
153 | 
154 | * Identify protein families
155 | 
156 | ```
157 | sbatch 00_scripts/utility_scripts/pfam.sh
158 | ```
159 | _note: you will need access to **Pfam-A** database_
160 | 
161 | 
162 | * Identify signal peptides
163 | 
164 | ```
165 | sbatch 00_scripts/utility_scripts/signalip.sh
166 | ```
167 | _need to install signalip_
168 | 
169 | * Identify transmembrane regions
170 | 
171 | ```
172 | sbatch 00_scripts/utility_scripts/tmhmm.sh
173 | ```
174 | _need to install tmhmm_
175 | 
176 | * Identify rRNA transcripts
177 | 
178 | ```
179 | sbatch 00_scripts/utility_scripts/rnammer.sh
180 | ```
181 | _need to install rnammer_
182 | 
183 | * Compile the annotation in a final report
184 | 
185 | ```
186 | sbatch 00_scripts/utility_scripts/trianotate.sh
187 | ```
188 | 
189 | _note: you will need access to **Trinotate.sqlite** database_
190 | 
191 | 
192 | ### 7.3 Differential expression analysis
193 | 
194 | 
195 | * Build transcripts expression matrices
196 | 
197 | ```
198 | sbatch 00_scripts/07_matrix.sh
199 | ```
200 | * running DE
201 | ```
202 | sbatch 00_scripts/08_de_analysis.sh
203 | ```
204 | 
205 | Several packages are available and implemented in the script such as **DeSeq2**, **Lima/voom**, **EdgeR** and **ROTS**.
206 | 
207 | 
208 | ## Notes
209 | 
210 | ## Dependencies
211 | 
212 | ### Softwares
213 | 
214 | [Trinity](https://github.com/trinityrnaseq/trinityrnaseq)
215 | 
216 | [Trinotate](https://github.com/Trinotate/Trinotate)
217 | 
218 | [RSEM](https://github.com/deweylab/RSEM)
219 | 
220 | [Bowtie](http://bowtie-bio.sourceforge.net/index.shtml)
221 | 
222 | **java 1.7** or higher
223 | 
224 | [R](https://www.r-project.org/)
225 | 
226 | ### R packages
227 | 
228 | [edgeR](http://bioconductor.org/packages/release/bioc/html/edgeR.html)
229 | 
230 | [DESeq2](http://bioconductor.org/packages/release/bioc/html/DESeq2.html)
231 | 
232 | [limma/voom](http://bioconductor.org/packages/release/bioc/html/limma.html)
233 | 
234 | [ROTS](http://www.btk.fi/research/research-groups/elo/software/rots/)
235 | 
236 | [goseq](http://www.bioconductor.org/packages/release/bioc/html/goseq.html)
237 | 
238 | #### Install R packages
239 | 
240 | ```R
241 | %R
242 | source("http://bioconductor.org/biocLite.R")
243 | biocLite('edgeR')
244 | biocLite('limma')
245 | biocLite('DESeq2')
246 | biocLite('ctc')
247 | biocLite('Biobase')
248 | bioclite("goseq")
249 | install.packages('gplots')
250 | install.packages('ape')
251 | ```
252 | 
253 | ## Citations
254 | 
255 | Grabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L, Raychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F, Birren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A. (2011). Full-length transcriptome assembly from RNA-seq data without a reference genome. **_Nat. Biotechnol._** [doi: 10.1038/nbt.1883](http://www.ncbi.nlm.nih.gov/pubmed/21572440)
256 | 
257 | Haas BJ, Papanicolaou A, Yassour M, Grabherr M, Blood PD, Bowden J, Couger MB, Eccles D, Li B, Lieber M, Macmanes MD, Ott M, Orvis J, Pochet N, Strozzi F, Weeks N, Westerman R, William T, Dewey CN, Henschel R, Leduc RD, Friedman N, Regev A. (2013). De novo transcript sequence reconstruction from RNA-Seq: reference generation and analysis with Trinity. **_Nat. Protoc._** [doi:  10.1038/nprot.2013.084](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3875132/)
258 | 
259 | [Trinity documentation](https://github.com/trinityrnaseq/trinityrnaseq/wiki)
260 | 
261 | ## Licence
262 | 
263 | The rna-seq_denovo_workflow is licensed under the GPL3 license. See the LICENCE file for more details.
264 | 


--------------------------------------------------------------------------------