├── 0.1_kallisto_extract_transcripts.sh ├── 0.2_bowtie_build_genome.sh ├── 0.3_Salmon_build_index.sh ├── 00_Add_to_Reference.readme ├── 00_Generate_FastQs.readme ├── 00_Kallisto_For_SmartSeq.readme ├── 00_LIST_OF_BSUB_COMMANDS.sh ├── 00_Steps ├── 0_Anno_Extract_Transcriptome.pl ├── 0_BAM2FastQ.sh ├── 0_BuildGenome.sh ├── 0_CRAM2BAM.sh ├── 0_Check_Barcodes.pl ├── 0_Convert_CRAM_to_BAM.sh ├── 0_Determine_Barcodes.pl ├── 0_Download_Files_from_Dropbox.pl ├── 0_Extract_Metadata_from_Bam.sh ├── 0_FASTQC.sh ├── 0_FASTQC_Streaming.sh ├── 0_FASTQC_limits.txt ├── 0_Flexible_Convert_Dir_CRAM_to_BAM.sh ├── 0_GBK2FASTA.pl ├── 0_Gather_Summary_Statistics.pl ├── 0_Get_Data_from_iRODS.sh ├── 0_Make_ERCC_fasta_and_gtf.pl ├── 0_Merge_FASTQs.sh ├── 0_My_Extract_Transcriptome.pl ├── 0_Process_GBK.pl ├── 0_custom_undo_demultiplexing.pl ├── 0_make_transcriptome.sh ├── 1.5_DO_Trim_Reads.sh ├── 1.5_Trim_Reads_Paired.sh ├── 1.5_Trim_UMI.pl ├── 1.6_Summarizing_Trimming.pl ├── 1_BreakDown_Files_wrapper.sh ├── 1_BreakDown_PairedEnds.pl ├── 1_BreakDown_PairedEnds_Custom_Wafergen.pl ├── 1_Breakdown_UMI_read_pairs.pl ├── 1_DO_BreakDown_Files.sh ├── 1_Flexible_FullTranscript_Demultiplexing.pl ├── 1_Flexible_UMI_Demultiplexing.pl ├── 2-5.1_DO_kallisto_quant.sh ├── 2-5.1_kallisto_quant.sh ├── 2-5.2_DO_Salmon_quant.sh ├── 2-5.2_Salmon_quant.sh ├── 2-5_DO_RSEM.sh ├── 2-5_STAR-RSEM.sh ├── 2-5_bowtie2-RSEM.sh ├── 2.2_DO_MapReads_Tophat.sh ├── 2.2_MapReads_Tophat.sh ├── 2_DO_MapReadsFile.sh ├── 2_DO_MapReadsFile_singleend.sh ├── 2_MapReadsFile.sh ├── 2_MapReadsFile_Transcriptome.sh ├── 2_MapReadsFile_singleend.sh ├── 2_STAR_Parameters.txt ├── 3_CLEANUP_MapReadFiles.sh ├── 3_Compile_Mapping_Statistics.pl ├── 3_Compile_UMI_Statistics.pl ├── 3_DO_UmiDedup.sh ├── 3_SAMtools_sort_wrapper.sh ├── 3_SortBAMs.pl ├── 3_UmiDedup.sh ├── 3_merge_dedup_MappedReads.sh ├── 4_Convert_GTF2BED_customized_for_Ensembl.pl ├── 4_DO_RSeQC_Multiple.sh ├── 4_MergeBAMs.pl ├── 4_Process_RSEQC_output.pl ├── 4_RSeQC_Multiple.sh ├── 5.0_Summarize_Known_Transcriptome.pl ├── 5_Cufflinks_wrapper.sh ├── 5_Cuffmerge_wrapper.sh ├── 5_DO_Cufflinks.sh ├── 5_DO_Cufflinks_denovo_Transcripts.sh ├── 5_DO_Cuffmerge.sh ├── 5_DO_Quantification_X2.sh ├── 5_DO_featureCounts.sh ├── 5_DO_featureCounts_locally.sh ├── 5_Fix_Transcriptome_for_featureCounts.pl ├── 5_RSEM.sh ├── 5_RSEM_build_refrence.sh ├── 5_Summarize_Filter_Merged_Transcriptome.pl ├── 5_TidyCufflinks.pl ├── 5_featureCounts_wrapper.sh ├── 6.1_Get_Expression_Kallisto.pl ├── 6_Get_Construct_Expression_Cufflinks.pl ├── 6_Get_Cufflinks_Gene_Level_Expression.pl ├── 6_Get_Expression_featureCounts.pl ├── 6_Get_Kallisto.pl ├── 6_Get_RSEM_Expression.pl ├── 6_Get_Salmon_Expression.pl ├── 99_Check_Barcodes.pl ├── 99_Check_RSEM_Output.pl ├── 99_Check_Results.pl ├── 99_NotesForImprovement ├── 99_get_order_chr_in_SAM.pl ├── ERCC_Controls ├── ERCC_Controls.fa ├── ERCC_Controls.gtf ├── ERCC_Controls_Analysis.txt ├── ERCC_Controls_Annotation.txt ├── ERCC_Controls_README.txt ├── Make_FASTA_GTF_from_Annotation.pl └── Note ├── Extract_PlateID_and_WellID_from_headers.pl ├── Kallisto_Build_Index.sh ├── Kallisto_Make_ExpMat.pl ├── Kallisto_Quantification_Wrapper.sh ├── Parse_GTF_biotype.pl ├── Parse_GTF_splicing.pl ├── README ├── TODO └── software └── dedup_umi.py /0.1_kallisto_extract_transcripts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Raw genome fasta and annotation gtf 4 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa 5 | GTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf 6 | 7 | # Location for output files 8 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES 9 | 10 | # Locations of required software 11 | GFFREAD=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread 12 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto 13 | 14 | # Extract transcriptome fasta using gffread 15 | $GFFREAD $GTF -g $FA -w $OUTDIR/Transcripts.fasta 16 | 17 | # Index the extracted transcriptome 18 | $KALLISTO index -i $OUTDIR/kallisto_index.idx $OUTDIR/Transcripts.fasta 19 | 20 | -------------------------------------------------------------------------------- /0.2_bowtie_build_genome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #This should be flexible enough to get the commonly used genomes: human, mouse, fly, worm from ensembl, and have options to add genetic constructs that have been integrated into the system. 3 | 4 | # Tallulah 07 April 2015 - added the option to just keep the GTF & Fasta files without running BOWTIE by setting the number of threads to 0 (for getting the genome & annotations for Cufflinks later). 5 | # Tallulah 31 March 2015 - updated to check all 5/6 arguments (which are required) have been set. 6 | # Tallulah 26 Mar 2015 Not so obvious whether it is more efficient to get genomes from internal ensembl mirror or to download from ensembl ftp website? -> since only doing this once per organism/experiment ftp/rsync is probably fine? 7 | # All bits tested but not all at once 8 | 9 | # Arguments: 10 | # $1 = working directory on /lustre/ 11 | # $2 = striped genome directory on /lustre/ 12 | # $3 = number of threads to run on, # if 0 does not run star 13 | # $4= readlength, 14 | # $5 = organism [Hsap, Mmus, Dmel, Cele]; 15 | # $6 = directory with constructs to be added (optional) 16 | 17 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa 18 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/bowtie2_build 19 | BOWTIE=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bowtie2-2.2.6/bowtie2-build 20 | 21 | echo "$BOWTIE --seed=10101 $FA $OUTDIR" 22 | 23 | -------------------------------------------------------------------------------- /0.3_Salmon_build_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Raw genome fasta and annotation gtf files 4 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa 5 | GTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf 6 | 7 | # Location for output 8 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES 9 | 10 | # Location of required software 11 | GFFREAD=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread 12 | SALMON=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Salmon-0.7.2_linux_x86_64/bin/salmon 13 | 14 | # Extract transcriptome fasta 15 | $GFFREAD $GTF -g $FA -w $OUTDIR/Transcripts.fasta 16 | 17 | # Build index (single thread) for optimal mapping performance 18 | $SALMON index -i $OUTDIR/salmon_index -t $OUTDIR/Transcripts.fasta --perfectHash -p 1 19 | 20 | -------------------------------------------------------------------------------- /00_Add_to_Reference.readme: -------------------------------------------------------------------------------- 1 | This provides instructions for adding extra sequences to a reference genome 2 | prior to mapping your reads. Reasons for doing this would be: 3 | 1) if your celline contains inserted genetic contructs, 4 | e.g. for CRISPR, or for fluorescently labelling your cells. 5 | 2) if you've added spike-in transcripts to your sample. 6 | 7 | 1) This will require manually creating the GTF annotations for your 8 | particular construct. If you have a GenBank file (.gbk) you can use: 9 | 0_GBK2FASTA.pl 10 | to extract the raw sequence of the construct to append to the genome. 11 | 12 | 2) If using the ERCC spike in, you must download the sequences from the 13 | manufacturer, i.e. the "ERCC Controls Annotation" file from: 14 | https://www.thermofisher.com/order/catalog/product/4456740 15 | then run: 16 | 0_Make_ERCC_fasta_and_gtf.pl ERCC_Controls_Annotation.txt 17 | this will create a fasta and gtf file for the ERCCs in your working 18 | directory which you can append to the reference genome. 19 | 20 | Append your extra gtf and fasta files to the existing reference fasta and gtf using "cat" : 21 | cat ERCC_Controls.fa >> ref.fa 22 | cat ERCC_Controls.gtf >> ref.gtf 23 | 24 | 25 | -------------------------------------------------------------------------------- /00_Generate_FastQs.readme: -------------------------------------------------------------------------------- 1 | This file contains instructions for converting a variety of file-types to 2 | a fastq files. This is necessary before running any of the quantification 3 | pipelines found here. 4 | 5 | There are many different ways you could recieve data from a sequencing 6 | facility and not all of them will be covered here. These scripts will 7 | only cover the different formats I've had to deal with so far. 8 | 9 | Software Requirements: 10 | samtools (>=1.3.1) 11 | bedtools2 12 | 13 | 14 | Option 1 : CRAM/BAM files 15 | First inspect your CRAM/BAM files using: 16 | 17 | samtools view -h file1.cram | less 18 | 19 | Among the header lines (start with "@" symbol) you should find 20 | some information identifying the data as belonging to your study 21 | along with any processing that has been done (e.g. how it was 22 | mapped to the genome and which genome it was mapped to) 23 | 24 | Once you scroll down to reads (press / to scroll) 25 | you can check whether they have been mapped or not. Unmapped 26 | reads will have the second entry of each row be either 77 or 141. 27 | 28 | Usually BAM/CRAM files will already be demultiplexed (one file per 29 | cell) and mapped to the appropriate genome. In which case, you may 30 | choose to skip the mapping step yourself and simply count your 31 | already mapped reads (see : 00_STAR_For_SmartSeq.readme). In which 32 | case if you have BAM files you are done. If you have CRAM files you 33 | need to convert them to BAM files: 34 | 35 | samtools view -b -h file.cram -o file.bam 36 | 37 | NOTE: Converting mapped CRAM files to BAM files will store the 38 | entire reference genome in your cache (which might be bad). To 39 | specify a new cache location use: 40 | 41 | export REF_CACHE= 42 | 43 | If you have a large number of files to convert you may want to 44 | loop over them or submit them to a cluster to run. E.g. 45 | 46 | bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>1000] rusage[mem=1000]" -M1000 -o cram2bam.%J.%I 0_CRAM2BAM.sh cram_dir bam_dir work_dir 47 | 48 | or 49 | 50 | CRAM_files=$CRAM_dir/*.cram 51 | for FILE in $CRAM_files 52 | do 53 | 0_CRAM2BAM.sh $FILE bam_dir work_dir 54 | done 55 | 56 | If for whatever reason you need to remap your reads from BAM/CRAM files 57 | then you'll need to convert the BAM files to FastQ files, using : 58 | 59 | bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>1000] rusage[mem=1000]" -M1000 -o bam2fastq.%J.%I 0_BAM2FastQ.sh bam_dir fastq_dir work_dir "P" 60 | 61 | or 62 | 63 | BAM_files=$BAM_dir/*.bam 64 | for FILE in $BAM_files 65 | do 66 | 0_BAM2FastQ.sh $FILE fastq_dir work_dir "P" 67 | done 68 | 69 | This script assumes reads are unpaired unless the fourth argument "P" is provided. 70 | 71 | To check if your reads are pairs simply run this command: 72 | 73 | samtools view -f 1 my_file.bam | wc -l 74 | 75 | This will count the number of paired reads in your bam file. 76 | 77 | Option 2 : Demultiplexing large FastQs 78 | If you get data where reads from multiple cells are mixed together then 79 | you will need to demultiplex the data so you have one pair of fastq files 80 | per cell. This can be done with : 81 | 82 | perl 1_Flexible_FullTranscript_Demultiplexing.pl read1.fq read2.fq b_pos b_len index mismatch prefix 83 | 84 | Running the script without any arguments will bring up the help for 85 | what each argument should be. Note this script assumes you have a 86 | relatively small number of possible cell-barcodes (i.e. < 5,000). 87 | It is not appropriate for droplet or microwell based experiments. 88 | 89 | Cell barcodes are assumed to be present at either the start of end of 90 | read1. And they will be removed from the sequences after demultiplexing. 91 | 92 | If you have multiple files of reads per cell (e.g. if you have multiple 93 | lanes of sequencing) then these can be combined with the "cat" function: 94 | 95 | cat file1_1.fq file2_1.fq file3_1.fq > all_1.fq 96 | 97 | It is easist to combine all reads into one file and then demultiplex. 98 | -------------------------------------------------------------------------------- /00_Kallisto_For_SmartSeq.readme: -------------------------------------------------------------------------------- 1 | This outlines the scripts, software and steps for processing a 2 | SmartSeq[2]-based RNASeq experiment with Kallisto. It assumes 3 | you are starting with one pair of FastQ files per cell, and 4 | takes you through to creating a Single-Cell Experiment object. 5 | 6 | See: 00_Generate_FastQs.readme for instructions on creating one 7 | pair of FastQs per cell. 8 | 9 | This workflow assumes you are NOT using Unique Molecular Identifiers 10 | 11 | Software Requirements: 12 | fastqc 13 | trimmomatic 14 | gffread 15 | kallisto 16 | perl 17 | 18 | All scripts contain variables among the top few lines for hard-coding specific 19 | versions of the software if it is not in your path. 20 | 21 | Directory Set-up: 22 | (A) Create one directory with all the FastQ files for one experiment. 23 | (B) Create a second directory for kallisto output files. 24 | (C) Create a third directory for temporary files. 25 | 26 | SAVE A BACK-UP COPY OF YOUR RAW DATA BEFORE RUNNING 27 | 28 | Steps 29 | 30 | 1 : Build the reference transcriptome and kallisto index 31 | 32 | Download the appropriate reference fasta (.fa) and annotation (.gtf) files 33 | (https://www.ensembl.org/info/data/ftp/index.html) 34 | 35 | Add any custom sequences you need for your experiment. 36 | 37 | See: 00_Add_to_Reference.readme for instructions on adding custom sequences 38 | such as spike-ins to the reference. 39 | 40 | Run : "Kallisto_Build_Index.sh ref.fa ref.gtf outdir" 41 | 42 | 2 : Read Quality Control with FASTQC 43 | 44 | Download my FASTQC limit file (0_FASTQC_limits.txt) 45 | 46 | Run : 47 | 0_FASTQC_Streaming.sh fastq_dir "*_1.fq" 0_FASTQC_limits.txt "Read1" outdir 48 | 0_FASTQC_Streaming.sh fastq_dir "*_2.fq" 0_FASTQC_limits.txt "Read2" outdir 49 | 50 | If your data was sequenced on multiple lanes of sequences you may want to run 51 | FASTQC on each lane separately. 52 | 53 | 3 : Read Trimming (WARNING: replaces original FastQs) 54 | Either submit 1.5_Trim_Reads_Paired.sh as a job array: 55 | 56 | NCELLS=384 57 | bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o trim.out.%J.%I 1.5_Trim_Reads_Paired.sh $FQ_dir NULL $work_dir NexteraPE-PE.fa 1000 58 | 59 | or loop over all pairs of fastq files : 60 | 61 | NCELLS=384 62 | FQ_files=($FQ_dir/*.fq.gz) 63 | for CELL in $(seq 1 $NCELLS) 64 | do 65 | FILE_INDEX=$((($CELL-1)*2)) 66 | FILE1=${FQ_files[$FILE_INDEX]} 67 | FILE2=${FQ_files[$FILE_INDEX+1]} 68 | bsub -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o trim.out.%J 1.5_Trim_Reads_Paired.sh $FILE1 $FILE2 $work_dir NexteraPE-PE.fa 1000 69 | done 70 | 71 | 4 : Quantification with kallisto 72 | Either submit Kallisto_Quantification_Wrapper.sh as a job array: 73 | 74 | NCELLS=384 75 | bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>5000] rusage[mem=5000] span[hosts=1]" -M5000 -n2 -q normal -o kallisto.out.%J.%I Kallisto_Quantification_Wrapper.sh $FQ_dir NULL kallisto_index.idx 2 outdir 76 | 77 | or loop over all pairs of fastq files : 78 | 79 | NCELLS=384 80 | FQ_files=($FQ_dir/*.fq.gz) 81 | for CELL in $(seq 1 $NCELLS) 82 | do 83 | FILE_INDEX=$((($CELL-1)*2)) 84 | FILE1=${FQ_files[$FILE_INDEX]} 85 | FILE2=${FQ_files[$FILE_INDEX+1]} 86 | bsub -R"select[mem>5000] rusage[mem=5000] span[hosts=1]" -M5000 -n2 -q normal -o kallisto.out.%J.%I Kallisto_Quantification_Wrapper.sh $FILE1 $FILE2 kallisto_index.idx 2 outdir 87 | done 88 | 89 | 5 : Combine results with perl script 90 | "Kallisto_Make_ExpMat.pl kallisto_dir ref.gtf [gene|trans] out_prefix" 91 | 92 | -------------------------------------------------------------------------------- /00_Steps: -------------------------------------------------------------------------------- 1 | Setup: 2 | 0_BuildGenome.sh 3 | (1) Get appropriate genome & annotations. 4 | -> from local ensembl mirror? : mysql -u anonymous -h ensembldb.internal.sanger.ac.uk 5 | -> simpler to just get from ftp site 6 | (2) get these in the right format and put on lustre and stripe them. (/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES) 7 | -> stripe at the level of directory (see wiki): 8 | lfs setstripe -c -1 (number not letter) stripe across all **This is what I have done for this directory 9 | lfs setstripe STAR seems to be better in all ways except for the number of people already using it, it is both faster and maps more reads successfully. It is also possible to make the output compatible with Cufflinks 15 | (1) Copy data in processable chunks to /lustre/scratch108/compgen/team218 (1_break..., 1_Break...) 16 | -> Q: What is the appropriate size chunk? (run in a few hours) 17 | -> Should sort by lane & cellID while breaking down. Use 0_Gather_Summary_Statistics.pl as a basis 18 | -> need to maintain ordering for paired-end reads. 19 | (2) submit jobs to cluster -> use job array, output into "output.%J.%I" or something like that 20 | -> probably want to start with 40 jobs at a time until know how to load genome data efficiently 21 | (3) check results succeeded combined results as necessary & compress (compress while still on lustre it is faster) 22 | (4) move combined-compressed output back to /nfs/team218/ 23 | (5) remove input chunks & deal with logfiles from lustre and repeat with next set 24 | --> Question? One or two passes of STAR? -> I think only one pass because we aren't that interested in alternative splicing and since relatively low read depth for each cell rare alternative splice forms are unlikely to be quantifiable in many samples. Thus the costs of doing the second pass - filtering novel splice junctions then re-building the genome to include them as annotations - outweigh the benefits of doing so - allowing more reads to be mapped to novel splice junctions. 25 | --> Question? what state does data have to be in for cufflinks/express? -> not going to use express since it doesn't do novel transcript assembly so would have to run cufflinks anyway. Cufflinks requires: Sorted SAM/BAM with special strand field and no soft clipping, the special strand field can be added with the appropriate parameter/argument in STAR calls, and I have an awk command from : https://groups.google.com/forum/#!searchin/rna-star/cufflinks/rna-star/Ta1Z2u4bPfc/8nZ2iMkxSyMJ to remove soft clipping. 26 | (6) Sort SAM/BAM 27 | (7) Remove duplicates (can be done with STAR once alignments have been combined & sorted) -> easier to do it with samtools (4_MergeBAMs.pl) 28 | 29 | Post-Mapping QC: 30 | 3_Compile_Mapping_Statistics.pl Gather mapping statistics from STAR log files 31 | 4_DO_RSeQC_Multiple.sh Various QC things using RSeQC -> general statistics, rRNA content, GC, Gene Body converage, splice junction saturation 32 | 33 | 34 | 35 | Calculating Abundance cufflinks vs eXpress: eXpress is much faster but unclear which is more accurate. I'm not sure whether eXpress does de novo transcript assembly though..... need to look at documentations for eXpress. eXpress does not do de novo transcript assembly. Cufflinks does de novo transcript assembly and produces 95% CIs for FPKM, therefore forget eXpress I'm only going to use Cufflinks (assuming it doesn't take forever). 36 | (1) Build novel transcripts/gene-models using all data. -> Cufflinks! 37 | (2) Calculate abundance for each cell by mapping to the combination of novel transcripts & existing annotations. 38 | 39 | 40 | -------------------------------------------------------------------------------- /0_Anno_Extract_Transcriptome.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 2) {die "0_My_Extract_Transcriptome.pl .gtf .fa\n";} 5 | 6 | my %Ensg2Seq = (); 7 | my %Ensg2Gtf = (); 8 | my @Ensgs = (); 9 | my $flank = 10; 10 | 11 | my $nascent = 1; 12 | 13 | open (my $fa, $ARGV[1]) or die $!; 14 | open (my $fa_out, ">","Transcripts.fa") or die $!; 15 | open (my $gtf_out, ">","Transcripts.gtf") or die $!; 16 | my $chr = "None"; 17 | my $chr_seq = ""; 18 | my $COUNT = 0; 19 | while (<$fa>) { 20 | if($_ =~ /^#/) {next;} # skip headers 21 | if ($_ =~ /^\>/) { 22 | # New Chr 23 | my @line = split(/\s+/); 24 | my $newchr = $line[0]; $newchr =~ s/>//g; 25 | if ($chr eq "None") { 26 | $chr = $newchr; 27 | next; 28 | } 29 | # Output gene sequences for this chromosome 30 | open (my $gtf, $ARGV[0]) or die $!; 31 | my $gtf_line = ""; 32 | while ($gtf_line = <$gtf>) { 33 | # Extract sequence for each gene on this Chr. 34 | if ($gtf_line =~ /^#/) {next;} # ignore headers 35 | 36 | my $geneid = ""; 37 | if ($gtf_line =~ /gene_id "(.+?)";/) { 38 | $geneid = $1; 39 | } else { 40 | next; 41 | } # get gene id 42 | 43 | # Get coordinates 44 | my @record = split(/\t/, $gtf_line); 45 | my $seq_chr = $record[0]; 46 | if ($seq_chr ne $chr) {next;} 47 | my $seq_st = $record[3]; 48 | my $seq_end = $record[4]; 49 | if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";} 50 | # Get sequence 51 | if ($record[2] eq "gene") { 52 | # Add null flanks as necessary 53 | if ($seq_st-$flank < 0) { 54 | $chr_seq = ('N' x $flank) . $chr_seq; 55 | $seq_st = $seq_st+$flank; 56 | $seq_end = $seq_end+$flank; 57 | } 58 | if ($seq_end+$flank > length($chr_seq)) { 59 | $chr_seq = $chr_seq . ('N' x $flank); 60 | } 61 | $Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank)); 62 | push(@Ensgs, $geneid); 63 | } 64 | # Store Annotations 65 | if ($record[2] eq "exon" || $record[2] eq "UTR" || $record[2] eq "gene") { 66 | push(@{$Ensg2Gtf{$geneid}}, $gtf_line); 67 | } 68 | } 69 | close($gtf); 70 | # Write output for all genes on this Chr 71 | foreach my $ensg (@Ensgs) { 72 | print $fa_out ">$ensg\n"; 73 | print $fa_out $Ensg2Seq{$ensg}."\n"; 74 | my $seq_length = length($Ensg2Seq{$ensg}); 75 | my $shift = -1; 76 | foreach my $old_gtf (@{$Ensg2Gtf{$ensg}}) { 77 | $old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/s; 78 | $old_gtf =~ s/gene_id "(.+?)"/gene_id "$ensg"/s; 79 | $old_gtf =~ s/gene_name "(.+?)"/gene_name "$ensg"/s; 80 | 81 | my @record = split(/\t/, $old_gtf); 82 | if($shift == -1 && $record[2] ne "gene") {die "ERROR: Requires first entry for each ensg to be \"gene\".\n";} 83 | if ($shift == -1) { 84 | $shift = $record[3]-$flank; 85 | } 86 | if (scalar(@record) < 5) {die "$old_gtf not enough entries\n";} 87 | 88 | $record[0] = $ensg; 89 | $record[3] = $record[3]-$shift; 90 | $record[4] = $record[4]-$shift; 91 | if ($record[4] > $seq_length) { 92 | print STDERR "$chr $ensg $record[2] $record[3] $record[4], seq = $seq_length\n"; 93 | die "ERROR: annotation exceeds sequence length\n"; 94 | } 95 | print $gtf_out join("\t",@record); 96 | } 97 | } 98 | print "$chr $newchr\n"; 99 | $chr = $newchr; 100 | $chr_seq=""; 101 | $COUNT=0; 102 | @Ensgs=(); 103 | } else { 104 | # Read in chr sequence 105 | chomp; 106 | $chr_seq = $chr_seq.$_; 107 | } 108 | } 109 | # Output last chromosome 110 | # Output gene sequences 111 | { 112 | # Output gene sequences for this chromosome 113 | open (my $gtf, $ARGV[0]) or die $!; 114 | my $gtf_line = ""; 115 | while ($gtf_line = <$gtf>) { 116 | # Extract sequence for each gene on this Chr. 117 | if ($gtf_line =~ /^#/) {next;} # ignore headers 118 | 119 | my $geneid = ""; 120 | if ($gtf_line =~ /gene_id "(.+?)";/) { 121 | $geneid = $1; 122 | } else { 123 | next; 124 | } # get gene id 125 | 126 | # Get coordinates 127 | my @record = split(/\t/, $gtf_line); 128 | my $seq_chr = $record[0]; 129 | if ($seq_chr ne $chr) {next;} 130 | my $seq_st = $record[3]; 131 | my $seq_end = $record[4]; 132 | if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";} 133 | # Get sequence 134 | if ($record[2] eq "gene") { 135 | # Add null flanks as necessary 136 | if ($seq_st-$flank < 0) { 137 | $chr_seq = ('N' x $flank) . $chr_seq; 138 | $seq_st = $seq_st+$flank; 139 | $seq_end = $seq_end+$flank; 140 | } 141 | if ($seq_end+$flank > length($chr_seq)) { 142 | $chr_seq = $chr_seq . ('N' x $flank); 143 | } 144 | $Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank)); 145 | push(@Ensgs, $geneid); 146 | } 147 | # Store Annotations 148 | if ($record[2] eq "exon" || $record[2] eq "UTR" || $record[2] eq "gene") { 149 | push(@{$Ensg2Gtf{$geneid}}, $gtf_line); 150 | } 151 | } 152 | close($gtf); 153 | # Write output for all genes on this Chr 154 | foreach my $ensg (@Ensgs) { 155 | print $fa_out ">$ensg\n"; 156 | print $fa_out $Ensg2Seq{$ensg}."\n"; 157 | my $seq_length = length($Ensg2Seq{$ensg}); 158 | my $shift = -1; 159 | foreach my $old_gtf (@{$Ensg2Gtf{$ensg}}) { 160 | $old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/s; 161 | $old_gtf =~ s/gene_id "(.+?)"/gene_id "$ensg"/s; 162 | $old_gtf =~ s/gene_name "(.+?)"/gene_name "$ensg"/s; 163 | 164 | my @record = split(/\t/, $old_gtf); 165 | if($shift == -1 && $record[2] ne "gene") {die "ERROR: Requires first entry for each ensg to be \"gene\".\n";} 166 | if ($shift == -1) { 167 | $shift = $record[3]-$flank; 168 | } 169 | 170 | $record[0] = $ensg; 171 | $record[3] = $record[3]-$shift; 172 | $record[4] = $record[4]-$shift; 173 | if ($record[4] > $seq_length) {die "ERROR: annotation exceeds sequence length\n";} 174 | print $gtf_out join("\t",@record); 175 | } 176 | } 177 | exit(); 178 | $chr_seq=""; 179 | $COUNT=0; 180 | @Ensgs=(); 181 | } 182 | close($gtf_out); 183 | close($fa); 184 | -------------------------------------------------------------------------------- /0_BAM2FastQ.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Convert BAM file to paired, zipped, read files. Assumes paired-end sequencing 3 | BAM_file=$1 4 | OUT_dir=$2 5 | WORK_dir=$3 6 | 7 | export REF_CACHE=$WORK_dir 8 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools 9 | BEDTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bedtools2/bin/bedtools 10 | 11 | USAGE="Usage: 0_BAM2FastQ.sh bam_file out_dir work_dir\n 12 | Assumes paired-end reads. 13 | \tArguments:\n 14 | \tbam_file = BAM file or directory of BAM files if running in job array\n 15 | \tout_dir = directory for FastQ files\n 16 | \twork_dir = fast I/O location with space to store genome\n" 17 | 18 | if [ -z $BAM_file ] || [ -z $BAM_dir ] || [ -z $WORK_dir ] ; then 19 | echo -e $USAGE 20 | exit 1 21 | fi 22 | 23 | if [ ! -f $SAMTOOLS ] ; then 24 | echo "$SAMTOOLS not available" 25 | exit 1 26 | fi 27 | 28 | if [ ! -f $BEDTOOLS ] ; then 29 | echo "$BEDTOOLS not available" 30 | exit 1 31 | fi 32 | 33 | # Get CRAM files 34 | if [ ! -z $LSB_JOBINDEX ]; then 35 | BAMS=($BAM_file/*.bam) 36 | INDEX=$(($LSB_JOBINDEX-1)) 37 | FILE=${BAMS[$INDEX]} 38 | else 39 | FILE=$BAM_file 40 | fi 41 | 42 | NAME=`basename ${FILE%.bam}` # remove path and .bam suffix 43 | 44 | FASTQ1=${NAME}_1.fq 45 | FASTQ2=${$NAME}_2.fq 46 | 47 | #write all reads to fastq 48 | TMP=$WORK_dir/Tmp$NAME.bam 49 | TMP2=$WORK_dir/Tmp2_$NAME.bam 50 | $SAMTOOLS sort -n $FILE -o $TMP 51 | $SAMTOOLS view -b -F 256 $TMP -o $TMP2 # remove secondary alignments 52 | $BEDTOOLS bamtofastq -i $TMP2 -fq $OUT_dir/$FASTQ1 -fq2 $OUT_dir/$FASTQ2 53 | 54 | gzip $OUT_dir/$FASTQ1 55 | gzip $OUT_dir/$FASTQ2 56 | rm $TMP 57 | rm $TMP2 58 | -------------------------------------------------------------------------------- /0_BuildGenome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #This should be flexible enough to get the commonly used genomes: human, mouse, fly, worm from ensembl, and have options to add genetic constructs that have been integrated into the system. 3 | 4 | # Tallulah 07 April 2015 - added the option to just keep the GTF & Fasta files without running STAR by setting the number of threads to 0 (for getting the genome & annotations for Cufflinks later). 5 | # Tallulah 31 March 2015 - updated to check all 5/6 arguments (which are required) have been set. 6 | # Tallulah 26 Mar 2015 Not so obvious whether it is more efficient to get genomes from internal ensembl mirror or to download from ensembl ftp website? -> since only doing this once per organism/experiment ftp/rsync is probably fine? 7 | # All bits tested but not all at once - Totally works now (13 Dec 2016) 8 | 9 | # Arguments: 10 | # $1 = working directory on /lustre/ 11 | # $2 = striped genome directory on /lustre/ 12 | # $3 = number of threads to run on, # if 0 does not run star 13 | # $4= readlength, 14 | # $5 = organism [Hsap, Mmus, Dmel, Cele]; 15 | # $6 = directory with constructs to be added (optional) 16 | 17 | NUMTHREADS=$3 18 | OVERHANG=$4-1 #read length-1 19 | ORG=$5 20 | ADDDIR=$6 21 | ORGERR="please enter one of the following organism tags: Hsap, Mmus" 22 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR 23 | LUSTRE=$1 24 | OUTDIR=$2 25 | 26 | 27 | 28 | if [ ! -f "$STAR" ] ; then 29 | echo "Sorry STAR not available " 30 | exit 1 31 | fi 32 | 33 | if [ -z "$LUSTRE" ] ; then 34 | echo "Please set a directory for temporary working files (ARG 1/6)" 35 | exit 1 36 | fi 37 | 38 | if [ -z "$OUTDIR" ] ; then 39 | echo "Please set a directory for output (ARG 2/6)" 40 | exit 1 41 | fi 42 | 43 | if [ -z "$NUMTHREADS" ] ; then 44 | echo "Please set number of threads to use (ARG 3/6)" 45 | exit 1 46 | fi 47 | 48 | if [ -z "$OVERHANG" ] ; then 49 | echo "Please set length of RNASeq reads (ARG 4/6)" 50 | exit 1 51 | fi 52 | 53 | if [ -z "$ORG" ] ; then 54 | echo "Sorry no organism to work with (ARG 5/6)" 55 | echo $ORGERR 56 | exit 1 57 | fi 58 | 59 | # Make directories for output/temporary working files if they don't already exist 60 | if [ ! -d "$OUTDIR" ] ; then 61 | mkdir -p $OUTDIR 62 | lfs setstripe $OUTDIR -c -1 63 | fi 64 | 65 | mkdir -p $LUSTRE 66 | 67 | FA="";GTF=""; 68 | # Step 1: Get genome & annotations from Ensembl and put on lustre 69 | if [ $ORG = "Hsap" ]; then 70 | # Genome fastas 71 | rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz $LUSTRE 72 | # Annotation GTFs 73 | rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/gtf/homo_sapiens/Homo_sapiens.GRCh38.79.gtf.gz $LUSTRE 74 | FA=$LUSTRE/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz 75 | GTF=$LUSTRE/Homo_sapiens.GRCh38.79.gtf.gz 76 | elif [ $ORG = "Mmus" ]; then 77 | # Genome fastas 78 | rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz $LUSTRE 79 | # Annotation GTFs 80 | rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/gtf/mus_musculus/Mus_musculus.GRCm38.79.gtf.gz $LUSTRE 81 | FA=$LUSTRE/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz 82 | GTF=$LUSTRE/Mus_musculus.GRCm38.79.gtf.gz 83 | else 84 | echo "$ORG not supported" 85 | echo $ORGERR 86 | exit 1 87 | fi 88 | 89 | # Step 2: Add genetic constructs from a directory 90 | # Question? Should we add the genetic construct as an additional chromosome or try to place it correctly in the genome? 91 | # placing it in the genome would affect the locations of all other elements on that chromosome 92 | # expression from the construct should be independent of that of the surrounding genome so don't expect any 93 | # reads to span to neighbouring genes -> depends on the construct 94 | # what about things added to the tail of a native locus? 95 | # Couldn't this be dealt with by adding the full new locus as a separate thing and somehow masking the native locus? 96 | 97 | # OK I think adding constructs as separate contigs is the best approach! 98 | # If I assume the constructs have been pre-formatted to be fasta & gtfs, 99 | # can the fasta just be added to the stock & gtf just concatenated to the current one? 100 | # -> yes as long as names are consistent across the files and not the same as any other chromosome/contig 101 | 102 | gunzip $FA 103 | FA=${FA%.*} 104 | gunzip $GTF 105 | GTF=${GTF%.*} 106 | if [ ! -z "$ADDDIR" ] ; then 107 | echo "Adding files from $ADDDIR" 108 | cat $ADDDIR/*.fa >> $FA 109 | cat $ADDDIR/*.gtf >> $GTF 110 | fi 111 | 112 | if [ $NUMTHREADS -gt 0 ] ; then 113 | # Step 3: Run STAR on the finished genome & put output in striped directory. 114 | $STAR --runThreadN $NUMTHREADS --runMode genomeGenerate --genomeDir /lustre/scratch117/cellgen/team218/TA/STRIPED_GENOMES --genomeFastaFiles $FA --sjdbGTFfile $GTF --sjdbOverhang $OVERHANG --limitGenomeGenerateRAM 31000000000 115 | 116 | # Step 4: delete the Ensembl-derived files 117 | # rm $FA 118 | # rm $GTF 119 | fi 120 | -------------------------------------------------------------------------------- /0_CRAM2BAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CRAM_file=$1 3 | BAM_dir=$2 4 | WORK_dir=$3 # fast I/O location with space to store genome & temporary files 5 | 6 | export REF_CACHE=$WORK_dir 7 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools 8 | 9 | # Checks 10 | USAGE="Usage: 0_CRAM2BAM.sh cram_file bam_dir work_dir\n 11 | \tArguments:\n 12 | \tcram_file = CRAM file or directory of CRAM files if running in job array\n 13 | \tbam_dir = directory to be filled with BAM files\n 14 | \twork_dir = fast I/O location with space to store genome\n" 15 | 16 | if [ -z $CRAM_file ] || [ -z $BAM_dir ] || [ -z $WORK_dir ] ; then 17 | echo -e $USAGE 18 | exit 1 19 | fi 20 | 21 | if [ ! -f $SAMTOOLS ] ; then 22 | echo "$SAMTOOLS not available" 23 | exit 1 24 | fi 25 | 26 | 27 | 28 | # Get all CRAM files 29 | 30 | if [ ! -z $LSB_JOBINDEX ]; then 31 | CRAMS=($CRAM_file/*.cram) 32 | INDEX=$(($LSB_JOBINDEX-1)) 33 | FILE=${CRAMS[$INDEX]} 34 | else 35 | FILE=$CRAM_file 36 | fi 37 | 38 | NAME=`basename ${FILE%.cram}` #remove path and .cram suffix 39 | cp $FILE $WORK_dir/$NAME.cram 40 | $SAMTOOLS view -b -h $WORK_dir/$NAME.cram -o $BAM_dir/$NAME.bam 41 | rm $WORK_dir/$NAME.cram 42 | 43 | 44 | -------------------------------------------------------------------------------- /0_Check_Barcodes.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 3) { die "Breakdown_Paired_Ends.pl INPUT1 INPUT2 ProjectName\n";} 5 | my $infile1 = $ARGV[0]; 6 | my $infile2 = $ARGV[1]; 7 | 8 | my %Barcodes = (); 9 | open (my $ifh1, $infile1) or die $!; 10 | while(<$ifh1>) { 11 | my $file1line = $_; 12 | if ($file1line =~ /^@/) { 13 | my @thing1 = split(/\s+/,$file1line); 14 | my $readname = $thing1[0]; 15 | my $barcodes = <$ifh1>; 16 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/){ 17 | my $UMI = $2; 18 | my $CellID = $1; 19 | $Barcodes{$UMI}++; 20 | } 21 | } else {next;} 22 | } 23 | close($ifh1); 24 | 25 | my @codes = sort{$Barcodes{$a}<=>$Barcodes{$b}} keys(%Barcodes); 26 | foreach my $code (@codes) { 27 | print "$code ".$Barcodes{$code}."\n"; 28 | } 29 | -------------------------------------------------------------------------------- /0_Convert_CRAM_to_BAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export REF_CACHE=/lustre/scratch117/cellgen/team218/TA/TemporaryFileDir 4 | CRAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/cramtools-3.0/cramtools-3.0.jar 5 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools 6 | FILEINDEX=$LSB_JOBINDEX 7 | LANE6="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane6/21698_6_$FILEINDEX.cram" 8 | LANE7="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane7/21698_7_$FILEINDEX.cram" 9 | LANE8="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane8/21698_8_$FILEINDEX.cram" 10 | 11 | OUTDIR=/lustre/scratch117/cellgen/team218/TA/LiverOrganoids/BAMS 12 | 13 | FILE1=$(basename $LANE6) 14 | FILE2=$(basename $LANE7) 15 | FILE3=$(basename $LANE8) 16 | 17 | cp $LANE6 $OUTDIR/$FILE1 18 | cp $LANE7 $OUTDIR/$FILE2 19 | cp $LANE8 $OUTDIR/$FILE3 20 | 21 | $SAMTOOLS view -b -h $OUTDIR/$FILE1 -o $OUTDIR/$FILE1.bam 22 | $SAMTOOLS view -b -h $OUTDIR/$FILE2 -o $OUTDIR/$FILE2.bam 23 | $SAMTOOLS view -b -h $OUTDIR/$FILE3 -o $OUTDIR/$FILE3.bam 24 | 25 | #$SAMTOOLS merge $OUTDIR/Cell$FILEINDEX.bam $OUTDIR/$FILE1.bam $OUTDIR/$FILE2.bam $OUTDIR/$FILE3.bam 26 | 27 | rm $OUTDIR/$FILE1 28 | rm $OUTDIR/$FILE2 29 | rm $OUTDIR/$FILE3 30 | 31 | 32 | 33 | #export _JAVA_OPTIONS="-Xmx100M -XX:MaxHeapSize=100m" 34 | #java -jar $CRAMTOOLS bam -I $OUTDIR/$FILE1 -O $OUTDIR/$FILE1.cram.bam 35 | 36 | #alias cramtools='java -jar cramtools-2.0.jar' 37 | #cramtools bam -I 9233_8#168_1.cram -O 9233_8#168_1.cram.bam 38 | #cramtools fastq -I 9233_8#168_1.cram | head 39 | #samtools view 9233_8#168_1.cram.bam | head 40 | -------------------------------------------------------------------------------- /0_Determine_Barcodes.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 2) {die "Required Argument: Barcode counting output, expected number of cells\n";} 5 | 6 | my $ExpectNCells = $ARGV[1]; # This is just used as a guide does not have to be exact. 7 | my $trunc = 1; #Only output the identified cell IDs. 8 | 9 | open(my $ifh, $ARGV[0]) or die $!; # list of the form: barcode frequencey; in decending order of frequency. 10 | my %Barcodes = (); 11 | 12 | my $count = 0; 13 | while(<$ifh>) { 14 | chomp; 15 | my @record = split(/\s+/); 16 | my $barcode = $record[0]; 17 | my $counts = $record[1]; 18 | my @seencodes = keys(%Barcodes); 19 | $Barcodes{$barcode} = $counts; 20 | foreach my $key (@seencodes) { 21 | my $count = ( $barcode ^ $key ) =~ tr/\0//; 22 | my $mismatches = length($barcode)-$count; 23 | if ($mismatches <= 1) { 24 | $Barcodes{$barcode} = $Barcodes{$key}+$Barcodes{$barcode}; 25 | delete($Barcodes{$key}); 26 | } 27 | } 28 | $count++; 29 | if ($count > 10000) {print STDERR scalar(@seencodes)."\n"; $count=0;} 30 | } 31 | 32 | my @codes = sort{$Barcodes{$a}<=>$Barcodes{$b}} keys(%Barcodes); 33 | my $quantile = $ExpectNCells*0.75; 34 | my $quantile_freq = $Barcodes{$codes[$quantile]}; 35 | my $threshold = $quantile_freq - ($Barcodes{$codes[0]}-$quantile_freq); 36 | 37 | my $count = 0; 38 | foreach my $code (@codes) { 39 | if ($Barcodes{$code} < $threshold) { 40 | print STDERR "$count cell barcodes found.\n" 41 | if ($tunct) {last;} 42 | } 43 | print "$code ".$Barcodes{$code}."\n"; 44 | $count++; 45 | } 46 | -------------------------------------------------------------------------------- /0_Download_Files_from_Dropbox.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | #system("wget "LINK" > index.txt"); 4 | my %files = (); 5 | open(my $ifh, "index.txt") or die $!; 6 | while (<$ifh>) { 7 | if ($_ =~ /\.gz/) { 8 | while($_ =~ s/href="(.*?\.gz)/Done/){ 9 | # print $1."\n"; 10 | $files{$1} = 1; 11 | } 12 | } 13 | } close($ifh); 14 | 15 | foreach my $file (keys(%files)) { 16 | system("wget $file") 17 | } 18 | -------------------------------------------------------------------------------- /0_Extract_Metadata_from_Bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export REF_CACHE=/lustre/scratch117/cellgen/team218/TA/TemporaryFileDir 4 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools 5 | 6 | OUTDIR=$1 7 | INPUTDIR=$2 8 | 9 | FILEStoMAP=($INPUTDIR/*.bam) 10 | ARRAYINDEX=$(($LSB_JOBINDEX-1)) 11 | INPUTBAM=${FILEStoMAP[$ARRAYINDEX]} 12 | OUTFILE=$(basename $FILE).meta 13 | 14 | $SAMTOOLS -H $INPUTBAM > $OUTDIR/$OUTFILE 15 | -------------------------------------------------------------------------------- /0_FASTQC.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Initial QC 3 | FASTQFILE=$1 4 | FASTQC=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/FastQC/fastqc 5 | LIMITFILE=/nfs/users/nfs_t/ta6/RNASeqPipeline/0_FASTQC_limits.txt 6 | # There is also the -o for an appropriate output directory 7 | 8 | if [ ! -f "$FASTQFILE" ] ; then 9 | echo "Sorry $FASTQFILE does not exist " 10 | exit 1 11 | fi 12 | 13 | if [ ! -f "$FASTQC" ] ; then 14 | echo "Sorry FASTQC not available " 15 | exit 1 16 | fi 17 | 18 | export _JAVA_OPTIONS="-Xmx100M -XX:MaxHeapSize=100m" 19 | $FASTQC -l $LIMITFILE --quiet $FASTQFILE 20 | 21 | 22 | #If you want to run fastqc on a stream of data to be read from standard input then you 23 | #can do this by specifing 'stdin' as the name of the file to be processed and then 24 | #streaming uncompressed fastq format data to the program. For example: 25 | 26 | #zcat *fastq.gz | fastqc stdin 27 | #zcat C*.gz | /nfs/users/nfs_t/ta6/RNASeqPipeline/software/FastQC/fastqc -o placeforoutputfiles/ stdin 28 | 29 | # ^^ This is probably the best approach to use in many of my cases since this allows on the fly combining of various files without storing duplicated data. 30 | -------------------------------------------------------------------------------- /0_FASTQC_Streaming.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Initial QC 3 | FASTQFILEDIR=$1 4 | FASTQFILEPATTERN=$2 5 | OUTNAME=$3 #outputfilenames 6 | OUTPUTDIR=$4 #directory for outputfiles 7 | FASTQC=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/FastQC/fastqc 8 | LIMITFILE=/nfs/users/nfs_t/ta6/RNASeqPipeline/0_FASTQC_limits.txt 9 | # There is also the -o for an appropriate output directory 10 | 11 | if [ -z "$FASTQFILEDIR" ] ; then 12 | echo "Please provide a directory of fastq files (Argument 1/4)" 13 | exit 1 14 | fi 15 | if [ -z "$FASTQFILEPATTERN" ] ; then 16 | echo "Please provide a pattern to select fastq files with (Argument 2/4) " 17 | exit 1 18 | fi 19 | 20 | if [ -z "$OUTNAME" ] ; then 21 | echo "Please provide name for output files (Argument 3/4) " 22 | exit 1 23 | fi 24 | 25 | if [ -z "$OUTPUTDIR" ] ; then 26 | echo "Please provide a directory to put output in (Argument 4/4) " 27 | exit 1 28 | fi 29 | 30 | if [ ! -f "$FASTQC" ] ; then 31 | echo "Sorry FASTQC not available " 32 | exit 1 33 | fi 34 | 35 | mkdir -p $OUTPUTDIR 36 | 37 | export _JAVA_OPTIONS="-Xmx10000M -XX:MaxHeapSize=10000m" 38 | #zcat $FASTQFILEDIR/$FASTQFILEPATTERN | $FASTQC -l $LIMITFILE --quiet $FASTQFILE -o $OUTPUTDIR stdin 39 | cat $FASTQFILEDIR/$FASTQFILEPATTERN | $FASTQC -l $LIMITFILE --quiet $FASTQFILE -o $OUTPUTDIR stdin 40 | mv $OUTPUTDIR/stdin_fastqc.html $OUTPUTDIR/FASTQC_$OUTNAME.html 41 | mv $OUTPUTDIR/stdin_fastqc.zip $OUTPUTDIR/FASTQC_$OUTNAME.zip 42 | -------------------------------------------------------------------------------- /0_FASTQC_limits.txt: -------------------------------------------------------------------------------- 1 | # For each of the modules you can choose to not run that 2 | # module at all by setting the value below to 1 for the 3 | # modules you want to remove. 4 | duplication ignore 0 5 | kmer ignore 0 6 | n_content ignore 0 7 | overrepresented ignore 0 8 | quality_base ignore 0 9 | sequence ignore 0 10 | gc_sequence ignore 0 11 | quality_sequence ignore 0 12 | tile ignore 0 13 | sequence_length ignore 0 14 | adapter ignore 0 15 | 16 | # For the duplication module the value is the percentage 17 | # remaining after deduplication. Measured levels below 18 | # these limits trigger the warning / error. 19 | duplication warn 70 20 | duplication error 50 21 | 22 | # For the kmer module the filter is on the -log10 binomial 23 | # pvalue for the most significant Kmer, so 5 would be 24 | # 10^-5 = p<0.00001 25 | kmer warn -1 26 | kmer error -1 27 | 28 | # For the N module the filter is on the percentage of Ns 29 | # at any position in the library 30 | n_content warn -1 31 | n_content error -1 32 | 33 | # For the overrepresented seqs the warn value sets the 34 | # threshold for the overrepresented sequences to be reported 35 | # at all as the proportion of the library which must be seen 36 | # as a single sequence 37 | overrepresented warn -1 38 | overrepresented error -1 39 | 40 | # The per base quality filter uses two values, one for the value 41 | # of the lower quartile, and the other for the value of the 42 | # median quality. Failing either of these will trigger the alert 43 | quality_base_lower warn 10 44 | quality_base_lower error 5 45 | quality_base_median warn 25 46 | quality_base_median error 20 47 | 48 | # The per base sequence content module tests the maximum deviation 49 | # between A and T or C and G 50 | sequence warn -1 51 | sequence error -1 52 | 53 | # The per sequence GC content tests the maximum deviation between 54 | # the theoretical distribution and the real distribution 55 | gc_sequence warn 15 56 | gc_sequence error 30 57 | 58 | # The per sequence quality module tests the phred score which is 59 | # most frequently observed 60 | quality_sequence warn -1 61 | quality_sequence error -1 62 | 63 | # The per tile module tests the maximum phred score loss between 64 | # and individual tile and the average for that base across all tiles 65 | tile warn 5 66 | tile error 10 67 | 68 | # The sequence length module tests are binary, so the values here 69 | # simply turn them on or off. The actual tests warn if you have 70 | # sequences of different length, and error if you have sequences 71 | # of zero length. 72 | 73 | sequence_length warn -1 74 | sequence_length error -1 75 | 76 | # The adapter module's warnings and errors are based on the 77 | # percentage of reads in the library which have been observed 78 | # to contain an adapter associated Kmer at any point 79 | 80 | adapter warn 5 81 | adapter error 10 82 | -------------------------------------------------------------------------------- /0_Flexible_Convert_Dir_CRAM_to_BAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Designed for LSF job arrays 3 | 4 | export REF_CACHE=/lustre/scratch117/cellgen/team218/TA/TemporaryFileDir 5 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools 6 | #LSB_JOBINDEX=$3 # specify which file to run on 7 | FILEINDEX=$LSB_JOBINDEX 8 | DIR=$1 9 | OUTDIR=$2 10 | FILE="$DIR/*_$FILEINDEX.cram" 11 | 12 | #LANE8="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane8/21698_8_$FILEINDEX.cram" 13 | 14 | OUTDIR=/lustre/scratch117/cellgen/team218/TA/LiverOrganoids/BAMS 15 | 16 | FILE1=$(basename $FILE) 17 | 18 | cp $FILE $OUTDIR/$FILE1 19 | 20 | $SAMTOOLS view -b -h $OUTDIR/$FILE1 -o $OUTDIR/$FILE1.bam 21 | 22 | rm $OUTDIR/$FILE1 23 | -------------------------------------------------------------------------------- /0_GBK2FASTA.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (scalar(@ARGV) < 1 || $ARGV[0] !~ /gbk$/) {die "Did not provide GBK file."} 5 | 6 | my @sequence = (); 7 | my $name = ""; 8 | my $file = $ARGV[0]; 9 | 10 | open (my $ifh, $file) or die $!; 11 | my $seq_started = 0; 12 | while (<$ifh>) { 13 | if ($_ =~ /^LOCUS/) { 14 | my @record = split(/\s+/); 15 | $name = $record[1]; 16 | } elsif ($_ =~ /^ORIGIN/) { 17 | $seq_started = 1; 18 | next; 19 | } elsif ($_ =~ /^\/\//) { 20 | last; 21 | } 22 | if ($seq_started) { 23 | chomp; 24 | my $seq = $_; 25 | $seq =~ s/\s//g; #remove all whitespace 26 | $seq =~ s/\d//g; #remove all base numbers 27 | push(@sequence, $seq); 28 | } 29 | } close($ifh); 30 | 31 | $file =~ s/gbk$/fa/; 32 | open (my $ofh, ">", $file) or die $!; 33 | print $ofh ">$name\n"; 34 | foreach my $seq (@sequence) {print $ofh $seq."\n";} 35 | close($ofh); 36 | 37 | -------------------------------------------------------------------------------- /0_Gather_Summary_Statistics.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # Input a set of fastQ sequencing files 5 | # gunzip, read, and re-gzip each one in turn 6 | # Count No. Reads, Length of Reads, 7 | # Get number of unique cells & number of reads for each one (I think this is the second thing in the header line). 8 | 9 | if (@ARGV < 1) { die "Gather_Summary_Statistics.pl list_of_gzipped_fastq_files\n";} 10 | 11 | my %cell2count = (); 12 | my %lane2count = (); 13 | #print join("\n", @ARGV)."\n"; 14 | #exit(); 15 | foreach my $file (@ARGV) { 16 | my $lane = ""; 17 | if ($file =~ /(lane\d+)/) { 18 | $lane = $1; 19 | } 20 | my $workingfile = "/lustre/scratch108/compgen/team218/TA/temporaryfile1.txt.gz"; 21 | system("cp $file $workingfile"); 22 | system("gunzip $workingfile"); 23 | $workingfile =~ s/\.gz$//; 24 | open (my $ifh, $workingfile) or die "Cannot open $workingfile : $!\n"; 25 | 26 | while (<$ifh>) { 27 | if ($_ =~ /^@/) { 28 | my @record = split(/\s+/); 29 | $cell2count{$record[1]} ++; 30 | $lane2count{$lane}++; 31 | } 32 | } close ($ifh); 33 | system("rm $workingfile"); 34 | # exit(); 35 | } 36 | foreach my $cell (sort(keys(%cell2count))) { 37 | print "$cell\t$cell2count{$cell}\n"; 38 | } 39 | foreach my $lane (sort(keys(%lane2count))) { 40 | print "$lane\t$lane2count{$lane}\n"; 41 | } 42 | -------------------------------------------------------------------------------- /0_Get_Data_from_iRODS.sh: -------------------------------------------------------------------------------- 1 | #auto login for running as a job on the farm: ta6 = your username 2 | # create irods.keytab file 3 | #ktutil 4 | #ktutil: addent -password -p ta6 -k 1 -e aes256-cts 5 | #Password for ta6@INTERNAL.SANGER.AC.UK: 6 | #ktutil: wkt irods.keytab 7 | #ktutil: quit 8 | kinit ta6 -k -t ~/irods.keytab 9 | 10 | #nicked from /nfs/team205/tpcg/bin/scripts/dump_irods.sh on the farm 11 | #and slightly modified - sed -i 's/\/software\/irods\/icommands\/bin\///g' dump_irods.sh 12 | 13 | # make directory for each run_lane 14 | run_lane="$1" 15 | 16 | # extract run and lane 17 | run=`echo $run_lane | sed -e 's/_.*//'` 18 | lane=`echo $run_lane | sed -e 's/.*_//'` 19 | 20 | # get the cram files 21 | imeta qu -z seq -d id_run = $run and lane = $lane and target = 1 and type = cram \ 22 | | grep : | awk '{ print $2 }' | paste - - -d/ \ 23 | | xargs -ixxx iget -K xxx ./ 24 | 25 | chmod 664 * 26 | 27 | # remove phiX control 28 | find ./ | grep -E '#888\.' | xargs rm 29 | 30 | # get and format the meta info. 31 | for cram in $(find ./ | grep cram$ | sed -e 's/.*\///' | sed -e 's/\.cram$//'); do 32 | imeta ls -d /seq/$run/$cram.cram > $cram.imeta 33 | sn=$(grep -A 1 sample_supplier_name $cram.imeta | tail -1 | sed 's/ /_/g') 34 | sample_name=${sn:7} 35 | echo -e "$run_lane/$cram\t$sample_name" >> $run_$lane_sampleInfo.txt 36 | done 37 | -------------------------------------------------------------------------------- /0_Make_ERCC_fasta_and_gtf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl 2 | use strict; 3 | use warnings; 4 | # Converts the Annotation file from https://www.thermofisher.com/order/catalog/product/4456740 to gtf and fasta files that can be added to existing genome fasta & gtf files. 5 | if (@ARGV != 1) {die "Usage: 0_Make_ERCC_fasta_and_gtf.pl ";} 6 | 7 | $file = $ARGV[0]; #ERCC_Controls_Annotation.txt 8 | 9 | my @FASTAlines = (); 10 | my @GTFlines = (); 11 | open (my $ifh, $file) or die $!; 12 | <$ifh>; #header 13 | while (<$ifh>) { 14 | # Do all the important stuff 15 | chomp; 16 | my @record = split(/\t/); 17 | my $sequence = $record[4]; 18 | $sequence =~ s/\s+//g; # get rid of any preceeding/tailing white space 19 | $sequence = $sequence."NNNN"; 20 | my $name = $record[0]; 21 | my $genbank = $record[1]; 22 | push(@FASTAlines, ">$name\n$sequence\n"); 23 | # is GTF 1 indexed or 0 indexed? -> it is 1 indexed 24 | # + or - strand? 25 | push(@GTFlines, "$name\tERCC\tgene\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n"); 26 | push(@GTFlines, "$name\tERCC\ttranscript\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n"); 27 | push(@GTFlines, "$name\tERCC\texon\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n"); 28 | } close($ifh); 29 | 30 | # Write output 31 | open(my $ofh, ">", "ERCC_Controls.fa") or die $!; 32 | foreach my $line (@FASTAlines) { 33 | print $ofh $line; 34 | } close ($ofh); 35 | 36 | open($ofh, ">", "ERCC_Controls.gtf") or die $!; 37 | foreach my $line (@GTFlines) { 38 | print $ofh $line; 39 | } close ($ofh); 40 | -------------------------------------------------------------------------------- /0_Merge_FASTQs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Merge read files for paired-end sequencing across two lanes, where files are ordered by lane, then cell, then read 3 | FASTQDIR=$1 4 | OUTDIR=$2 5 | NCELLS=$3 6 | 7 | #LSB_JOBINDEX=1 #Testing 8 | 9 | # Maths 10 | NFILES=$(($NCELLS*2)) 11 | INDEX1=$(($LSB_JOBINDEX-1)) 12 | INDEX2=$(($INDEX1+$NFILES)) 13 | 14 | FILES=($FASTQDIR/*.gz) 15 | FILE1=${FILES[$INDEX1]} 16 | FILE2=${FILES[$INDEX2]} 17 | 18 | echo $FILE1 19 | echo $FILE2 20 | TAIL='_1.fq' 21 | CELLID=$LSB_JOBINDEX; 22 | if !((CELLID % 2)); then 23 | CELLID=$(($CELLID/2)) 24 | else 25 | CELLID=$(( ($CELLID+1)/2 )) 26 | fi 27 | 28 | if [[ $FILE1 =~ _1.f ]] ; then 29 | OUTFILE=Cell$CELLID$TAIL 30 | zcat $FILE1 $FILE2 > $OUTDIR/$OUTFILE 31 | else 32 | TAIL='_2.fq' 33 | OUTFILE=Cell$CELLID$TAIL 34 | zcat $FILE1 $FILE2 > $OUTDIR/$OUTFILE 35 | fi 36 | echo $OUTFILE 37 | gzip $OUTDIR/$OUTFILE 38 | -------------------------------------------------------------------------------- /0_My_Extract_Transcriptome.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 3) {die "0_My_Extract_Transcriptome.pl .gtf .fa Nascent?[0|1]\n";} 5 | # To Do: 6 | # get all exons per transcript 7 | # sort by start 8 | # if overlap (start2 < end1) then merge 9 | # else switch to new exon 10 | sub merge_transcripts { # Tested 11 | my @exons = sort{$a->{"st"} <=> $b->{"st"}} @_; 12 | my %curr = %{shift(@exons)}; 13 | my @finalexons = (); 14 | foreach my $exon2 (@exons) { 15 | if ($curr{"end"} > $exon2->{"st"}) { 16 | # overlap == merge 17 | if ($exon2->{"end"} > $curr{"end"}) { 18 | $curr{"end"} = $exon2->{"end"}; 19 | } 20 | } else { 21 | my $tmp1 = $curr{"st"}; 22 | my $tmp2 = $curr{"end"}; 23 | # print "save $tmp1 $tmp2\n"; 24 | push(@finalexons, {"st"=>$tmp1,"end"=>$tmp2}); 25 | %curr = %{$exon2}; 26 | } 27 | } 28 | my $tmp1 = $curr{"st"}; 29 | my $tmp2 = $curr{"end"}; 30 | # print "save $tmp1 $tmp2\n"; 31 | push(@finalexons, {"st"=>$tmp1,"end"=>$tmp2}); 32 | return(@finalexons); 33 | } 34 | 35 | 36 | my %Ensg2Seq = (); 37 | my %Ensg2Tail = (); 38 | my %Ensg2Gtf = (); 39 | my @Ensgs = (); 40 | my $flank = 10; 41 | 42 | my $nascent = $ARGV[2]; 43 | 44 | open (my $fa, $ARGV[1]) or die $!; 45 | open (my $fa_out, ">","Transcripts.fa") or die $!; 46 | open (my $gtf_out, ">","Transcripts.gtf") or die $!; 47 | my $chr = "None"; 48 | my $chr_seq = ""; 49 | my $COUNT = 0; 50 | while (<$fa>) { 51 | if($_ =~ /^#/) {next;} # skip headers 52 | if ($_ =~ /^\>/) { 53 | # New Chr 54 | my @line = split(/\s+/); 55 | my $newchr = $line[0]; $newchr =~ s/>//g; 56 | if ($chr eq "None") { 57 | $chr = $newchr; 58 | next; 59 | } 60 | # Output gene sequences for this chromosome 61 | open (my $gtf, $ARGV[0]) or die $!; 62 | my $gtf_line = ""; 63 | my %exons = (); 64 | while ($gtf_line = <$gtf>) { 65 | if ($gtf_line =~ /^#/) {next;} # ignore headers 66 | my $geneid = ""; 67 | if ($gtf_line =~ /gene_id "(.+?)";/) { 68 | $geneid = $1; 69 | } else { 70 | next; 71 | } # get gene id 72 | my @record = split(/\t/, $gtf_line); 73 | my $seq_chr = $record[0]; 74 | if ($seq_chr ne $chr) {next;} 75 | my $seq_st = $record[3]-$flank; 76 | my $seq_end = $record[4]+$flank; 77 | my %item; $item{"st"} = $seq_st; $item{"end"} = $seq_end; 78 | if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";} 79 | if (!$nascent) { 80 | if ($record[2] eq "exon" || $record[2] eq "UTR") { 81 | push(@{$exons{$geneid}}, \%item); 82 | # if (exists($Ensg2Seq{$geneid})) { 83 | # $Ensg2Seq{$geneid}.= substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank)); 84 | # } else { 85 | # $Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank)); 86 | # } 87 | } 88 | } else { 89 | if ($record[2] eq "gene") { 90 | push(@{$exons{$geneid}}, \%item); 91 | # $Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank)); 92 | } 93 | } 94 | if ($record[2] eq "gene") { 95 | push(@Ensgs, $geneid); 96 | $Ensg2Gtf{$geneid} = $gtf_line; 97 | $COUNT++; 98 | } 99 | } 100 | close($gtf); 101 | foreach my $ensg (@Ensgs) { 102 | # Get sequence 103 | my @parts = @{$exons{$ensg}}; 104 | my @merged = merge_transcripts(@parts); 105 | my $gene_seq = ""; 106 | foreach my $item (@merged) { 107 | my $seq_st = $item->{"st"}; 108 | my $seq_end = $item->{"end"}; 109 | #print "$seq_st $seq_end aquired\n"; 110 | $gene_seq .= substr($chr_seq, $seq_st, ($seq_end-$seq_st)); 111 | } 112 | 113 | print $fa_out ">$ensg\n"; 114 | print $fa_out $gene_seq."\n"; 115 | my $seq_length = length($gene_seq); 116 | # print $fa_out $Ensg2Seq{$ensg}."\n"; 117 | # my $seq_length = length($Ensg2Seq{$ensg}); 118 | my $old_gtf = $Ensg2Gtf{$ensg}; 119 | $old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/; 120 | my @record = split(/\t/, $old_gtf); 121 | $record[0] = $ensg; 122 | $record[3] = 1; 123 | $record[4] = $seq_length-1; 124 | print $gtf_out join("\t",@record); 125 | my $lastele = scalar(@record)-1; 126 | $record[$lastele] = "gene_id \"$ensg\"; transcript_id \"$ensg\"; exon_number \"1\"; gene_name \"$ensg\"\n"; 127 | $record[2] = "exon"; 128 | print $gtf_out join("\t",@record); 129 | } 130 | print "$chr $newchr\n"; 131 | %exons = (); 132 | $chr = $newchr; 133 | $chr_seq=""; 134 | $COUNT=0; 135 | @Ensgs=(); 136 | } else { 137 | chomp; 138 | $chr_seq = $chr_seq.$_; 139 | } 140 | } 141 | # Output last chromosome 142 | # Output gene sequences 143 | open (my $gtf, $ARGV[0]) or die $!; 144 | my $gtf_line = ""; 145 | while ($gtf_line = <$gtf>) { 146 | if ($gtf_line =~ /^#/) {next;} # ignore headers 147 | 148 | my $geneid = ""; 149 | if ($gtf_line =~ /gene_id "(.+?)";/) { 150 | $geneid = $1; 151 | } else { 152 | next; 153 | } # get gene id 154 | 155 | my @record = split(/\t/, $gtf_line); 156 | my $seq_chr = $record[0]; 157 | if ($seq_chr ne $chr) {next;} 158 | my $seq_st = $record[3]; 159 | my $seq_end = $record[4]; 160 | if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";} 161 | if (!$nascent) { 162 | if ($record[2] eq "exon" || $record[2] eq "UTR") { 163 | if (exists($Ensg2Seq{$geneid})) { 164 | $Ensg2Seq{$geneid}.= substr($chr_seq, $seq_st-10, ($seq_end-$seq_st+10)); 165 | } else { 166 | $Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-10, ($seq_end-$seq_st+10)); 167 | } 168 | } 169 | } else { 170 | if ($record[2] eq "gene") { 171 | $Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-10, ($seq_end-$seq_st+10)); 172 | } 173 | } 174 | if ($record[2] eq "gene") { 175 | push(@Ensgs, $geneid); 176 | $Ensg2Gtf{$geneid} = $gtf_line; 177 | $COUNT++; 178 | if ($record[6] eq "+") { 179 | $Ensg2Tail{$geneid}->{"+"} = substr($chr_seq,$seq_end,$flank); 180 | } else { 181 | $Ensg2Tail{$geneid}->{"-"} = substr($chr_seq,$seq_st-$flank,$flank); 182 | } 183 | } 184 | } 185 | close($gtf); 186 | foreach my $ensg (@Ensgs) { 187 | print $fa_out ">$ensg\n"; 188 | print $fa_out $Ensg2Seq{$ensg}."\n"; 189 | my $seq_length = length($Ensg2Seq{$ensg}); 190 | my $old_gtf = $Ensg2Gtf{$ensg}; 191 | $old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/; 192 | my @record = split(/\t/, $old_gtf); 193 | $record[0] = $ensg; 194 | $record[3] = 1; 195 | $record[4] = $seq_length-1; 196 | print $gtf_out join("\t",@record); 197 | 198 | my $lastele = scalar(@record)-1; 199 | $record[$lastele] = "gene_id \"$ensg\"; transcript_id \"$ensg\"; exon_number \"1\"; gene_name \"$ensg\"\n"; 200 | $record[2] = "exon"; 201 | print $gtf_out join("\t",@record); 202 | } 203 | 204 | close($fa_out); 205 | close($gtf_out); 206 | close($fa); 207 | -------------------------------------------------------------------------------- /0_Process_GBK.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (scalar(@ARGV) < 1 || $ARGV[0] !~ /gbk$/) {die "Did not provide GBK file."} 5 | 6 | my $sequence = ""; 7 | my $chrname = ""; 8 | my $file = $ARGV[0]; 9 | 10 | open (my $ifh, $file) or die $!; 11 | my $seq_started = 0; 12 | my $st = 0; 13 | my $end = 0; 14 | my $name = ""; 15 | my %Items = (); 16 | my $geneid = 0; 17 | my %Gene_info=(); 18 | while (<$ifh>) { 19 | if ($_ =~/Promoter/) { 20 | $geneid++; 21 | } 22 | if ($_ =~ /feature\s+(\d+)\.\.(\d+)/){ 23 | $st = $1; 24 | $end = $2; 25 | } 26 | if ($_ =~ /\/label=(.+)\s+$/) { 27 | $name=$1; 28 | $name =~ s/\s//g; 29 | $Items{$geneid}->{"$st\t$end"} = $name; 30 | if (!exists($Gene_info{$geneid}->{"st"}) || $st < $Gene_info{$geneid}->{"st"}) { 31 | $Gene_info{$geneid}->{"st"} = $st; 32 | } 33 | if (!exists($Gene_info{$geneid}->{"end"}) || $end > $Gene_info{$geneid}->{"end"}) { 34 | $Gene_info{$geneid}->{"end"} = $end; 35 | } 36 | } 37 | 38 | if ($_ =~ /^LOCUS/) { 39 | my @record = split(/\s+/); 40 | $chrname = $record[1]; 41 | } elsif ($_ =~ /^ORIGIN/) { 42 | $seq_started = 1; 43 | next; 44 | } elsif ($_ =~ /^\/\//) { 45 | last; 46 | } 47 | if ($seq_started) { 48 | chomp; 49 | my $seq = $_; 50 | $seq =~ s/\s//g; #remove all whitespace 51 | $seq =~ s/\d//g; #remove all base numbers 52 | $sequence .= $seq; 53 | } 54 | } close($ifh); 55 | 56 | $file =~ s/gbk$/gtf/; 57 | open (my $ofh, ">", $file) or die $!; 58 | foreach my $gene (sort(keys(%Items))) { 59 | print $ofh "$chrname\tGBK\tgene\t".$Gene_info{$gene}->{"st"}."\t".$Gene_info{$gene}->{"end"}."\t.\t+\t.\tgene_id \"Gene$gene\"; transcript_id \"Transcript$gene\"; gene_name \"Gene$gene\"; gene_source \"GBK\";\n"; 60 | my $exon_num=0; 61 | foreach my $exon (sort(keys(%{$Items{$gene}}))) { 62 | $exon_num++; 63 | print $ofh "$chrname\tGBK\texon\t$exon\t.\t+\t.\tgene_id \"Gene$gene\"; transcript_id \"Transcript$gene\"; exon_number \"$exon_num\"; gene_name \"Gene$gene\"; transcript_name \"Transcript$gene\"; gene_source \"GBK\"; exon_name \"".$Items{$gene}->{$exon}."\";\n"; 64 | } 65 | } 66 | close($ofh); 67 | 68 | $file =~ s/gtf$/fa/; 69 | open ($ofh, ">", $file) or die $!; 70 | print $ofh ">$chrname\n$sequence"; 71 | close($ofh); 72 | 73 | -------------------------------------------------------------------------------- /0_custom_undo_demultiplexing.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | open (my $ifh, "/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/wta98_embl2.metadata") or die $!; 5 | my %Name2Barcode = (); 6 | while (<$ifh>) { 7 | if ($_ =~/^#/) {next;} 8 | my @record = split(/\s+/); 9 | my $name = "R".$record[6]."C".$record[7]; 10 | $Name2Barcode{$name} = $record[0]; 11 | } close($ifh); 12 | 13 | 14 | my @files1 = glob("/lustre/scratch108/compgen/team218/TA/Bergiers_Dropbox/*_1.txt"); 15 | my @files2 = glob("/lustre/scratch108/compgen/team218/TA/Bergiers_Dropbox/*_2.txt"); 16 | 17 | if (scalar(@files1) != scalar(@files2)) {die "Must have equal number of read1 & read2 files\n";} 18 | 19 | my $unassigned1 = ""; 20 | my $unassigned2 = ""; 21 | my $out1 = "lane1_Waf375_1.fq"; 22 | my $out2 = "lane1_Waf375_2.fq"; 23 | open(my $ofh1, ">", $out1) or die $!; 24 | open(my $ofh2, ">", $out2) or die $!; 25 | 26 | for(my $i = 0; $i < scalar(@files1); $i++) { 27 | open(my $ifh1, $files1[$i]) or die $!; 28 | open(my $ifh2, $files2[$i]) or die $!; 29 | 30 | my $barcode = ""; 31 | if ($files1[$i] =~ /sample(R\d+C\d+)_/) { 32 | my $name=$1; 33 | if (exists($Name2Barcode{$name})) { 34 | $barcode = $Name2Barcode{$name}; 35 | } else { 36 | die "$name has no barcode\n"; 37 | } 38 | } else { 39 | if ($files1[$i] =~/unassigned/i){ 40 | $unassigned1 = $files1[$i]; 41 | $unassigned2 = $files2[$i]; 42 | next; 43 | } 44 | die "$files1[$i] does not match\n"; 45 | } 46 | 47 | while(<$ifh1>) { 48 | my $file1line = $_; 49 | my $file2line = <$ifh2>; 50 | if ($file1line =~ /^@/) { 51 | my @thing1 = split(/\s+/,$file1line); 52 | my @thing2 = split(/\s+/,$file2line); 53 | my $readname = $thing1[0]; 54 | if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";} 55 | my $barcodeseq = <$ifh1>; 56 | $barcodeseq = $barcode.$barcodeseq; 57 | my $read = <$ifh2>; 58 | <$ifh1>;<$ifh2>; #+'s 59 | my $file1qual = <$ifh1>; 60 | $file1qual = 'E' x length($barcode) . $file1qual; 61 | my $file2qual = <$ifh2>; 62 | print $ofh1 "$readname\n$barcodeseq+\n$file1qual"; 63 | print $ofh2 "$readname\n$read+\n$file2qual"; 64 | } 65 | } 66 | close($ifh1); 67 | close($ifh2); 68 | } 69 | close ($ofh1); close($ofh2); 70 | system("cat $unassigned1 >> $out1"); 71 | system("cat $unassigned2 >> $out2"); 72 | print "Successfully Completed\n"; 73 | -------------------------------------------------------------------------------- /0_make_transcriptome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR 3 | 4 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Nascent_Transcripts.fa 5 | GTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Nascent_Transcripts.gtf 6 | 7 | bsub -R"select[mem>37000] rusage[mem=37000]" -M37000 -o buildtranscriptome.out -e buildtranscriptome.err $STAR --runMode genomeGenerate --genomeDir /lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/NeuronsLiora --genomeFastaFiles $FA --sjdbGTFfile $GTF --sjdbOverhang 20 --limitGenomeGenerateRAM 36000000000 8 | -------------------------------------------------------------------------------- /1.5_DO_Trim_Reads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesQCed 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesToMap 7 | INPUTFILES=($INPUTDIR/*) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES)) 10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%100" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/1.5_Trim_Reads.sh $INPUTDIR $OUTPUTDIR 11 | -------------------------------------------------------------------------------- /1.5_Trim_Reads_Paired.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Initial QC 3 | INPUTDIR=$1 #directory of inputfiles 4 | OUTPUTDIR=$2 #directory for outputfiles 5 | TRIMMER=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Trimmomatic-0.36/trimmomatic-0.36.jar 6 | # There is also the -o for an appropriate output directory 7 | 8 | if [ -z "$INPUTDIR" ] ; then 9 | echo "Please provide an input directory of fastq files (Argument 1/2)" 10 | exit 1 11 | fi 12 | if [ -z "$OUTPUTDIR" ] ; then 13 | echo "Please provide a directory for outputfiles (Argument 2/2)" 14 | exit 1 15 | fi 16 | 17 | if [ ! -f "$TRIMMER" ] ; then 18 | echo "Sorry $TRIMMER not available " 19 | exit 1 20 | fi 21 | 22 | mkdir -p $OUTPUTDIR 23 | FILES=($INPUTDIR/*.fq) 24 | ARRAYINDEX=$((($LSB_JOBINDEX-1))) 25 | INPUTFILE=${FILES[$ARRAYINDEX]} 26 | FILEnopath=`basename ${INPUTFILE%.fq}` 27 | OUTPUTFILE="$OUTPUTDIR/TRIMMED-$FILEnopath.fq" 28 | 29 | export _JAVA_OPTIONS="-Xmx1000M -XX:MaxHeapSize=1000m" 30 | #java -jar $TRIMMER SE -phred33 $INPUTFILE $OUTPUTFILE ILLUMINACLIP:/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Trimmomatic-0.36/adapters/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:20 31 | java -jar $TRIMMER SE -phred33 $INPUTFILE $OUTPUTFILE ILLUMINACLIP:/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Trimmomatic-0.36/adapters/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:50 32 | -------------------------------------------------------------------------------- /1.5_Trim_UMI.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 3) {die "1.5_Trim_UMI.pl 5'Length 3'Length inputdir outputdir\n";} 5 | 6 | my @files = glob("$ARGV[2]/*.fq"); 7 | my $tmpfile = "tmp.txt"; 8 | foreach my $file (@files) { 9 | $file =~ /([^\/]+\.fq)/; 10 | my $filename = $1; 11 | open(my $ifh, $file) or die $!; 12 | open(my $ofh, ">",$tmpfile) or die $!; 13 | 14 | while (<$ifh>) { 15 | if ($_ =~ /:/) { 16 | chomp; 17 | my @stuff = split(/\:/); 18 | my $UMI = $stuff[scalar(@stuff)-1]; 19 | my $trimmed = substr($UMI, $ARGV[0], -$ARGV[1]); 20 | if ($ARGV[1] == 0) { 21 | $trimmed = substr($UMI, $ARGV[0]); 22 | } 23 | $stuff[scalar(@stuff)-1]=$trimmed; 24 | print $ofh (join(":",@stuff)."\n"); 25 | } else { 26 | print $ofh ($_); 27 | } 28 | } close ($ifh); close ($ofh); 29 | system("mv $tmpfile $ARGV[3]/$filename"); 30 | } 31 | -------------------------------------------------------------------------------- /1.6_Summarizing_Trimming.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "1.6_Summarizing_Trimming.pl directory of outputfiles\n";} 5 | 6 | my @files = glob("$ARGV[0]/*"); 7 | my %Cell2ReadCount = (); 8 | foreach my $file (@files) { 9 | open (my $ifh, $file) or die $!; 10 | my $cell = ""; 11 | my $surviving = 0; 12 | while(<$ifh>) { 13 | if ($_ =~ /([ATCG]+)\.fq/) { 14 | $cell = $1; 15 | } 16 | if ($_ =~ /Surviving: (\d+) /) { 17 | $surviving=$1; 18 | last; 19 | } 20 | } close ($ifh); 21 | $Cell2ReadCount{$cell} = $surviving; 22 | } 23 | 24 | foreach my $code (sort(keys(%Cell2ReadCount))) { 25 | print "$code\t$Cell2ReadCount{$code}\n"; 26 | } 27 | 28 | 29 | -------------------------------------------------------------------------------- /1_BreakDown_Files_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ] ; then 4 | echo "Please set maximum number of reads per file (ARG 1/4)" 5 | exit 1 6 | fi 7 | if [ -z $2 ] ; then 8 | echo "Please set input file directory (ARG 2/4)" 9 | exit 1 10 | fi 11 | if [ -z $3 ] ; then 12 | echo "Please set a pattern for inputfiles (ARG 3/4)" 13 | exit 1 14 | fi 15 | if [ -z "$4" ] ; then 16 | echo "Please set a directory for output files (ARG 4/4)" 17 | exit 1 18 | fi 19 | 20 | OUTPUTDIR=$4 21 | INPUTFILES=($2/$3) 22 | ARRAYINDEX=$((($LSB_JOBINDEX-1))) 23 | 24 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/1_BreakDown_PairedEnds.pl $LSB_JOBINDEX $1 $OUTPUTDIR ${INPUTFILES[$ARRAYINDEX]} 25 | -------------------------------------------------------------------------------- /1_BreakDown_PairedEnds.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # Input a pair of sequencing FastQ files 5 | # gunzip, read, write out smaller broken down files in a format suitable for submitting job array, and re-gzip each one in turn 6 | # Breakdown by lane & cellID 7 | # Keep order. 8 | # This should work equally well for single-end reads and can take any number of files as arguments. 9 | # TESTED 10 | 11 | if (@ARGV < 3) { die "Breakdown_Paired_Ends.pl JOBID MAXREADS OUTPUTDIR FastQfile1 FastQfile2\n";} 12 | 13 | my $JOBID = shift(@ARGV); #Maxmimum number of reads per file [job]. 14 | my $MAX_READS_PER_FILE = shift(@ARGV); #Maxmimum number of reads per file [job]. 15 | my $OUTPUT_DIR = shift(@ARGV); #directory for output 16 | system("mkdir -p $OUTPUT_DIR"); 17 | 18 | foreach my $file (@ARGV) { 19 | my %cell2lines = (); 20 | my $workingfile = $file; 21 | # my $workingfile = "/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/temporaryfile$JOBID.txt.gz"; 22 | # system("cp $file $workingfile"); 23 | # system("gunzip $workingfile"); 24 | # $workingfile =~ s/\.gz$//; 25 | # my $pair = 0; my $orig_file_id = 0; my $experiment = ""; 26 | # if ($file =~ /(exp\d)_.*_(Bergiers_\w+)_(\d)_sequence/) { 27 | # if ($file =~ /(lane\d)(sample\d)_(\d)_sequence/) { 28 | # $experiment = $1; 29 | # $orig_file_id = $2; 30 | # $pair = $3; 31 | # } 32 | 33 | open (my $ifh, $workingfile) or die "Cannot open $workingfile : $!\n"; 34 | 35 | while (<$ifh>) { 36 | if ($_ =~ /^@/) { 37 | my @record = split(/\s+/); 38 | my $cell = "AAAAAAAAAAA"; 39 | if (scalar(@record) == 3) { 40 | $cell = $record[1]; 41 | } 42 | push(@{$cell2lines{$cell}}, $_); 43 | push(@{$cell2lines{$cell}}, <$ifh>); 44 | push(@{$cell2lines{$cell}}, <$ifh>); 45 | push(@{$cell2lines{$cell}}, <$ifh>); 46 | } 47 | } close ($ifh); 48 | # system("rm $workingfile"); 49 | foreach my $cell (sort(keys(%cell2lines))) { 50 | my $fileid = 1; my $Nlines = 0; 51 | # my $currentfile = "$OUTPUT_DIR/$orig_file_id\_$experiment\_$cell\_$pair.$fileid.fq"; 52 | my $currentfile = "$OUTPUT_DIR/$JOBID\_$cell.$fileid.fq"; 53 | open (my $ofh, ">$currentfile") or die $!; 54 | foreach my $line (@{$cell2lines{$cell}}) { 55 | 56 | print $ofh $line; 57 | $Nlines++; 58 | 59 | if ($Nlines == $MAX_READS_PER_FILE*4) { 60 | close ($ofh); #close current file 61 | system("gzip $currentfile"); #compress it 62 | $fileid++; 63 | # $currentfile = "$OUTPUT_DIR/$orig_file_id\_$experiment\_$cell\_$pair.$fileid.fq"; 64 | $currentfile = "$OUTPUT_DIR/$JOBID\_$cell.$fileid.fq"; 65 | open ($ofh, ">$currentfile") or die $!; #open the next file 66 | $Nlines = 0; #reset line counter 67 | } 68 | } 69 | close($ofh); system("gzip $currentfile"); # close and compress the current file. 70 | } 71 | # if (-e $workingfile) { 72 | # system("rm $workingfile"); 73 | # } 74 | } 75 | -------------------------------------------------------------------------------- /1_BreakDown_PairedEnds_Custom_Wafergen.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 6) { die "Breakdown_Paired_Ends.pl OUTPUTDIR INPUT1 INPUT2 BarcodeIndexfile BarcodeColumn(0=first column) ProjectName\n";} 5 | my $OUTPUT_DIR = $ARGV[0]; #directory for output 6 | system("mkdir -p $OUTPUT_DIR"); 7 | my $infile1 = $ARGV[1]; 8 | my $infile2 = $ARGV[2]; 9 | 10 | # Get acceptable cell barcodes 11 | my %CellBarcodes = (); 12 | open (my $ifh, $ARGV[3]) or die "Cannot open $ARGV[3]\n"; 13 | <$ifh>; # header 14 | my $column = $ARGV[4]; 15 | my $index=1; 16 | my %ofhs = (); 17 | while (<$ifh>) { 18 | if ($_ =~/^#/) {next;} 19 | my @record = split(/\s+/); 20 | my $barcode = $record[$column]; 21 | $CellBarcodes{$barcode} = $index; 22 | open(my $fh,'>',"$OUTPUT_DIR/$ARGV[5]_$barcode.fq") or die $!; 23 | $ofhs{$index} = $fh; 24 | $index++; 25 | } close($ifh); 26 | 27 | my $NotProperTail = 0; 28 | my $NotPossibleCell = 0; 29 | my $AmbiguousCell = 0; 30 | my $ExactMatch = 0; 31 | my $Mismatch1 = 0; 32 | my $Mismatch2 = 0; 33 | my $total_reads = 0; 34 | open (my $ifh1, $infile1) or die $!; 35 | open (my $ifh2, $infile2) or die $!; 36 | while(<$ifh1>) { 37 | my $file1line = $_; 38 | my $file2line = <$ifh2>; 39 | if ($file1line =~ /^@/) { 40 | my @thing1 = split(/\s+/,$file1line); 41 | my @thing2 = split(/\s+/,$file2line); 42 | my $readname = $thing1[0]; 43 | if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";} 44 | my $barcodes = <$ifh1>; 45 | my $read = <$ifh2>; 46 | $total_reads++; 47 | my $mismatches = 0; 48 | # if ($barcodes =~ /([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})[TKYWBHDNX]{9}/) { 49 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/) { 50 | my $UMI = $2; 51 | my $CellID = $1; 52 | if (!exists($CellBarcodes{$CellID})) { # Not an expected barcode 53 | if ($CellID !~ /^[ATCG]+$/) { 54 | $mismatches = () = $CellID =~ /[^ATCG]/g; # count uncertain bases as mismatches 55 | $CellID =~ s/[^ATCG]/./g; #Turn non-ATCG bases into wildcards 56 | } 57 | my @matches = (); 58 | my %close = (); 59 | foreach my $barcode (keys(%CellBarcodes)) { 60 | if ($barcode =~/$CellID/) { # Match but with uncertainty 61 | push(@matches, $barcode); 62 | } else { 63 | if (scalar(@matches == 0)) { # Count mismatches 64 | my $count = ( $barcode ^ $CellID ) =~ tr/\0//; 65 | if ($count >= length($barcode)-2) { # Allow upto 2 mismatches 66 | $close{$barcode} = $count; 67 | } 68 | } 69 | } 70 | } 71 | if (scalar(@matches) == 0 && scalar(keys(%close)) > 0) { # Has 1 or 2 mismatches 72 | my $max = my_max(values(%close)); # Closest match 73 | $mismatches = length($CellID)-$max; 74 | foreach my $code (keys(%close)) { 75 | if ($close{$code} == $max) { 76 | push(@matches,$code); 77 | } 78 | } 79 | } 80 | if (scalar(@matches) == 1) { # single best match 81 | $CellID = $matches[0]; 82 | if ($mismatches == 2) { 83 | $Mismatch2++; 84 | } 85 | if ($mismatches == 1) { 86 | $Mismatch1++; 87 | } 88 | } elsif (scalar(@matches) > 1) { #More than one equally good match 89 | $AmbiguousCell++; 90 | next; 91 | } else { # No match 92 | $NotPossibleCell++; 93 | next; 94 | } 95 | } else { # Exact match 96 | $ExactMatch++; 97 | } 98 | # ProperTailProperBarcode 99 | <$ifh1>;<$ifh2>; #+'s 100 | my $file1qual = <$ifh1>; 101 | my $file2qual = <$ifh2>; 102 | my $handle = $ofhs{$CellBarcodes{$CellID}}; 103 | print $handle "$readname:$UMI\n$read+\n$file2qual"; 104 | } else { 105 | $NotProperTail++; 106 | next; 107 | } 108 | } else {next;} 109 | } 110 | print STDERR "Not proper read: $NotProperTail\nNot possible cell: $NotPossibleCell\nAmbiguous: $AmbiguousCell\nExact Matches:$ExactMatch\nOne mismatch: $Mismatch1\nTwo mismatch: $Mismatch2\n Total: $total_reads\n"; 111 | close($ifh1); 112 | close($ifh2); 113 | foreach my $ofh (keys(%ofhs)) {close($ofhs{$ofh});} 114 | 115 | 116 | sub my_max { 117 | if (scalar(@_) == 1) {return($_[0])}; 118 | my $max = shift; 119 | foreach my $ele (@_) { 120 | if ($ele > $max) {$max = $ele;} 121 | } 122 | return($max); 123 | } 124 | -------------------------------------------------------------------------------- /1_Breakdown_UMI_read_pairs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # Matches upto two mismatches between observed cell barcodes and the expected cell barcodes 5 | # Excludes reads with problematic UMIs: >= 80% A, >= 80% T, contained in adaptor sequence. - Note do not provide adaptors for short UMI datasets (fewer than 7 bases) since there is a high probability of real UMIs being contained in the adaptor for such cases. 6 | # Allows barcodes to contain ambiguous bases 7 | # Allows trailing bases at the end of the barcode sequence but requires barcodes to begin from the first base in the barcode sequence. 8 | 9 | if (@ARGV < 6) { die "Usage: 1_Breakdown_UMI_read_pairs.pl BarcodeFastq ReadFastq BarcodeStructure(C=cellbarcodebase, U=UMIbase) BarcodeIndexFile(\"UNKNOWN\" triggers counting reads with every unique barcode) BarcodeColumn(0=first column) OutputPrefix AdaptorFasta(optional)\n";} 10 | my $infile1 = $ARGV[0]; 11 | my $infile2 = $ARGV[1]; 12 | my $barcodestructure = $ARGV[2]; 13 | 14 | # Parse Barcode Structure # 15 | 16 | $barcodestructure =~ s/[^CU]//g; 17 | 18 | print "$barcodestructure\n"; 19 | 20 | my $order = -1; 21 | my $C_len = -1; 22 | my $U_len = -1; 23 | 24 | if ($barcodestructure =~ /^(C+)(U+)$/) { 25 | $order=1; 26 | $C_len = length($1); 27 | $U_len = length($2); 28 | print "Barcode Structure: $C_len bp CellID followed by $U_len bp UMI\n"; 29 | } elsif ($barcodestructure =~ /^(U+)(C+)$/) { 30 | $order = 0; 31 | $C_len = length($2); 32 | $U_len = length($1); 33 | print "Barcode Structure: $U_len bp UMI followed by $C_len bp CellID\n"; 34 | } else { 35 | die "Intermingled cell & umi barcodes are not supported\n"; 36 | } 37 | # ----------------------- # 38 | 39 | my $OUTprefix = $ARGV[5]; #prefix for output 40 | #Ensure output directory exists 41 | if ($OUTprefix =~ /^(.+)\/[^\/]$/) { 42 | if ($1 ne ".") { 43 | system("mkdir -p $1"); 44 | } 45 | } 46 | 47 | # Read Expected Cell Barcodes # 48 | my %CellBarcodes = (); 49 | my %ofhs = (); 50 | if ($ARGV[3] ne "UNKNOWN") { 51 | open (my $ifh, $ARGV[3]) or die "Cannot open $ARGV[3]\n"; 52 | <$ifh>; # header 53 | my $column = $ARGV[4]; 54 | my $index=1; 55 | while (<$ifh>) { 56 | chomp; 57 | if ($_ =~/^#/) {next;} 58 | my @record = split(/\s+/); 59 | my $barcode = $record[$column]; 60 | $CellBarcodes{$barcode} = $index; 61 | open(my $fh,'>',"$OUTprefix\_$barcode.fq") or die $!; 62 | $ofhs{$index} = $fh; 63 | $index++; 64 | } close($ifh); 65 | } 66 | # --------------------------- # 67 | 68 | # Read Adaptor Fasta # 69 | my @Adaptors = (); 70 | if (defined($ARGV[6])) { 71 | open (my $afh, $ARGV[6]) or die $!; 72 | while (<$afh>) { 73 | if ($_ =~ /^>/) { 74 | my $seq = <$afh>; 75 | chomp($seq); 76 | push(@Adaptors, $seq); 77 | } 78 | } close($afh); 79 | } 80 | # ------------------ # 81 | 82 | 83 | ### Process Reads ### 84 | 85 | # Summary Statistics 86 | my $NotProperBarcodes = 0; 87 | my $NotPossibleCell = 0; 88 | my $AmbiguousCell = 0; 89 | my $ExactMatch = 0; 90 | my $Mismatch1 = 0; 91 | my $Mismatch2 = 0; 92 | my $BadUMI = 0; 93 | my $total_reads = 0; 94 | my $OutputReads=0; 95 | 96 | open (my $ifh1, $infile1) or die $!; 97 | open (my $ifh2, $infile2) or die $!; 98 | while(<$ifh1>) { 99 | my $file1line = $_; 100 | my $file2line = <$ifh2>; 101 | if ($file1line =~ /^@/) { #Skip any file headers 102 | 103 | # Ensure matching pair of reads 104 | my @thing1 = split(/\s+/,$file1line); 105 | my @thing2 = split(/\s+/,$file2line); 106 | my $readname = $thing1[0]; 107 | if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match! $readname $thing2[0]\n";} 108 | my $barcodes = <$ifh1>; 109 | my $read = <$ifh2>; 110 | $total_reads++; 111 | 112 | # Parse barcodes 113 | my $CellID = ""; my $UMI = ""; 114 | if ($order) { 115 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$C_len})([ATCGNUKMRYSWBVHDX]{$U_len})/) { 116 | $CellID = $1; $UMI = $2; 117 | } else {$NotProperBarcodes++; next;} 118 | } else { 119 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$U_len})([ATCGNUKMRYSWBVHDX]{$C_len})/) { 120 | $CellID = $2; $UMI = $1; 121 | 122 | } else {$NotProperBarcodes++; next;} 123 | } 124 | # if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/) { 125 | 126 | 127 | # Correct for upto two mismatches between observed and expected cell barcodes 128 | if ($ARGV[3] ne "UNKNOWN") { 129 | my $mismatches = 0; 130 | if (!exists($CellBarcodes{$CellID})) { # Not an expected barcode 131 | 132 | # Barcode contains uncertain bases -> convert to wildcards and pattern match on expected barcodes. (given priority over barcodes with higher confidence mismatches) 133 | if ($CellID !~ /^[ATCG]+$/) { 134 | $mismatches = () = $CellID =~ /[^ATCG]/g; # count uncertain bases as mismatches 135 | $CellID =~ s/[^ATCG]/./g; #Turn non-ATCG bases into wildcards 136 | } 137 | my @matches = (); 138 | my %close = (); 139 | foreach my $barcode (keys(%CellBarcodes)) { 140 | if ($barcode =~/$CellID/) { # Match but with uncertainty 141 | push(@matches, $barcode); 142 | } else { 143 | if (scalar(@matches == 0)) { # Count mismatches 144 | my $count = ( $barcode ^ $CellID ) =~ tr/\0//; 145 | if ($count >= length($barcode)-2) { # Allow upto 2 mismatches 146 | $close{$barcode} = $count; 147 | } 148 | } 149 | } 150 | } 151 | # If exact matches with uncertainty then give those priority, otherwise keep the most similar expected barcodes 152 | if (scalar(@matches) == 0 && scalar(keys(%close)) > 0) { # Has 1 or 2 mismatches 153 | my $max = my_max(values(%close)); # Closest match 154 | $mismatches = length($CellID)-$max; 155 | foreach my $code (keys(%close)) { 156 | if ($close{$code} == $max) { 157 | push(@matches,$code); 158 | } 159 | } 160 | } 161 | if (scalar(@matches) == 1) { # single best match 162 | $CellID = $matches[0]; 163 | if ($mismatches == 2) { 164 | $Mismatch2++; 165 | } 166 | if ($mismatches == 1) { 167 | $Mismatch1++; 168 | } 169 | } elsif (scalar(@matches) > 1) { #More than one equally good match 170 | $AmbiguousCell++; 171 | next; 172 | } else { # No match 173 | $NotPossibleCell++; 174 | next; 175 | } 176 | } else { # Exact match 177 | $ExactMatch++; 178 | } 179 | 180 | } #If known barcodes 181 | 182 | # UMI filter 183 | 184 | # All As or All Ts with 2 mismatches - No I think >80% A or T is a better definition since short UMIs quite likely to get real A/T rich UMIs 185 | my $As_in_UMI = () = $UMI =~ /A/g; 186 | my $Ts_in_UMI = () = $UMI =~ /T/g; 187 | if ($As_in_UMI >= length($UMI)*0.8 || $Ts_in_UMI >= length($UMI)*0.8) { 188 | $BadUMI++; next; 189 | } 190 | # UMI contained in adaptor sequence - Don't need UMI length limit here since just don't provide adaptor sequences for short UMI datasets. 191 | if (scalar(@Adaptors) > 0) { 192 | foreach my $adapt (@Adaptors) { 193 | if ($adapt =~ /$UMI/) { 194 | $BadUMI++; next; 195 | } 196 | } 197 | } 198 | 199 | if ($ARGV[3] ne "UNKNOWN") { 200 | # Has Acceptable Barcode 201 | <$ifh1>;<$ifh2>; #+'s 202 | my $file1qual = <$ifh1>; 203 | my $file2qual = <$ifh2>; 204 | my $handle = $ofhs{$CellBarcodes{$CellID}}; 205 | print $handle "$readname:$UMI\n$read+\n$file2qual"; 206 | $OutputReads++; 207 | } else { 208 | $CellBarcodes{$CellID}++; 209 | } 210 | } else {next;} 211 | } 212 | if ($ARGV[3] ne "UNKNOWN") { 213 | print STDERR " 214 | Not proper read: $NotProperBarcodes 215 | Not possible cell: $NotPossibleCell 216 | Ambiguous: $AmbiguousCell 217 | Exact Matches:$ExactMatch 218 | One mismatch: $Mismatch1 219 | Two mismatch: $Mismatch2 220 | Bad UMI: $BadUMI 221 | Input Reads: $total_reads 222 | Output Reads: $OutputReads\n"; 223 | close($ifh1); 224 | close($ifh2); 225 | foreach my $ofh (keys(%ofhs)) {close($ofhs{$ofh});} 226 | } else { 227 | print STDERR "Bad UMI: $BadUMI\n"; 228 | my @Codes = sort { $CellBarcodes{$b} <=> $CellBarcodes{$a} } keys(%CellBarcodes); 229 | foreach my $code (@Codes) { 230 | print "$code ".$CellBarcodes{$code}."\n"; 231 | } 232 | } 233 | 234 | sub my_max { 235 | if (scalar(@_) == 1) {return($_[0])}; 236 | my $max = shift; 237 | foreach my $ele (@_) { 238 | if ($ele > $max) {$max = $ele;} 239 | } 240 | return($max); 241 | } 242 | -------------------------------------------------------------------------------- /1_DO_BreakDown_Files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/ 4 | mkdir -p $OUTPUTDIR 5 | INPUTDIR=/nfs/team218/MH/2015-03-02-C6H8GANXX/2015-03-02-C6H8GANXX/ 6 | PATTERN="*exp2*_sequence.txt.gz" 7 | INPUTFILES=($INPUTDIR/$PATTERN) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES)) 10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%40" -R"select[mem>4000] rusage[mem=4000]" -M4000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/1_BreakDown_Files_wrapper.sh 100000000 "$INPUTDIR" "$PATTERN" $OUTPUTDIR 11 | -------------------------------------------------------------------------------- /1_Flexible_FullTranscript_Demultiplexing.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | # Input a pair of sequencing FastQ files 6 | # gunzip, read, write out smaller broken down files in a format suitable for submitting job array, and re-gzip each one in turn 7 | # Breakdown by lane & cellID 8 | # Keep order. 9 | # This should work equally well for single-end reads and can take any number of files as arguments. 10 | # TESTED 11 | 12 | if (@ARGV != 7) { 13 | print STDERR "perl 1_Flexible_FullTranscript_Demultiplexing.pl read1.fq read2.fq b_pos b_len index mismatch prefix\n"; 14 | print STDERR " 15 | read1.fq : barcode containing read 16 | read2.fq : non-barcode containg read 17 | b_pos : position of cell-barcode in the read. [\"start\" or \"end\"] 18 | b_len : length of cell-barcode (bp) 19 | index : file contain a single column of expected barcodes 20 | mismatch : maximum number of permitted mismatches (recommend 2) 21 | prefix : prefix for output fq files.\n"; 22 | exit(1); 23 | } 24 | 25 | my $infile1 = $ARGV[0]; 26 | my $infile2 = $ARGV[1]; 27 | my $barcode_pos = $ARGV[2]; 28 | my $barcode_len = $ARGV[3]; 29 | my $barcode_index_file = $ARGV[4]; 30 | my $MAXmismatch = $ARGV[5]; 31 | my $OUTprefix = $ARGV[6]; 32 | 33 | if ($OUTprefix =~ /^(.+)\/[^\/]$/) { 34 | if ($1 ne ".") { 35 | system("mkdir -p $1"); 36 | } 37 | } 38 | 39 | my %CellBarcodes = (); 40 | my %ofhs1 = (); 41 | my %ofhs2 = (); 42 | open(my $ifh, $barcode_index_file) or die "Cannot open $barcode_index_file\n"; 43 | my $index=1; 44 | while (<$ifh>) { 45 | chomp; 46 | $CellBarcodes{$_} = $index; 47 | open(my $fh1, '>', "$OUTprefix\_$_\_read1.fq") or die $!; 48 | $ofhs1{$index} = $fh1; 49 | open(my $fh2, '>', "$OUTprefix\_$_\_read2.fq") or die $!; 50 | $ofhs2{$index} = $fh2; 51 | $index++; 52 | } close($ifh); 53 | 54 | 55 | my $NotProperBarcodes = 0; 56 | my $NotPossibleCell = 0; 57 | my $AmbiguousCell = 0; 58 | my $ExactMatch = 0; 59 | my $Mismatch = 0; 60 | my $total_reads = 0; 61 | my $OutputReads = 0; 62 | 63 | open(my $ifh1, $infile1) or die $!; 64 | open(my $ifh2, $infile2) or die $!; 65 | while(<$ifh1>) { 66 | my $file1line=$_; 67 | my $file2line = <$ifh2>; 68 | if ($file1line =~ /^@/) { 69 | # Ensure matching pair of reads 70 | my @thing1 = split(/\s+/, $file1line); 71 | my @thing2 = split(/\s+/, $file2line); 72 | my $readname = $thing1[0]; 73 | #if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";} 74 | my $barcode_read = <$ifh1>; 75 | chomp $barcode_read; 76 | my $read2 = <$ifh2>; 77 | chomp $read2; 78 | $total_reads++; 79 | 80 | <$ifh1>; <$ifh2>; 81 | my $file1qual = <$ifh1>; 82 | chomp $file1qual; 83 | my $file2qual = <$ifh2>; 84 | chomp $file2qual; 85 | my $CellID = ""; 86 | if ($barcode_pos eq "start") { 87 | $CellID = substr($barcode_read, 0, $barcode_len, ""); 88 | substr($file1qual, 0, $barcode_len, ""); 89 | } else { 90 | $CellID = substr($barcode_read, -$barcode_len, $barcode_len, ""); 91 | substr($file1qual, -$barcode_len, $barcode_len, ""); 92 | } 93 | my $mismatches = 0; 94 | if (!exists($CellBarcodes{$CellID})) { 95 | my @matches = (); 96 | my %close = (); 97 | foreach my $expected_barcode (keys(%CellBarcodes)) { 98 | my $count = ( $expected_barcode ^ $CellID ) =~ tr/\0//; 99 | if ($count >= length($expected_barcode)-$MAXmismatch) { 100 | $close{$expected_barcode} = $count; 101 | } 102 | } 103 | if (scalar(keys(%close)) > 0) { 104 | my $max = my_max(values(%close)); 105 | $mismatches = length($CellID) - $max; 106 | foreach my $code (keys(%close)) { 107 | if ($close{$code} == $max) { 108 | push(@matches, $code); 109 | } 110 | } 111 | } 112 | if (scalar(@matches) == 1) { 113 | $CellID = $matches[0]; 114 | $Mismatch++; 115 | } elsif (scalar(@matches) > 1) { 116 | $AmbiguousCell++; 117 | next; 118 | } else { 119 | $NotPossibleCell++; 120 | next; 121 | } 122 | } else { 123 | $ExactMatch++; 124 | } 125 | # print the read 126 | my $handle1 = $ofhs1{$CellBarcodes{$CellID}}; 127 | my $handle2 = $ofhs2{$CellBarcodes{$CellID}}; 128 | print $handle1 "$readname\n$barcode_read\n+\n$file1qual\n"; 129 | print $handle2 "$readname\n$read2\n+\n$file2qual\n"; 130 | $OutputReads++; 131 | } else {next;} 132 | } 133 | 134 | print STDERR " 135 | Doesn't match any cell: $NotPossibleCell 136 | Ambiguous: $AmbiguousCell 137 | Exact Matches: $ExactMatch 138 | Contain Mismatches: $Mismatch 139 | Input Reads: $total_reads 140 | Output Reads: $OutputReads\n"; 141 | close($ifh1); 142 | close($ifh2); 143 | foreach my $ofh1 (keys(%ofhs1)) {close($ofhs1{$ofh1});} 144 | foreach my $ofh2 (keys(%ofhs2)) {close($ofhs2{$ofh2});} 145 | 146 | 147 | 148 | sub my_max { 149 | if (scalar(@_) == 1) {return($_[0])}; 150 | my $max = shift; 151 | foreach my $ele (@_) { 152 | if ($ele > $max) {$max = $ele;} 153 | } 154 | return($max); 155 | } 156 | -------------------------------------------------------------------------------- /1_Flexible_UMI_Demultiplexing.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # Matches upto two mismatches between observed cell barcodes and the expected cell barcodes 5 | # Excludes reads with problematic UMIs: >= 80% A, >= 80% T, contained in adaptor sequence. - Note do not provide adaptors for short UMI datasets (fewer than 7 bases) since there is a high probability of real UMIs being contained in the adaptor for such cases. 6 | # Allows barcodes to contain ambiguous bases 7 | # Allows trailing bases at the end of the barcode sequence but requires barcodes to begin from the first base in the barcode sequence. 8 | 9 | if (@ARGV != 6) { 10 | print STDERR "perl 1_Flexible_UMI_Demultiplexing.pl read1.fq read2.fq b_structure index mismatch prefix\n"; 11 | print STDERR " 12 | read1.fq : barcode/umi containing read 13 | read2.fq : non-barcode containing read 14 | b_structure : a single string of the format C##U# or U#C## 15 | where C## is the cell-barcode and U# is the UMI. 16 | e.g. C10U4 = a 10bp cell barcode followed by a 4bp UMI 17 | index : file containg a single column of expected cell-barcodes. 18 | if equal to \"UNKNOWN\" script will output read counts for each unique barcode. 19 | mismatch : maximum number of permitted mismatches (recommend 2) 20 | prefix : prefix for output fastq files.\n"; 21 | exit(1);} 22 | my $infile1 = $ARGV[0]; 23 | my $infile2 = $ARGV[1]; 24 | my $barcodestructure = $ARGV[2]; 25 | my $MAXmismatch = $ARGV[4]; 26 | 27 | # Parse Barcode Structure # 28 | 29 | 30 | 31 | my $order = -1; 32 | my $C_len = -1; 33 | my $U_len = -1; 34 | 35 | if ($barcodestructure =~ /^C(\d+)U(\d+)$/) { 36 | $order=1; 37 | $C_len = $1; 38 | $U_len = $2; 39 | print "Barcode Structure: $C_len bp CellID followed by $U_len bp UMI\n"; 40 | } elsif ($barcodestructure =~ /^U(\d+)C(\d+)$/) { 41 | $order = 0; 42 | $C_len = $2; 43 | $U_len = $1; 44 | print "Barcode Structure: $U_len bp UMI followed by $C_len bp CellID\n"; 45 | } else { 46 | die "$barcodestructure not recognized.\n"; 47 | } 48 | # ----------------------- # 49 | 50 | my $OUTprefix = $ARGV[5]; #prefix for output 51 | #Ensure output directory exists 52 | if ($OUTprefix =~ /^(.+)\/[^\/]$/) { 53 | if ($1 ne ".") { 54 | system("mkdir -p $1"); 55 | } 56 | } 57 | 58 | # Read Expected Cell Barcodes # 59 | my %CellBarcodes = (); 60 | my %ofhs = (); 61 | if ($ARGV[3] ne "UNKNOWN") { 62 | open (my $ifh, $ARGV[3]) or die "Cannot open $ARGV[3]\n"; 63 | my $index=1; 64 | while (<$ifh>) { 65 | chomp; 66 | if ($_ =~/^#/) {next;} 67 | my $barcode = $_; 68 | $CellBarcodes{$barcode} = $index; 69 | open(my $fh,'>',"$OUTprefix\_$barcode.fq") or die $!; 70 | $ofhs{$index} = $fh; 71 | $index++; 72 | } close($ifh); 73 | } 74 | # --------------------------- # 75 | 76 | 77 | ### Process Reads ### 78 | 79 | # Summary Statistics 80 | my $NotProperBarcodes = 0; 81 | my $NotPossibleCell = 0; 82 | my $AmbiguousCell = 0; 83 | my $ExactMatch = 0; 84 | my $Mismatch = 0; 85 | my $BadUMI = 0; 86 | my $total_reads = 0; 87 | my $OutputReads=0; 88 | 89 | open (my $ifh1, $infile1) or die $!; 90 | open (my $ifh2, $infile2) or die $!; 91 | while(<$ifh1>) { 92 | my $file1line = $_; 93 | my $file2line = <$ifh2>; 94 | if ($file1line =~ /^@/) { #Skip any file headers 95 | 96 | # Ensure matching pair of reads 97 | my @thing1 = split(/\s+/,$file1line); 98 | my @thing2 = split(/\s+/,$file2line); 99 | #my $readname1 = chop($thing1[0]); 100 | #my $readname2 = chop($thing2[0]); 101 | #if ($readname1 ne $readname2) {die "file1 & file2 readnames don't match! $thing1[0] $thing2[0]\n";} 102 | my $readname = $thing1[0]; 103 | my $barcodes = <$ifh1>; 104 | my $read = <$ifh2>; 105 | $total_reads++; 106 | 107 | # Parse barcodes 108 | my $CellID = ""; my $UMI = ""; 109 | if ($order) { 110 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$C_len})([ATCGNUKMRYSWBVHDX]{$U_len})/) { 111 | $CellID = $1; $UMI = $2; 112 | } else {$NotProperBarcodes++; next;} 113 | } else { 114 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$U_len})([ATCGNUKMRYSWBVHDX]{$C_len})/) { 115 | $CellID = $2; $UMI = $1; 116 | 117 | } else {$NotProperBarcodes++; next;} 118 | } 119 | # if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/) { 120 | 121 | 122 | # Correct for upto two mismatches between observed and expected cell barcodes 123 | if ($ARGV[3] ne "UNKNOWN") { 124 | my $mismatches = 0; 125 | if (!exists($CellBarcodes{$CellID})) { # Not an expected barcode 126 | 127 | # Barcode contains uncertain bases -> convert to wildcards and pattern match on expected barcodes. (given priority over barcodes with higher confidence mismatches) 128 | if ($CellID !~ /^[ATCG]+$/) { 129 | $mismatches = () = $CellID =~ /[^ATCG]/g; # count uncertain bases as mismatches 130 | $CellID =~ s/[^ATCG]/./g; #Turn non-ATCG bases into wildcards 131 | } 132 | my @matches = (); 133 | my %close = (); 134 | foreach my $barcode (keys(%CellBarcodes)) { 135 | if ($barcode =~/$CellID/) { # Match but with uncertainty 136 | push(@matches, $barcode); 137 | } else { 138 | if (scalar(@matches == 0)) { # Count mismatches 139 | my $count = ( $barcode ^ $CellID ) =~ tr/\0//; 140 | if ($count >= length($barcode)-$MAXmismatch) { # Allow upto 2 mismatches 141 | $close{$barcode} = $count; 142 | } 143 | } 144 | } 145 | } 146 | # If exact matches with uncertainty then give those priority, otherwise keep the most similar expected barcodes 147 | if (scalar(@matches) == 0 && scalar(keys(%close)) > 0) { # Has 1 or 2 mismatches 148 | my $max = my_max(values(%close)); # Closest match 149 | $mismatches = length($CellID)-$max; 150 | foreach my $code (keys(%close)) { 151 | if ($close{$code} == $max) { 152 | push(@matches,$code); 153 | } 154 | } 155 | } 156 | if (scalar(@matches) == 1) { # single best match 157 | $CellID = $matches[0]; 158 | $Mismatch++; 159 | } elsif (scalar(@matches) > 1) { #More than one equally good match 160 | $AmbiguousCell++; 161 | next; 162 | } else { # No match 163 | $NotPossibleCell++; 164 | next; 165 | } 166 | } else { # Exact match 167 | $ExactMatch++; 168 | } 169 | 170 | } #If known barcodes 171 | 172 | if ($ARGV[3] ne "UNKNOWN") { 173 | # Has Acceptable Barcode 174 | <$ifh1>;<$ifh2>; #+'s 175 | my $file1qual = <$ifh1>; 176 | my $file2qual = <$ifh2>; 177 | my $handle = $ofhs{$CellBarcodes{$CellID}}; 178 | print $handle "$readname:$UMI\n$read+\n$file2qual"; 179 | $OutputReads++; 180 | } else { 181 | $CellBarcodes{$CellID}++; 182 | } 183 | } else {next;} 184 | } 185 | if ($ARGV[3] ne "UNKNOWN") { 186 | print STDERR " 187 | Doesn't match any cell: $NotPossibleCell 188 | Ambiguous: $AmbiguousCell 189 | Exact Matches: $ExactMatch 190 | Contain mismatches: $Mismatch 191 | Input Reads: $total_reads 192 | Output Reads: $OutputReads\n"; 193 | close($ifh1); 194 | close($ifh2); 195 | foreach my $ofh (keys(%ofhs)) {close($ofhs{$ofh});} 196 | } else { 197 | print STDERR "Bad UMI: $BadUMI\n"; 198 | my @Codes = sort { $CellBarcodes{$b} <=> $CellBarcodes{$a} } keys(%CellBarcodes); 199 | foreach my $code (@Codes) { 200 | print "$code ".$CellBarcodes{$code}."\n"; 201 | } 202 | } 203 | 204 | sub my_max { 205 | if (scalar(@_) == 1) {return($_[0])}; 206 | my $max = shift; 207 | foreach my $ele (@_) { 208 | if ($ele > $max) {$max = $ele;} 209 | } 210 | return($max); 211 | } 212 | -------------------------------------------------------------------------------- /2-5.1_DO_kallisto_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | INDEXFILE=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/kallisto_index.idx 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Kallisto 6 | mkdir -p $OUTPUTDIR 7 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/ 8 | INPUTFILES=($INPUTDIR/*) 9 | NUMFILES=${#INPUTFILES[@]} 10 | MAXJOBS=$(($NUMFILES/2)) 11 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5.1_kallisto_quant.sh 1 $INPUTDIR $OUTPUTDIR $INDEXFILE Bergiers_Vivo 12 | 13 | -------------------------------------------------------------------------------- /2-5.1_kallisto_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with KALLISTO -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | # Arguments: 7 | # $1 = number of threads to run on, 8 | # $2 = directory of files to map 9 | # $3 = outputdirectory 10 | 11 | NUMTHREADS=$1 12 | FILESTOMAPDIR=$2 13 | OUTDIR=$3 14 | INFILE=$4 15 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto 16 | 17 | #Check appropriate arguments 18 | if [ ! -f "$KALLISTO" ] ; then 19 | echo "Sorry KALLISTO not available " 20 | exit 1 21 | fi 22 | 23 | if [ -z "$NUMTHREADS" ] ; then 24 | echo "Please set number of threads to use (ARG 1/4)" 25 | exit 1 26 | fi 27 | 28 | if [ -z "$FILESTOMAPDIR" ] ; then 29 | echo "Please include a directory of files to map (ARG 2/4)" 30 | exit 1 31 | fi 32 | FILEStoMAP=($FILESTOMAPDIR/*) 33 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 34 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 35 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 36 | 37 | NAME=${FILE1TOMAP##*/} 38 | NAME=${NAME%.*} 39 | 40 | if [ -z "$FILE1TOMAP" ] ; then 41 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 42 | exit 1 43 | fi 44 | 45 | if [ -z "$FILE2TOMAP" ] ; then 46 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 47 | exit 1 48 | fi 49 | 50 | if [ -z "$OUTDIR" ] ; then 51 | echo "Please include a directory for output (ARG 3/4)" 52 | exit 1 53 | fi 54 | 55 | if [ -z "$INFILE" ] ; then 56 | echo "Please include a transcript index file (ARG 4/4)" 57 | exit 1 58 | fi 59 | 60 | # Make directory for output if necessary 61 | if [ ! -d "$OUTDIR" ] ; then 62 | mkdir -p $OUTDIR 63 | fi 64 | if [ ! -d "$OUTDIR/$LSB_JOBINDEX" ] ; then 65 | mkdir -p $OUTDIR/$LSB_JOBINDEX 66 | fi 67 | 68 | # bsub -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o test_kallisto_quant.out -e test_kallisto_quant.err /nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto quant --bias -b 100 --seed=1 --plaintext --threads=1 -i /lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/kallisto_index.idx /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/G1_Cell01_1.fastq.gz /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/G1_Cell01_2.fastq.gz -o /lustre/scratch108/compgen/team218/TA/TEST 69 | 70 | # Run KALLISTO 71 | $KALLISTO quant --bias --plaintext --threads=$NUMTHREADS -i $INFILE -o $OUTDIR/$LSB_JOBINDEX $FILE1TOMAP $FILE2TOMAP 72 | mv $OUTDIR/$LSB_JOBINDEX/abundance.tsv $OUTDIR/$NAME.abundances.tsv 73 | rm $OUTDIR/$LSB_JOBINDEX/run_info.json 74 | rmdir $OUTDIR/$LSB_JOBINDEX 75 | 76 | -------------------------------------------------------------------------------- /2-5.2_DO_Salmon_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | INDEXFILE=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/salmon_index 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Salmon 6 | GTFFILE=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf 7 | mkdir -p $OUTPUTDIR 8 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/ 9 | INPUTFILES=($INPUTDIR/*) 10 | NUMFILES=${#INPUTFILES[@]} 11 | MAXJOBS=$(($NUMFILES/2)) 12 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%50" -R"select[mem>5000] rusage[mem=5000] span[hosts=1]" -M5000 -n2 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5.2_Salmon_quant.sh 1 $INPUTDIR $OUTPUTDIR $INDEXFILE $GTFFILE 13 | 14 | -------------------------------------------------------------------------------- /2-5.2_Salmon_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 3 Nov 2016 : wrapper for Mapping Reads with SALMON -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | # Arguments: 7 | # $1 = number of threads to run on, 8 | # $2 = directory of files to map 9 | # $3 = outputdirectory 10 | # $4 = transcript index file (see 0.3_Salmon_build_index.sh) 11 | # $5 = annotation gtf (map transcripts to genes) 12 | 13 | NUMTHREADS=$1 14 | FILESTOMAPDIR=$2 15 | OUTDIR=$3 16 | INFILE=$4 17 | ANNFILE=$5 18 | SALMON=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Salmon-0.7.2_linux_x86_64/bin/salmon 19 | 20 | #Check appropriate arguments 21 | if [ ! -f "$SALMON" ] ; then 22 | echo "Sorry SALMON not available " 23 | exit 1 24 | fi 25 | 26 | if [ -z "$NUMTHREADS" ] ; then 27 | echo "Please set number of threads to use (ARG 1/4)" 28 | exit 1 29 | fi 30 | 31 | if [ -z "$FILESTOMAPDIR" ] ; then 32 | echo "Please include a directory of files to map (ARG 2/4)" 33 | exit 1 34 | fi 35 | FILEStoMAP=($FILESTOMAPDIR/*) 36 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 37 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 38 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 39 | 40 | NAME=${FILE1TOMAP##*/} 41 | NAME=${NAME%.*} 42 | 43 | if [ -z "$FILE1TOMAP" ] ; then 44 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 45 | exit 1 46 | fi 47 | 48 | if [ -z "$FILE2TOMAP" ] ; then 49 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 50 | exit 1 51 | fi 52 | 53 | if [ -z "$OUTDIR" ] ; then 54 | echo "Please include a directory for output (ARG 3/4)" 55 | exit 1 56 | fi 57 | 58 | if [ -z "$INFILE" ] ; then 59 | echo "Please include a transcript index file (ARG 4/4)" 60 | exit 1 61 | fi 62 | 63 | # Make directory for output if necessary 64 | if [ ! -d "$OUTDIR" ] ; then 65 | mkdir -p $OUTDIR 66 | fi 67 | if [ ! -d "$OUTDIR/$LSB_JOBINDEX" ] ; then 68 | mkdir -p $OUTDIR/$LSB_JOBINDEX 69 | fi 70 | 71 | # Run SALMON 72 | $SALMON quant -i $INFILE -o $OUTDIR/$LSB_JOBINDEX -1 $FILE1TOMAP -2 $FILE2TOMAP -p $NUMTHREADS -l A -g $ANNFILE --seqBias --gcBias --posBias -q 73 | mv $OUTDIR/$LSB_JOBINDEX/quant.sf $OUTDIR/$NAME.quant.sf 74 | mv $OUTDIR/$LSB_JOBINDEX/quant.genes.sf $OUTDIR/$NAME.quant.genes.sf 75 | rm -r $OUTDIR/$LSB_JOBINDEX 76 | -------------------------------------------------------------------------------- /2-5_DO_RSEM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_RSEM 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap 7 | INPUTFILES=($INPUTDIR/*) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES/2)) 10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>30000] rusage[mem=30000] span[hosts=1]" -M30000 -n2 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5_STAR-RSEM.sh $INPUTDIR $OUTPUTDIR Beuttner_STAR_RSEM 2 11 | #bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>10000] rusage[mem=10000] span[hosts=1]" -M10000 -n5 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5_bowtie2-RSEM.sh $INPUTDIR $OUTPUTDIR Beuttner_bowtie2_RSEM 5 12 | 13 | -------------------------------------------------------------------------------- /2-5_STAR-RSEM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Assume paired end 3 | # Arguments: 4 | # $1 = directory of fastq files to map/quantify 5 | # $2 = output directory for final quantification file 6 | # $3 = prefix 7 | # $4 = number of threads to run on (optional) 8 | RSEM=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26/rsem-calculate-expression 9 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/ 10 | REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38 11 | FILESTOMAPDIR=$1 12 | OUTDIR=$2 13 | PREFIX="$3-$LSB_JOBINDEX-" 14 | THREADS=$4 15 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX 16 | 17 | if [ -z "$THREADS" ] ; then 18 | THREADS=1 19 | fi 20 | if [ ! -f "$RSEM" ] ; then 21 | echo "Sorry RSEM not available " 22 | exit 1 23 | fi 24 | if [ -z "$FILESTOMAPDIR" ] ; then 25 | echo "Please include a directory of files to map (ARG 1/4)" 26 | exit 1 27 | fi 28 | if [ -z "$OUTDIR" ] ; then 29 | echo "Please include a directory for outputfile (ARG 2/4)" 30 | exit 1 31 | fi 32 | if [ -z "$3" ] ; then 33 | echo "Please include a prefix for output (ARG 3/4)" 34 | exit 1 35 | fi 36 | 37 | # Get fastq files 38 | FILEStoMAP=($FILESTOMAPDIR/*) 39 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 40 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 41 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 42 | 43 | if [ -z "$FILE1TOMAP" ] ; then 44 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 45 | exit 1 46 | fi 47 | 48 | if [ -z "$FILE2TOMAP" ] ; then 49 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 50 | exit 1 51 | fi 52 | # Make directory for output if necessary 53 | if [ ! -d "$OUTDIR" ] ; then 54 | mkdir -p $OUTDIR 55 | fi 56 | 57 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then 58 | $RSEM --star --star-path $STAR --gzipped-read-file --no-bam-output --single-cell-prior --temporary-folder $WORKINGDIR --paired-end $FILE1TOMAP $FILE2TOMAP $REFname $OUTDIR/$PREFIX 59 | else 60 | $RSEM --star --star-path $STAR --no-bam-output --single-cell-prior --temporary-folder $WORKINGDIR --paired-end $FILE1TOMAP $FILE2TOMAP $REFname $OUTDIR/$PREFIX 61 | fi 62 | -------------------------------------------------------------------------------- /2-5_bowtie2-RSEM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Assume paired end 3 | # Arguments: 4 | # $1 = directory of fastq files to map/quantify 5 | # $2 = output directory for final quantification file 6 | # $3 = prefix 7 | # $4 = number of threads to run on (optional) 8 | RSEM=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26/rsem-calculate-expression 9 | BOWTIE=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bowtie2-2.2.6/ 10 | #REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38 11 | REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38 12 | FILESTOMAPDIR=$1 13 | OUTDIR=$2 14 | PREFIX="$3-$LSB_JOBINDEX-" 15 | THREADS=$4 16 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX 17 | 18 | if [ -z "$THREADS" ] ; then 19 | THREADS=1 20 | fi 21 | if [ ! -f "$RSEM" ] ; then 22 | echo "Sorry RSEM not available " 23 | exit 1 24 | fi 25 | if [ -z "$FILESTOMAPDIR" ] ; then 26 | echo "Please include a directory of files to map (ARG 1/4)" 27 | exit 1 28 | fi 29 | if [ -z "$OUTDIR" ] ; then 30 | echo "Please include a directory for outputfile (ARG 2/4)" 31 | exit 1 32 | fi 33 | if [ -z "$3" ] ; then 34 | echo "Please include a prefix for output (ARG 3/4)" 35 | exit 1 36 | fi 37 | 38 | # Get fastq files 39 | FILEStoMAP=($FILESTOMAPDIR/*) 40 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 41 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 42 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 43 | 44 | if [ -z "$FILE1TOMAP" ] ; then 45 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 46 | exit 1 47 | fi 48 | 49 | if [ -z "$FILE2TOMAP" ] ; then 50 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 51 | exit 1 52 | fi 53 | # Make directory for output if necessary 54 | if [ ! -d "$OUTDIR" ] ; then 55 | mkdir -p $OUTDIR 56 | fi 57 | 58 | $RSEM --bowtie2 --bowtie2-path $BOWTIE --no-bam-output --single-cell-prior --temporary-folder $WORKINGDIR --paired-end -p $THREADS $FILE1TOMAP $FILE2TOMAP $REFname $OUTDIR/$PREFIX 59 | -------------------------------------------------------------------------------- /2.2_DO_MapReads_Tophat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_Tophat 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap 7 | INPUTFILES=($INPUTDIR/*) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES/2)) 10 | GENOME=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/bowtie2_build 11 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%50" -R"select[mem>6000] rusage[mem=6000] span[hosts=1]" -M6000 -n5 -q long -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2.2_MapReads_Tophat.sh 5 $INPUTDIR $OUTPUTDIR $GENOME Beuttner_Tophat 12 | 13 | -------------------------------------------------------------------------------- /2.2_MapReads_Tophat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with TOPHAT -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | # Arguments: 7 | # $1 = number of threads to run on, 8 | # $2 = directory of files to map 9 | # $3 = outputdirectory 10 | # $4 = genome base 11 | # $5 = Prefix 12 | 13 | NUMTHREADS=$1 14 | FILESTOMAPDIR=$2 15 | OUTDIR=$3 16 | TOPHAT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/tophat-2.1.0.Linux_x86_64/tophat2 17 | GENOME=$4 18 | PREFIX="$5-$LSB_JOBINDEX-" 19 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX 20 | 21 | #Check appropriate arguments 22 | if [ ! -f "$TOPHAT" ] ; then 23 | echo "Sorry TOPHAT not available " 24 | exit 1 25 | fi 26 | 27 | if [ -z "$NUMTHREADS" ] ; then 28 | echo "Please set number of threads to use (ARG 1/4)" 29 | exit 1 30 | fi 31 | 32 | if [ -z "$FILESTOMAPDIR" ] ; then 33 | echo "Please include a directory of files to map (ARG 2/4)" 34 | exit 1 35 | fi 36 | FILEStoMAP=($FILESTOMAPDIR/*) 37 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 38 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 39 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 40 | 41 | if [ -z "$FILE1TOMAP" ] ; then 42 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 43 | exit 1 44 | fi 45 | 46 | if [ -z "$FILE2TOMAP" ] ; then 47 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 48 | exit 1 49 | fi 50 | 51 | if [ -z "$OUTDIR" ] ; then 52 | echo "Please include a directory for output (ARG 3/4)" 53 | exit 1 54 | fi 55 | 56 | if [ -z "$GENOME" ] ; then 57 | echo "Please include the base genome name (ARG 4/4)" 58 | exit 1 59 | fi 60 | 61 | if [ -z "$5" ] ; then 62 | echo "Warning: no file prefix included" 63 | fi 64 | 65 | #To fix failed jobs 66 | if [ -d "$OUTDIR/$LSB_JOBINDEX" ]; then 67 | #----------------- 68 | 69 | # Make directory for output if necessary 70 | if [ ! -d "$OUTDIR/$LSB_JOBINDEX" ] ; then 71 | mkdir -p $OUTDIR/$LSB_JOBINDEX 72 | fi 73 | 74 | NAME=${FILE1TOMAP##*/} 75 | NAME=${NAME%.*} 76 | 77 | echo "Job$LSB_JOBINDEX Mapping: $FILE1TOMAP $FILE2TOMAP\n" 78 | 79 | # Run TOPHAT 80 | FILEnopath=`basename ${FILE1TOMAP%.fq.gz}` 81 | cd $OUTDIR/$LSB_JOBINDEX 82 | $TOPHAT $GENOME $FILE1TOMAP $FILE2TOMAP 83 | 84 | mv $OUTDIR/$LSB_JOBINDEX/tophat_out/align_summary.txt $OUTDIR/$NAME.align_summary.txt 85 | /usr/bin/samtools merge -n -f $OUTDIR/$NAME.sorted.aligned.bam $OUTDIR/$LSB_JOBINDEX/tophat_out/accepted_hits.bam $OUTDIR/$LSB_JOBINDEX/tophat_out/unmapped.bam 86 | rm -r $OUTDIR/$LSB_JOBINDEX 87 | 88 | #To fix failed jobs 89 | fi 90 | #----------------- 91 | -------------------------------------------------------------------------------- /2_DO_MapReadsFile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_STAR 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap 7 | INPUTFILES=($INPUTDIR/*) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES/2)) 10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>30000] rusage[mem=30000]" -M30000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2_MapReadsFile.sh 1 $INPUTDIR $OUTPUTDIR /nfs/users/nfs_t/ta6/RNASeqPipeline/2_STAR_Parameters.txt Beuttner_STAR 11 | 12 | -------------------------------------------------------------------------------- /2_DO_MapReadsFile_singleend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | #OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/NeuronsEmmyLiora/FilesMapped 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedTranscriptome 6 | mkdir -p $OUTPUTDIR 7 | #INPUTDIR=/lustre/scratch108/compgen/team218/TA/NeuronsEmmyLiora/FilesUMITrimmed 8 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesQCed 9 | GENOMEDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/Bergiers 10 | INPUTFILES=($INPUTDIR/*) 11 | NUMFILES=${#INPUTFILES[@]} 12 | MAXJOBS=$(($NUMFILES)) 13 | #GENOMEDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/NeuronsLiora/ 14 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%100" -R"select[mem>35000] rusage[mem=35000]" -M35000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2_MapReadsFile_singleend.sh 1 $INPUTDIR $OUTPUTDIR /nfs/users/nfs_t/ta6/RNASeqPipeline/2_STAR_Parameters.txt $GENOMEDIR Bergiers_Trimmed_Waf375 15 | -------------------------------------------------------------------------------- /2_MapReadsFile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | ## Haven't tested since moved genomedir out of parameterfile 7 | 8 | # Arguments: 9 | # $1 = number of threads to run on, 10 | # $2 = directory of files to map 11 | # $3 = outputdirectory 12 | # $4 = STAR Parameters file 13 | # $5 = Prefix 14 | 15 | NUMTHREADS=$1 16 | FILESTOMAPDIR=$2 17 | OUTDIR=$3 18 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR 19 | PARAMFILE=$4 20 | PREFIX="$5-$LSB_JOBINDEX-" 21 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX 22 | GENOME=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/NeuronsLiora 23 | 24 | #Check appropriate arguments 25 | if [ ! -f "$STAR" ] ; then 26 | echo "Sorry STAR not available " 27 | exit 1 28 | fi 29 | 30 | if [ -z "$NUMTHREADS" ] ; then 31 | echo "Please set number of threads to use (ARG 1/4)" 32 | exit 1 33 | fi 34 | 35 | if [ -z "$FILESTOMAPDIR" ] ; then 36 | echo "Please include a directory of files to map (ARG 2/4)" 37 | exit 1 38 | fi 39 | FILEStoMAP=($FILESTOMAPDIR/*) 40 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 41 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 42 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 43 | 44 | if [ -z "$FILE1TOMAP" ] ; then 45 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 46 | exit 1 47 | fi 48 | 49 | if [ -z "$FILE2TOMAP" ] ; then 50 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 51 | exit 1 52 | fi 53 | 54 | if [ -z "$OUTDIR" ] ; then 55 | echo "Please include a directory for output (ARG 3/4)" 56 | exit 1 57 | fi 58 | 59 | if [ -z "$PARAMFILE" ] ; then 60 | echo "Please include a parameter file (ARG 4/4)" 61 | exit 1 62 | fi 63 | 64 | if [ -z "$5" ] ; then 65 | echo "Warning: no file prefix included" 66 | fi 67 | 68 | # Make directory for output if necessary 69 | if [ ! -d "$OUTDIR" ] ; then 70 | mkdir -p $OUTDIR 71 | fi 72 | 73 | # Run STAR 74 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then 75 | FILEnopath=`basename ${FILE1TOMAP%.fq.gz}` 76 | $STAR --runThreadN $NUMTHREADS --runMode alignReads --genomeDir $GENOME --readFilesIn $FILE1TOMAP $FILE2TOMAP --readFilesCommand zcat --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR 77 | else 78 | FILEnopath=`basename ${FILE1TOMAP%.fq}` 79 | $STAR --runThreadN $NUMTHREADS --runMode alignReads --genomeDir $GENOME --readFilesIn $FILE1TOMAP $FILE2TOMAP --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR 80 | fi 81 | -------------------------------------------------------------------------------- /2_MapReadsFile_Transcriptome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | # Arguments: 7 | # $1 = number of threads to run on, 8 | # $2 = directory of files to map 9 | # $3 = outputdirectory 10 | # $4 = STAR Parameters file 11 | # $5 = Prefix 12 | 13 | NUMTHREADS=$1 14 | FILESTOMAPDIR=$2 15 | OUTDIR=$3 16 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR 17 | PARAMFILE=$4 18 | PREFIX="$5-$LSB_JOBINDEX-" 19 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX 20 | 21 | #Check appropriate arguments 22 | if [ ! -f "$STAR" ] ; then 23 | echo "Sorry STAR not available " 24 | exit 1 25 | fi 26 | 27 | if [ -z "$NUMTHREADS" ] ; then 28 | echo "Please set number of threads to use (ARG 1/4)" 29 | exit 1 30 | fi 31 | 32 | if [ -z "$FILESTOMAPDIR" ] ; then 33 | echo "Please include a directory of files to map (ARG 2/4)" 34 | exit 1 35 | fi 36 | FILEStoMAP=($FILESTOMAPDIR/*) 37 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2)) 38 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 39 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 40 | 41 | if [ -z "$FILE1TOMAP" ] ; then 42 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 43 | exit 1 44 | fi 45 | 46 | if [ -z "$FILE2TOMAP" ] ; then 47 | echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist." 48 | exit 1 49 | fi 50 | 51 | if [ -z "$OUTDIR" ] ; then 52 | echo "Please include a directory for output (ARG 3/4)" 53 | exit 1 54 | fi 55 | 56 | if [ -z "$PARAMFILE" ] ; then 57 | echo "Please include a parameter file (ARG 4/4)" 58 | exit 1 59 | fi 60 | 61 | if [ -z "$5" ] ; then 62 | echo "Warning: no file prefix included" 63 | fi 64 | 65 | # Make directory for output if necessary 66 | if [ ! -d "$OUTDIR" ] ; then 67 | mkdir -p $OUTDIR 68 | fi 69 | 70 | # Run STAR 71 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then 72 | FILEnopath=`basename ${FILE1TOMAP%.fq.gz}` 73 | $STAR --runThreadN $NUMTHREADS --runMode alignReads --quantMode TranscriptomeSAM --readFilesIn $FILE1TOMAP $FILE2TOMAP --readFilesCommand zcat --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR 74 | else 75 | FILEnopath=`basename ${FILE1TOMAP%.fq}` 76 | $STAR --runThreadN $NUMTHREADS --runMode alignReads --quantMode TranscriptomeSAM --readFilesIn $FILE1TOMAP $FILE2TOMAP --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR 77 | fi 78 | 79 | 80 | -------------------------------------------------------------------------------- /2_MapReadsFile_singleend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | # Arguments: 7 | # $1 = number of threads to run on, 8 | # $2 = directory of files to map 9 | # $3 = outputdirectory 10 | # $4 = STAR Parameters file 11 | # $5 = STAR Genome directory 12 | # $6 = Prefix 13 | 14 | NUMTHREADS=$1 15 | FILESTOMAPDIR=$2 16 | OUTDIR=$3 17 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR 18 | PARAMFILE=$4 19 | PREFIX="$6-$LSB_JOBINDEX-" 20 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/Pipeline_RunningDir/STAR/$PREFIX 21 | GENOME=$5 22 | 23 | #Check appropriate arguments 24 | if [ ! -f "$STAR" ] ; then 25 | echo "Sorry STAR not available " 26 | exit 1 27 | fi 28 | 29 | if [ -z "$NUMTHREADS" ] ; then 30 | echo "Please set number of threads to use (ARG 1/4)" 31 | exit 1 32 | fi 33 | 34 | if [ -z "$FILESTOMAPDIR" ] ; then 35 | echo "Please include a directory of files to map (ARG 2/4)" 36 | exit 1 37 | fi 38 | FILEStoMAP=($FILESTOMAPDIR/*) 39 | ARRAYINDEX=$((($LSB_JOBINDEX-1))) 40 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 41 | 42 | if [ -z "$FILE1TOMAP" ] ; then 43 | echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 44 | exit 1 45 | fi 46 | 47 | if [ -z "$OUTDIR" ] ; then 48 | echo "Please include a directory for output (ARG 3/4)" 49 | exit 1 50 | fi 51 | 52 | if [ -z "$PARAMFILE" ] ; then 53 | echo "Please include a parameter file (ARG 4/4)" 54 | exit 1 55 | fi 56 | 57 | if [ -z "$5" ] ; then 58 | echo "Warning: no file prefix included" 59 | fi 60 | 61 | # Make directory for output if necessary 62 | if [ ! -d "$OUTDIR" ] ; then 63 | mkdir -p $OUTDIR 64 | fi 65 | 66 | # Run STAR 67 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then 68 | FILEnopath=`basename ${FILE1TOMAP%.fq.gz}` 69 | $STAR --runThreadN $NUMTHREADS --runMode alignReads --readFilesIn $FILE1TOMAP --genomeDir $GENOME --readFilesCommand zcat --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR 70 | else 71 | FILEnopath=`basename ${FILE1TOMAP%.fq}` 72 | $STAR --runThreadN $NUMTHREADS --runMode alignReads --readFilesIn $FILE1TOMAP --genomeDir $GENOME --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR 73 | fi 74 | -------------------------------------------------------------------------------- /2_STAR_Parameters.txt: -------------------------------------------------------------------------------- 1 | outSAMstrandField intronMotif 2 | outFilterIntronMotifs RemoveNoncanonical 3 | outSAMtype BAM SortedByCoordinate 4 | outFilterType BySJout 5 | outFilterMultimapNmax 20 6 | alignSJoverhangMin 8 7 | alignSJDBoverhangMin 1 8 | outFilterMismatchNmax 999 9 | outFilterMismatchNoverLmax 0.04 10 | alignIntronMin 20 11 | alignIntronMax 1000000 12 | alignMatesGapMax 1000000 13 | -------------------------------------------------------------------------------- /3_CLEANUP_MapReadFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh to break it down into steps. 3 | # Run this after mapping is finished. 4 | 5 | # These must be consistent with 2_DO_MapReadsFile.sh 6 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedTranscriptome 7 | TAG="Trimmed50-Bergiers_Waf375" 8 | 9 | rm $OUTPUTDIR/*Log.progress.out 10 | 11 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/3_Compile_Mapping_Statistics.pl $OUTPUTDIR > /lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/$TAG.mapped_summary.out 12 | 13 | tar -cvzf $OUTPUTDIR/$TAG.ParameterLogfiles.tar.gz $OUTPUTDIR/*Log.out 14 | tar -cvzf $OUTPUTDIR/$TAG.SpliceJunctionfiles.tar.gz $OUTPUTDIR/*SJ.out.tab 15 | tar -cvzf $OUTPUTDIR/$TAG.FinalLogfiles.tar.gz $OUTPUTDIR/*Log.final.out 16 | rm $OUTPUTDIR/*Log.out 17 | rm $OUTPUTDIR/*SJ.out.tab 18 | rm $OUTPUTDIR/*Log.final.out 19 | -------------------------------------------------------------------------------- /3_Compile_Mapping_Statistics.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (scalar(@ARGV) != 1) {die "Please provide a directory of STAR output\n";} 5 | 6 | my @files = glob("$ARGV[0]/*Log.final.out"); 7 | print "lane\tsample\texp\tproject\tfile\tNreads\tNuniquemap\tNmultimap\tNnomap\tNsplice\tNnovelSJ\tNoMapTooManyMap\tNoMapTooManyMis\tNoMapTooShort\n"; 8 | foreach my $file (@files) { 9 | my $fullfilename = $file; 10 | 11 | # Get as much info from file names as possible 12 | $file =~ /([^\/]+)$/; $file = $1; 13 | my $laneID = "NA"; my $sampleID = "NA"; my $expID = "NA"; my $projectID = "NA"; my $fileID = "NA"; 14 | if ($file =~ s/(lane\d+)//) {$laneID = $1;} 15 | # if ($file =~ s/(exp\d+)//) {$expID = $1;} 16 | # if ($file =~ s/(sc\d)//) {$expID = $1;} 17 | if ($file =~ s/([ACTG]{5,})//) {$sampleID = $1;} 18 | # if ($file =~ s/(cell\d\d)//) {$sampleID = $1;} 19 | if ($file =~ /^([^_]+)/) { 20 | my @remnants = split(/_+/, $file); 21 | $projectID = $remnants[0]; 22 | $fileID = $remnants[1]; 23 | } 24 | 25 | # print "$fullfilename\n"; 26 | print "$laneID\t$sampleID\t$expID\t$projectID\t$fileID\t"; 27 | 28 | my $Nreads = 0, my $Nuniquelymapped = 0; my $Nmultimap = 0; my $Nsplice = 0; my $NspliceAnn = 0; 29 | my $UnmappedTooManyMultimapN = 0; my $UnmappedTooManyMMprop = 0; my $UnmappedTooShortprop = 0; 30 | open(my $ifh, $fullfilename) or die $!; 31 | while (<$ifh>) { 32 | if ($_ =~ /Number of input reads[\s|]+(\d+)/) {$Nreads = $1;} 33 | if ($_ =~ /Uniquely mapped reads number[\s|]+(\d+)/) {$Nuniquelymapped = $1;} 34 | if ($_ =~ /Number of reads mapped to multiple loci[\s|]+(\d+)/) {$Nmultimap = $1;} 35 | if ($_ =~ /Number of splices: Total[\s|]+(\d+)/) {$Nsplice = $1;} 36 | if ($_ =~ /Number of splices: Annotated \(sjdb\)[\s|]+(\d+)/) {$NspliceAnn = $1;} 37 | if ($_ =~ /Number of reads mapped to too many loci[\s|]+(\d+)/) {$UnmappedTooManyMultimapN = $1;} 38 | if ($_ =~ /of reads unmapped: too many mismatches[\s|]+([\d\.]+%)/) {$UnmappedTooManyMMprop = $1;} 39 | if ($_ =~ /of reads unmapped: too short[\s|]+([\d\.]+%)/) {$UnmappedTooShortprop = $1;} 40 | } close ($ifh); 41 | 42 | print "$Nreads\t$Nuniquelymapped\t$Nmultimap\t".($Nreads-$Nuniquelymapped-$Nmultimap)."\t$Nsplice\t".($Nsplice-$NspliceAnn)."\t$UnmappedTooManyMultimapN\t$UnmappedTooManyMMprop\t$UnmappedTooShortprop\n"; 43 | 44 | # exit(); #short circuit for debugging 45 | } 46 | -------------------------------------------------------------------------------- /3_Compile_UMI_Statistics.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (scalar(@ARGV) != 1) {die "Please provide a directory of UMI-tools running output\n";} 5 | 6 | my @files = glob("$ARGV[0]/*err*"); 7 | print "sample\tmethod\tNreads\tNmolecules\n"; 8 | for(my $i = 0; $i < scalar(@files); $i++) { 9 | my $file = $files[$i]; 10 | 11 | my $Nreads = 0, my $Nmolecules = 0; 12 | my $cellID = ""; my $method = ""; 13 | open(my $ifh, $file) or die $!; 14 | while (<$ifh>) { 15 | if ($_ =~ /Number of reads in:\s*(\d+)/) {$Nreads = $1;} 16 | if ($_ =~ /Number of reads out:\s*(\d+)/) {$Nmolecules = $1;} 17 | if ($_ =~ /([AGCT]+)Aligned/) {$cellID = $1;} 18 | if ($_ =~ /Method:\s*(\w+)/) {$method = $1;} 19 | } close ($ifh); 20 | 21 | print "$cellID\t$method\t$Nreads\t$Nmolecules\n"; 22 | } 23 | -------------------------------------------------------------------------------- /3_DO_UmiDedup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedDeDupped 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedTranscriptome 7 | INPUTFILES=($INPUTDIR/*.bam) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES)) 10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%100" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o umi-tools.out.%J.%I -e umi-tools.err.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/3_UmiDedup.sh $INPUTDIR $OUTPUTDIR Rerum_DirAdj_transcriptome directional-adjacency 11 | 12 | # methods: 13 | # options_method = "directional-adjacency" 14 | # options_method = "adjacency" 15 | # options_method = "unique" 16 | 17 | -------------------------------------------------------------------------------- /3_SAMtools_sort_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $1 ] ; then 4 | echo "Please set input file (ARG 1/3)" 5 | exit 1 6 | fi 7 | if [ -z $2 ] ; then 8 | echo "Please set an output prefix (ARG 2/3)" 9 | exit 1 10 | fi 11 | if [ -z $3 ] ; then 12 | echo "Please set an max memory limit (ARG 3/3)" 13 | exit 1 14 | fi 15 | 16 | samtools sort -m $3 $1 $2 17 | rm $1 18 | -------------------------------------------------------------------------------- /3_SortBAMs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/*.out.bam")) { 5 | $file =~ /(.*)\.out\.bam$/; 6 | my $outprefix = "$1.sorted"; 7 | system("bsub -R\"select[mem>3000] rusage[mem=3000]\" -M3000 -q normal -o output.%J /nfs/users/nfs_t/ta6/RNASeqPipeline/3_SAMtools_sort_wrapper.sh $file $outprefix 3000000000\n"); 8 | } 9 | -------------------------------------------------------------------------------- /3_UmiDedup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command. 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0 4 | # Maps paired reads only! 5 | 6 | ## Haven't tested since moved genomedir out of parameterfile 7 | 8 | # Arguments: 9 | # $1 = directory of files to map 10 | # $2 = outputdirectory 11 | # $3 = Prefix 12 | # $4 = Method 13 | 14 | FILESTOMAPDIR=$1 15 | OUTDIR=$2 16 | PREFIX=$3 17 | METHOD=$4 18 | DIST_THRESH=0 19 | UMITOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/dedup_umi.py 20 | 21 | #Check appropriate arguments 22 | if [ ! -f "$UMITOOLS" ] ; then 23 | echo "Sorry UMI-tools not available " 24 | exit 1 25 | fi 26 | 27 | if [ -z "$FILESTOMAPDIR" ] ; then 28 | echo "Please include a directory of files to map (ARG 1/3)" 29 | exit 1 30 | fi 31 | MYFILES=($FILESTOMAPDIR/*.bam) 32 | ARRAYINDEX=$((($LSB_JOBINDEX-1))) 33 | MYFILE=${MYFILES[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!! 34 | echo $FILESTOMAPDIR 35 | echo $ARRAYINDEX 36 | echo ${#MYFILES[@]} 37 | echo $MYFILE 38 | if [ -z "$MYFILE" ] ; then 39 | echo "$MYFILE the $ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist." 40 | exit 1 41 | fi 42 | 43 | if [ -z "$OUTDIR" ] ; then 44 | echo "Please include a directory for output (ARG 2/3)" 45 | exit 1 46 | fi 47 | 48 | if [ -z "$PREFIX" ] ; then 49 | echo "Warning: no file prefix included" 50 | fi 51 | 52 | # Make directory for output if necessary 53 | if [ ! -d "$OUTDIR" ] ; then 54 | mkdir -p $OUTDIR 55 | fi 56 | 57 | # Run STAR 58 | FILEnopath=`basename ${MYFILE%.bam}` 59 | /usr/bin/python $UMITOOLS $MYFILE $DIST_THRESH $OUTDIR/$PREFIX-$FILEnopath.bam $METHOD 60 | -------------------------------------------------------------------------------- /3_merge_dedup_MappedReads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Coppied commands from 00_LIST_OF_BSUB_COMMANDS.sh 3 | 4 | # These must be consistent with 2_DO_MapReadsFile.sh 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_Tophat/ 6 | TAG="Beuttner_Tophat2_dedup" 7 | SCRIPT=/nfs/users/nfs_t/ta6/RNASeqPipeline/4_MergeBAMs.pl 8 | 9 | if [ ! -f $SCRIPT ] ; then 10 | echo "$SCRIPT not available" 11 | exit 1 12 | fi 13 | 14 | if [ -z $TAG ] ; then 15 | echo "No project tag" 16 | exit 1 17 | fi 18 | 19 | if [ -z $OUTPUTDIR ] ; then 20 | echo "No directory of sorted mapped read bam files" 21 | exit 1 22 | fi 23 | 24 | # Do I want to do this in here? 25 | perl $SCRIPT $OUTPUTDIR $TAG 26 | 27 | MappedDedupDIR=$OUTPUTDIR/$TAG/Deduplicated 28 | MappedWdupDIR=$OUTPUTDIR/$TAG/WithDuplicates 29 | mkdir -p $MappedDedupDIR 30 | mkdir -p $MappedWdupDIR 31 | mv $OUTPUTDIR/*dedup* $MappedDedupDIR 32 | mv $OUTPUTDIR/*sorted*.bam $MappedWdupDIR 33 | #tar -cvzf $OUTPUTDIR/Bergiers_exp2_mapping_output.tar.gz $OUTPUTDIR/Bergiers*exp2* 34 | #rm $OUTPUTDIR/Bergiers_lane*.bam 35 | 36 | -------------------------------------------------------------------------------- /4_Convert_GTF2BED_customized_for_Ensembl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Downloaded from: https://code.google.com/p/ea-utils/source/browse/trunk/clipper/gtf2bed on 9 April 2015 4 | # Based on suggestion from: http://onetipperday.blogspot.com/2012/08/convert-bed-to-gtf.html 5 | 6 | # Copyright (c) 2011 Erik Aronesty (erik@q32.com) 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | # THE SOFTWARE. 25 | # 26 | # ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT. 27 | 28 | use Data::Dumper; 29 | use Getopt::Long; 30 | 31 | 32 | my $extended; 33 | GetOptions("x"=>\$extended); 34 | 35 | $in = shift @ARGV; 36 | 37 | if (!defined($in) || $in eq "" || !-e $in) {die "No input file provided $!\n";} 38 | 39 | my $in_cmd =($in =~ /\.gz$/ ? "gunzip -c $in|" : $in =~ /\.zip$/ ? "unzip -p $in|" : "$in") || die "Please provide valid gtf (or compressed gtf) file, Can't open $in: $!\n"; 40 | open IN, $in_cmd; 41 | 42 | while () { 43 | $gff = 2 if /^##gff-version 2/; 44 | $gff = 3 if /^##gff-version 3/; 45 | next if /^#/ && $gff; 46 | 47 | s/\s+$//; 48 | # 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr 49 | my @f = split /\t/; 50 | # Tallulah's Modifications: 51 | ($transid) = $f[8]=~ /transcript_id "([^"]+)"/; 52 | ($geneid) = $f[8]=~ /gene_id "([^"]+)"/; 53 | ($gene_type) = $f[8]=~ /gene_biotype "([^"]+)"/; 54 | ($gene_name) = $f[8]=~ /gene_name "([^"]+)"/; 55 | ($trans_type) = $f[8]=~ /transcript_biotype "([^"]+)"/; 56 | $id="${gene_name}__${geneid}__${transid}__${gene_type}.${trans_type}"; 57 | # if ($gff) { 58 | # # most ver 2's stick gene names in the id field 59 | # ($id) = $f[8]=~ /\bID="([^"]+)"/; 60 | # # most ver 3's stick unquoted names in the name field 61 | # ($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3; 62 | # } else { 63 | # ($id) = $f[8]=~ /transcript_id "([^"]+)"/; 64 | # } 65 | # End of Modifications --- 66 | 67 | next unless $id && $f[0]; 68 | 69 | if ($f[2] eq 'exon') { 70 | die "no position at exon on line $." if ! $f[3]; 71 | # gff3 puts :\d in exons sometimes 72 | $id =~ s/:\d+$// if $gff == 3; 73 | push @{$exons{$id}}, \@f; 74 | # save lowest start 75 | $trans{$id} = \@f if !$trans{$id}; 76 | } elsif ($f[2] eq 'start_codon') { 77 | #optional, output codon start/stop as "thick" region in bed 78 | $sc{$id}->[0] = $f[3]; 79 | } elsif ($f[2] eq 'stop_codon') { 80 | $sc{$id}->[1] = $f[4]; 81 | } elsif ($f[2] eq 'miRNA' ) { 82 | $trans{$id} = \@f if !$trans{$id}; 83 | push @{$exons{$id}}, \@f; 84 | } 85 | } 86 | 87 | for $id ( 88 | # sort by chr then pos 89 | sort { 90 | $trans{$a}->[0] eq $trans{$b}->[0] ? 91 | $trans{$a}->[3] <=> $trans{$b}->[3] : 92 | $trans{$a}->[0] cmp $trans{$b}->[0] 93 | } (keys(%trans)) ) { 94 | my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}}; 95 | my ($cds, $cde); 96 | ($cds, $cde) = @{$sc{$id}} if $sc{$id}; 97 | 98 | # sort by pos 99 | my @ex = sort { 100 | $a->[3] <=> $b->[3] 101 | } @{$exons{$id}}; 102 | 103 | my $beg = $ex[0][3]; 104 | my $end = $ex[-1][4]; 105 | 106 | if ($dir eq '-') { 107 | # swap 108 | $tmp=$cds; 109 | $cds=$cde; 110 | $cde=$tmp; 111 | $cds -= 2 if $cds; 112 | $cde += 2 if $cde; 113 | } 114 | 115 | # not specified, just use exons 116 | $cds = $beg if !$cds; 117 | $cde = $end if !$cde; 118 | 119 | # adjust start for bed 120 | --$beg; --$cds; 121 | 122 | my $exn = @ex; # exon count 123 | my $exst = join ",", map {$_->[3]-$beg-1} @ex; # exon start 124 | my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex; # exon size 125 | 126 | my $gene_id; 127 | my $extend = ""; 128 | if ($extended) { 129 | ($gene_id) = $attr =~ /gene_name "([^"]+)"/; 130 | ($gene_id) = $attr =~ /gene_id "([^"]+)"/ unless $gene_id; 131 | $extend="\t$gene_id"; 132 | } 133 | # added an extra comma to make it look exactly like ucsc's beds 134 | print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,$extend\n"; 135 | } 136 | 137 | 138 | close IN; 139 | -------------------------------------------------------------------------------- /4_DO_RSeQC_Multiple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Arguments: 3 | # $1 = Organism under consideration 4 | # $2 = input bam dir 5 | # $3 = output directory 6 | 7 | ORGANISM=$1 8 | INPUTDIR=$2 9 | OUTDIR=$3 10 | 11 | if [ -z $ORGANISM ] ; then 12 | echo "Please set organism for reference annotations (ARG 1/3)" 13 | exit 1 14 | fi 15 | 16 | if [ -z $INPUTDIR ] ; then 17 | echo "$INPUTDIR does not exist. Please provide a directory of BAMfiles (ARG 2/3)" 18 | exit 1 19 | fi 20 | 21 | if [ -z $OUTDIR ] ; then 22 | echo "Please set a directory for the output files (ARG 3/3)" 23 | exit 1 24 | fi 25 | 26 | mkdir -p $OUTDIR 27 | 28 | 29 | # Check relevant annotation/gene model files exist and have been converted to BED format -> prevent multiple jobs trying to write to the same place. 30 | # This code is duplicated in 4_RSeQC_Multiple.sh so that it can be run safely on its own (specific/detailed analyses for particular files) or from this script for bulk analysis 31 | MASKgtf=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$ORGANISM-rRNAtRNAmtmRNAs-mask.gtf 32 | MASKbed="${MASKgtf%.gtf}.bed" 33 | REFGENOME="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf" 34 | REFGENOMEbed="${REFGENOME%.gtf}.bed" 35 | 36 | if [ ! -f $MASKbed ] ; then 37 | echo "Cannot find $MASKbed: attempting to make it." 38 | if [ ! -f $MASKgtf ] ; then 39 | /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0 40 | fi 41 | if [ ! -s $MASKgtf ] ; then 42 | echo "Cannot find or make $MASKgtf\n" 43 | exit 1 44 | fi 45 | # Convert to bed format 46 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $MASKgtf > $MASKbed 47 | if [ ! -s $MASKbed ] ; then 48 | echo "Failed to make $MASKbed\n" 49 | exit 1 50 | fi 51 | fi 52 | 53 | 54 | 55 | if [ ! -f $REFGENOMEbed ] ; then 56 | echo "Cannot find $REFGENOMEbed: attempting to make it." 57 | if [ ! -f $REFGENOME ] ; then 58 | /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0 59 | fi 60 | if [ ! -s $REFGENOME ] ; then 61 | echo "Cannot find or make $REFGENOME\n" 62 | exit 1 63 | fi 64 | # Convert to bed format 65 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $REFGENOME > $REFGENOMEbed 66 | if [ ! -s $REFGENOMEbed ] ; then 67 | echo "Failed to make $REFGENOMEbed\n" 68 | exit 1 69 | fi 70 | fi 71 | 72 | for INPUTFILE in $INPUTDIR/*.bam ; do 73 | OUTPREFIX=$(basename $INPUTFILE) 74 | OUTPREFIX=${OUTPREFIX%.bam} 75 | bsub -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o $OUTDIR/RSEQC_$OUTPREFIX.output /nfs/users/nfs_t/ta6/RNASeqPipeline/4_RSeQC_Multiple.sh $ORGANISM $INPUTFILE $OUTDIR/RSEQC_$OUTPREFIX 1 76 | done 77 | -------------------------------------------------------------------------------- /4_MergeBAMs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # 18 Apr 2015 : added path to samtools and the check that this samtools is available. 5 | # 10 Apr 2015 : added indexing of dedupped file. 6 | 7 | if (scalar(@ARGV) < 2) {die "Arguements: sortedmappedbamfiledirectory outputfileprefix\n";} 8 | 9 | my $dir = $ARGV[0];#"/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped"; 10 | 11 | # Sort files by cell 12 | my @files = glob("$dir/*sorted*aligned.bam"); 13 | my %sample2files = (); 14 | foreach my $file (@files) { 15 | # if ($file =~ /([ATCG]{5,})/) { 16 | if ($file =~ /([^\/]+_Cell\d\d)/) { 17 | push(@{$sample2files{$1}},$file); 18 | } else { 19 | die "$file does not match?\n"; 20 | } 21 | } 22 | 23 | if (! -e "/usr/bin/samtools") { die "Cannot find samtools\n";} 24 | # merge files for each cell 25 | open (my $ofh, ">", "$ARGV[1]\_DeDuppingStatistics.out") or die $!; 26 | print $ofh "sample\tdups\treads\n"; 27 | foreach my $sample (sort(keys(%sample2files))) { 28 | print STDERR "Starting $sample\n"; 29 | my $mergedfile = "$dir/$ARGV[1]\_$sample.sorted.bam"; 30 | my $dedupedfile = "$dir/$ARGV[1]\_$sample.sorted.dedupped.bam"; 31 | if (! -e $dedupedfile) { 32 | my @infiles = @{$sample2files{$sample}}; 33 | if (scalar(@infiles) > 1) { 34 | print("/usr/bin/samtools merge $mergedfile @infiles\n"); 35 | # print STDERR "Finished Merging @infiles\n"; 36 | print("/usr/bin/samtools rmdup $mergedfile $dedupedfile 2> dup.log\n"); 37 | # print STDERR "Finished removing dups from $mergedfile\n"; 38 | # system("samtools index $dedupedfile\n"); # Changed my mind, will do this as needed from 4_RSeQC_Multiple.sh 39 | } else { 40 | $mergedfile = $infiles[0]; 41 | system("/usr/bin/samtools rmdup $mergedfile $dedupedfile 2> dup.log\n"); 42 | } 43 | my $last = ""; 44 | # open (my $ifh, "dup.log") or die $!; 45 | # while (<$ifh>) {$last=$_;} close($ifh); 46 | # if ($last =~ /(\d+) \/ (\d+) =/) { 47 | # print $ofh "$sample\t$1\t$2\n"; 48 | # } else {die "$last line does not match\n";} 49 | ## print STDERR "Finished extracting data from logfile & writing to new output file\n"; 50 | } 51 | } 52 | close ($ofh); 53 | -------------------------------------------------------------------------------- /4_Process_RSEQC_output.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # Optionally takes a second arguement of a list of sample to exclude. -> this is untested 5 | 6 | # Output: 7 | # sample totalreads QCfailed duplicates multimap uniquemap unmapped rRNA read1/read2 +/- non-splice splice GeneBodyskewness avgfragsize 8 | 9 | if (@ARGV < 1) {die "Please provide a directory of RSeQC output files (and optionally a file of samples to exclude)\n";} 10 | 11 | my %exclude = (); 12 | if (defined($ARGV[1])) { 13 | open(my $ifh, $ARGV[1]) or die $!; 14 | while (<$ifh>) { 15 | chomp; 16 | my @record = split(/\s+/); 17 | foreach my $ele (@record) { 18 | $exclude{$ele} = 1; 19 | } 20 | } close($ifh); 21 | } 22 | 23 | my @files = glob("$ARGV[0]/RSEQC_*.output"); 24 | my %sample2output = (); 25 | 26 | 27 | 28 | my @ordered_expected_keys = ("totalreads","QCfailed","duplicates","multimap","uniquemap","unmapped","rRNA", "read1/read2", "+/-", "non-splice", "splice", "GeneBodyskewness", "avgfragsize"); 29 | 30 | foreach my $file (@files) { 31 | # $file =~ /([ATGC]{5,})/; 32 | $file =~ /(sc\d_cell\d\d)/; 33 | my $sample = $1; 34 | if (exists($exclude{$sample})) {next;} 35 | open (my $ifh, $file) or die $!; 36 | my %outputs = (); 37 | while (<$ifh>) { 38 | if ($_ =~ /Total records:\s*(\d+)/) { 39 | $outputs{"totalreads"} = $1; 40 | } 41 | elsif ($_ =~ /Reads consumed by input gene list\):\s*(\d+)/) { 42 | $outputs{"rRNA"} = $1; 43 | } 44 | elsif ($_ =~ /QC failed:\s*(\d+)/) { 45 | $outputs{"QCfailed"} = $1; 46 | } 47 | elsif ($_ =~ /PCR duplicate:\s*(\d+)/) { 48 | $outputs{"duplicates"} = $1; 49 | } 50 | elsif ($_ =~ /Non primary hits\s*(\d+)/) { 51 | $outputs{"multimap"} += $1; 52 | } 53 | elsif ($_ =~ /mapq < mapq_cut \(non-unique\):\s*(\d+)/) { 54 | $outputs{"multimap"} += $1; 55 | } 56 | elsif ($_ =~ /mapq >= mapq_cut \(unique\):\s*(\d+)/) { 57 | $outputs{"uniquemap"} += $1; 58 | } 59 | elsif ($_ =~ /Unmapped reads:\s*(\d+)/) { 60 | $outputs{"unmapped"} = $1; 61 | } 62 | elsif ($_ =~ /Read-1:\s*(\d+)/) { 63 | $outputs{"read1"} = $1; 64 | } 65 | elsif ($_ =~ /Read-2:\s*(\d+)/) { 66 | $outputs{"read2"} = $1; 67 | } 68 | elsif ($_ =~ /Reads map to '\+':\s*(\d+)/) { 69 | $outputs{"+"} = $1; 70 | } 71 | elsif ($_ =~ /Reads map to '\-':\s*(\d+)/) { 72 | $outputs{"-"} = $1; 73 | } 74 | elsif ($_ =~ /Non-splice reads:\s*(\d+)/) { 75 | $outputs{"non-splice"} = $1; 76 | } 77 | elsif ($_ =~ /Splice reads:\s*(\d+)/) { 78 | $outputs{"splice"} = $1; 79 | } 80 | elsif ($_ =~ /Sample\s+Skewness/) { 81 | my $data = <$ifh>; 82 | $data =~ /\s+([-\.\d]+)/; 83 | $outputs{"GeneBodyskewness"} = $1; 84 | } 85 | else { 86 | #count number of tabs in line 87 | my @record = split(/\t/); 88 | if (scalar(@record) == 8 && $record[7] =~ /\d/) { 89 | $outputs{"sumfrag"} += $record[5]*$record[4]; 90 | $outputs{"numfrag"} += $record[4]; 91 | } 92 | } 93 | } close($ifh); 94 | if (exists($outputs{"read1"}) && exists($outputs{"read2"})) { 95 | $outputs{"read1/read2"} = $outputs{"read1"}/$outputs{"read2"}; 96 | } else { 97 | $outputs{"read1/read2"} = "NA"; 98 | } 99 | if (exists($outputs{"+"}) && exists($outputs{"-"})) { 100 | $outputs{"+/-"} = $outputs{"+"}/$outputs{"-"}; 101 | } else { 102 | $outputs{"+/-"} = "NA"; 103 | } 104 | if ((exists($outputs{"sumfrag"}) && exists($outputs{"numfrag"})) && $outputs{"numfrag"} > 0) { 105 | $outputs{"avgfragsize"} = $outputs{"sumfrag"}/$outputs{"numfrag"}; 106 | } else { 107 | $outputs{"avgfragsize"} = "NA"; 108 | } 109 | 110 | foreach my $key (@ordered_expected_keys){ 111 | if (!exists($outputs{$key})) { 112 | die "No data for $key\n"; 113 | } 114 | push(@{$sample2output{$sample}},$outputs{$key}); 115 | } 116 | } 117 | 118 | print "sample\t".join("\t", @ordered_expected_keys)."\n"; 119 | foreach my $sample (keys(%sample2output)) { 120 | print "$sample\t".join("\t", @{$sample2output{$sample}})."\n"; 121 | } 122 | 123 | # Combine Rscripts 124 | 125 | @files = glob("$ARGV[0]/RSEQC_*.GC_plot.r"); 126 | my $plotcmd = ""; 127 | my @datacmds = (); 128 | my @bincounts = (0)x100; 129 | my $pdfcmd="pdf(\"$ARGV[0]/RSEQC_GC_plot_Combined.pdf\")\n"; 130 | foreach my $file (@files) { 131 | 132 | # $file =~ /([ATCG]{5,})/; my $sample = $1; 133 | $file =~ /(sc\d_cell\d\d)/; my $sample = $1; 134 | if (exists($exclude{$sample})) {next;} 135 | 136 | open (my $ifh, $file) or die $!; 137 | <$ifh>; #pdfcmd 138 | my $data = <$ifh>; 139 | # Data is originally "rep(c(),times=c())" how to process this? -> split the two c()'s and interate for ($i ...) through each of them. 140 | my @stuff = split(/[\(\)]/,$data); 141 | my @values = split(",",$stuff[2]); 142 | my @times = split(",",$stuff[4]); 143 | if (scalar(@values) != scalar(@times)) {die "Does not compute: Not same number of values as times\n";} 144 | for (my $i =0; $i < scalar(@values); $i++) { 145 | my $index = int($values[$i]); 146 | # floor each data point to nearest integer, add 1 to that index of @bincounts 147 | $bincounts[$index]+=$times[$i]; 148 | } 149 | $plotcmd=<$ifh>;# need new plot command 150 | <$ifh>; 151 | close($ifh); 152 | } 153 | open(my $ofh, ">", "$ARGV[0]/RSEQC_GC_plot_Combined.r") or die $!; 154 | print $ofh $pdfcmd; 155 | print $ofh "data=c(".join(",",@bincounts).")\n"; 156 | print $ofh "xes=barplot(data/sum(data), space=0, col=\"white\",ylab=\"Density of Reads\", border=\"blue\", main=\"\", xlab=\"GCcontent (%)\")\n"; 157 | print $ofh "axis(1,at=xes,labels=1:100,col=\"white\")\n"; 158 | print $ofh "dev.off()\n"; 159 | close($ofh); 160 | 161 | @files = glob("$ARGV[0]/RSEQC_*.geneBodyCoverage.r"); 162 | $plotcmd = "matplot(data,type='l', xlab=\"Gene body percentile (5'->3')\", ylab=\"Coverage\",lwd=0.8,col=colours)\n"; 163 | @datacmds = (); 164 | my $colourcmd = "colours=colorRampPalette(c(\"#7fc97f\",\"#beaed4\",\"#fdc086\",\"#ffff99\",\"#386cb0\",\"#f0027f\"))(".scalar(@files).")\n"; 165 | $pdfcmd="pdf(\"$ARGV[0]/RSEQC_geneBodyCoverage_plot_Combined.pdf\")\n"; 166 | foreach my $file (@files) { 167 | # $file =~ /([ATCG]{5,})/; my $sample = $1; 168 | $file =~ /(sc\d_cell\d\d)/; my $sample = $1; 169 | if (exists($exclude{$sample})) {next;} 170 | 171 | open (my $ifh, $file) or die $!; 172 | my $data = <$ifh>; 173 | $data =~ /(c\(.+\))/; 174 | push(@datacmds, $1); 175 | close($ifh); 176 | } 177 | open($ofh, ">", "$ARGV[0]/RSEQC_geneBodyCoverage_plot_Combined.r") or die $!; 178 | print $ofh $pdfcmd; 179 | print $ofh "data=cbind(".join(",",@datacmds).")\n"; 180 | print $ofh $colourcmd; 181 | print $ofh $plotcmd; 182 | print $ofh "dev.off()\n"; 183 | close($ofh); 184 | 185 | # Use average at each point for each line over all samples! 186 | @files = glob("$ARGV[0]/RSEQC_*.junctionSaturation_plot.r"); 187 | my $xcmd = "x=c(5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100)"; 188 | $pdfcmd="pdf(\"$ARGV[0]/RSEQC_junctionSaturation_plot_Combined.pdf\")"; 189 | my $legendcmd = "legend(5,40, legend=c(\"All junctions\",\"known junctions\", \"novel junctions\"),col=c(\"blue\",\"red\",\"green\"),lwd=1,pch=1)"; 190 | my %data = (); 191 | foreach my $file (@files) { 192 | # $file =~ /([ATCG]{5,})/; my $sample = $1; 193 | $file =~ /(sc\d_cell\d\d)/; my $sample = $1; 194 | if (exists($exclude{$sample})) {next;} 195 | 196 | open (my $ifh, $file) or die $!; 197 | <$ifh>;<$ifh>; #pdf cmd, xes 198 | my $y = <$ifh>; $y =~ s/y=c\(//; $y =~s/\)//; 199 | my @yes = split(/,/,$y); 200 | for (my $i = 0; $i < scalar(@yes); $i++) {$data{"y"}->[$i] += $yes[$i];} 201 | 202 | my $z = <$ifh>; $z =~ s/z=c\(//; $z =~s/\)//; 203 | my @zes = split(/,/,$z); 204 | for (my $i = 0; $i < scalar(@zes); $i++) {$data{"z"}->[$i] += $zes[$i];} 205 | 206 | my $w = <$ifh>; $w =~ s/w=c\(//; $w =~s/\)//; 207 | my @wes = split(/,/,$w); 208 | for (my $i = 0; $i < scalar(@wes); $i++) {$data{"w"}->[$i] += $wes[$i];} 209 | 210 | close($ifh); 211 | } 212 | #plot(x,z/1000,xlab='percent of total reads',ylab='Number of splicing junctions (x1000)',type='o',col='blue',ylim=c(n,m)) 213 | #points(x,y/1000,type='o',col='red') 214 | #points(x,w/1000,type='o',col='green') 215 | 216 | open($ofh, ">", "$ARGV[0]/RSEQC_junctionSaturation_plot_Combined.r") or die $!; 217 | print $ofh "x=c(5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100)\n"; 218 | print $ofh "y=c(".join(",", @{$data{"y"}}).")/".scalar(@files)."\n"; 219 | print $ofh "z=c(".join(",", @{$data{"z"}}).")/".scalar(@files)."\n"; 220 | print $ofh "w=c(".join(",", @{$data{"w"}}).")/".scalar(@files)."\n"; 221 | print $ofh "m=max(y,z,w)/1000\nn=min(y,z,w)/1000\n"; 222 | print $ofh $pdfcmd."\n"; 223 | print $ofh "plot(x,z/1000,xlab='percent of total reads',ylab='Number of splicing junctions (x1000)',type='o',col='blue',ylim=c(n,m))\npoints(x,y/1000,type='o',col='red')\npoints(x,w/1000,type='o',col='green')\n"; 224 | print $ofh $legendcmd."\n"; 225 | 226 | print $ofh "dev.off()\n"; 227 | close($ofh); 228 | -------------------------------------------------------------------------------- /4_RSeQC_Multiple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Arguments: 3 | # $1 = Organism under consideration 4 | # $2 = input bam 5 | # $3 = output prefix 6 | # $4 = full analysis? [0/1], 1=do all six analyses, 0=only do basic stats & rRNA content 7 | 8 | ORGANISM=$1 9 | INPUTBAM=$2 10 | OUTPREFIX=$3 11 | MAP_QUALITY=30 #default=30 on Phred scale 12 | 13 | if [ -z $ORGANISM ] ; then 14 | echo "Please set organism for reference annotations (ARG 1/4)" 15 | exit 1 16 | fi 17 | 18 | if [ -z $INPUTBAM ] || [ ! -f $INPUTBAM ] ; then 19 | echo "$INPUTBAM does not exist. Please provide existing sorted BAM file (ARG 2/4)" 20 | exit 1 21 | fi 22 | 23 | if [ -z $OUTPREFIX ] ; then 24 | echo "Please set a prefix for the output files (ARG 3/4)" 25 | exit 1 26 | fi 27 | 28 | if [ -z $4 ] ; then 29 | echo "Please set type of analysis: 0 = basic stats & rRNA content only, 1 = full analysis (ARG 4/4)" 30 | exit 1 31 | fi 32 | 33 | # Check relevant annotation/gene model files exist and have been converted to BED format 34 | # This code is duplicated in 4_DO_RSeQC_Multiple.sh 35 | MASKgtf=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$ORGANISM-rRNAtRNAmtmRNAs-mask.gtf 36 | MASKbed="${MASKgtf%.gtf}.bed" 37 | REFGENOME="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf" 38 | REFGENOMEbed="${REFGENOME%.gtf}.bed" 39 | 40 | if [ ! -s $MASKbed ] ; then 41 | echo "Cannot find $MASKbed: attempting to make it." 42 | if [ ! -s $MASKgtf ] ; then 43 | /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0 44 | fi 45 | if [ ! -s $MASKgtf ] ; then 46 | echo "Cannot find or make $MASKgtf\n" 47 | exit 1 48 | fi 49 | # Convert to bed format 50 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $MASKgtf > $MASKbed 51 | if [ ! -s $MASKbed ] ; then 52 | echo "Failed to make $MASKbed\n" 53 | exit 1 54 | fi 55 | fi 56 | 57 | if [ ! -s $REFGENOMEbed ] ; then 58 | echo "Cannot find $REFGENOMEbed: attempting to make it." 59 | if [ ! -s $REFGENOME ] ; then 60 | /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0 61 | fi 62 | if [ ! -s $REFGENOME ] ; then 63 | echo "Cannot find or make $REFGENOME\n" 64 | exit 1 65 | fi 66 | # Convert to bed format 67 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $REFGENOME > $REFGENOMEbed 68 | if [ ! -s $REFGENOMEbed ] ; then 69 | echo "Failed to make $REFGENOMEbed\n" 70 | exit 1 71 | fi 72 | fi 73 | 74 | 75 | # RUN RSeQC analysis 76 | # get python path 77 | bash 78 | 79 | echo $INPUTBAM 80 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/split_bam.py -i $INPUTBAM -r $MASKbed -o $OUTPREFIX 81 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/bam_stat.py -i $INPUTBAM -q $MAP_QUALITY 82 | if [ $4 -gt 0 ] ; then 83 | if [ ! -f $INPUTBAM.bai ] ; then 84 | samtools index $INPUTBAM 85 | fi 86 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/geneBody_coverage.py -i $INPUTBAM -r $REFGENOMEbed -o $OUTPREFIX #requires BAM indexing file *.bam.bai -> index using samtools 87 | # -m 20 = minimum intron size (keeping consistent with STAR ENCODE parameters) 88 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/junction_saturation.py -i $INPUTBAM -r $REFGENOMEbed -o $OUTPREFIX -m 20 -q $MAP_QUALITY 89 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/read_GC.py -i $INPUTBAM -o $OUTPREFIX -q $MAP_QUALITY 90 | # python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/RNA_fragment_size.py -i $INPUTBAM -r $REFGENOMEbed -q $MAP_QUALITY #requires BAM indexing file *.bam.bai, output takes up lot of memory & similar to what I will probably get from fragment counting software so not very important 91 | fi 92 | 93 | rm $OUTPREFIX*.bam 94 | -------------------------------------------------------------------------------- /5.0_Summarize_Known_Transcriptome.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | #Things to filter: 5 | # single exon non-reference transcripts (class 'u' 'i' & single exon), 6 | # transcripts with retained introns (class 'e'), 7 | # polymerase read-though (class 'p'), 8 | # class code 's' (likely read mapping error) 9 | # helpful info: http://seqanswers.com/forums/showthread.php?t=3518 10 | # Stats I would like to have: (1) % reference transcripts recovered (# transcripts class '=' vs # transcripts genome, (2) # novel intergenic multi-exonic transcripts, (3) # novel alternatively spliced transcripts 11 | 12 | if (@ARGV < 1) {die "Please provide reference GTF\n";} 13 | 14 | my %transcriptid2lines =(); 15 | my %transcriptid2numexons = (); 16 | open (my $ifh, $ARGV[0]) or die $!; 17 | while (<$ifh>) { 18 | chomp; 19 | $_ =~ /transcript_id "(.+?)";/; 20 | my $tid = $1; 21 | if ($_ =~ /exon_number "(\d+)"/) { 22 | if (!exists($transcriptid2numexons{$tid}) || $transcriptid2numexons{$tid} < $1) { 23 | $transcriptid2numexons{$tid} = $1; 24 | } 25 | } 26 | } close($ifh); 27 | 28 | print "Number of Transcripts: ".scalar(keys(%transcriptid2numexons))."\n"; 29 | -------------------------------------------------------------------------------- /5_Cufflinks_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note: this may be called by 4_RSeQC_Multiple.sh, 4_DO_RSeQC_Multiple.sh 3 | # Arguments: 4 | # $1 = organism: either Mmus or Hsap 5 | # $2 = number of threads to run on 6 | # $3 = input BAM to run on (only required if number of threads > 0) 7 | # $4 = outputdir (optional, default = /lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified). 8 | # $5 = gtf file. -> if provided allows faux reads, if not provided gets genome one and does not use faux-reads 9 | 10 | CUFFLINKS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/cufflinks 11 | ORGANISM=$1 12 | NUMTHREADS=$2 13 | INPUTDIR=$3 14 | OUTDIR=$4 15 | ANNOTATIONgtf=$5 16 | TEMPDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir 17 | FILEStoMAP=($INPUTDIR/*.bam) 18 | ARRAYINDEX=$(($LSB_JOBINDEX-1)) 19 | INPUTBAM=${FILEStoMAP[$ARRAYINDEX]} 20 | echo "Inputfile: $INPUTBAM" 21 | 22 | if [ ! -f $CUFFLINKS ] ; then 23 | echo "Sorry Cufflinks not available" 24 | exit 1 25 | fi 26 | 27 | if [ -z $ORGANISM ] ; then 28 | echo "Please set organism for reference annotations (ARG 1/4)" 29 | exit 1 30 | fi 31 | 32 | if [ -z $NUMTHREADS ] ; then 33 | echo "Please set number of threads to run on, setting = 0 will get genome & rRNA gtf but not run cufflinks (ARG 2/4)" 34 | exit 1 35 | fi 36 | 37 | if [ $NUMTHREADS -gt 0 ] ; then 38 | if [ -z $INPUTBAM ] || [ ! -f $INPUTBAM ] ; then 39 | echo "$INPUTBAM, jobindex $LSB_JOBINDEX, array index $ARRAYINDEX of $INPUTDIR does not exist. Please provide a directory containing BAMfiles (ARG 3/4)" 40 | exit 1 41 | fi 42 | fi 43 | 44 | if [ -z $OUTDIR ] ; then 45 | OUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified 46 | fi 47 | if [ -z $LSB_JOBINDEX ] ; then 48 | LSB_JOBINDEX=7 49 | fi 50 | 51 | SEED=$((100+$LSB_JOBINDEX)) 52 | echo "rgenerator seed: $SEED" 53 | 54 | FAUXREADS="" 55 | 56 | # Get stuff for cufflinks: 57 | # gtf if not already present, get genome fasta if not already present (basically run Build genome without actually running STAR. 58 | if [ -z $ANNOTATIONgtf ] ; then 59 | echo "Using mapping-genome annotations" 60 | GENOMEDIR=/lustre/scratch108/compgen/team218/TA/genomebuilding 61 | GENOMEfa=$GENOMEDIR/*.fa 62 | GENOMEgtf=$GENOMEDIR/*.gtf 63 | ANNOTATIONgtf=$GENOMEgtf 64 | FAUXREADS="--no-faux-reads" 65 | 66 | # if [ ! -s $GENOMEfa ] ; then 67 | # /nfs/users/nfs_t/ta6/RNASeqPipeline/0_BuildGenome.sh $GENOMEDIR $TEMPDIR 0 125 $ORGANISM /nfs/users/nfs_t/ta6/Collaborations/Bergiers_Italy 68 | # fi 69 | # if [ ! -s $GENOMEgtf ] ; then 70 | # /nfs/users/nfs_t/ta6/RNASeqPipeline/0_BuildGenome.sh $GENOMEDIR $TEMPDIR 0 125 $ORGANISM /nfs/users/nfs_t/ta6/Collaborations/Bergiers_Italy 71 | # fi 72 | fi 73 | # get rRNA, mitochondial transcripts, tRNAs to mask -> hummmmm....... how best to do this? -> use grep to select the relevant lines from the existing .gtf is fast and ensures compatibility between the two gtf files and with the .fa file. 74 | MASKgtf=$TEMPDIR/$ORGANISM-rRNAtRNAmtmRNAs-mask.gtf 75 | if [ ! -f $MASKgtf ] ; then 76 | grep -E 'rRNA|tRNA|^MT' /lustre/scratch108/compgen/team218/TA/genomebuilding/*.gtf > $MASKgtf 77 | fi 78 | 79 | 80 | if [ ! -s $GENOMEfa ] ; then 81 | echo "Failed to find or make $GENOMEfa" 82 | exit 1; 83 | fi 84 | if [ ! -s $GENOMEgtf ] ; then 85 | echo "Failed to find or make $GENOMEgtf" 86 | exit 1; 87 | fi 88 | if [ ! -f $MASKgtf ] ; then 89 | echo "Failed to find or make $MASKgtf" 90 | exit 1; 91 | fi 92 | if [ ! -s $MASKgtf ] ; then 93 | echo "Warning: Mask ($MASKgtf) is empty, continuing anyway..." 94 | fi 95 | 96 | 97 | 98 | # Cufflinks options: 99 | # --GTF-guide 100 | # --mask-file 101 | # --frag-bias-correct 102 | # --multi-read-correct 103 | # --quiet 104 | # --no-update-check 105 | # -o 106 | # --num-threads 107 | # --seed 108 | # --max-intron-length 1000000 #keep consistent with STAR parameters 109 | # --min-intron-length 20 #keep consistent with STAR parameters 110 | # --max-multiread-fraction #default is 0.75 111 | # --library-type #default is fr-unstranded 112 | 113 | #To fix failed jobs 114 | #if [ -d "$OUTDIR/JOB$LSB_JOBINDEX" ]; then 115 | #----------------- 116 | 117 | if [ $NUMTHREADS -gt 0 ] ; then 118 | OUTDIR=$OUTDIR/JOB$LSB_JOBINDEX 119 | mkdir -p $OUTDIR 120 | 121 | # Get rid of S thing from STAR. 122 | TMPSAM=Temp$LSB_JOBINDEX.out.sam 123 | samtools view -h -o $TEMPDIR/$TMPSAM $INPUTBAM 124 | 125 | awk 'BEGIN {OFS="\t"} {split($6,C,/[0-9]*/); split($6,L,/[SMDIN]/); if (C[2]=="S") {$10=substr($10,L[1]+1); $11=substr($11,L[1]+1)}; if (C[length(C)]=="S") {L1=length($10)-L[length(L)-1]; $10=substr($10,1,L1); $11=substr($11,1,L1); }; gsub(/[0-9]*S/,"",$6); print}' $TEMPDIR/$TMPSAM > $TEMPDIR/noS.$TMPSAM 126 | 127 | NEWINPUTBAM=$TEMPDIR/noS.Temp$LSB_JOBINDEX.out.bam 128 | samtools view -bS $TEMPDIR/noS.$TMPSAM > $NEWINPUTBAM 129 | rm $TEMPDIR/$TMPSAM 130 | rm $TEMPDIR/noS.$TMPSAM 131 | 132 | # de novo assembly command 133 | # $CUFFLINKS --GTF-guide $ANNOTATIONgtf --frag-bias-correct $GENOMEfa --mask-file $MASKgtf --multi-read-correct --max-intron-length 1000000 --min-intron-length 20 -o $OUTDIR --quiet --no-update-check --no-faux-reads --seed $SEED --num-threads $NUMTHREADS $NEWINPUTBAM 134 | $CUFFLINKS --GTF-guide $ANNOTATIONgtf --mask-file $MASKgtf --multi-read-correct --max-intron-length 1000000 --min-intron-length 20 -o $OUTDIR --quiet --no-update-check $FAUXREADS --seed $SEED --num-threads $NUMTHREADS $NEWINPUTBAM 135 | rm $NEWINPUTBAM 136 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/5_TidyCufflinks.pl $OUTDIR $INPUTBAM 137 | 138 | if [ -f $TEMPDIR/noS.$TMPSAM ]; then 139 | rm $TEMPDIR/noS.$TMPSAM 140 | fi 141 | fi 142 | #To fix failed jobs 143 | #fi 144 | #----------------- 145 | -------------------------------------------------------------------------------- /5_Cuffmerge_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note: this may be called by 4_RSeQC_Multiple.sh, 4_DO_RSeQC_Multiple.sh 3 | # Arguments: 4 | # $1 = number of threads to run on 5 | # $2 = file of files to merge 6 | # $3 = reference gtf (optional) 7 | # $4 = reference fasta (optional) 8 | 9 | CUFFMERGE=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/cuffmerge 10 | NUMTHREADS=$1 11 | INPUTFILE=$2 12 | REFgtf=$3 13 | REFfasta=$4 14 | 15 | # Add gtf_to_sam and other accessorty cufflinks scripts to my path 16 | export PATH=$PATH:/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/ 17 | 18 | if [ ! -f $CUFFMERGE ] ; then 19 | echo "Sorry Cuffmerge not available" 20 | exit 1 21 | fi 22 | 23 | if [ -z $NUMTHREADS ] ; then 24 | echo "Please set number of threads to run on, setting = 0 will get genome & rRNA gtf but not run cufflinks (ARG 1/4)" 25 | exit 1 26 | fi 27 | 28 | if [ $NUMTHREADS -lt 1 ] ; then 29 | echo "Number of threads must be at least 1." 30 | exit 1 31 | fi 32 | 33 | if [ -z $INPUTFILE ] ; then 34 | echo "Please set provide a file with a list of gtf files to merge (ARG 2/4)" 35 | exit 1 36 | fi 37 | 38 | ARGrefgtf="" 39 | if [ ! -z $REFgtf ] ; then 40 | if [ -s $REFgtf ] ; then 41 | ARGrefgtf="-g $REFgtf" 42 | else 43 | echo "Reference GTF is empty or does not exist, will not be used"; 44 | fi 45 | fi 46 | 47 | ARGreffa="" 48 | if [ ! -z $REFfasta ] ; then 49 | if [ -s $REFfasta ] ; then 50 | ARGreffa="-s $REFfasta" 51 | else 52 | echo "Reference FASTA is empty of does not exist, will not be used"; 53 | fi 54 | fi 55 | 56 | 57 | # Cuffmerge options: 58 | # -o outprefix->redirects stdout 59 | # -g ref-gtf 60 | # -p number of threads 61 | # -s ref-sequence 62 | $CUFFMERGE $ARGrefgtf $ARGreffa --num-threads $NUMTHREADS $INPUTFILE 63 | -------------------------------------------------------------------------------- /5_DO_Cufflinks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Cufflinks 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_Tophat/Beuttner_Tophat2_dedup/Deduplicated 7 | INPUTFILES=($INPUTDIR/*.bam) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES)) 10 | 11 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%40" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o output.%J.%I -e error.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh Mmus 1 $INPUTDIR $OUTPUTDIR /lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf 12 | -------------------------------------------------------------------------------- /5_DO_Cufflinks_denovo_Transcripts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline. 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/DeNovoTranscripts 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Deduplicated 7 | INPUTFILES=($INPUTDIR/*.bam) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES)) 10 | 11 | bsub -J"mappingwithstararrayjob[48-$MAXJOBS]%40" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o output.%J.%I -e error.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh Mmus 1 $INPUTDIR $OUTPUTDIR 12 | 13 | -------------------------------------------------------------------------------- /5_DO_Cuffmerge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/DeNovoTranscripts 4 | mkdir -p $OUTPUTDIR 5 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified 6 | INPUTFILE=$OUTPUTDIR/List_of_GTFs_to_merge.txt 7 | REFgtf=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf 8 | REFfa=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa 9 | 10 | ls $INPUTDIR/*_transcripts.gtf > $INPUTFILE 11 | 12 | cd $OUTPUTDIR 13 | 14 | readarray -t array < $INPUTFILE 15 | for file in ${array[@]} ; do 16 | cat $file | sed "s/TNeo CDS/TNeoCDS/" > tempfile.tmp 17 | mv tempfile.tmp $file 18 | done 19 | 20 | bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o output.%J -e error.%J /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cuffmerge_wrapper.sh 1 $INPUTFILE $REFgtf $REFfa 21 | 22 | -------------------------------------------------------------------------------- /5_DO_Quantification_X2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # NOT TESTED 3 | 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Bergiers_Vivo 5 | mkdir -p $OUTPUTDIR 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Bergiers_Vivo/Deduplicated 7 | INPUTFILES=($INPUTDIR/Bergiers_Vivo*.bam) 8 | NUMFILES=${#INPUTFILES[@]} 9 | MAXJOBS=$(($NUMFILES)) 10 | 11 | NEWANNOTATIONgtf="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf" 12 | 13 | #bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%20" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o output.%J.%I -e error.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh Mmus 1 $INPUTDIR $OUTPUTDIR $NEWANNOTATIONgtf 14 | 15 | 16 | #Only run these one at a time because they create a huge amount of temporary files 17 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%1" -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o FCountoutput.%J.%I -e FCounterror.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_featureCounts_wrapper.sh $NEWANNOTATIONgtf 1 $INPUTDIR 18 | 19 | -------------------------------------------------------------------------------- /5_DO_featureCounts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedDeDupped 4 | INPUTFILES=($INPUTDIR/*.bam) 5 | NUMFILES=${#INPUTFILES[@]} 6 | MAXJOBS=$NUMFILES 7 | OUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedDedupedCounted 8 | TMPDIR=/lustre/scratch108/compgen/team218/TA/Pipeline_RunningDir/FeatureCounts 9 | mkdir -p $OUTDIR 10 | ANNOTATIONgtf="/lustre/scratch108/compgen/team218/TA/genomebuilding/Bergiers_Transcripts.gtf" 11 | featureCOUNT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/featureCounts 12 | NUMTHREADS=1 13 | 14 | if [ ! -f $featureCOUNT ] ; then 15 | echo "Sorry featureCounts not available" 16 | exit 1 17 | fi 18 | 19 | if [ -z $ANNOTATIONgtf ] || [ ! -f $ANNOTATIONgtf ] ; then 20 | echo "Please provide an annotation GTF file (ARG 1/3)" 21 | exit 1 22 | fi 23 | 24 | if [ -z $NUMTHREADS ] ; then 25 | echo "Please set number of threads to run on (ARG 2/3)" 26 | exit 1 27 | fi 28 | 29 | if [ ! $NUMTHREADS -gt 0 ] ; then 30 | echo "Error: number of threads must be > 0" 31 | exit 1 32 | fi 33 | 34 | # featureCounts options: 35 | # -t 'string' : specify the feature type to count reads for, default='exon' 36 | # -g 'string' : specify the attribute used to group features into meta-features, default='gene_id' 37 | # -f : read summarization performed at the feature level instead of the meta-feature level 38 | # -O : reads can match more than one feature/metafeature 39 | # -M : multi-mapping can be counted multiple times (once for each of their mapping locations 40 | # alternatively 41 | # --primary : only primary alignments will be counted 42 | # -Q ## : minimum mapping quality (default = 0, 30 = consistent with RSeQC) 43 | # -T ## : number of threads to run on (default =1) 44 | # -R : output read counting assignments of each read into a .featureCounts file 45 | # --ignoreDup : ignores any reads marked as duplicates 46 | # -p : fragments rather than reads counted for paired-end data. 47 | # -d ## : minimum fragment/template legnth (default: 50) -> only if using -P parameter too 48 | # -D ## : maximum fragment/template length (default 600) -> only if using -P parameter too 49 | # -B : only reads with both ends mapping considered 50 | # -C : reads with ends mapping to different Chrs excluded 51 | 52 | bsub -J"featurecountsjobarray[1-$MAXJOBS]%100" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o FCoutput.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_featureCounts_wrapper.sh $ANNOTATIONgtf $NUMTHREADS $INPUTDIR $TMPDIR $OUTDIR 53 | -------------------------------------------------------------------------------- /5_DO_featureCounts_locally.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # NOT TESTED 3 | 4 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_STAR/Beuttner_STAR_dedup/Deduplicated/ 5 | OUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_FeatureCounts 6 | mkdir -p $OUTDIR 7 | ANNOTATIONgtf="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf" 8 | featureCOUNT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/featureCounts 9 | NUMTHREADS=1 10 | 11 | if [ ! -f $featureCOUNT ] ; then 12 | echo "Sorry featureCounts not available" 13 | exit 1 14 | fi 15 | 16 | if [ -z $ANNOTATIONgtf ] || [ ! -f $ANNOTATIONgtf ] ; then 17 | echo "Please provide an annotation GTF file (ARG 1/3)" 18 | exit 1 19 | fi 20 | 21 | if [ -z $NUMTHREADS ] ; then 22 | echo "Please set number of threads to run on (ARG 2/3)" 23 | exit 1 24 | fi 25 | 26 | if [ ! $NUMTHREADS -gt 0 ] ; then 27 | echo "Error: number of threads must be > 0" 28 | exit 1 29 | fi 30 | 31 | # featureCounts options: 32 | # -t 'string' : specify the feature type to count reads for, default='exon' 33 | # -g 'string' : specify the attribute used to group features into meta-features, default='gene_id' 34 | # -f : read summarization performed at the feature level instead of the meta-feature level 35 | # -O : reads can match more than one feature/metafeature 36 | # -M : multi-mapping can be counted multiple times (once for each of their mapping locations 37 | # alternatively 38 | # --primary : only primary alignments will be counted 39 | # -Q ## : minimum mapping quality (default = 0, 30 = consistent with RSeQC) 40 | # -T ## : number of threads to run on (default =1) 41 | # -R : output read counting assignments of each read into a .featureCounts file 42 | # --ignoreDup : ignores any reads marked as duplicates 43 | # -p : fragments rather than reads counted for paired-end data. 44 | # -d ## : minimum fragment/template legnth (default: 50) -> only if using -P parameter too 45 | # -D ## : maximum fragment/template length (default 600) -> only if using -P parameter too 46 | # -B : only reads with both ends mapping considered 47 | # -C : reads with ends mapping to different Chrs excluded 48 | 49 | 50 | for INPUTBAM in $INPUTDIR/*.bam ; do 51 | OUTPUTFILE=$(basename "${INPUTBAM%.bam}.fragmentcounts") 52 | $featureCOUNT -O -M -Q 30 -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #allow multimap 53 | #$featureCOUNT -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #No multimap 54 | rm temp* 55 | done 56 | -------------------------------------------------------------------------------- /5_Fix_Transcriptome_for_featureCounts.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "0_My_Extract_Transcriptome.pl .gtf\n";} 5 | 6 | my %Ensg2Gtf = (); 7 | 8 | open (my $gtf_out, ">","Transcripts_featureCounts.gtf") or die $!; 9 | open (my $gtf, $ARGV[0]) or die $!; 10 | my $gtf_line = ""; 11 | while ($gtf_line = <$gtf>) { 12 | if ($gtf_line =~ /^#/) { 13 | next; 14 | } # ignore headers 15 | my $geneid = ""; 16 | if ($gtf_line =~ /gene_id "(.+?)";/) { 17 | $geneid = $1; 18 | } else { 19 | next; 20 | } # get gene id 21 | my @record = split(/\t/, $gtf_line); 22 | my $seq_chr = $record[0]; 23 | my $seq_st = $record[3]; 24 | my $seq_end = $record[4]; 25 | if ($record[2] eq "exon") { 26 | $gtf_line =~ s/transcript_id "(.+?)"/transcript_id "$geneid"/; 27 | print $gtf_out $gtf_line; 28 | } else { 29 | $record[2] = "exon"; 30 | my $lastele = scalar(@record)-1; 31 | $record[$lastele] = "gene_id \"$geneid\"; transcript_id \"$geneid\"; exon_number \"1\"; gene_name \"$geneid\"\n"; 32 | print $gtf_out join("\t", @record); 33 | } 34 | } 35 | close($gtf); 36 | close($gtf_out); 37 | -------------------------------------------------------------------------------- /5_RSEM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Arguments: 3 | # $1 = BAM file to map 4 | # $2 = is paired end? 5 | # $3 = number of threads to run on (default = 1) 6 | RSEMdir=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26/ 7 | TEMPdir=/lustre/scratch108/compgen/team218/TA/Pipeline_RunningDir/RSEM/TMP 8 | BAMfile=$1 9 | paired=$2 10 | REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38 11 | BASEname=${BAMfile##*/} 12 | PREFIX=${BASEname%%.*} 13 | BAMfileOut=$TEMPdir/Out$BASEname 14 | BAMfiltered=$TEMPdir/Filtered$BASEname 15 | BAMfixed=$TEMPdir/Fixed$BASEname 16 | BAMsorted=$TEMPdir/Sorted$BASEname 17 | THREADS=$3 18 | 19 | if [ -z "$THREADS" ] ; then 20 | THREADS=1 21 | fi 22 | 23 | mkdir -p $TEMPdir/$PREFIX 24 | 25 | samtools view -b -f 2 $BAMfile > $BAMfiltered # read mapped in proper pair 26 | samtools sort -n $BAMfiltered $BAMsorted 27 | 28 | #/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/utilities/subtools -i $BAMsorted.bam -o $BAMsorted --informat BAM --outformat BAM --sort byname 29 | 30 | #$RSEMdir/convert-sam-for-rsem $BAMsorted $BAMfixed 31 | 32 | #echo "validate file" 33 | #$RSEMdir/rsem-sam-validator $BAMsorted 34 | 35 | #$RSEMdir/convert-sam-for-rsem $BAMsorted.bam $BAMfileOut -T $TEMPdir/$PREFIX 36 | 37 | 38 | #if [ $paired ] ; then 39 | echo "$RSEMdir/rsem-calculate-expression --bam --paired-end --num-threads $THREADS --single-cell-prior --temporary-folder $TEMPdir --no-bam-output $BAMsorted.bam $REFname $PREFIX" 40 | #else 41 | # $RSEMdir/rsem-calculate-expression --bam --num-threads $THREADS --single-cell-prior --temporary-folder $TEMPdir --no-bam-output $BAMfileTmp $REFname $PREFIX 42 | #fi 43 | 44 | #rm $BAMfileTmp 45 | -------------------------------------------------------------------------------- /5_RSEM_build_refrence.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RSEMdir=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26 3 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/ 4 | REFname=GRCm38 5 | GenomeGTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf 6 | GenomeFASTA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa 7 | BOWTIEpath=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bowtie2-2.2.6/ 8 | STARpath=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/ 9 | 10 | $RSEMdir/rsem-prepare-reference --gtf $GenomeGTF --bowtie2 --bowtie2-path $BOWTIEpath $GenomeFASTA $OUTDIR/$REFname 11 | $RSEMdir/rsem-prepare-reference --gtf $GenomeGTF --star --star-path $STARpath $GenomeFASTA $OUTDIR/$REFname 12 | -------------------------------------------------------------------------------- /5_Summarize_Filter_Merged_Transcriptome.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | #Things to filter: 5 | # single exon non-reference transcripts (class 'u' 'i' & single exon), 6 | # transcripts with retained introns (class 'e'), 7 | # polymerase read-though (class 'p'), 8 | # class code 's' (likely read mapping error) 9 | # helpful info: http://seqanswers.com/forums/showthread.php?t=3518 10 | # Stats I would like to have: (1) % reference transcripts recovered (# transcripts class '=' vs # transcripts genome, (2) # novel intergenic multi-exonic transcripts, (3) # novel alternatively spliced transcripts 11 | 12 | if (@ARGV < 1) {die "Please provide cuffmerge outputfile\n";} 13 | 14 | my %code2count = (); 15 | my %transcriptid2lines =(); 16 | my %transcriptid2code=(); 17 | my %transcriptid2numexons = (); 18 | open (my $ifh, $ARGV[0]) or die $!; 19 | while (<$ifh>) { 20 | chomp; 21 | $_ =~ /transcript_id "(.+)"; exon_number/; 22 | my $tid = $1; 23 | push(@{$transcriptid2lines{$tid}},$_); 24 | $_ =~ /class_code "(.+)"; tss_id/; 25 | my $code = $1; 26 | if (exists($transcriptid2code{$tid}) && $transcriptid2code{$tid} ne $code) {die "Contradicting codes\n";} 27 | $transcriptid2code{$tid}=$code; 28 | $code2count{$code}++; 29 | if ($_ =~ /exon_number "(\d+)"/) { 30 | if (!exists($transcriptid2numexons{$tid}) || $transcriptid2numexons{$tid} < $1) { 31 | $transcriptid2numexons{$tid} = $1; 32 | } 33 | } else { die "exon_num not match\n";} 34 | } close($ifh); 35 | 36 | 37 | my $Nrecovered = 0; 38 | my $Nremoved = 0; 39 | %code2count=(); 40 | open(my $ofh, ">", "New_Transcriptome.gtf") or die $!; 41 | foreach my $tid (keys(%transcriptid2lines)) { 42 | $Nremoved++; 43 | my $code = $transcriptid2code{$tid}; 44 | my $exons = $transcriptid2numexons{$tid}; 45 | $code2count{$code}++; 46 | if ($exons == 1 && ($code eq "u" || $code eq "i")) {next;} 47 | if ($code eq "e" || $code eq "p" || $code eq "s" || $code eq "r") {next;} 48 | if ($code eq "=") {$Nrecovered++;} 49 | $Nremoved--; 50 | 51 | foreach my $line (@{$transcriptid2lines{$tid}}) { 52 | print $ofh $line."\n"; 53 | } 54 | } 55 | foreach my $code (keys(%code2count)) { 56 | print "$code : $code2count{$code}\n"; 57 | } 58 | print "transcripts recovered: $Nrecovered\n"; 59 | print "transcripts removed: $Nremoved\n"; 60 | -------------------------------------------------------------------------------- /5_TidyCufflinks.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 2) {die "Provide: a directory of cufflinks output, and the file it was produced from\n";} 5 | 6 | my $tag="ERR"; 7 | my $origfile = $ARGV[1]; 8 | if ($origfile =~ /_([^_]+_Cell\d\d)/) { 9 | $tag = $1; 10 | } else { 11 | die "$origfile does not match\n"; 12 | } 13 | 14 | #$origfile =~ /([ACGT]{5,})/; 15 | if( chdir($ARGV[0])) { 16 | 17 | foreach my $file (glob("*")) { 18 | system("mv $file ../$tag\_$file \n"); 19 | } 20 | chdir("/nfs/users/nfs_t/ta6/RNASeqPipeline"); 21 | rmdir($ARGV[0]); 22 | } else {die "error changing directory to $ARGV[0]";} 23 | 24 | -------------------------------------------------------------------------------- /5_featureCounts_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Arguments: 3 | # $1 = Annotation GTF file 4 | # $2 = number of threads to run on 5 | # $3 = input BAM to run on (only required if number of threads > 0) 6 | # $4 = workingdir 7 | # $5 = outputdir 8 | # This runs fast & efficiently, not on cluster took < 10 minutes to count one of the dedupped merged files. 9 | # But should run this on the complete annotations after cufflinks de novo transcript assembly. 10 | 11 | featureCOUNT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/featureCounts 12 | ANNOTATIONgtf=$1 13 | NUMTHREADS=$2 14 | INPUTDIR=$3 15 | WORKINGDIR=$4/$LSB_JOBINDEX 16 | OUTDIR=$5 17 | FILEStoMAP=($INPUTDIR/*.bam) 18 | ARRAYINDEX=$(($LSB_JOBINDEX-1)) 19 | INPUTBAM=${FILEStoMAP[$ARRAYINDEX]} 20 | echo "Inputfile: $INPUTBAM" 21 | 22 | 23 | 24 | if [ ! -f $featureCOUNT ] ; then 25 | echo "Sorry featureCounts not available" 26 | exit 1 27 | fi 28 | 29 | if [ -z $ANNOTATIONgtf ] || [ ! -f $ANNOTATIONgtf ] ; then 30 | echo "Please provide an annotation GTF file (ARG 1/3)" 31 | exit 1 32 | fi 33 | 34 | if [ -z $NUMTHREADS ] ; then 35 | echo "Please set number of threads to run on (ARG 2/3)" 36 | exit 1 37 | fi 38 | 39 | if [ $NUMTHREADS -gt 0 ] ; then 40 | if [ -z $INPUTBAM ] || [ ! -f $INPUTBAM ] ; then 41 | echo "$INPUTBAM does not exist. Please provide existing sorted BAM file (ARG 3/3)" 42 | exit 1 43 | fi 44 | else 45 | echo "Error: number of threads must be > 0" 46 | exit 1 47 | fi 48 | 49 | # featureCounts options: 50 | # -t 'string' : specify the feature type to count reads for, default='exon' 51 | # -g 'string' : specify the attribute used to group features into meta-features, default='gene_id' 52 | # -f : read summarization performed at the feature level instead of the meta-feature level 53 | # -O : reads can match more than one feature/metafeature 54 | # -M : multi-mapping can be counted multiple times (once for each of their mapping locations 55 | # alternatively 56 | # --primary : only primary alignments will be counted 57 | # -Q ## : minimum mapping quality (default = 0, 30 = consistent with RSeQC) 58 | # -T ## : number of threads to run on (default =1) 59 | # -R : output read counting assignments of each read into a .featureCounts file 60 | # --ignoreDup : ignores any reads marked as duplicates 61 | # -p : fragments rather than reads counted for paired-end data. 62 | # -d ## : minimum fragment/template legnth (default: 50) -> only if using -P parameter too 63 | # -D ## : maximum fragment/template length (default 600) -> only if using -P parameter too 64 | # -B : only reads with both ends mapping considered 65 | # -C : reads with ends mapping to different Chrs excluded 66 | mkdir -p $WORKINGDIR 67 | cd $WORKINGDIR 68 | 69 | OUTPUTFILE=$(basename "${INPUTBAM%.bam}.fragmentcounts") 70 | $featureCOUNT -O -M -T $NUMTHREADS -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #yes multimap, single end, no quality threshold 71 | #$featureCOUNT -O -M -T $NUMTHREADS -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #allow multimap, single end, no quality threshold 72 | #$featureCOUNT -O -M -Q 30 -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #allow multimap 73 | #$featureCOUNT -Q 30 -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #no multimap 74 | rm temp* 75 | cd .. 76 | rmdir $WORKINGDIR 77 | -------------------------------------------------------------------------------- /6.1_Get_Expression_Kallisto.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV != 4) {die "Usage: perl 6.1_Get_Expression_Kallisto.pl /path/outprefix /path/genome.gtf /path/kallisto_output_directory [0/1 - aggregate by gene?].\n";} 5 | 6 | my $outprefix = $ARGV[0]; 7 | my $gtf_file = $ARGV[1]; 8 | my $abundance_string = $ARGV[2]."*.abundance*.tsv"; 9 | my $agg_by_gene = $ARGV[3]; 10 | 11 | my $suffix = "kallisto_trans"; 12 | if ($agg_by_gene) { 13 | $suffix = "kallisto_gene"; 14 | } 15 | 16 | my %Gene2ID2FragCount = (); 17 | my %Gene2ID2TPM = (); 18 | my @IDs = (); 19 | 20 | my %transcript2gene = (); 21 | #open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!; 22 | open (my $ifh, "$gtf_file") or die $!; 23 | while (<$ifh>) { 24 | if ($_ =~ /^#/) {next;} 25 | my $geneid = "ERROR"; 26 | if ($_ =~ /gene_id "(.+?)"/) { 27 | $geneid=$1; 28 | } else { 29 | die "No gene id!\n"; 30 | } 31 | 32 | my @record = split(/\t/); 33 | if ($record[2] =~ /gene/i) { 34 | #gene 35 | } elsif ($_ =~ /exon_id "(.+?)"/) { 36 | #exon 37 | } elsif ($_ =~ /transcript_id "(.+?)"/) { 38 | #transcript 39 | $transcript2gene{$1}=$geneid; 40 | } 41 | } close ($ifh); 42 | print STDERR "Done reading Annotations\n"; 43 | 44 | #foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Kallisto/*.abundance*.tsv")) { 45 | foreach my $file (glob("$abundance_string")) { 46 | # $file =~ /([ATCG]{5,})/; 47 | my $ID = "ERROR"; 48 | # if ($file =~ /([^\/]+_Cell\d\d)/) { 49 | if ($file =~ /([^\/]+)\.abundance/) { 50 | $ID = $1; 51 | } else {die "$file did not match!\n";} 52 | push(@IDs,$ID); 53 | open(my $ifh, $file) or die $!; 54 | while (<$ifh>) { 55 | chomp; 56 | if ($_ =~ /^#/ || $_ =~ /^target_id/) {next;} #skip header & comments 57 | my @record=split(/\t/); 58 | my $gene = $record[0]; $gene =~ s/\s+//g; 59 | if (exists($transcript2gene{$gene}) && $agg_by_gene) { 60 | $gene = $transcript2gene{$gene}; 61 | } 62 | if (exists($Gene2ID2FragCount{$gene}->{$ID})) { 63 | $Gene2ID2FragCount{$gene}->{$ID} += $record[3]; 64 | $Gene2ID2TPM{$gene}->{$ID} += $record[4]; 65 | } else { 66 | $Gene2ID2FragCount{$gene}->{$ID} = $record[3]; 67 | $Gene2ID2TPM{$gene}->{$ID} = $record[4]; 68 | } 69 | } close ($ifh); 70 | } 71 | 72 | 73 | 74 | open(my $ofh1, ">", "$outprefix\_$suffix\_counts.txt") or die $!; 75 | open(my $ofh2, ">", "$outprefix\_$suffix\_tpm.txt") or die $!; 76 | print $ofh1 "Gene\t".join("\t",@IDs)."\n"; 77 | print $ofh2 "Gene\t".join("\t",@IDs)."\n"; 78 | foreach my $gene (keys(%Gene2ID2FragCount)) { 79 | print $ofh1 "$gene"; 80 | print $ofh2 "$gene"; 81 | foreach my $ID (@IDs) { 82 | my $count = "NA"; 83 | my $tpm = "NA"; 84 | if (exists($Gene2ID2FragCount{$gene}->{$ID})) { 85 | $count = $Gene2ID2FragCount{$gene}->{$ID}; 86 | } else { 87 | $count = "NA"; 88 | } 89 | if (exists($Gene2ID2TPM{$gene}->{$ID})) { 90 | $tpm = $Gene2ID2TPM{$gene}->{$ID}; 91 | } else { 92 | $tpm = "NA"; 93 | } 94 | print $ofh1 "\t".$count; 95 | print $ofh2 "\t".$tpm; 96 | } 97 | print $ofh1 "\n"; 98 | print $ofh2 "\n"; 99 | } 100 | -------------------------------------------------------------------------------- /6_Get_Construct_Expression_Cufflinks.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | my %ID2Things = (); 5 | my %ID2Thingsloci = (); 6 | 7 | my $fpkmcol = 9; 8 | 9 | foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/*genes.fpkm_tracking")) { 10 | $file =~ /([ATCG]{5,})/; 11 | my $ID = $1; 12 | open(my $ifh, $file) or die $!; 13 | while (<$ifh>) { 14 | chomp; 15 | if ($_ =~ /A2lox-TRE/) { 16 | my @record=split(/\t/); 17 | $ID2Things{$ID}->{$record[0]} = $record[$fpkmcol]; 18 | my $locus = $record[6]; 19 | $locus =~ /(\d+)-(\d+)/; 20 | $ID2Thingsloci{$ID}->{$record[0]} = "$1\t$2"; 21 | } 22 | } close ($ifh); 23 | } 24 | 25 | 26 | foreach my $id (keys(%ID2Things)) { 27 | foreach my $thing (keys(%{$ID2Things{$id}})) { 28 | print "$id\t$thing\t".$ID2Things{$id}->{$thing}."\t".$ID2Thingsloci{$id}->{$thing}."\n"; 29 | } 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /6_Get_Expression_featureCounts.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "Please provide directory of featurecounts output\n";} 5 | 6 | my $dir = $ARGV[0]; 7 | 8 | my %Gene2ID2FragCount = (); 9 | my @IDs = (); 10 | 11 | foreach my $file (glob("$dir/*.fragmentcounts")) { 12 | my $ID = "ERR"; 13 | if ($file =~ /([ATCG]{5,})A/) { 14 | # if ($file =~ /_([^_]+_Cell\d\d)/) { 15 | $ID = $1; 16 | } else { 17 | die "$file does not match\n"; 18 | } 19 | push(@IDs,$ID); 20 | open(my $ifh, $file) or die $!; 21 | while (<$ifh>) { 22 | chomp; 23 | if ($_ =~ /^#/ || $_ =~ /^Geneid/) {next;} #skip header & comments 24 | my @record=split(/\t/); 25 | my $gene = $record[0]; $gene =~ s/\s+//g; 26 | $Gene2ID2FragCount{$gene}->{$ID} = $record[6]; 27 | } close ($ifh); 28 | } 29 | 30 | print join("\t",@IDs)."\n"; 31 | foreach my $gene (keys(%Gene2ID2FragCount)) { 32 | print "$gene"; 33 | foreach my $ID (@IDs) { 34 | my $count = "NA"; 35 | if (exists($Gene2ID2FragCount{$gene}->{$ID})) { 36 | $count = $Gene2ID2FragCount{$gene}->{$ID}; 37 | } else { 38 | $count = "0"; 39 | } 40 | print "\t".$count; 41 | } 42 | print "\n"; 43 | } 44 | 45 | my %ID2Unassigned = (); 46 | foreach my $file (glob("$dir/*.fragmentcounts.summary")) { 47 | my $ID = "ERR"; 48 | #if ($file =~ /_([^_]+_Cell\d\d)/) { 49 | if ($file =~ /([ATCG]{5,})A/) { 50 | $ID = $1; 51 | } else { 52 | die "$file does not match\n"; 53 | } 54 | open(my $ifh, $file) or die $!; 55 | <$ifh>; # header 56 | <$ifh>; #Assigned 57 | while (<$ifh>) { 58 | chomp; 59 | my @record=split(/\t/); 60 | $ID2Unassigned{$ID} += $record[1] 61 | } close ($ifh); 62 | } 63 | 64 | print "Unassigned_Various"; 65 | foreach my $ID (@IDs) { 66 | print "\t".$ID2Unassigned{$ID}; 67 | } 68 | print "\n"; 69 | -------------------------------------------------------------------------------- /6_Get_Kallisto.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | # Currently replaces all estimated FPKMs which are not significantly bigger than 0 with 0. -> not as of (Feb 9 2016), also changed "not detected" genes from NA to 0. 4 | 5 | if (@ARGV < 2) {die "Please supply a directory of Kallisto Output and a prefix for output\n";} 6 | 7 | my $dir = $ARGV[0]; 8 | my $outprefix = $ARGV[1]; 9 | 10 | # Now read in genome annotations 11 | my %transcript2gene = (); 12 | open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!; 13 | while (<$ifh>) { 14 | if ($_ =~ /^#/) {next;} 15 | 16 | if ($_ =~ /transcript_id "(.+?)"/) { 17 | my $transid = $1; 18 | #transcript 19 | my $geneid = "ERROR"; 20 | if ($_ =~ /gene_id "(.+?)"/) { 21 | $geneid=$1; 22 | } else { 23 | die "No gene id!\n"; 24 | } 25 | $transcript2gene{$transid}=$geneid; 26 | } 27 | 28 | } close ($ifh); 29 | print STDERR "Done reading Annotations\n"; 30 | 31 | # First get expression for all genes & store details for all Cuff-Genes 32 | my %AllGenes = (); my %AllSamples = (); 33 | my %Gene2Sample2TPM = (); 34 | my %Gene2Sample2Counts=(); 35 | 36 | my @files = glob("$dir/*.abundances.tsv"); 37 | foreach my $file (@files) { 38 | # $file =~ /([ATCG]{5,})/; 39 | my $ID = "ERR"; 40 | if ($file =~ /([^_]+_Cell\d\d)/) { 41 | $ID = $1; 42 | } else { 43 | die "$file does not match!"; 44 | } 45 | $AllSamples{$ID}=1; 46 | open(my $ifh, $file) or die $!; 47 | <$ifh>; # header 48 | while (<$ifh>) { 49 | chomp; 50 | my @record = split(/\t/); 51 | my $trans = $record[0]; 52 | my $count = $record[3]; 53 | my $tpm = $record[4]; 54 | my $gene = $trans; 55 | if (exists($transcript2gene{$trans})) { 56 | $gene = $transcript2gene{$trans}; 57 | } 58 | if (exists($Gene2Sample2TPM{$gene})) { 59 | $Gene2Sample2TPM{$gene}->{$ID}+=$tpm; 60 | $Gene2Sample2Counts{$gene}->{$ID}+=$count; 61 | } else { 62 | $Gene2Sample2TPM{$gene}->{$ID}=$tpm; 63 | $Gene2Sample2Counts{$gene}->{$ID}=$count; 64 | } 65 | } close ($ifh); 66 | } 67 | 68 | open (my $ofhtpm, ">", "$outprefix.tpm") or die $!; 69 | open (my $ofhcounts, ">", "$outprefix.counts") or die $!; 70 | my @IDs = sort(keys(%AllSamples)); 71 | print $ofhtpm "Gene\t".join("\t",@IDs)."\n"; 72 | print $ofhcounts "Gene\t".join("\t",@IDs)."\n"; 73 | 74 | foreach my $gene (keys(%Gene2Sample2TPM)) { 75 | print $ofhtpm "$gene"; 76 | print $ofhcounts "$gene"; 77 | foreach my $ID (@IDs) { 78 | my $tpm = "NA"; 79 | if (exists($Gene2Sample2TPM{$gene}->{$ID})) { 80 | $tpm = $Gene2Sample2TPM{$gene}->{$ID}; 81 | } else { 82 | $tpm = "0"; 83 | } 84 | my $count = "NA"; 85 | if (exists($Gene2Sample2Counts{$gene}->{$ID})) { 86 | $count = $Gene2Sample2Counts{$gene}->{$ID}; 87 | } else { 88 | $count = "0"; 89 | } 90 | print $ofhcounts "\t".$count; 91 | print $ofhtpm "\t".$tpm; 92 | } 93 | print $ofhcounts "\n"; 94 | print $ofhtpm "\n"; 95 | } 96 | close($ofhcounts); 97 | close($ofhtpm); 98 | -------------------------------------------------------------------------------- /6_Get_RSEM_Expression.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 2) {die "Please supply a directory of Cufflinks Output and a prefix for outputfiles\n";} 5 | 6 | my $dir = $ARGV[0]; 7 | my $outprefix = $ARGV[1]; 8 | 9 | # First get expression for all genes & store details for all Cuff-Genes 10 | my %AllGenes = (); my %AllSamples = (); 11 | my %Gene2Sample2FPKM = (); 12 | my %Gene2Sample2TPM = (); 13 | 14 | my @files = glob("$dir/*.genes.results"); 15 | foreach my $file (@files) { 16 | # $file =~ /([ATCG]{5,})/; 17 | my $ID = "ERR"; 18 | # if ($file =~ /([^_]+_Cell\d\d)/) { 19 | if ($file =~ /RSEM-(\d+)-/) { 20 | $ID = $1; 21 | } else { 22 | die "$file does not match!"; 23 | } 24 | $AllSamples{$ID}=1; 25 | open(my $ifh, $file) or die $!; 26 | <$ifh>; # header 27 | while (<$ifh>) { 28 | chomp; 29 | my @record = split(/\t/); 30 | my $gene = $record[0]; 31 | my $fpkm = $record[6]; 32 | my $tpm = $record[5]; 33 | $Gene2Sample2FPKM{$gene}->{$ID}=$fpkm; 34 | $Gene2Sample2TPM{$gene}->{$ID} =$tpm; 35 | } close ($ifh); 36 | } 37 | 38 | print STDERR "Done reading FPKMs\n"; 39 | 40 | open (my $ofhfpkm, ">", "$outprefix.fpkm") or die $!; 41 | open (my $ofhtpm, ">", "$outprefix.tpm") or die $!; 42 | my @IDs = sort{$a<=>$b} keys(%AllSamples); 43 | print $ofhfpkm "Gene\t".join("\t",@IDs)."\n"; 44 | print $ofhtpm "Gene\t".join("\t",@IDs)."\n"; 45 | 46 | foreach my $gene (keys(%Gene2Sample2FPKM)) { 47 | print $ofhfpkm "$gene"; 48 | print $ofhtpm "$gene"; 49 | foreach my $ID (@IDs) { 50 | my $tpm = "NA"; 51 | my $fpkm = "NA"; 52 | if (exists($Gene2Sample2FPKM{$gene}->{$ID})) { 53 | $fpkm = $Gene2Sample2FPKM{$gene}->{$ID}; 54 | } else { 55 | $fpkm = "0"; 56 | } 57 | if (exists($Gene2Sample2TPM{$gene}->{$ID})) { 58 | $tpm = $Gene2Sample2TPM{$gene}->{$ID}; 59 | } else { 60 | $tpm = "0"; 61 | } 62 | print $ofhfpkm "\t".$fpkm; 63 | print $ofhtpm "\t".$tpm; 64 | } 65 | print $ofhfpkm "\n"; 66 | print $ofhtpm "\n"; 67 | } 68 | close($ofhtpm); 69 | close($ofhfpkm); 70 | -------------------------------------------------------------------------------- /6_Get_Salmon_Expression.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | # Currently replaces all estimated FPKMs which are not significantly bigger than 0 with 0. -> not as of (Feb 9 2016), also changed "not detected" genes from NA to 0. 4 | 5 | if (@ARGV < 2) {die "Usage: 6_Get_Salmon_Expression.pl [directory of Salmon output] [gene|transcript] [output prefix]\n";} 6 | 7 | my $dir = $ARGV[0]; 8 | my $feature = $ARGV[1]; 9 | my $outprefix = $ARGV[2]; 10 | 11 | # Process arguments 12 | my @files = (); 13 | if ($feature =~ /gene/i) { 14 | @files = glob("$dir/*.quant.genes.sf"); 15 | } else { 16 | @files = glob("$dir/*.quant.sf"); 17 | } 18 | # More efficient to do both at once 19 | #my $col = -1; 20 | #if ($type =~ /tpm/i) { 21 | # # column of salmon output corresponding to tpm 22 | # $col = 3; 23 | #} else { 24 | # # column of salmon output corresponding to read counts 25 | # $col = 4; 26 | #} 27 | 28 | 29 | # First get expression for all genes & store details for all Cuff-Genes 30 | my %AllGenes = (); my %AllSamples = (); 31 | my %Gene2Sample2TPM = (); 32 | my %Gene2Sample2Counts=(); 33 | 34 | foreach my $file (@files) { 35 | # $file =~ /([ATCG]{5,})/; 36 | # Regular expressions to extract sample name from file name 37 | my $ID = "ERR"; 38 | if ($file =~ /([^_]+_Cell\d\d)/) { 39 | $ID = $1; 40 | } else { 41 | die "$file does not match!"; 42 | } 43 | ##### 44 | $AllSamples{$ID}=1; 45 | open(my $ifh, $file) or die $!; 46 | <$ifh>; # header 47 | while (<$ifh>) { 48 | chomp; 49 | my @record = split(/\t/); 50 | my $feature = $record[0]; 51 | my $count = $record[4]; 52 | my $tpm = $record[3]; 53 | if (exists($Gene2Sample2TPM{$feature})) { 54 | $Gene2Sample2TPM{$feature}->{$ID}+=$tpm; 55 | $Gene2Sample2Counts{$feature}->{$ID}+=$count; 56 | } else { 57 | $Gene2Sample2TPM{$feature}->{$ID}=$tpm; 58 | $Gene2Sample2Counts{$feature}->{$ID}=$count; 59 | } 60 | } close ($ifh); 61 | } 62 | 63 | open (my $ofhtpm, ">", "$outprefix.tpm") or die $!; 64 | open (my $ofhcounts, ">", "$outprefix.counts") or die $!; 65 | my @IDs = sort(keys(%AllSamples)); 66 | print $ofhtpm "Gene\t".join("\t",@IDs)."\n"; 67 | print $ofhcounts "Gene\t".join("\t",@IDs)."\n"; 68 | 69 | foreach my $gene (sort(keys(%Gene2Sample2TPM))) { 70 | print $ofhtpm "$gene"; 71 | print $ofhcounts "$gene"; 72 | foreach my $ID (@IDs) { 73 | my $tpm = "NA"; 74 | if (exists($Gene2Sample2TPM{$gene}->{$ID})) { 75 | $tpm = $Gene2Sample2TPM{$gene}->{$ID}; 76 | } else { 77 | $tpm = "0"; 78 | } 79 | my $count = "NA"; 80 | if (exists($Gene2Sample2Counts{$gene}->{$ID})) { 81 | $count = $Gene2Sample2Counts{$gene}->{$ID}; 82 | } else { 83 | $count = "0"; 84 | } 85 | print $ofhcounts "\t".$count; 86 | print $ofhtpm "\t".$tpm; 87 | } 88 | print $ofhcounts "\n"; 89 | print $ofhtpm "\n"; 90 | } 91 | close($ofhcounts); 92 | close($ofhtpm); 93 | -------------------------------------------------------------------------------- /99_Check_Barcodes.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 3) { die "Breakdown_Paired_Ends.pl INPUT1 INPUT2 ProjectName\n";} 5 | my $infile1 = $ARGV[0]; 6 | my $infile2 = $ARGV[1]; 7 | 8 | my %Barcodes = (); 9 | open (my $ifh1, $infile1) or die $!; 10 | while(<$ifh1>) { 11 | my $file1line = $_; 12 | if ($file1line =~ /^@/) { 13 | my @thing1 = split(/\s+/,$file1line); 14 | my $readname = $thing1[0]; 15 | my $barcodes = <$ifh1>; 16 | if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/){ 17 | my $UMI = $2; 18 | my $CellID = $1; 19 | $Barcodes{$UMI}++; 20 | } 21 | } else {next;} 22 | } 23 | close($ifh1); 24 | 25 | my @codes = sort{$Barcodes{$a}<=>$Barcodes{$b}} keys(%Barcodes); 26 | foreach my $code (@codes) { 27 | print "$code ".$Barcodes{$code}."\n"; 28 | } 29 | -------------------------------------------------------------------------------- /99_Check_RSEM_Output.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "Please supply a directory of RSEM Output\n";} 5 | 6 | my $dir = $ARGV[0]; 7 | 8 | # First get expression for all genes 9 | my %AllGenes = (); my %AllSamples = (); 10 | my %Gene2Sample2FPKM = (); 11 | my %Gene2Sample2TPM = (); 12 | 13 | my @files = glob("$dir/bowtie2*.genes.results"); 14 | foreach my $file (@files) { 15 | my $ID = "ERR"; 16 | if ($file =~ /bowtie2_RSEM-(\d+)/) { # Match file name. 17 | $ID = $1; 18 | } else { 19 | next; 20 | } 21 | $AllSamples{$ID}=1; 22 | } 23 | 24 | my @IDs = sort{$a<=>$b} keys(%AllSamples); 25 | print ( join("\n", @IDs) ); 26 | -------------------------------------------------------------------------------- /99_Check_Results.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "requires one or more LSF output files\n";} 5 | 6 | my $CPU = 0; 7 | my $Mem = 0; 8 | my $Count = 0; 9 | 10 | foreach my $file (@ARGV) { 11 | open(my $ifh, $file) or die $!; 12 | my $success = 0; 13 | while(<$ifh>) { 14 | if ($_ =~ /Successfully completed/) { 15 | $success = 1; 16 | $Count++; 17 | } 18 | if ($success && $_ =~ /CPU time :\s+([\d\.]+) sec/) { 19 | $CPU += $1; 20 | } 21 | if ($success && $_ =~ /Max Memory :\s+([\d\.]+) MB/) { 22 | my $m = $1; 23 | if ($m > $Mem) {$Mem = $m;} 24 | } 25 | } close($ifh); 26 | if ($success) { 27 | system("rm $file\n"); 28 | } 29 | } 30 | print "\"Total :\" ".scalar(@ARGV)."\n\"Success:\" $Count\n\"Max Mem:\" $Mem\n\"Total CPU:\" $CPU\n"; 31 | -------------------------------------------------------------------------------- /99_NotesForImprovement: -------------------------------------------------------------------------------- 1 | Is it better to use SAMtools after mapping to sort BAM file or to have STAR sort the BAMs before writing them? (Probably the latter since less I/O) -> then just merge the sorted files using samtools (merge expects sorted bams anyway) 2 | 3 | Streaming FASTQC made Java crash but this didn't halt the job which is problematic (eventually was killed because job time limit was exceeded). Need to increase efficiency. 4 | -> worked fine when increased memory to 10 Gb and ran on each end of each lane. 5 | 6 | Samtools sort on largest BAM took 104 sec and 200MB of memory on the cluster. Need to do some finicky stuff to get it to run as a job array though (or should I just send them as individual jobs? only 192 of them...). 7 | -------------------------------------------------------------------------------- /99_get_order_chr_in_SAM.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "Arguments: same file\n";} 5 | 6 | my @chrs = ("NULL"); 7 | open (my $ifh, $ARGV[0]) or die $!; 8 | while (<$ifh>) { 9 | if ($_ =~ /^@/) {next;} 10 | my @record = split(/\t/); 11 | my $chr = $record[2]; 12 | my $i = scalar(@chrs) -1; 13 | if ($chr ne $chrs[$i]){ 14 | push(@chrs, $chr); 15 | } 16 | } close($ifh); 17 | 18 | shift(@chrs); # remove NULL 19 | print join("\n", @chrs); 20 | 21 | -------------------------------------------------------------------------------- /ERCC_Controls/ERCC_Controls_Analysis.txt: -------------------------------------------------------------------------------- 1 | #Re-sort ID ERCC ID subgroup concentration in Mix 1 (attomoles/ul) concentration in Mix 2 (attomoles/ul) expected fold-change ratio log2(Mix 1/Mix 2) 2 | 1 ERCC-00130 A 30000 7500 4 2 3 | 2 ERCC-00004 A 7500 1875 4 2 4 | 3 ERCC-00136 A 1875 468.75 4 2 5 | 4 ERCC-00108 A 937.5 234.375 4 2 6 | 5 ERCC-00116 A 468.75 117.1875 4 2 7 | 6 ERCC-00092 A 234.375 58.59375 4 2 8 | 7 ERCC-00095 A 117.1875 29.296875 4 2 9 | 8 ERCC-00131 A 117.1875 29.296875 4 2 10 | 9 ERCC-00062 A 58.59375 14.6484375 4 2 11 | 10 ERCC-00019 A 29.296875 7.32421875 4 2 12 | 11 ERCC-00144 A 29.296875 7.32421875 4 2 13 | 12 ERCC-00170 A 14.6484375 3.66210938 4 2 14 | 13 ERCC-00154 A 7.32421875 1.83105469 4 2 15 | 14 ERCC-00085 A 7.32421875 1.83105469 4 2 16 | 15 ERCC-00028 A 3.66210938 0.91552734 4 2 17 | 16 ERCC-00033 A 1.83105469 0.45776367 4 2 18 | 17 ERCC-00134 A 1.83105469 0.45776367 4 2 19 | 18 ERCC-00147 A 0.91552734 0.22888184 4 2 20 | 19 ERCC-00097 A 0.45776367 0.11444092 4 2 21 | 20 ERCC-00156 A 0.45776367 0.11444092 4 2 22 | 21 ERCC-00123 A 0.22888184 0.05722046 4 2 23 | 22 ERCC-00017 A 0.11444092 0.02861023 4 2 24 | 23 ERCC-00083 A 0.02861023 0.00715256 4 2 25 | 24 ERCC-00096 B 15000 15000 1 0 26 | 25 ERCC-00171 B 3750 3750 1 0 27 | 26 ERCC-00009 B 937.5 937.5 1 0 28 | 27 ERCC-00042 B 468.75 468.75 1 0 29 | 28 ERCC-00060 B 234.375 234.375 1 0 30 | 29 ERCC-00035 B 117.1875 117.1875 1 0 31 | 30 ERCC-00025 B 58.59375 58.59375 1 0 32 | 31 ERCC-00051 B 58.59375 58.59375 1 0 33 | 32 ERCC-00053 B 29.296875 29.296875 1 0 34 | 33 ERCC-00148 B 14.6484375 14.6484375 1 0 35 | 34 ERCC-00126 B 14.6484375 14.6484375 1 0 36 | 35 ERCC-00034 B 7.32421875 7.32421875 1 0 37 | 36 ERCC-00150 B 3.66210938 3.66210938 1 0 38 | 37 ERCC-00067 B 3.66210938 3.66210938 1 0 39 | 38 ERCC-00031 B 1.83105469 1.83105469 1 0 40 | 39 ERCC-00109 B 0.91552734 0.91552734 1 0 41 | 40 ERCC-00073 B 0.91552734 0.91552734 1 0 42 | 41 ERCC-00158 B 0.45776367 0.45776367 1 0 43 | 42 ERCC-00104 B 0.22888184 0.22888184 1 0 44 | 43 ERCC-00142 B 0.22888184 0.22888184 1 0 45 | 44 ERCC-00138 B 0.11444092 0.11444092 1 0 46 | 45 ERCC-00117 B 0.05722046 0.05722046 1 0 47 | 46 ERCC-00075 B 0.01430512 0.01430512 1 0 48 | 47 ERCC-00074 C 15000 22500 0.67 -0.58 49 | 48 ERCC-00113 C 3750 5625 0.67 -0.58 50 | 49 ERCC-00145 C 937.5 1406.25 0.67 -0.58 51 | 50 ERCC-00111 C 468.75 703.125 0.67 -0.58 52 | 51 ERCC-00076 C 234.375 351.5625 0.67 -0.58 53 | 52 ERCC-00044 C 117.1875 175.78125 0.67 -0.58 54 | 53 ERCC-00162 C 58.59375 87.890625 0.67 -0.58 55 | 54 ERCC-00071 C 58.59375 87.890625 0.67 -0.58 56 | 55 ERCC-00084 C 29.296875 43.9453125 0.67 -0.58 57 | 56 ERCC-00099 C 14.6484375 21.9726563 0.67 -0.58 58 | 57 ERCC-00054 C 14.6484375 21.9726563 0.67 -0.58 59 | 58 ERCC-00157 C 7.32421875 10.9863281 0.67 -0.58 60 | 59 ERCC-00143 C 3.66210938 5.49316406 0.67 -0.58 61 | 60 ERCC-00039 C 3.66210938 5.49316406 0.67 -0.58 62 | 61 ERCC-00058 C 1.83105469 2.74658203 0.67 -0.58 63 | 62 ERCC-00120 C 0.91552734 1.37329102 0.67 -0.58 64 | 63 ERCC-00040 C 0.91552734 1.37329102 0.67 -0.58 65 | 64 ERCC-00164 C 0.45776367 0.68664551 0.67 -0.58 66 | 65 ERCC-00024 C 0.22888184 0.34332275 0.67 -0.58 67 | 66 ERCC-00016 C 0.22888184 0.34332275 0.67 -0.58 68 | 67 ERCC-00012 C 0.11444092 0.17166138 0.67 -0.58 69 | 68 ERCC-00098 C 0.05722046 0.08583069 0.67 -0.58 70 | 69 ERCC-00057 C 0.01430512 0.02145767 0.67 -0.58 71 | 70 ERCC-00002 D 15000 30000 0.5 -1 72 | 71 ERCC-00046 D 3750 7500 0.5 -1 73 | 72 ERCC-00003 D 937.5 1875 0.5 -1 74 | 73 ERCC-00043 D 468.75 937.5 0.5 -1 75 | 74 ERCC-00022 D 234.375 468.75 0.5 -1 76 | 75 ERCC-00112 D 117.1875 234.375 0.5 -1 77 | 76 ERCC-00165 D 58.59375 117.1875 0.5 -1 78 | 77 ERCC-00079 D 58.59375 117.1875 0.5 -1 79 | 78 ERCC-00078 D 29.296875 58.59375 0.5 -1 80 | 79 ERCC-00163 D 14.6484375 29.296875 0.5 -1 81 | 80 ERCC-00059 D 14.6484375 29.296875 0.5 -1 82 | 81 ERCC-00160 D 7.32421875 14.6484375 0.5 -1 83 | 82 ERCC-00014 D 3.66210938 7.32421875 0.5 -1 84 | 83 ERCC-00077 D 3.66210938 7.32421875 0.5 -1 85 | 84 ERCC-00069 D 1.83105469 3.66210938 0.5 -1 86 | 85 ERCC-00137 D 0.91552734 1.83105469 0.5 -1 87 | 86 ERCC-00013 D 0.91552734 1.83105469 0.5 -1 88 | 87 ERCC-00168 D 0.45776367 0.91552734 0.5 -1 89 | 88 ERCC-00041 D 0.22888184 0.45776367 0.5 -1 90 | 89 ERCC-00081 D 0.22888184 0.45776367 0.5 -1 91 | 90 ERCC-00086 D 0.11444092 0.22888184 0.5 -1 92 | 91 ERCC-00061 D 0.05722046 0.11444092 0.5 -1 93 | 92 ERCC-00048 D 0.01430512 0.02861023 0.5 -1 94 | -------------------------------------------------------------------------------- /ERCC_Controls/ERCC_Controls_README.txt: -------------------------------------------------------------------------------- 1 | Mapping Reads to ERCC Control Sequences with BioScope 1.2.1 2 | ------------------------------------------------------------ 3 | 4 | This document describes how to use BioScope 1.2.1 to map the results 5 | of a SOLiD run containing ERCC control sequences against the ERCC 6 | reference using the BioScope Whole Transcriptome Pipeline. 7 | 8 | Once a SOLiD run containing ERCC control sequences is finished, the 9 | results must be mapped and counted against the ERCC reference files. 10 | This can be accomplished in two ways using BioScope 1.2.1, by 11 | combining the ERCC references with the genome references or by mapping 12 | directly to the ERCC references. 13 | 14 | Both methods are described below. 15 | 16 | 17 | Prerequisites 18 | ------------- 19 | 20 | BioScope 1.2.1 is required to map against the ERCC references. If you 21 | have an older version of BioScope, upgrade to 1.2.1 before proceeding. 22 | 23 | Two ERCC references are required for mapping and counting: 24 | 25 | ERCC92.fa 26 | This multi-fasta file contains the reference sequences and IDs for 27 | each ERCC control sequence. 28 | 29 | ERCC92.gtf 30 | This feature file contains feature entries for each ERCC control 31 | sequence. This is used as the exon reference in the Whole 32 | Transcriptome pipeline. 33 | 34 | Both ERCC reference files can be downloaded from: 35 | www.appliedbiosystems.com. 36 | 37 | 38 | Method 1: Mapping ERCCs Directly to the ERCC References 39 | -------------------------------------------------------- 40 | 41 | 1. Run BioScope 1.2.1 whole transcriptome analysis as directed in the 42 | BioScope documentation. Use ERCC92.fa for the genome reference and 43 | ERCC92.gtf for the exon reference. 44 | 45 | 2. When the BioScope run completes, you will find the ERCC counts in 46 | the last 92 lines of the countagresult.txt file. 47 | 48 | 49 | Method 2: Combining the ERCC References with the Genomic References 50 | ------------------------------------------------------------------- 51 | 52 | Combining the references allows you to map to both the ERCCs and the 53 | genome reference at the same time. This is accomplished by appending 54 | the ERCC references to the genome references. 55 | 56 | Follow these steps to combine the references: 57 | 58 | 1. Prepare your genome and feature references for use with BioScope 59 | 1.2.1. For human references, you might have two reference files: 60 | 61 | human.fa - the multi-fasta file contain the human reference genome 62 | refGene.gtf - the exon reference file for each exon in refseq 63 | 64 | 2. Append the ERCC references to the genome and feature references. 65 | If your genome reference is human.fa and your exon reference is 66 | refGene.gtf then you could use the following UNIX commands from a 67 | Bash shell to append the files (note that $ is the command prompt): 68 | 69 | $ cat ERCC92.fa >> human.fa 70 | $ cat ERCC92.gtf >> refGene.gtf 71 | 72 | 3. Run BioScope 1.2.1 whole transcriptome analysis as directed in the 73 | BioScope documentation. 74 | 75 | 4. When the BioScope run completes, you will find the ERCC counts in 76 | the last 92 lines of the countagresult.txt file. 77 | 78 | 79 | Post-Processing: Extracting Counts to a Tab-delimited File 80 | ---------------------------------------------------------- 81 | 82 | You can use the following UNIX commands from a Bash shell to parse the 83 | results into a tab delimited table (ERCC.counts) of ERCC name, raw 84 | read count and RPKM: 85 | 86 | $ tail -n 92 countagresult.txt | cut -f9 | cut -d';' -f1 | sed 's/gene_id\|"\| //g' > gene_id 87 | $ tail -n 92 countagresult.txt | cut -f6 > raw_count 88 | $ tail -n 92 countagresult.txt | cut -f9 | cut -d';' -f3 | sed 's/RPKM\|"\| //g' > RPKM 89 | $ paste gene_id raw_count RPKM > ERCC.counts 90 | $ rm gene_id raw_count RPKM 91 | 92 | 93 | -------------------------------------------------------------------------------- /ERCC_Controls/Make_FASTA_GTF_from_Annotation.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | # Converts the Annotation file from https://www.thermofisher.com/order/catalog/product/4456740 into fasta and gtf files that can be added to the end of an existing genome fasta/gtf 4 | 5 | my @FASTAlines = (); 6 | my @GTFlines = (); 7 | open (my $ifh, "ERCC_Controls_Annotation.txt") or die $!; 8 | <$ifh>; #header 9 | while (<$ifh>) { 10 | # Do all the important stuff 11 | chomp; 12 | my @record = split(/\t/); 13 | my $sequence = $record[4]; 14 | $sequence =~ s/\s+//g; # get rid of any preceeding/tailing white space 15 | $sequence = $sequence."NNNN"; # add some buffer to the end of the sequence 16 | my $name = $record[0]; 17 | my $genbank = $record[1]; 18 | push(@FASTAlines, ">$name\n$sequence\n"); 19 | # is GTF 1 indexed or 0 indexed? -> it is 1 indexed 20 | # + or - strand? 21 | push(@GTFlines, "$name\tERCC\tgene\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n"); 22 | push(@GTFlines, "$name\tERCC\texon\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n"); 23 | } close($ifh); 24 | 25 | # Write output 26 | open(my $ofh, ">", "ERCC_Controls.fa") or die $!; 27 | foreach my $line (@FASTAlines) { 28 | print $ofh $line; 29 | } close ($ofh); 30 | 31 | open($ofh, ">", "ERCC_Controls.gtf") or die $!; 32 | foreach my $line (@GTFlines) { 33 | print $ofh $line; 34 | } close ($ofh); 35 | -------------------------------------------------------------------------------- /ERCC_Controls/Note: -------------------------------------------------------------------------------- 1 | Downloaded from: https://www.lifetechnologies.com/order/catalog/product/4456740 2 | Date: 10 April 2015 3 | 4 | Files: 5 | ERCC_Controls_Analysis.txt 6 | ERCC_Controls_Annotation.txt 7 | ERCC_Controls_README.txt 8 | -------------------------------------------------------------------------------- /Extract_PlateID_and_WellID_from_headers.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | 5 | if (@ARGV < 1) {die "requires at least one headerfile\n";} 6 | my %CellID2WellID = (); 7 | 8 | foreach my $file (@ARGV) { 9 | #Extract cell ID 10 | my $cellid = ""; 11 | if ($file =~ /_(\d_\d+)\./) { 12 | $cellid = $1; 13 | } else { 14 | die "$file does not match\n"; 15 | } 16 | open (my $ifh, $file) or die $!; 17 | while (<$ifh>) { 18 | if ($_ =~ /^\@RG/) { 19 | # Match the plate-well ID 20 | my $wellid = ""; 21 | if ($_ =~ /SM:SCGC--(\w+)/) { 22 | $wellid = $1; 23 | } else { 24 | die "$_ does not match"; 25 | } 26 | $CellID2WellID{$cellid} = $wellid; 27 | last; 28 | } else { 29 | next; 30 | } 31 | } close($ifh); 32 | } 33 | 34 | foreach my $cell (sort(keys(%CellID2WellID))) { 35 | print $cell."\t".$CellID2WellID{$cell}."\n"; 36 | } 37 | -------------------------------------------------------------------------------- /Kallisto_Build_Index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build Kallisto Index from a reference fasta and gtf. 3 | USAGE="Usage: Kallisto_Build_Index.sh ref.fa ref.gtf outdir\n 4 | \tArguments:\n 5 | \t ref.fa = reference fasta file\n 6 | \t ref.gtf = reference GTF file\n 7 | \t outdir = directory for output\n" 8 | 9 | # Locations of required software 10 | GFFREAD=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread 11 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto 12 | 13 | # Raw genome fasta and annotation gtf 14 | FA=$1 15 | GTF=$2 16 | 17 | # Location for output files 18 | OUTDIR=$3 19 | 20 | # Checks 21 | if [ ! -f $GFFREAD ] ; then 22 | echo "Error: gffread not available" 23 | exit 1 24 | fi 25 | 26 | if [ ! -f $KALLISTO ] ; then 27 | echo "Error: kallisto not available" 28 | exit 1 29 | fi 30 | 31 | if [ -z $FA ] || [ ! -f $FA ] ; then 32 | echo -e $USAGE 33 | exit 1 34 | fi 35 | 36 | if [ -z $GTF ] || [ ! -f $GTF ] ; then 37 | echo -e $USAGE 38 | exit 1 39 | fi 40 | 41 | 42 | if [ -z $OUTDIR ] ; then 43 | OUTDIR=./ 44 | fi 45 | 46 | # Extract transcriptome fasta using gffread 47 | $GFFREAD $GTF -g $FA -w $OUTDIR/Transcripts.fasta 48 | 49 | # Index the extracted transcriptome 50 | $KALLISTO index -i $OUTDIR/kallisto_index.idx $OUTDIR/Transcripts.fasta 51 | 52 | -------------------------------------------------------------------------------- /Kallisto_Make_ExpMat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | if (@ARGV != 4) {die "Usage: Kallisto_Make_ExpMat.pl kallisto_dir ref.gtf [gene|trans] out_prefix\n 6 | Arguments:\n 7 | kalliso_dir = directory of kallisto output files 8 | ref.gtf = reference GTF file 9 | [gene|trans] = whether to aggregate expression at gene or trans[script] level 10 | out_prefix = prefix for output files";} 11 | 12 | my $dir = $ARGV[0]; 13 | my $gtf = $ARGV[1]; 14 | my $type = $ARGV[2]; 15 | my $outprefix = $ARGV[3]; 16 | 17 | # Now read in genome annotations 18 | my %transcript2gene = (); 19 | open (my $ifh, $gtf) or die $!; 20 | while (<$ifh>) { 21 | if ($_ =~ /^#/) {next;} 22 | 23 | if ($_ =~ /transcript_id "(.+?)"/) { 24 | my $transid = $1; 25 | #transcript 26 | my $geneid = "ERROR"; 27 | if ($_ =~ /gene_id "(.+?)"/) { 28 | $geneid=$1; 29 | } else { 30 | die "No gene id!\n"; 31 | } 32 | $transcript2gene{$transid}=$geneid; 33 | } 34 | 35 | } close ($ifh); 36 | print STDERR "Done reading Annotations\n"; 37 | 38 | # First get expression for all genes & store details for all Cuff-Genes 39 | my %AllGenes = (); my %AllSamples = (); 40 | my %Gene2Sample2TPM = (); 41 | my %Gene2Sample2Counts=(); 42 | 43 | my @files = glob("$dir/*.kallisto.abundances.tsv"); 44 | foreach my $file (@files) { 45 | # $file =~ /([ATCG]{5,})/; 46 | my $ID = "ERR"; 47 | # if ($file =~ /([^_]+_Cell\d\d)/) { 48 | if ($file =~ /(.+)\.kallisto\.abundances\.tsv/) { # Extract sample ID from file name -> must be customized for each dataset. 49 | $ID = $1; 50 | } else { 51 | die "$file does not match!"; 52 | } 53 | $AllSamples{$ID}=1; 54 | open(my $ifh, $file) or die $!; 55 | <$ifh>; # header 56 | while (<$ifh>) { 57 | chomp; 58 | my @record = split(/\t/); 59 | my $trans = $record[0]; 60 | my $count = $record[3]; 61 | my $tpm = $record[4]; 62 | my $gene = $trans; 63 | if ($type eq "gene") { #Aggregate by gene 64 | if (exists($transcript2gene{$trans})) { 65 | $gene = $transcript2gene{$trans}; 66 | } 67 | } else { 68 | next; 69 | } 70 | if (exists($Gene2Sample2TPM{$gene})) { 71 | $Gene2Sample2TPM{$gene}->{$ID}+=$tpm; 72 | $Gene2Sample2Counts{$gene}->{$ID}+=$count; 73 | } else { 74 | $Gene2Sample2TPM{$gene}->{$ID}=$tpm; 75 | $Gene2Sample2Counts{$gene}->{$ID}=$count; 76 | } 77 | } close ($ifh); 78 | } 79 | 80 | #open (my $ofhtpm, ">", "$outprefix.tpm") or die $!; 81 | open (my $ofhcounts, ">", "$outprefix.counts") or die $!; 82 | my @IDs = sort(keys(%AllSamples)); 83 | #print $ofhtpm "Gene\t".join("\t",@IDs)."\n"; 84 | print $ofhcounts "Gene\t".join("\t",@IDs)."\n"; 85 | 86 | foreach my $gene (keys(%Gene2Sample2TPM)) { 87 | # print $ofhtpm "$gene"; 88 | print $ofhcounts "$gene"; 89 | foreach my $ID (@IDs) { 90 | my $tpm = "NA"; 91 | if (exists($Gene2Sample2TPM{$gene}->{$ID})) { 92 | $tpm = $Gene2Sample2TPM{$gene}->{$ID}; 93 | } else { 94 | $tpm = "0"; 95 | } 96 | my $count = "NA"; 97 | if (exists($Gene2Sample2Counts{$gene}->{$ID})) { 98 | $count = $Gene2Sample2Counts{$gene}->{$ID}; 99 | } else { 100 | $count = "0"; 101 | } 102 | print $ofhcounts "\t".$count; 103 | # print $ofhtpm "\t".$tpm; 104 | } 105 | print $ofhcounts "\n"; 106 | # print $ofhtpm "\n"; 107 | } 108 | close($ofhcounts); 109 | #close($ofhtpm); 110 | -------------------------------------------------------------------------------- /Kallisto_Quantification_Wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note job array requires indexing to start at 1 but array indexing starts at 0 3 | # Maps paired reads only! 4 | 5 | USAGE="Usage: Kallisto_Quantification_Wrapper.sh index threads file1 file2 outdir\n 6 | \tArguments:\n 7 | \tfile1 = either fastq for read1 or if running in jobarray directory of fasta files\n 8 | \tfile2 = either fastq for read2 or if running in jobarray \"NULL\" or for single-end\n 9 | \tindex = kallisto index (see: Kallisto_Build_Index.sh)\n 10 | \tthreads = number of cpus to use\n 11 | \toutdir = directory for output (default: current working directory)\n" 12 | 13 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto 14 | JOB_INDEX=$LSB_JOBINDEX # for array jobs, index starts at 1. 15 | 16 | FILE1=$1 17 | FILE2=$2 18 | KALLISTO_INDEX=$3 19 | NUMTHREADS=$4 20 | OUTDIR=$5 21 | 22 | #Check appropriate arguments 23 | if [ ! -f "$KALLISTO" ] ; then 24 | echo "Error: kallisto not available " 25 | exit 1 26 | fi 27 | 28 | if [ -z "$NUMTHREADS" ] ; then 29 | echo -e $USAGE 30 | exit 1 31 | fi 32 | 33 | if [ -z "$KALLISTO_INDEX" ] ; then 34 | echo -e $USAGE 35 | exit 1 36 | fi 37 | 38 | if [ -z "$FILE1" ] ; then 39 | echo -e $USAGE 40 | exit 1 41 | fi 42 | 43 | # allow running in unpaired mode 44 | #if [ -z "$FILE2" ] ; then 45 | # echo -e $USAGE 46 | # exit 1 47 | #fi 48 | 49 | if [ -z "$OUTDIR" ] ; then 50 | OUTDIR=./ 51 | fi 52 | 53 | # Set-up for either array job or for loop 54 | if [ $FILE2 == "NULL" ] && [ $JOB_INDEX -gt 0 ]; then 55 | echo "ArrayJob" 56 | echo $JOB_INDEX 57 | FILEStoMAP=($FILE1/*) 58 | ARRAYINDEX=$((($JOB_INDEX-1)*2)) 59 | FILE1=${FILEStoMAP[$ARRAYINDEX]} #Bash array indicies start at 0 60 | FILE2=${FILEStoMAP[$ARRAYINDEX+1]} #Bash array indicies start at 0 61 | fi 62 | 63 | if [ ! -d "$OUTDIR" ] ; then 64 | mkdir -p $OUTDIR 65 | fi 66 | 67 | if [ -z "$FILE1" ] || [ ! -f "$FILE1" ] ; then 68 | echo "$FILE1 does not exist." 69 | exit 1 70 | fi 71 | 72 | # allow running in unpaired mode 73 | #if [ -z "$FILE2" ] || [ ! -f "$FILE2" ] ; then 74 | # echo "$FILE2 does not exist." 75 | # exit 1 76 | #fi 77 | 78 | NAME=${FILE1##*/} 79 | NAME=${NAME%.*} 80 | WORKDIR=$OUTDIR/$NAME 81 | 82 | # Make directory for temporary output 83 | if [ ! -d "$WORKDIR" ] ; then 84 | mkdir -p $WORKDIR 85 | fi 86 | 87 | # Run KALLISTO 88 | if [ -f $FILE2 ] ; then 89 | $KALLISTO quant --bias --plaintext --threads=$NUMTHREADS -i $KALLISTO_INDEX -o $WORKDIR $FILE1 $FILE2 90 | else 91 | $KALLISTO quant --single --fragment-length=100 --sd=20 --bias --plaintext --threads=$NUMTHREADS -i $KALLISTO_INDEX -o $WORKDIR $FILE1 92 | fi 93 | mv $WORKDIR/abundance.tsv $OUTDIR/$NAME.kallisto.abundances.tsv 94 | rm $WORKDIR/run_info.json 95 | rmdir $WORKDIR 96 | 97 | -------------------------------------------------------------------------------- /Parse_GTF_biotype.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | if (@ARGV < 1) {die "Required input: gtf file\n";} 5 | 6 | my %GeneID2Stuff = (); 7 | my %TranscriptID2GeneID = (); 8 | open (my $ifh, $ARGV[0]) or die $!; 9 | while (<$ifh>) { 10 | if ($_ =~ /^#/) {next;} 11 | my $geneid = "ERROR"; 12 | if ($_ =~ /gene_id "(.+?)"/) { 13 | $geneid=$1; 14 | } else { 15 | die "No gene id!\n"; 16 | } 17 | 18 | if ($_ =~ /transcript_id "(.+?)"/) { 19 | $GeneID2Stuff{$geneid}->{"transcript_ids"}->{$1} = 1; 20 | $TranscriptID2GeneID{$1}=$geneid; 21 | } 22 | if ($_ =~ /gene_name "(.+?)"/) { 23 | $GeneID2Stuff{$geneid}->{"gene_name"} = $1; 24 | } 25 | if ($_ =~ /gene_biotype "(.+?)"/) { 26 | $GeneID2Stuff{$geneid}->{"gene_biotype"} = $1; 27 | } 28 | my @record = split(/\t/); 29 | my $length = $record[4]-$record[3]; 30 | if (!exists($GeneID2Stuff{$geneid}->{"length"}) || $GeneID2Stuff{$geneid}->{"length"} < $length) { 31 | $GeneID2Stuff{$geneid}->{"length"} = $length; 32 | } 33 | } close ($ifh); 34 | 35 | foreach my $gene (sort(keys(%GeneID2Stuff))) { 36 | print $gene."\t".$GeneID2Stuff{$gene}->{"gene_biotype"}."\n"; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /Parse_GTF_splicing.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | # Now read in genome annotations 5 | my %chr2exon2locus = (); 6 | my %exon2gene = (); 7 | my %exon2transcript = (); 8 | my %exon2size = (); 9 | my %chr2gene2locus = (); 10 | my %transcript2gene = (); 11 | my %gene2trans = (); 12 | my %trans2exon = (); 13 | my %gene2exon = (); 14 | my %Addedchr = (); 15 | open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!; 16 | while (<$ifh>) { 17 | my $transcriptID = "ERROR"; 18 | if ($_ =~ /^#/) {next;} 19 | my $geneid = "ERROR"; 20 | if ($_ =~ /gene_id "(.+?)"/) { 21 | $geneid=$1; 22 | } else { 23 | die "No gene id!\n"; 24 | } 25 | if ($_ =~ /transcript_id "(.+?)"/) { 26 | #transcript 27 | $transcriptID = $1; 28 | $transcript2gene{$transcriptID}=$geneid; 29 | $gene2trans{$geneid}->{$transcriptID} = 1; 30 | } 31 | 32 | my @record = split(/\t/); 33 | my $size = $record[4] - $record[3]; 34 | if ($size < 0) { 35 | $size = $record[3] - $record[4]; 36 | } 37 | 38 | if ($record[2] =~ /gene/i) { 39 | #gene 40 | #$chr2gene2locus{$locus->[0]}->{$geneid} = $locus; 41 | } elsif ($_ =~ /exon_id "(.+?)"/) { 42 | #exon 43 | # $chr2exon2locus{$locus->[0]}->{$1} = $locus; 44 | my $exonID = "$geneid $record[3] $record[4]"; 45 | $exon2gene{$exonID}=$geneid; 46 | $exon2size{$exonID} = $size; 47 | $exon2transcript{$exonID}->{$transcriptID} = 1; 48 | $trans2exon{$transcriptID}->{$exonID} = 1; 49 | $gene2exon{$geneid}->{$exonID} = 1; 50 | } 51 | 52 | } close ($ifh); 53 | 54 | foreach my $g (keys(%gene2trans)) { 55 | my $total_trans = scalar(keys(%{$gene2trans{$g}})); 56 | my $Perc_diff = 0; 57 | if ($total_trans > 1) { 58 | my $total_size = 0; 59 | my $var_size = 0; 60 | foreach my $e (keys(%{$gene2exon{$g}})) { 61 | $total_size += $exon2size{$e}; 62 | if (scalar(keys(%{$exon2transcript{$e}})) > 1) { 63 | $var_size += $exon2size{$e}; 64 | } 65 | } 66 | $Perc_diff = $var_size/$total_size; 67 | } 68 | print "$g $total_trans $Perc_diff\n"; 69 | } 70 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This is a collection of scripts I use (or have used in the past) to process scRNASeq data. They are free to use by anyone else for any purpose, but come with no assurances or guarantees of correctness or functionality. The general workflow is as follows: 2 | 3 | 0 : Create the appropriate genome for the dataset, and obtain the read files & initial QC 4 | - Building mapping indexes generally requires ~30Gb of memory for a mouse-sized genome 5 | 1 : Split the files by well (cell), Trim reads as appropriate based on QC 6 | 2 : Map the reads to the genome 7 | 3 : Clean up mapping output & remove duplicates 8 | 4 : Mapping QC 9 | 5 : Quantify expression 10 | 6 : Assemble expression matrix 11 | 12 | Finished Pipelines: 13 | 00_Kallisto_For_SmartSeq.readme = Smartseq2 + Kallisto (no UMIs) 14 | 15 | 16 | Brief Descriptions of Useful files: 17 | 0_Extract_barcodes_from_BAM.sh : open the first line of each BAM file and find the barcode (tagged with BC:) - for matching up metadata. 18 | 19 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | Integrate with ESAT: End Sequencing Analysis Toolkit 2 | Improve Transcriptome Extraction to reduce duplicate sequences. 3 | --------------------------------------------------------------------------------