├── 0.1_kallisto_extract_transcripts.sh
├── 0.2_bowtie_build_genome.sh
├── 0.3_Salmon_build_index.sh
├── 00_Add_to_Reference.readme
├── 00_Generate_FastQs.readme
├── 00_Kallisto_For_SmartSeq.readme
├── 00_LIST_OF_BSUB_COMMANDS.sh
├── 00_Steps
├── 0_Anno_Extract_Transcriptome.pl
├── 0_BAM2FastQ.sh
├── 0_BuildGenome.sh
├── 0_CRAM2BAM.sh
├── 0_Check_Barcodes.pl
├── 0_Convert_CRAM_to_BAM.sh
├── 0_Determine_Barcodes.pl
├── 0_Download_Files_from_Dropbox.pl
├── 0_Extract_Metadata_from_Bam.sh
├── 0_FASTQC.sh
├── 0_FASTQC_Streaming.sh
├── 0_FASTQC_limits.txt
├── 0_Flexible_Convert_Dir_CRAM_to_BAM.sh
├── 0_GBK2FASTA.pl
├── 0_Gather_Summary_Statistics.pl
├── 0_Get_Data_from_iRODS.sh
├── 0_Make_ERCC_fasta_and_gtf.pl
├── 0_Merge_FASTQs.sh
├── 0_My_Extract_Transcriptome.pl
├── 0_Process_GBK.pl
├── 0_custom_undo_demultiplexing.pl
├── 0_make_transcriptome.sh
├── 1.5_DO_Trim_Reads.sh
├── 1.5_Trim_Reads_Paired.sh
├── 1.5_Trim_UMI.pl
├── 1.6_Summarizing_Trimming.pl
├── 1_BreakDown_Files_wrapper.sh
├── 1_BreakDown_PairedEnds.pl
├── 1_BreakDown_PairedEnds_Custom_Wafergen.pl
├── 1_Breakdown_UMI_read_pairs.pl
├── 1_DO_BreakDown_Files.sh
├── 1_Flexible_FullTranscript_Demultiplexing.pl
├── 1_Flexible_UMI_Demultiplexing.pl
├── 2-5.1_DO_kallisto_quant.sh
├── 2-5.1_kallisto_quant.sh
├── 2-5.2_DO_Salmon_quant.sh
├── 2-5.2_Salmon_quant.sh
├── 2-5_DO_RSEM.sh
├── 2-5_STAR-RSEM.sh
├── 2-5_bowtie2-RSEM.sh
├── 2.2_DO_MapReads_Tophat.sh
├── 2.2_MapReads_Tophat.sh
├── 2_DO_MapReadsFile.sh
├── 2_DO_MapReadsFile_singleend.sh
├── 2_MapReadsFile.sh
├── 2_MapReadsFile_Transcriptome.sh
├── 2_MapReadsFile_singleend.sh
├── 2_STAR_Parameters.txt
├── 3_CLEANUP_MapReadFiles.sh
├── 3_Compile_Mapping_Statistics.pl
├── 3_Compile_UMI_Statistics.pl
├── 3_DO_UmiDedup.sh
├── 3_SAMtools_sort_wrapper.sh
├── 3_SortBAMs.pl
├── 3_UmiDedup.sh
├── 3_merge_dedup_MappedReads.sh
├── 4_Convert_GTF2BED_customized_for_Ensembl.pl
├── 4_DO_RSeQC_Multiple.sh
├── 4_MergeBAMs.pl
├── 4_Process_RSEQC_output.pl
├── 4_RSeQC_Multiple.sh
├── 5.0_Summarize_Known_Transcriptome.pl
├── 5_Cufflinks_wrapper.sh
├── 5_Cuffmerge_wrapper.sh
├── 5_DO_Cufflinks.sh
├── 5_DO_Cufflinks_denovo_Transcripts.sh
├── 5_DO_Cuffmerge.sh
├── 5_DO_Quantification_X2.sh
├── 5_DO_featureCounts.sh
├── 5_DO_featureCounts_locally.sh
├── 5_Fix_Transcriptome_for_featureCounts.pl
├── 5_RSEM.sh
├── 5_RSEM_build_refrence.sh
├── 5_Summarize_Filter_Merged_Transcriptome.pl
├── 5_TidyCufflinks.pl
├── 5_featureCounts_wrapper.sh
├── 6.1_Get_Expression_Kallisto.pl
├── 6_Get_Construct_Expression_Cufflinks.pl
├── 6_Get_Cufflinks_Gene_Level_Expression.pl
├── 6_Get_Expression_featureCounts.pl
├── 6_Get_Kallisto.pl
├── 6_Get_RSEM_Expression.pl
├── 6_Get_Salmon_Expression.pl
├── 99_Check_Barcodes.pl
├── 99_Check_RSEM_Output.pl
├── 99_Check_Results.pl
├── 99_NotesForImprovement
├── 99_get_order_chr_in_SAM.pl
├── ERCC_Controls
    ├── ERCC_Controls.fa
    ├── ERCC_Controls.gtf
    ├── ERCC_Controls_Analysis.txt
    ├── ERCC_Controls_Annotation.txt
    ├── ERCC_Controls_README.txt
    ├── Make_FASTA_GTF_from_Annotation.pl
    └── Note
├── Extract_PlateID_and_WellID_from_headers.pl
├── Kallisto_Build_Index.sh
├── Kallisto_Make_ExpMat.pl
├── Kallisto_Quantification_Wrapper.sh
├── Parse_GTF_biotype.pl
├── Parse_GTF_splicing.pl
├── README
├── TODO
└── software
    └── dedup_umi.py


/0.1_kallisto_extract_transcripts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Raw genome fasta and annotation gtf
 4 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa
 5 | GTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf
 6 | 
 7 | # Location for output files
 8 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES
 9 | 
10 | # Locations of required software
11 | GFFREAD=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread
12 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto
13 | 
14 | # Extract transcriptome fasta using gffread
15 | $GFFREAD $GTF -g $FA -w $OUTDIR/Transcripts.fasta
16 | 
17 | # Index the extracted transcriptome
18 | $KALLISTO index -i $OUTDIR/kallisto_index.idx $OUTDIR/Transcripts.fasta
19 | 
20 | 


--------------------------------------------------------------------------------
/0.2_bowtie_build_genome.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #This should be flexible enough to get the commonly used genomes: human, mouse, fly, worm from ensembl, and have options to add genetic constructs that have been integrated into the system.
 3 | 
 4 | # Tallulah 07 April 2015 - added the option to just keep the GTF & Fasta files without running BOWTIE by setting the number of threads to 0 (for getting the genome & annotations for Cufflinks later).
 5 | # Tallulah 31 March 2015 - updated to check all 5/6 arguments (which are required) have been set.
 6 | # Tallulah 26 Mar 2015 Not so obvious whether it is more efficient to get genomes from internal ensembl mirror or to download from ensembl ftp website? -> since only doing this once per organism/experiment ftp/rsync is probably fine?
 7 | # All bits tested but not all at once
 8 | 
 9 | # Arguments: 
10 | #    $1 = working directory on /lustre/
11 | #    $2 = striped genome directory on /lustre/
12 | #    $3 = number of threads to run on, # if 0 does not run star
13 | #    $4= readlength, 
14 | #    $5 = organism [Hsap, Mmus, Dmel, Cele]; 
15 | #    $6 = directory with constructs to be added (optional)
16 | 
17 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa
18 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/bowtie2_build
19 | BOWTIE=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bowtie2-2.2.6/bowtie2-build
20 | 
21 | echo "$BOWTIE --seed=10101 $FA $OUTDIR"
22 | 
23 | 


--------------------------------------------------------------------------------
/0.3_Salmon_build_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Raw genome fasta and annotation gtf files
 4 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa
 5 | GTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf
 6 | 
 7 | # Location for output
 8 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES
 9 | 
10 | # Location of required software
11 | GFFREAD=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread
12 | SALMON=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Salmon-0.7.2_linux_x86_64/bin/salmon 
13 | 
14 | # Extract transcriptome fasta
15 | $GFFREAD $GTF -g $FA -w $OUTDIR/Transcripts.fasta
16 | 
17 | # Build index (single thread) for optimal mapping performance
18 | $SALMON index -i $OUTDIR/salmon_index -t $OUTDIR/Transcripts.fasta --perfectHash -p 1
19 | 
20 | 


--------------------------------------------------------------------------------
/00_Add_to_Reference.readme:
--------------------------------------------------------------------------------
 1 | This provides instructions for adding extra sequences to a reference genome 
 2 | prior to mapping your reads. Reasons for doing this would be:
 3 | 	1) if your celline contains inserted genetic contructs, 
 4 | 		e.g. for CRISPR, or for fluorescently labelling your cells. 
 5 | 	2) if you've added spike-in transcripts to your sample.
 6 | 
 7 | 1) This will require manually creating the GTF annotations for your 
 8 | 	particular construct. If you have a GenBank file (.gbk) you can use:
 9 | 	0_GBK2FASTA.pl
10 | 	to extract the raw sequence of the construct to append to the genome.
11 | 
12 | 2) If using the ERCC spike in, you must download the sequences from the 
13 | 	manufacturer, i.e. the "ERCC Controls Annotation" file from: 
14 | 	https://www.thermofisher.com/order/catalog/product/4456740
15 | 	then run:
16 | 	0_Make_ERCC_fasta_and_gtf.pl ERCC_Controls_Annotation.txt
17 | 	this will create a fasta and gtf file for the ERCCs in your working 
18 | 	directory which you can append to the reference genome.
19 | 
20 | Append your extra gtf and fasta files to the existing reference fasta and gtf using "cat" :
21 | cat ERCC_Controls.fa >> ref.fa
22 | cat ERCC_Controls.gtf >> ref.gtf
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/00_Generate_FastQs.readme:
--------------------------------------------------------------------------------
 1 | This file contains instructions for converting a variety of file-types to
 2 | a fastq files. This is necessary before running any of the quantification 
 3 | pipelines found here.
 4 | 
 5 | There are many different ways you could recieve data from a sequencing 
 6 | facility and not all of them will be covered here. These scripts will
 7 | only cover the different formats I've had to deal with so far.
 8 | 
 9 | Software Requirements:
10 | samtools (>=1.3.1)
11 | bedtools2
12 | 
13 | 
14 | Option 1 : CRAM/BAM files
15 | 	First inspect your CRAM/BAM files using:
16 | 
17 | 	samtools view -h file1.cram | less
18 | 	
19 | 	Among the header lines (start with "@" symbol) you should find 
20 | 	some information identifying the data as belonging to your study
21 | 	along with any processing that has been done (e.g. how it was 
22 | 	mapped to the genome and which genome it was mapped to)
23 | 
24 | 	Once you scroll down to reads (press <enter>/<space> to scroll)
25 | 	you can check whether they have been mapped or not. Unmapped 
26 | 	reads will have the second entry of each row be either 77 or 141.
27 | 
28 | 	Usually BAM/CRAM files will already be demultiplexed (one file per 
29 | 	cell) and mapped to the appropriate genome. In which case, you may
30 | 	choose to skip the mapping step yourself and simply count your
31 | 	already mapped reads (see : 00_STAR_For_SmartSeq.readme). In which
32 | 	case if you have BAM files you are done. If you have CRAM files you
33 | 	need to convert them to BAM files:
34 | 
35 | 	samtools view -b -h file.cram -o file.bam
36 | 
37 | 	NOTE: Converting mapped CRAM files to BAM files will store the 
38 | 	entire reference genome in your cache (which might be bad). To
39 | 	specify a new cache location use:
40 | 
41 | 	export REF_CACHE=<full path to new cache location>
42 | 
43 | 	If you have a large number of files to convert you may want to 
44 | 	loop over them or submit them to a cluster to run. E.g.
45 | 
46 | 	bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>1000] rusage[mem=1000]" -M1000 -o cram2bam.%J.%I 0_CRAM2BAM.sh cram_dir bam_dir work_dir
47 | 	
48 | 	or
49 | 
50 | 	CRAM_files=$CRAM_dir/*.cram
51 | 	for FILE in $CRAM_files
52 | 	do
53 | 	  0_CRAM2BAM.sh $FILE bam_dir work_dir
54 | 	done
55 | 
56 | 	If for whatever reason you need to remap your reads from BAM/CRAM files
57 | 	then you'll need to convert the BAM files to FastQ files, using :
58 | 	
59 | 	bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>1000] rusage[mem=1000]" -M1000 -o bam2fastq.%J.%I 0_BAM2FastQ.sh bam_dir fastq_dir work_dir "P"
60 | 
61 | 	or
62 | 
63 | 	BAM_files=$BAM_dir/*.bam
64 |         for FILE in $BAM_files
65 |         do
66 |           0_BAM2FastQ.sh $FILE fastq_dir work_dir "P"
67 |         done
68 | 
69 | 	This script assumes reads are unpaired unless the fourth argument "P" is provided. 
70 | 
71 | 	To check if your reads are pairs simply run this command:
72 | 
73 | 	samtools view -f 1 my_file.bam | wc -l 
74 | 
75 | 	This will count the number of paired reads in your bam file.
76 | 
77 | Option 2 : Demultiplexing large FastQs
78 | 	If you get data where reads from multiple cells are mixed together then
79 | 	you will need to demultiplex the data so you have one pair of fastq files
80 | 	per cell. This can be done with :
81 | 
82 | 	perl 1_Flexible_FullTranscript_Demultiplexing.pl read1.fq read2.fq b_pos b_len index mismatch prefix
83 | 
84 | 	Running the script without any arguments will bring up the help for 
85 | 	what each argument should be. Note this script assumes you have a 
86 | 	relatively small number of possible cell-barcodes (i.e. < 5,000).
87 | 	It is not appropriate for droplet or microwell based experiments.
88 | 
89 | 	Cell barcodes are assumed to be present at either the start of end of 
90 | 	read1. And they will be removed from the sequences after demultiplexing.
91 | 
92 | 	If you have multiple files of reads per cell (e.g. if you have multiple 
93 | 	lanes of sequencing) then these can be combined with the "cat" function:
94 | 	
95 | 	cat file1_1.fq file2_1.fq file3_1.fq > all_1.fq
96 | 
97 | 	It is easist to combine all reads into one file and then demultiplex.
98 | 


--------------------------------------------------------------------------------
/00_Kallisto_For_SmartSeq.readme:
--------------------------------------------------------------------------------
 1 | This outlines the scripts, software and steps for processing a 
 2 | SmartSeq[2]-based RNASeq experiment with Kallisto. It assumes 
 3 | you are starting with one pair of FastQ files per cell, and
 4 | takes you through to creating a Single-Cell Experiment object.
 5 | 
 6 | See: 00_Generate_FastQs.readme for instructions on creating one
 7 | pair of FastQs per cell.
 8 | 
 9 | This workflow assumes you are NOT using Unique Molecular Identifiers
10 | 
11 | Software Requirements:
12 | fastqc
13 | trimmomatic
14 | gffread
15 | kallisto
16 | perl
17 | 
18 | All scripts contain variables among the top few lines for hard-coding specific
19 | versions of the software if it is not in your path. 
20 | 
21 | Directory Set-up:
22 | (A) Create one directory with all the FastQ files for one experiment.
23 | (B) Create a second directory for kallisto output files.
24 | (C) Create a third directory for temporary files.
25 | 
26 | SAVE A BACK-UP COPY OF YOUR RAW DATA BEFORE RUNNING
27 | 
28 | Steps
29 | 
30 | 1 : Build the reference transcriptome and kallisto index
31 | 
32 | Download the appropriate reference fasta (.fa) and annotation (.gtf) files
33 | (https://www.ensembl.org/info/data/ftp/index.html)
34 | 
35 | Add any custom sequences you need for your experiment.
36 | 
37 | See: 00_Add_to_Reference.readme for instructions on adding custom sequences 
38 |      such as spike-ins to the reference.
39 | 
40 | Run : "Kallisto_Build_Index.sh ref.fa ref.gtf outdir"
41 | 
42 | 2 : Read Quality Control with FASTQC
43 | 
44 | Download my FASTQC limit file (0_FASTQC_limits.txt) 
45 | 
46 | Run : 
47 | 0_FASTQC_Streaming.sh fastq_dir "*_1.fq" 0_FASTQC_limits.txt "Read1" outdir
48 | 0_FASTQC_Streaming.sh fastq_dir "*_2.fq" 0_FASTQC_limits.txt "Read2" outdir
49 | 
50 | If your data was sequenced on multiple lanes of sequences you may want to run 
51 | FASTQC on each lane separately. 
52 | 
53 | 3 : Read Trimming (WARNING: replaces original FastQs)
54 | Either submit 1.5_Trim_Reads_Paired.sh as a job array:
55 | 
56 | NCELLS=384
57 | bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o trim.out.%J.%I 1.5_Trim_Reads_Paired.sh $FQ_dir NULL $work_dir NexteraPE-PE.fa 1000
58 | 
59 | or loop over all pairs of fastq files :
60 | 
61 | NCELLS=384
62 | FQ_files=($FQ_dir/*.fq.gz)
63 | for CELL in $(seq 1 $NCELLS)
64 | do
65 |   FILE_INDEX=$((($CELL-1)*2))
66 |   FILE1=${FQ_files[$FILE_INDEX]}
67 |   FILE2=${FQ_files[$FILE_INDEX+1]}
68 |   bsub -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o trim.out.%J 1.5_Trim_Reads_Paired.sh $FILE1 $FILE2 $work_dir NexteraPE-PE.fa 1000
69 | done
70 | 
71 | 4 : Quantification with kallisto
72 | Either submit Kallisto_Quantification_Wrapper.sh as a job array:
73 | 
74 | NCELLS=384
75 | bsub -J"arrayjob[1-$NCELLS]%50" -R"select[mem>5000] rusage[mem=5000] span[hosts=1]" -M5000 -n2 -q normal -o kallisto.out.%J.%I Kallisto_Quantification_Wrapper.sh $FQ_dir NULL kallisto_index.idx 2 outdir
76 | 
77 | or loop over all pairs of fastq files :
78 | 
79 | NCELLS=384
80 | FQ_files=($FQ_dir/*.fq.gz)
81 | for CELL in $(seq 1 $NCELLS)
82 | do
83 |   FILE_INDEX=$((($CELL-1)*2))
84 |   FILE1=${FQ_files[$FILE_INDEX]}
85 |   FILE2=${FQ_files[$FILE_INDEX+1]}
86 |   bsub -R"select[mem>5000] rusage[mem=5000] span[hosts=1]" -M5000 -n2 -q normal -o kallisto.out.%J.%I Kallisto_Quantification_Wrapper.sh $FILE1 $FILE2 kallisto_index.idx 2 outdir
87 | done
88 | 
89 | 5 : Combine results with perl script
90 | "Kallisto_Make_ExpMat.pl kallisto_dir ref.gtf [gene|trans] out_prefix"
91 | 
92 | 


--------------------------------------------------------------------------------
/00_Steps:
--------------------------------------------------------------------------------
 1 | Setup:
 2 | 0_BuildGenome.sh
 3 | (1) Get appropriate genome & annotations. 
 4 | 	-> from local ensembl mirror? : mysql -u anonymous -h ensembldb.internal.sanger.ac.uk
 5 | 	-> simpler to just get from ftp site 
 6 | (2) get these in the right format and put on lustre and stripe them. (/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES) 
 7 | 	-> stripe at the level of directory (see wiki):
 8 | 		lfs setstripe <directory> -c -1 (number not letter) stripe across all **This is what I have done for this directory
 9 | 		lfs setstripe <directory? -c 1 (number not letter) stripe across single OST
10 | 
11 | Initial QC : 
12 | 0_FASTQC_Streaming.sh : FASTQC run on each end of each lane efficiently by streaming the data rather than combining the files then running on the bigger files.
13 | 
14 | Mapping Reads: Tophat or STAR? -> STAR seems to be better in all ways except for the number of people already using it, it is both faster and maps more reads successfully. It is also possible to make the output compatible with Cufflinks
15 | (1) Copy data in processable chunks to /lustre/scratch108/compgen/team218  (1_break..., 1_Break...)
16 | 	-> Q: What is the appropriate size chunk? (run in a few hours)
17 | 	-> Should sort by lane & cellID while breaking down. Use 0_Gather_Summary_Statistics.pl as a basis
18 | 	-> need to maintain ordering for paired-end reads.
19 | (2) submit jobs to cluster -> use job array, output into "output.%J.%I" or something like that
20 | 	-> probably want to start with 40 jobs at a time until know how to load genome data efficiently
21 | (3) check results succeeded combined results as necessary & compress (compress while still on lustre it is faster)
22 | (4) move combined-compressed output back to /nfs/team218/
23 | (5) remove input chunks & deal with logfiles from lustre and repeat with next set
24 | --> Question? One or two passes of STAR? -> I think only one pass because we aren't that interested in alternative splicing and since relatively low read depth for each cell rare alternative splice forms are unlikely to be quantifiable in many samples. Thus the costs of doing the second pass - filtering novel splice junctions then re-building the genome to include them as annotations - outweigh the benefits of doing so - allowing more reads to be mapped to novel splice junctions.
25 | --> Question? what state does data have to be in for cufflinks/express? -> not going to use express since it doesn't do novel transcript assembly so would have to run cufflinks anyway. Cufflinks requires: Sorted SAM/BAM with special strand field and no soft clipping, the special strand field can be added with the appropriate parameter/argument in STAR calls, and I have an awk command from : https://groups.google.com/forum/#!searchin/rna-star/cufflinks/rna-star/Ta1Z2u4bPfc/8nZ2iMkxSyMJ to remove soft clipping.
26 | (6) Sort SAM/BAM
27 | (7) Remove duplicates (can be done with STAR once alignments have been combined & sorted) -> easier to do it with samtools (4_MergeBAMs.pl)
28 | 
29 | Post-Mapping QC:
30 | 3_Compile_Mapping_Statistics.pl Gather mapping statistics from STAR log files
31 | 4_DO_RSeQC_Multiple.sh Various QC things using RSeQC -> general statistics, rRNA content, GC, Gene Body converage, splice junction saturation
32 | 
33 | 
34 | 
35 | Calculating Abundance  cufflinks vs eXpress: eXpress is much faster but unclear which is more accurate. I'm not sure whether eXpress does de novo transcript assembly though..... need to look at documentations for eXpress. eXpress does not do de novo transcript assembly. Cufflinks does de novo transcript assembly and produces 95% CIs for FPKM, therefore forget eXpress I'm only going to use Cufflinks (assuming it doesn't take forever).
36 | (1) Build novel transcripts/gene-models using all data. -> Cufflinks!
37 | (2) Calculate abundance for each cell by mapping to the combination of novel transcripts & existing annotations. 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/0_Anno_Extract_Transcriptome.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | if (@ARGV < 2) {die "0_My_Extract_Transcriptome.pl .gtf .fa\n";}
  5 | 
  6 | my %Ensg2Seq = ();
  7 | my %Ensg2Gtf = ();
  8 | my @Ensgs = ();
  9 | my $flank = 10;
 10 | 
 11 | my $nascent = 1;
 12 | 
 13 | open (my $fa, $ARGV[1]) or die $!;
 14 | open (my $fa_out, ">","Transcripts.fa") or die $!;
 15 | open (my $gtf_out, ">","Transcripts.gtf") or die $!;
 16 | my $chr = "None";
 17 | my $chr_seq = "";
 18 | my $COUNT = 0;
 19 | while (<$fa>) {
 20 | 	if($_ =~ /^#/) {next;} # skip headers
 21 | 	if ($_ =~ /^\>/) {
 22 | 		# New Chr
 23 | 		my @line = split(/\s+/);
 24 | 		my $newchr = $line[0]; $newchr =~ s/>//g;
 25 | 		if ($chr eq "None") {
 26 | 			$chr = $newchr;
 27 | 			next;
 28 | 		}  
 29 | 		# Output gene sequences for this chromosome
 30 | 		open (my $gtf, $ARGV[0]) or die $!;
 31 | 		my $gtf_line = "";
 32 | 		while ($gtf_line = <$gtf>) {
 33 | 			# Extract sequence for each gene on this Chr.
 34 | 			if ($gtf_line =~ /^#/) {next;} # ignore headers
 35 | 
 36 | 			my $geneid = "";
 37 | 			if ($gtf_line =~ /gene_id "(.+?)";/) {
 38 | 				$geneid = $1;
 39 | 			} else {
 40 | 				next;
 41 | 			} # get gene id
 42 | 
 43 | 			# Get coordinates
 44 | 			my @record = split(/\t/, $gtf_line);
 45 | 			my $seq_chr = $record[0];
 46 | 			if ($seq_chr ne $chr) {next;}
 47 | 			my $seq_st = $record[3];
 48 | 			my $seq_end = $record[4];
 49 | 			if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";}
 50 | 			# Get sequence
 51 | 			if ($record[2] eq "gene") {
 52 | 				# Add null flanks as necessary
 53 | 				if ($seq_st-$flank < 0) {
 54 | 					$chr_seq = ('N' x $flank) . $chr_seq;
 55 | 					$seq_st = $seq_st+$flank;
 56 | 					$seq_end = $seq_end+$flank;
 57 | 				}
 58 | 				if ($seq_end+$flank > length($chr_seq)) {
 59 | 					$chr_seq = $chr_seq . ('N' x $flank);
 60 | 				}
 61 | 				$Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank));
 62 | 				push(@Ensgs, $geneid);
 63 | 			}
 64 | 			# Store Annotations
 65 | 			if ($record[2] eq "exon" || $record[2] eq "UTR" || $record[2] eq "gene") {
 66 | 				push(@{$Ensg2Gtf{$geneid}}, $gtf_line);
 67 | 			}
 68 | 		}
 69 | 		close($gtf);
 70 | 		# Write output for all genes on this Chr
 71 | 		foreach my $ensg (@Ensgs) {
 72 | 			print $fa_out ">$ensg\n";
 73 | 			print $fa_out $Ensg2Seq{$ensg}."\n";
 74 | 			my $seq_length = length($Ensg2Seq{$ensg});
 75 | 			my $shift = -1;
 76 | 			foreach my $old_gtf (@{$Ensg2Gtf{$ensg}}) {
 77 | 				$old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/s;
 78 | 				$old_gtf =~ s/gene_id "(.+?)"/gene_id "$ensg"/s;
 79 | 				$old_gtf =~ s/gene_name "(.+?)"/gene_name "$ensg"/s;
 80 | 
 81 | 				my @record = split(/\t/, $old_gtf);
 82 | 				if($shift == -1 && $record[2] ne "gene") {die "ERROR: Requires first entry for each ensg to be \"gene\".\n";}
 83 | 				if ($shift == -1) {
 84 | 					$shift = $record[3]-$flank;
 85 | 				}
 86 | 				if (scalar(@record) < 5) {die "$old_gtf not enough entries\n";}
 87 | 
 88 | 				$record[0] = $ensg;
 89 | 				$record[3] = $record[3]-$shift;
 90 | 				$record[4] = $record[4]-$shift;
 91 | 				if ($record[4] > $seq_length) {
 92 | 					print STDERR "$chr $ensg $record[2] $record[3] $record[4], seq = $seq_length\n";
 93 | 					die "ERROR: annotation exceeds sequence length\n";
 94 | 				}
 95 | 				print $gtf_out join("\t",@record);
 96 | 			}
 97 | 		}
 98 | 		print "$chr $newchr\n";
 99 | 		$chr = $newchr;
100 | 		$chr_seq="";
101 | 		$COUNT=0;
102 | 		@Ensgs=();
103 | 	} else {
104 | 		# Read in chr sequence
105 | 		chomp;
106 | 		$chr_seq = $chr_seq.$_;
107 | 	}
108 | }
109 | # Output last chromosome
110 | # Output gene sequences 
111 | {
112 | 			# Output gene sequences for this chromosome
113 | 			open (my $gtf, $ARGV[0]) or die $!;
114 | 			my $gtf_line = "";
115 | 			while ($gtf_line = <$gtf>) {
116 | 				# Extract sequence for each gene on this Chr.
117 | 				if ($gtf_line =~ /^#/) {next;} # ignore headers
118 | 
119 | 				my $geneid = "";
120 | 				if ($gtf_line =~ /gene_id "(.+?)";/) {
121 | 					$geneid = $1;
122 | 				} else {
123 | 					next;
124 | 				} # get gene id
125 | 
126 | 				# Get coordinates
127 | 				my @record = split(/\t/, $gtf_line);
128 | 				my $seq_chr = $record[0];
129 | 				if ($seq_chr ne $chr) {next;}
130 | 				my $seq_st = $record[3];
131 | 				my $seq_end = $record[4];
132 | 				if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";}
133 | 				# Get sequence
134 | 				if ($record[2] eq "gene") {
135 | 					# Add null flanks as necessary
136 | 					if ($seq_st-$flank < 0) {
137 | 						$chr_seq = ('N' x $flank) . $chr_seq;
138 | 						$seq_st = $seq_st+$flank;
139 | 						$seq_end = $seq_end+$flank;
140 | 					}
141 | 					if ($seq_end+$flank > length($chr_seq)) {
142 | 						$chr_seq = $chr_seq . ('N' x $flank);
143 | 					}
144 | 					$Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank));
145 | 					push(@Ensgs, $geneid);
146 | 				}
147 | 				# Store Annotations
148 | 				if ($record[2] eq "exon" || $record[2] eq "UTR" || $record[2] eq "gene") {
149 | 					push(@{$Ensg2Gtf{$geneid}}, $gtf_line);
150 | 				}
151 | 			}
152 | 			close($gtf);
153 | 			# Write output for all genes on this Chr
154 | 			foreach my $ensg (@Ensgs) {
155 | 				print $fa_out ">$ensg\n";
156 | 				print $fa_out $Ensg2Seq{$ensg}."\n";
157 | 				my $seq_length = length($Ensg2Seq{$ensg});
158 | 				my $shift = -1;
159 | 				foreach my $old_gtf (@{$Ensg2Gtf{$ensg}}) {
160 | 					$old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/s;
161 | 					$old_gtf =~ s/gene_id "(.+?)"/gene_id "$ensg"/s;
162 | 					$old_gtf =~ s/gene_name "(.+?)"/gene_name "$ensg"/s;
163 | 
164 | 					my @record = split(/\t/, $old_gtf);
165 | 					if($shift == -1 && $record[2] ne "gene") {die "ERROR: Requires first entry for each ensg to be \"gene\".\n";}
166 | 					if ($shift == -1) {
167 | 						$shift = $record[3]-$flank;
168 | 					}
169 | 
170 | 					$record[0] = $ensg;
171 | 					$record[3] = $record[3]-$shift;
172 | 					$record[4] = $record[4]-$shift;
173 | 					if ($record[4] > $seq_length) {die "ERROR: annotation exceeds sequence length\n";}
174 | 					print $gtf_out join("\t",@record);
175 | 				}
176 | 			}
177 | 			exit();
178 | 			$chr_seq="";
179 | 			$COUNT=0;
180 | 			@Ensgs=();
181 | 		}
182 | close($gtf_out);
183 | close($fa);
184 | 


--------------------------------------------------------------------------------
/0_BAM2FastQ.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Convert BAM file to paired, zipped, read files. Assumes paired-end sequencing
 3 | BAM_file=$1
 4 | OUT_dir=$2
 5 | WORK_dir=$3
 6 | 
 7 | export REF_CACHE=$WORK_dir
 8 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools
 9 | BEDTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bedtools2/bin/bedtools
10 | 
11 | USAGE="Usage: 0_BAM2FastQ.sh bam_file out_dir work_dir\n
12 | 	Assumes paired-end reads.
13 | 	\tArguments:\n
14 | 		\tbam_file = BAM file or directory of BAM files if running in job array\n
15 | 		\tout_dir = directory for FastQ files\n
16 | 		\twork_dir = fast I/O location with space to store genome\n"
17 | 
18 | if [ -z $BAM_file ] || [ -z $BAM_dir ] || [ -z $WORK_dir ] ; then
19 |   echo -e $USAGE
20 |   exit 1
21 | fi
22 | 
23 | if [ ! -f $SAMTOOLS ] ; then
24 |   echo "$SAMTOOLS not available"
25 |   exit 1
26 | fi
27 | 
28 | if [ ! -f $BEDTOOLS ] ; then
29 |   echo "$BEDTOOLS not available"
30 |   exit 1
31 | fi
32 | 
33 | # Get CRAM files
34 | if [ ! -z $LSB_JOBINDEX ]; then
35 |   BAMS=($BAM_file/*.bam)
36 |   INDEX=$(($LSB_JOBINDEX-1))
37 |   FILE=${BAMS[$INDEX]}
38 | else
39 |   FILE=$BAM_file
40 | fi
41 | 
42 | NAME=`basename ${FILE%.bam}` # remove path and .bam suffix
43 | 
44 | FASTQ1=${NAME}_1.fq
45 | FASTQ2=${$NAME}_2.fq
46 | 
47 | #write all reads to fastq
48 | TMP=$WORK_dir/Tmp$NAME.bam
49 | TMP2=$WORK_dir/Tmp2_$NAME.bam
50 | $SAMTOOLS sort -n $FILE -o $TMP
51 | $SAMTOOLS view -b -F 256 $TMP -o $TMP2 # remove secondary alignments
52 | $BEDTOOLS bamtofastq -i $TMP2 -fq $OUT_dir/$FASTQ1 -fq2 $OUT_dir/$FASTQ2
53 | 
54 | gzip $OUT_dir/$FASTQ1
55 | gzip $OUT_dir/$FASTQ2
56 | rm $TMP
57 | rm $TMP2
58 | 


--------------------------------------------------------------------------------
/0_BuildGenome.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #This should be flexible enough to get the commonly used genomes: human, mouse, fly, worm from ensembl, and have options to add genetic constructs that have been integrated into the system.
  3 | 
  4 | # Tallulah 07 April 2015 - added the option to just keep the GTF & Fasta files without running STAR by setting the number of threads to 0 (for getting the genome & annotations for Cufflinks later).
  5 | # Tallulah 31 March 2015 - updated to check all 5/6 arguments (which are required) have been set.
  6 | # Tallulah 26 Mar 2015 Not so obvious whether it is more efficient to get genomes from internal ensembl mirror or to download from ensembl ftp website? -> since only doing this once per organism/experiment ftp/rsync is probably fine?
  7 | # All bits tested but not all at once - Totally works now (13 Dec 2016)
  8 | 
  9 | # Arguments: 
 10 | #    $1 = working directory on /lustre/
 11 | #    $2 = striped genome directory on /lustre/
 12 | #    $3 = number of threads to run on, # if 0 does not run star
 13 | #    $4= readlength, 
 14 | #    $5 = organism [Hsap, Mmus, Dmel, Cele]; 
 15 | #    $6 = directory with constructs to be added (optional)
 16 | 
 17 | NUMTHREADS=$3
 18 | OVERHANG=$4-1 #read length-1
 19 | ORG=$5
 20 | ADDDIR=$6
 21 | ORGERR="please enter one of the following organism tags: Hsap, Mmus"
 22 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR
 23 | LUSTRE=$1
 24 | OUTDIR=$2
 25 | 
 26 | 
 27 | 
 28 | if [ ! -f "$STAR" ] ; then
 29 |   echo "Sorry STAR not available "
 30 |   exit 1
 31 | fi
 32 | 
 33 | if [ -z "$LUSTRE" ] ; then
 34 |   echo "Please set a directory for temporary working files (ARG 1/6)"
 35 |   exit 1
 36 | fi
 37 | 
 38 | if [ -z "$OUTDIR" ] ; then
 39 |   echo "Please set a directory for output (ARG 2/6)"
 40 |   exit 1
 41 | fi
 42 | 
 43 | if [ -z "$NUMTHREADS" ] ; then
 44 |   echo "Please set number of threads to use (ARG 3/6)"
 45 |   exit 1
 46 | fi
 47 | 
 48 | if [ -z "$OVERHANG" ] ; then
 49 |   echo "Please set length of RNASeq reads (ARG 4/6)"
 50 |   exit 1
 51 | fi
 52 | 
 53 | if [ -z "$ORG" ] ; then
 54 |   echo "Sorry no organism to work with (ARG 5/6)"
 55 |   echo $ORGERR
 56 |   exit 1
 57 | fi
 58 | 
 59 | # Make directories for output/temporary working files if they don't already exist
 60 | if [ ! -d "$OUTDIR" ] ; then
 61 |   mkdir -p $OUTDIR
 62 |   lfs setstripe $OUTDIR -c -1
 63 | fi
 64 | 
 65 | mkdir -p $LUSTRE
 66 | 
 67 | FA="";GTF="";
 68 | # Step 1: Get genome & annotations from Ensembl and put on lustre
 69 | if [ $ORG = "Hsap" ]; then
 70 |   # Genome fastas
 71 |   rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz $LUSTRE
 72 |   # Annotation GTFs
 73 |   rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/gtf/homo_sapiens/Homo_sapiens.GRCh38.79.gtf.gz $LUSTRE
 74 |   FA=$LUSTRE/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
 75 |   GTF=$LUSTRE/Homo_sapiens.GRCh38.79.gtf.gz
 76 | elif [ $ORG = "Mmus" ]; then
 77 |   # Genome fastas
 78 |   rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz $LUSTRE
 79 |   # Annotation GTFs
 80 |   rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-79/gtf/mus_musculus/Mus_musculus.GRCm38.79.gtf.gz $LUSTRE
 81 |   FA=$LUSTRE/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz 
 82 |   GTF=$LUSTRE/Mus_musculus.GRCm38.79.gtf.gz
 83 | else
 84 |   echo "$ORG not supported"
 85 |   echo $ORGERR
 86 |   exit 1
 87 | fi
 88 | 
 89 | # Step 2: Add genetic constructs from a directory
 90 | # Question? Should we add the genetic construct as an additional chromosome or try to place it correctly in the genome?
 91 | 		# placing it in the genome would affect the locations of all other elements on that chromosome
 92 | 		# expression from the construct should be independent of that of the surrounding genome so don't expect any 
 93 | 		# reads to span to neighbouring genes -> depends on the construct 
 94 | 		# what about things added to the tail of a native locus?  
 95 | 		# Couldn't this be dealt with by adding the full new locus as a separate thing and somehow masking the native locus?
 96 | 
 97 | 	# OK I think adding constructs as separate contigs is the best approach!
 98 | 		# If I assume the constructs have been pre-formatted to be fasta & gtfs, 
 99 | 		# can the fasta just be added to the stock & gtf just concatenated to the current one? 
100 | 		# -> yes as long as names are consistent across the files and not the same as any other chromosome/contig
101 | 
102 | gunzip $FA
103 | FA=${FA%.*}
104 | gunzip $GTF
105 | GTF=${GTF%.*}
106 | if [ ! -z "$ADDDIR" ] ; then
107 |   echo "Adding files from $ADDDIR"
108 |   cat $ADDDIR/*.fa >> $FA
109 |   cat $ADDDIR/*.gtf >> $GTF
110 | fi
111 | 
112 | if [ $NUMTHREADS -gt 0 ] ; then
113 |   # Step 3: Run STAR on the finished genome & put output in striped directory.
114 |   $STAR --runThreadN $NUMTHREADS --runMode genomeGenerate --genomeDir /lustre/scratch117/cellgen/team218/TA/STRIPED_GENOMES --genomeFastaFiles $FA --sjdbGTFfile $GTF --sjdbOverhang $OVERHANG --limitGenomeGenerateRAM 31000000000
115 | 
116 |   # Step 4: delete the Ensembl-derived files
117 | #  rm $FA
118 | #  rm $GTF
119 | fi
120 | 


--------------------------------------------------------------------------------
/0_CRAM2BAM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CRAM_file=$1
 3 | BAM_dir=$2
 4 | WORK_dir=$3 # fast I/O location with space to store genome & temporary files
 5 | 
 6 | export REF_CACHE=$WORK_dir
 7 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools
 8 | 
 9 | # Checks
10 | USAGE="Usage: 0_CRAM2BAM.sh cram_file bam_dir work_dir\n
11 | 	\tArguments:\n
12 | 		\tcram_file = CRAM file or directory of CRAM files if running in job array\n
13 | 		\tbam_dir = directory to be filled with BAM files\n
14 | 		\twork_dir = fast I/O location with space to store genome\n"
15 | 
16 | if [ -z $CRAM_file ] || [ -z $BAM_dir ] || [ -z $WORK_dir ] ; then
17 |   echo -e $USAGE
18 |   exit 1
19 | fi
20 | 
21 | if [ ! -f $SAMTOOLS ] ; then
22 |   echo "$SAMTOOLS not available"
23 |   exit 1
24 | fi
25 | 
26 | 
27 | 
28 | # Get all CRAM files
29 | 
30 | if [ ! -z $LSB_JOBINDEX ]; then
31 |   CRAMS=($CRAM_file/*.cram)
32 |   INDEX=$(($LSB_JOBINDEX-1))
33 |   FILE=${CRAMS[$INDEX]}
34 | else 
35 |   FILE=$CRAM_file
36 | fi
37 | 
38 | NAME=`basename ${FILE%.cram}` #remove path and .cram suffix
39 | cp $FILE $WORK_dir/$NAME.cram
40 | $SAMTOOLS view -b -h $WORK_dir/$NAME.cram -o $BAM_dir/$NAME.bam
41 | rm $WORK_dir/$NAME.cram
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/0_Check_Barcodes.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 3) { die "Breakdown_Paired_Ends.pl INPUT1 INPUT2 ProjectName\n";}
 5 | my $infile1 = $ARGV[0];
 6 | my $infile2 = $ARGV[1];
 7 | 
 8 | my %Barcodes = ();
 9 | open (my $ifh1, $infile1) or die $!;
10 | while(<$ifh1>) {
11 | 	my $file1line = $_;
12 | 	if ($file1line =~ /^@/) {
13 | 		my @thing1 = split(/\s+/,$file1line);
14 | 		my $readname = $thing1[0];
15 | 		my $barcodes = <$ifh1>;
16 | 		if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/){
17 | 			my $UMI = $2;
18 | 			my $CellID = $1;
19 | 			$Barcodes{$UMI}++;
20 | 		}
21 | 	} else {next;}
22 | }
23 | close($ifh1);
24 | 
25 | my @codes = sort{$Barcodes{$a}<=>$Barcodes{$b}} keys(%Barcodes);
26 | foreach my $code (@codes) {
27 | 	print "$code ".$Barcodes{$code}."\n";
28 | }
29 | 


--------------------------------------------------------------------------------
/0_Convert_CRAM_to_BAM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export REF_CACHE=/lustre/scratch117/cellgen/team218/TA/TemporaryFileDir
 4 | CRAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/cramtools-3.0/cramtools-3.0.jar
 5 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools
 6 | FILEINDEX=$LSB_JOBINDEX
 7 | LANE6="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane6/21698_6_$FILEINDEX.cram"
 8 | LANE7="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane7/21698_7_$FILEINDEX.cram"
 9 | LANE8="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane8/21698_8_$FILEINDEX.cram"
10 | 
11 | OUTDIR=/lustre/scratch117/cellgen/team218/TA/LiverOrganoids/BAMS
12 | 
13 | FILE1=$(basename $LANE6)
14 | FILE2=$(basename $LANE7)
15 | FILE3=$(basename $LANE8)
16 | 
17 | cp $LANE6 $OUTDIR/$FILE1
18 | cp $LANE7 $OUTDIR/$FILE2
19 | cp $LANE8 $OUTDIR/$FILE3
20 | 
21 | $SAMTOOLS view -b -h $OUTDIR/$FILE1 -o $OUTDIR/$FILE1.bam
22 | $SAMTOOLS view -b -h $OUTDIR/$FILE2 -o $OUTDIR/$FILE2.bam
23 | $SAMTOOLS view -b -h $OUTDIR/$FILE3 -o $OUTDIR/$FILE3.bam
24 | 
25 | #$SAMTOOLS merge $OUTDIR/Cell$FILEINDEX.bam $OUTDIR/$FILE1.bam $OUTDIR/$FILE2.bam $OUTDIR/$FILE3.bam 
26 | 
27 | rm $OUTDIR/$FILE1
28 | rm $OUTDIR/$FILE2
29 | rm $OUTDIR/$FILE3
30 | 
31 | 
32 | 
33 | #export _JAVA_OPTIONS="-Xmx100M -XX:MaxHeapSize=100m"
34 | #java -jar $CRAMTOOLS bam -I $OUTDIR/$FILE1 -O $OUTDIR/$FILE1.cram.bam
35 | 
36 | #alias cramtools='java -jar cramtools-2.0.jar'
37 | #cramtools bam -I 9233_8#168_1.cram -O 9233_8#168_1.cram.bam
38 | #cramtools fastq -I 9233_8#168_1.cram | head
39 | #samtools view 9233_8#168_1.cram.bam | head
40 | 


--------------------------------------------------------------------------------
/0_Determine_Barcodes.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 2) {die "Required Argument: Barcode counting output, expected number of cells\n";}
 5 | 
 6 | my $ExpectNCells = $ARGV[1]; # This is just used as a guide does not have to be exact.
 7 | my $trunc = 1; #Only output the identified cell IDs.
 8 | 
 9 | open(my $ifh, $ARGV[0]) or die $!; # list of the form: barcode frequencey; in decending order of frequency.
10 | my %Barcodes = ();
11 | 
12 | my $count = 0;
13 | while(<$ifh>) {
14 | 	chomp;
15 | 	my @record = split(/\s+/);
16 | 	my $barcode = $record[0];
17 | 	my $counts = $record[1];
18 | 	my @seencodes = keys(%Barcodes);
19 | 	$Barcodes{$barcode} = $counts;
20 | 	foreach my $key (@seencodes) {
21 | 		my $count = ( $barcode ^ $key ) =~ tr/\0//;
22 | 		my $mismatches = length($barcode)-$count;
23 | 		if ($mismatches <= 1) {
24 | 			$Barcodes{$barcode} = $Barcodes{$key}+$Barcodes{$barcode};
25 | 			delete($Barcodes{$key});
26 | 		}
27 | 	}
28 | 	$count++;
29 | 	if ($count > 10000) {print STDERR scalar(@seencodes)."\n"; $count=0;}
30 | }
31 | 
32 | my @codes = sort{$Barcodes{$a}<=>$Barcodes{$b}} keys(%Barcodes);
33 | my $quantile = $ExpectNCells*0.75;
34 | my $quantile_freq = $Barcodes{$codes[$quantile]};
35 | my $threshold = $quantile_freq - ($Barcodes{$codes[0]}-$quantile_freq);
36 | 
37 | my $count = 0;
38 | foreach my $code (@codes) {
39 | 	if ($Barcodes{$code} < $threshold) {
40 | 		print STDERR "$count cell barcodes found.\n"
41 | 		if ($tunct) {last;}
42 | 	}
43 | 	print "$code ".$Barcodes{$code}."\n";
44 | 	$count++;
45 | }
46 | 


--------------------------------------------------------------------------------
/0_Download_Files_from_Dropbox.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | #system("wget "LINK" > index.txt");
 4 | my %files = ();
 5 | open(my $ifh, "index.txt") or die $!;
 6 | while (<$ifh>) {
 7 | 	if ($_ =~ /\.gz/) {
 8 | 		while($_ =~ s/href="(.*?\.gz)/Done/){
 9 | #			print $1."\n"; 
10 | 			$files{$1} = 1;
11 | 		}
12 | 	}
13 | } close($ifh);
14 | 
15 | foreach my $file (keys(%files)) {
16 | 	system("wget $file")
17 | }
18 | 


--------------------------------------------------------------------------------
/0_Extract_Metadata_from_Bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export REF_CACHE=/lustre/scratch117/cellgen/team218/TA/TemporaryFileDir
 4 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools
 5 | 
 6 | OUTDIR=$1
 7 | INPUTDIR=$2
 8 | 
 9 | FILEStoMAP=($INPUTDIR/*.bam)
10 | ARRAYINDEX=$(($LSB_JOBINDEX-1))
11 | INPUTBAM=${FILEStoMAP[$ARRAYINDEX]}
12 | OUTFILE=$(basename $FILE).meta
13 | 
14 | $SAMTOOLS -H $INPUTBAM > $OUTDIR/$OUTFILE
15 | 


--------------------------------------------------------------------------------
/0_FASTQC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Initial QC
 3 | FASTQFILE=$1
 4 | FASTQC=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/FastQC/fastqc
 5 | LIMITFILE=/nfs/users/nfs_t/ta6/RNASeqPipeline/0_FASTQC_limits.txt
 6 | # There is also the -o for an appropriate output directory
 7 | 
 8 | if [ ! -f "$FASTQFILE" ] ; then
 9 |   echo "Sorry $FASTQFILE does not exist "
10 |   exit 1
11 | fi
12 | 
13 | if [ ! -f "$FASTQC" ] ; then
14 |   echo "Sorry FASTQC not available "
15 |   exit 1
16 | fi
17 | 
18 | export _JAVA_OPTIONS="-Xmx100M -XX:MaxHeapSize=100m"
19 | $FASTQC -l $LIMITFILE --quiet $FASTQFILE 
20 | 
21 | 
22 | #If you want to run fastqc on a stream of data to be read from standard input then you
23 | #can do this by specifing 'stdin' as the name of the file to be processed and then 
24 | #streaming uncompressed fastq format data to the program.  For example:
25 | 
26 | #zcat *fastq.gz | fastqc stdin
27 | #zcat C*.gz | /nfs/users/nfs_t/ta6/RNASeqPipeline/software/FastQC/fastqc -o placeforoutputfiles/ stdin
28 | 
29 | # ^^ This is probably the best approach to use in many of my cases since this allows on the fly combining of various files without storing duplicated data.
30 | 


--------------------------------------------------------------------------------
/0_FASTQC_Streaming.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Initial QC
 3 | FASTQFILEDIR=$1
 4 | FASTQFILEPATTERN=$2
 5 | OUTNAME=$3 #outputfilenames
 6 | OUTPUTDIR=$4 #directory for outputfiles
 7 | FASTQC=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/FastQC/fastqc
 8 | LIMITFILE=/nfs/users/nfs_t/ta6/RNASeqPipeline/0_FASTQC_limits.txt
 9 | # There is also the -o for an appropriate output directory
10 | 
11 | if [ -z "$FASTQFILEDIR" ] ; then
12 |   echo "Please provide a directory of fastq files (Argument 1/4)"
13 |   exit 1
14 | fi
15 | if [ -z "$FASTQFILEPATTERN" ] ; then
16 |   echo "Please provide a pattern to select fastq files with (Argument 2/4) "
17 |   exit 1
18 | fi
19 | 
20 | if [ -z "$OUTNAME" ] ; then
21 |   echo "Please provide name for output files (Argument 3/4) "
22 |   exit 1
23 | fi
24 | 
25 | if [ -z "$OUTPUTDIR" ] ; then
26 |   echo "Please provide a directory to put output in (Argument 4/4) "
27 |   exit 1
28 | fi
29 | 
30 | if [ ! -f "$FASTQC" ] ; then
31 |   echo "Sorry FASTQC not available "
32 |   exit 1
33 | fi
34 | 
35 | mkdir -p $OUTPUTDIR
36 | 
37 | export _JAVA_OPTIONS="-Xmx10000M -XX:MaxHeapSize=10000m"
38 | #zcat $FASTQFILEDIR/$FASTQFILEPATTERN | $FASTQC -l $LIMITFILE --quiet $FASTQFILE -o $OUTPUTDIR stdin
39 | cat $FASTQFILEDIR/$FASTQFILEPATTERN | $FASTQC -l $LIMITFILE --quiet $FASTQFILE -o $OUTPUTDIR stdin
40 | mv $OUTPUTDIR/stdin_fastqc.html $OUTPUTDIR/FASTQC_$OUTNAME.html
41 | mv $OUTPUTDIR/stdin_fastqc.zip $OUTPUTDIR/FASTQC_$OUTNAME.zip
42 | 


--------------------------------------------------------------------------------
/0_FASTQC_limits.txt:
--------------------------------------------------------------------------------
 1 | # For each of the modules you can choose to not run that
 2 | # module at all by setting the value below to 1 for the
 3 | # modules you want to remove.
 4 | duplication 		ignore 		0
 5 | kmer 				ignore 		0
 6 | n_content 			ignore 		0
 7 | overrepresented 	ignore 		0
 8 | quality_base 		ignore 		0
 9 | sequence 			ignore 		0
10 | gc_sequence			ignore 		0
11 | quality_sequence	ignore		0
12 | tile				ignore		0
13 | sequence_length		ignore		0
14 | adapter				ignore		0
15 | 
16 | # For the duplication module the value is the percentage
17 | # remaining after deduplication.  Measured levels below
18 | # these limits trigger the warning / error.
19 | duplication	warn	70
20 | duplication error	50
21 | 
22 | # For the kmer module the filter is on the -log10 binomial
23 | # pvalue for the most significant Kmer, so 5 would be 
24 | # 10^-5 = p<0.00001
25 | kmer	warn	-1
26 | kmer	error	-1
27 | 
28 | # For the N module the filter is on the percentage of Ns
29 | # at any position in the library
30 | n_content	warn	-1
31 | n_content	error	-1
32 | 
33 | # For the overrepresented seqs the warn value sets the
34 | # threshold for the overrepresented sequences to be reported
35 | # at all as the proportion of the library which must be seen
36 | # as a single sequence
37 | overrepresented	warn	-1
38 | overrepresented	error	-1
39 | 
40 | # The per base quality filter uses two values, one for the value
41 | # of the lower quartile, and the other for the value of the
42 | # median quality.  Failing either of these will trigger the alert
43 | quality_base_lower	warn	10
44 | quality_base_lower	error	5
45 | quality_base_median	warn	25
46 | quality_base_median	error	20
47 | 
48 | # The per base sequence content module tests the maximum deviation
49 | # between A and T or C and G
50 | sequence	warn	-1
51 | sequence	error	-1
52 | 
53 | # The per sequence GC content tests the maximum deviation between
54 | # the theoretical distribution and the real distribution
55 | gc_sequence	warn	15
56 | gc_sequence	error	30
57 | 
58 | # The per sequence quality module tests the phred score which is
59 | # most frequently observed
60 | quality_sequence	warn	-1
61 | quality_sequence	error	-1
62 | 
63 | # The per tile module tests the maximum phred score loss between 
64 | # and individual tile and the average for that base across all tiles
65 | tile	warn	5
66 | tile	error	10
67 | 
68 | # The sequence length module tests are binary, so the values here
69 | # simply turn them on or off.  The actual tests warn if you have
70 | # sequences of different length, and error if you have sequences
71 | # of zero length.
72 | 
73 | sequence_length	warn	-1
74 | sequence_length	error	-1
75 | 
76 | # The adapter module's warnings and errors are based on the 
77 | # percentage of reads in the library which have been observed
78 | # to contain an adapter associated Kmer at any point
79 | 
80 | adapter	warn	5
81 | adapter	error	10
82 | 


--------------------------------------------------------------------------------
/0_Flexible_Convert_Dir_CRAM_to_BAM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Designed for LSF job arrays
 3 | 
 4 | export REF_CACHE=/lustre/scratch117/cellgen/team218/TA/TemporaryFileDir
 5 | SAMTOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/CRAM/samtools-1.3.1/samtools
 6 | #LSB_JOBINDEX=$3 # specify which file to run on
 7 | FILEINDEX=$LSB_JOBINDEX
 8 | DIR=$1
 9 | OUTDIR=$2
10 | FILE="$DIR/*_$FILEINDEX.cram"
11 | 
12 | #LANE8="/warehouse/team218_wh01/MH/LBHsLiverData/21698/Lane8/21698_8_$FILEINDEX.cram"
13 | 
14 | OUTDIR=/lustre/scratch117/cellgen/team218/TA/LiverOrganoids/BAMS
15 | 
16 | FILE1=$(basename $FILE)
17 | 
18 | cp $FILE $OUTDIR/$FILE1
19 | 
20 | $SAMTOOLS view -b -h $OUTDIR/$FILE1 -o $OUTDIR/$FILE1.bam
21 | 
22 | rm $OUTDIR/$FILE1
23 | 


--------------------------------------------------------------------------------
/0_GBK2FASTA.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (scalar(@ARGV) < 1 || $ARGV[0] !~ /gbk$/) {die "Did not provide GBK file."}
 5 | 
 6 | my @sequence = ();
 7 | my $name = "";
 8 | my $file = $ARGV[0];
 9 | 
10 | open (my $ifh, $file) or die $!;
11 | my $seq_started = 0;
12 | while (<$ifh>) {
13 | 	if ($_ =~ /^LOCUS/) {
14 | 		my @record = split(/\s+/);
15 | 		$name = $record[1];
16 | 	} elsif ($_ =~ /^ORIGIN/) {
17 | 		$seq_started = 1;
18 | 		next;
19 | 	} elsif ($_ =~ /^\/\//) {
20 | 		last;
21 | 	}
22 | 	if ($seq_started) {
23 | 		chomp;
24 | 		my $seq = $_;
25 | 		$seq =~ s/\s//g; #remove all whitespace
26 | 		$seq =~ s/\d//g; #remove all base numbers
27 | 		push(@sequence, $seq);
28 | 	}
29 | } close($ifh);
30 | 
31 | $file =~ s/gbk$/fa/;
32 | open (my $ofh, ">", $file) or die $!;
33 | print $ofh ">$name\n";
34 | foreach my $seq (@sequence) {print $ofh $seq."\n";}
35 | close($ofh);
36 | 
37 | 


--------------------------------------------------------------------------------
/0_Gather_Summary_Statistics.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | # Input a set of fastQ sequencing files
 5 | # gunzip, read, and re-gzip each one in turn
 6 | # Count No. Reads, Length of Reads, 
 7 | # Get number of unique cells & number of reads for each one (I think this is the second thing in the header line). 
 8 | 
 9 | if (@ARGV < 1) { die "Gather_Summary_Statistics.pl list_of_gzipped_fastq_files\n";}
10 | 
11 | my %cell2count = ();
12 | my %lane2count = ();
13 | #print join("\n", @ARGV)."\n";
14 | #exit();
15 | foreach my $file (@ARGV) {
16 | 	my $lane = "";
17 | 	if ($file =~ /(lane\d+)/) {
18 | 		$lane = $1;
19 | 	}
20 | 	my $workingfile = "/lustre/scratch108/compgen/team218/TA/temporaryfile1.txt.gz";
21 | 	system("cp $file $workingfile");
22 | 	system("gunzip $workingfile");
23 | 	$workingfile =~ s/\.gz$//;
24 | 	open (my $ifh, $workingfile) or die "Cannot open $workingfile :  $!\n";
25 | 
26 | 	while (<$ifh>) {
27 | 		if ($_ =~ /^@/) {
28 | 			my @record = split(/\s+/);
29 | 			$cell2count{$record[1]} ++;
30 | 			$lane2count{$lane}++;
31 | 		}
32 | 	} close ($ifh);
33 | 	system("rm $workingfile");
34 | #	exit();
35 | }
36 | foreach my $cell (sort(keys(%cell2count))) {
37 | 	print "$cell\t$cell2count{$cell}\n";
38 | }
39 | foreach my $lane (sort(keys(%lane2count))) {
40 | 	print "$lane\t$lane2count{$lane}\n";
41 | }
42 | 


--------------------------------------------------------------------------------
/0_Get_Data_from_iRODS.sh:
--------------------------------------------------------------------------------
 1 | #auto login for running as a job on the farm: ta6 = your username
 2 | # create irods.keytab file
 3 | #ktutil
 4 | #ktutil:  addent -password -p ta6 -k 1 -e aes256-cts
 5 | #Password for ta6@INTERNAL.SANGER.AC.UK: 
 6 | #ktutil:  wkt irods.keytab
 7 | #ktutil:  quit
 8 | kinit ta6 -k -t ~/irods.keytab
 9 | 
10 | #nicked from /nfs/team205/tpcg/bin/scripts/dump_irods.sh on the farm
11 | #and slightly modified - sed -i 's/\/software\/irods\/icommands\/bin\///g' dump_irods.sh
12 | 
13 | # make directory for each run_lane
14 | run_lane="$1"
15 | 
16 | # extract run and lane
17 | run=`echo $run_lane | sed -e 's/_.*//'`
18 | lane=`echo $run_lane | sed -e 's/.*_//'`
19 | 
20 | # get the cram files
21 | imeta qu -z seq -d id_run = $run and lane = $lane and target = 1 and type = cram \
22 | | grep : | awk '{ print $2 }' | paste - - -d/ \
23 | | xargs -ixxx iget -K xxx ./
24 | 
25 | chmod 664 *
26 | 
27 | # remove phiX control
28 | find ./ | grep -E '#888\.' | xargs rm
29 | 
30 | # get and format the meta info.
31 | for cram in $(find ./ | grep cram$ | sed -e 's/.*\///' | sed -e 's/\.cram$//'); do
32 |     imeta ls -d /seq/$run/$cram.cram > $cram.imeta
33 |     sn=$(grep -A 1 sample_supplier_name $cram.imeta | tail -1 | sed 's/ /_/g')
34 |     sample_name=${sn:7}
35 |     echo -e "$run_lane/$cram\t$sample_name" >> $run_$lane_sampleInfo.txt
36 | done
37 | 


--------------------------------------------------------------------------------
/0_Make_ERCC_fasta_and_gtf.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | # Converts the Annotation file from https://www.thermofisher.com/order/catalog/product/4456740 to gtf and fasta files that can be added to existing genome fasta & gtf files.
 5 | if (@ARGV != 1) {die "Usage: 0_Make_ERCC_fasta_and_gtf.pl <ERCC Annotation file>";}
 6 | 
 7 | $file = $ARGV[0]; #ERCC_Controls_Annotation.txt
 8 | 
 9 | my @FASTAlines = ();
10 | my @GTFlines = ();
11 | open (my $ifh, $file) or die $!;
12 | <$ifh>; #header
13 | while (<$ifh>) {
14 | 	# Do all the important stuff
15 | 	chomp;
16 | 	my @record = split(/\t/);
17 | 	my $sequence = $record[4];
18 | 	$sequence =~ s/\s+//g; # get rid of any preceeding/tailing white space
19 | 	$sequence = $sequence."NNNN";
20 | 	my $name = $record[0];
21 | 	my $genbank = $record[1];
22 | 	push(@FASTAlines, ">$name\n$sequence\n");
23 | # is GTF 1 indexed or 0 indexed? -> it is 1 indexed
24 | # + or - strand?
25 | 	push(@GTFlines, "$name\tERCC\tgene\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n");
26 | 	push(@GTFlines, "$name\tERCC\ttranscript\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n");
27 | 	push(@GTFlines, "$name\tERCC\texon\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n");
28 | } close($ifh);
29 | 
30 | # Write output
31 | open(my $ofh, ">", "ERCC_Controls.fa") or die $!;
32 | foreach my $line (@FASTAlines) {
33 | 	print $ofh $line;
34 | } close ($ofh);
35 | 
36 | open($ofh, ">", "ERCC_Controls.gtf") or die $!;
37 | foreach my $line (@GTFlines) {
38 | 	print $ofh $line;
39 | } close ($ofh);
40 | 


--------------------------------------------------------------------------------
/0_Merge_FASTQs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Merge read files for paired-end sequencing across two lanes, where files are ordered by lane, then cell, then read
 3 | FASTQDIR=$1
 4 | OUTDIR=$2
 5 | NCELLS=$3
 6 | 
 7 | #LSB_JOBINDEX=1 #Testing
 8 | 
 9 | # Maths
10 | NFILES=$(($NCELLS*2))
11 | INDEX1=$(($LSB_JOBINDEX-1))
12 | INDEX2=$(($INDEX1+$NFILES))
13 | 
14 | FILES=($FASTQDIR/*.gz)
15 | FILE1=${FILES[$INDEX1]}
16 | FILE2=${FILES[$INDEX2]}
17 | 
18 | echo $FILE1
19 | echo $FILE2
20 | TAIL='_1.fq'
21 | CELLID=$LSB_JOBINDEX;
22 | if !((CELLID % 2)); then
23 | 	CELLID=$(($CELLID/2))
24 | else 
25 | 	CELLID=$(( ($CELLID+1)/2 ))
26 | fi
27 | 
28 | if [[ $FILE1 =~ _1.f ]] ; then
29 | 	OUTFILE=Cell$CELLID$TAIL
30 | 	zcat $FILE1 $FILE2 > $OUTDIR/$OUTFILE
31 | else 
32 | 	TAIL='_2.fq'
33 | 	OUTFILE=Cell$CELLID$TAIL
34 | 	zcat $FILE1 $FILE2 > $OUTDIR/$OUTFILE
35 | fi
36 | echo $OUTFILE
37 | gzip $OUTDIR/$OUTFILE
38 | 


--------------------------------------------------------------------------------
/0_My_Extract_Transcriptome.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | if (@ARGV < 3) {die "0_My_Extract_Transcriptome.pl .gtf .fa Nascent?[0|1]\n";}
  5 | # To Do:
  6 | # get all exons per transcript
  7 | # sort by start
  8 | # if overlap (start2 < end1) then merge
  9 | # else switch to new exon
 10 | sub merge_transcripts { # Tested
 11 | 	my @exons = sort{$a->{"st"} <=> $b->{"st"}} @_;
 12 | 	my %curr = %{shift(@exons)};
 13 | 	my @finalexons = ();
 14 | 	foreach my $exon2 (@exons) {
 15 | 		if ($curr{"end"} > $exon2->{"st"}) {
 16 | 			# overlap == merge
 17 | 			if ($exon2->{"end"} > $curr{"end"}) {
 18 | 				$curr{"end"} = $exon2->{"end"};
 19 | 			}
 20 | 		} else {
 21 | 			my $tmp1 = $curr{"st"};
 22 | 			my $tmp2 = $curr{"end"};
 23 | #			print "save $tmp1 $tmp2\n";
 24 | 			push(@finalexons, {"st"=>$tmp1,"end"=>$tmp2});
 25 | 			%curr = %{$exon2};
 26 | 		}
 27 | 	}
 28 | 	my $tmp1 = $curr{"st"};
 29 | 	my $tmp2 = $curr{"end"};
 30 | #	print "save $tmp1 $tmp2\n";
 31 | 	push(@finalexons, {"st"=>$tmp1,"end"=>$tmp2});
 32 | 	return(@finalexons);
 33 | }
 34 | 
 35 | 
 36 | my %Ensg2Seq = ();
 37 | my %Ensg2Tail = ();
 38 | my %Ensg2Gtf = ();
 39 | my @Ensgs = ();
 40 | my $flank = 10;
 41 | 
 42 | my $nascent = $ARGV[2];
 43 | 
 44 | open (my $fa, $ARGV[1]) or die $!;
 45 | open (my $fa_out, ">","Transcripts.fa") or die $!;
 46 | open (my $gtf_out, ">","Transcripts.gtf") or die $!;
 47 | my $chr = "None";
 48 | my $chr_seq = "";
 49 | my $COUNT = 0;
 50 | while (<$fa>) {
 51 | 	if($_ =~ /^#/) {next;} # skip headers
 52 | 	if ($_ =~ /^\>/) {
 53 | 		# New Chr
 54 | 		my @line = split(/\s+/);
 55 | 		my $newchr = $line[0]; $newchr =~ s/>//g;
 56 | 		if ($chr eq "None") {
 57 | 			$chr = $newchr;
 58 | 			next;
 59 | 		} 
 60 | 		# Output gene sequences for this chromosome
 61 | 		open (my $gtf, $ARGV[0]) or die $!;
 62 | 		my $gtf_line = "";
 63 | 		my %exons = ();
 64 | 		while ($gtf_line = <$gtf>) {
 65 | 			if ($gtf_line =~ /^#/) {next;} # ignore headers
 66 | 			my $geneid = "";
 67 | 			if ($gtf_line =~ /gene_id "(.+?)";/) {
 68 | 				$geneid = $1;
 69 | 			} else {
 70 | 				next;
 71 | 			} # get gene id
 72 | 			my @record = split(/\t/, $gtf_line);
 73 | 			my $seq_chr = $record[0];
 74 | 			if ($seq_chr ne $chr) {next;}
 75 | 			my $seq_st = $record[3]-$flank;
 76 | 			my $seq_end = $record[4]+$flank;
 77 | 			my %item; $item{"st"} = $seq_st; $item{"end"} = $seq_end; 
 78 | 			if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";}
 79 | 			if (!$nascent) {
 80 | 				if ($record[2] eq "exon" || $record[2] eq "UTR") {
 81 | 					push(@{$exons{$geneid}}, \%item);
 82 | #					if (exists($Ensg2Seq{$geneid})) {
 83 | #						$Ensg2Seq{$geneid}.= substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank));
 84 | #					} else {
 85 | #						$Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank));
 86 | #					}
 87 | 				}
 88 | 			} else {
 89 | 				if ($record[2] eq "gene") {
 90 | 					push(@{$exons{$geneid}}, \%item);
 91 | #					$Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-$flank, ($seq_end-$seq_st+$flank));
 92 | 				}
 93 | 			}
 94 | 			if ($record[2] eq "gene") {
 95 | 				push(@Ensgs, $geneid);
 96 | 				$Ensg2Gtf{$geneid} = $gtf_line;
 97 | 				$COUNT++;
 98 | 			}
 99 | 		}
100 | 		close($gtf);
101 | 		foreach my $ensg (@Ensgs) {
102 | 			# Get sequence
103 | 			my @parts = @{$exons{$ensg}};
104 | 			my @merged = merge_transcripts(@parts);
105 | 			my $gene_seq = "";
106 | 			foreach my $item (@merged) {
107 | 				my $seq_st  = $item->{"st"};
108 | 				my $seq_end = $item->{"end"};
109 | #print "$seq_st $seq_end aquired\n";
110 | 				$gene_seq .= substr($chr_seq, $seq_st, ($seq_end-$seq_st));
111 | 			}
112 | 			
113 | 			print $fa_out ">$ensg\n";
114 | 			print $fa_out $gene_seq."\n";
115 | 			my $seq_length = length($gene_seq);
116 | #			print $fa_out $Ensg2Seq{$ensg}."\n";
117 | #			my $seq_length = length($Ensg2Seq{$ensg});
118 | 			my $old_gtf = $Ensg2Gtf{$ensg};
119 | 			$old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/;
120 | 			my @record = split(/\t/, $old_gtf);
121 | 			$record[0] = $ensg;
122 | 			$record[3] = 1;
123 | 			$record[4] = $seq_length-1;
124 | 			print $gtf_out join("\t",@record);
125 | 			my $lastele = scalar(@record)-1;
126 | 			$record[$lastele] = "gene_id \"$ensg\"; transcript_id \"$ensg\"; exon_number \"1\"; gene_name \"$ensg\"\n";
127 | 			$record[2] = "exon";
128 | 			print $gtf_out join("\t",@record);
129 | 		}
130 | 		print "$chr $newchr\n";
131 | 		%exons = ();
132 | 		$chr = $newchr;
133 | 		$chr_seq="";
134 | 		$COUNT=0;
135 | 		@Ensgs=();
136 | 	} else {
137 | 		chomp;
138 | 		$chr_seq = $chr_seq.$_;
139 | 	}
140 | }
141 | # Output last chromosome
142 | # Output gene sequences 
143 | open (my $gtf, $ARGV[0]) or die $!;
144 | my $gtf_line = "";
145 | while ($gtf_line = <$gtf>) {
146 | 	if ($gtf_line =~ /^#/) {next;} # ignore headers
147 | 
148 | 	my $geneid = "";
149 | 	if ($gtf_line =~ /gene_id "(.+?)";/) {
150 | 		$geneid = $1;
151 | 	} else {
152 | 		next;
153 | 	} # get gene id
154 | 
155 | 	my @record = split(/\t/, $gtf_line);
156 | 	my $seq_chr = $record[0];
157 | 	if ($seq_chr ne $chr) {next;}
158 | 	my $seq_st = $record[3];
159 | 	my $seq_end = $record[4];
160 | 	if ($seq_chr ne $chr) {die "Something has gone terribly wrong $seq_chr $chr\n";}
161 | 	if (!$nascent) {
162 | 		if ($record[2] eq "exon" || $record[2] eq "UTR") {
163 | 			if (exists($Ensg2Seq{$geneid})) {
164 | 				$Ensg2Seq{$geneid}.= substr($chr_seq, $seq_st-10, ($seq_end-$seq_st+10));
165 | 			} else {
166 | 				$Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-10, ($seq_end-$seq_st+10));
167 | 			}
168 | 		}
169 | 	} else {
170 | 		if ($record[2] eq "gene") {
171 | 			$Ensg2Seq{$geneid} = substr($chr_seq, $seq_st-10, ($seq_end-$seq_st+10));
172 | 		}
173 | 	}
174 | 	if ($record[2] eq "gene") {
175 | 		push(@Ensgs, $geneid);
176 | 		$Ensg2Gtf{$geneid} = $gtf_line;
177 | 		$COUNT++;
178 | 		if ($record[6] eq "+") {
179 | 			$Ensg2Tail{$geneid}->{"+"} = substr($chr_seq,$seq_end,$flank);
180 | 		} else {
181 | 			$Ensg2Tail{$geneid}->{"-"} = substr($chr_seq,$seq_st-$flank,$flank);
182 | 		}
183 | 	}
184 | }
185 | close($gtf);
186 | foreach my $ensg (@Ensgs) {
187 | 	print $fa_out ">$ensg\n";
188 | 	print $fa_out $Ensg2Seq{$ensg}."\n";
189 | 	my $seq_length = length($Ensg2Seq{$ensg});
190 | 	my $old_gtf = $Ensg2Gtf{$ensg};
191 | 	$old_gtf =~ s/transcript_id "(.+?)"/transcript_id "$ensg"/;
192 | 	my @record = split(/\t/, $old_gtf);
193 | 	$record[0] = $ensg;
194 | 	$record[3] = 1;
195 | 	$record[4] = $seq_length-1;
196 | 	print $gtf_out join("\t",@record);
197 | 
198 | 	my $lastele = scalar(@record)-1;
199 | 	$record[$lastele] = "gene_id \"$ensg\"; transcript_id \"$ensg\"; exon_number \"1\"; gene_name \"$ensg\"\n";
200 | 	$record[2] = "exon";
201 | 	print $gtf_out join("\t",@record);
202 | }
203 | 
204 | close($fa_out);
205 | close($gtf_out);
206 | close($fa);
207 | 


--------------------------------------------------------------------------------
/0_Process_GBK.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (scalar(@ARGV) < 1 || $ARGV[0] !~ /gbk$/) {die "Did not provide GBK file."}
 5 | 
 6 | my $sequence = "";
 7 | my $chrname = "";
 8 | my $file = $ARGV[0];
 9 | 
10 | open (my $ifh, $file) or die $!;
11 | my $seq_started = 0;
12 | my $st = 0;
13 | my $end = 0;
14 | my $name = "";
15 | my %Items = ();
16 | my $geneid = 0;
17 | my %Gene_info=();
18 | while (<$ifh>) {
19 | 	if ($_ =~/Promoter/) {
20 | 		$geneid++;
21 | 	}
22 | 	if ($_ =~ /feature\s+(\d+)\.\.(\d+)/){
23 | 		$st = $1;
24 | 		$end = $2;
25 | 	}
26 | 	if ($_ =~ /\/label=(.+)\s+$/) {
27 | 		$name=$1;
28 | 		$name =~ s/\s//g;
29 | 		$Items{$geneid}->{"$st\t$end"} = $name;
30 | 		if (!exists($Gene_info{$geneid}->{"st"}) || $st < $Gene_info{$geneid}->{"st"}) {
31 | 			$Gene_info{$geneid}->{"st"} = $st;
32 | 		}
33 | 		if (!exists($Gene_info{$geneid}->{"end"}) || $end > $Gene_info{$geneid}->{"end"}) {
34 | 			$Gene_info{$geneid}->{"end"} = $end;
35 | 		}
36 | 	}
37 | 		
38 | 	if ($_ =~ /^LOCUS/) {
39 | 		my @record = split(/\s+/);
40 | 		$chrname = $record[1];
41 | 	} elsif ($_ =~ /^ORIGIN/) {
42 | 		$seq_started = 1;
43 | 		next;
44 | 	} elsif ($_ =~ /^\/\//) {
45 | 		last;
46 | 	}
47 | 	if ($seq_started) {
48 | 		chomp;
49 | 		my $seq = $_;
50 | 		$seq =~ s/\s//g; #remove all whitespace
51 | 		$seq =~ s/\d//g; #remove all base numbers
52 | 		$sequence .= $seq;
53 | 	}
54 | } close($ifh);
55 | 
56 | $file =~ s/gbk$/gtf/;
57 | open (my $ofh, ">", $file) or die $!;
58 | foreach my $gene (sort(keys(%Items))) {
59 | 	print $ofh "$chrname\tGBK\tgene\t".$Gene_info{$gene}->{"st"}."\t".$Gene_info{$gene}->{"end"}."\t.\t+\t.\tgene_id \"Gene$gene\"; transcript_id \"Transcript$gene\"; gene_name \"Gene$gene\"; gene_source \"GBK\";\n";
60 | 	my $exon_num=0;
61 | 	foreach my $exon (sort(keys(%{$Items{$gene}}))) {
62 | 		$exon_num++;
63 | 		print $ofh "$chrname\tGBK\texon\t$exon\t.\t+\t.\tgene_id \"Gene$gene\"; transcript_id \"Transcript$gene\"; exon_number \"$exon_num\"; gene_name \"Gene$gene\"; transcript_name \"Transcript$gene\"; gene_source \"GBK\"; exon_name \"".$Items{$gene}->{$exon}."\";\n";
64 | 	}
65 | }
66 | close($ofh);
67 | 
68 | $file =~ s/gtf$/fa/;
69 | open ($ofh, ">", $file) or die $!;
70 | print $ofh ">$chrname\n$sequence";
71 | close($ofh);
72 | 
73 | 


--------------------------------------------------------------------------------
/0_custom_undo_demultiplexing.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | open (my $ifh, "/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/wta98_embl2.metadata") or die $!;
 5 | my %Name2Barcode = ();
 6 | while (<$ifh>) {
 7 |         if ($_ =~/^#/) {next;}
 8 |         my @record = split(/\s+/);
 9 |         my $name = "R".$record[6]."C".$record[7];
10 | 	$Name2Barcode{$name} = $record[0];
11 | } close($ifh);
12 | 
13 | 
14 | my @files1 = glob("/lustre/scratch108/compgen/team218/TA/Bergiers_Dropbox/*_1.txt");
15 | my @files2 = glob("/lustre/scratch108/compgen/team218/TA/Bergiers_Dropbox/*_2.txt");
16 | 
17 | if (scalar(@files1) != scalar(@files2)) {die "Must have equal number of read1 & read2 files\n";}
18 | 
19 | my $unassigned1 = "";
20 | my $unassigned2 = "";
21 | my $out1 = "lane1_Waf375_1.fq";
22 | my $out2 = "lane1_Waf375_2.fq";
23 | open(my $ofh1, ">", $out1) or die $!;
24 | open(my $ofh2, ">", $out2) or die $!;
25 | 
26 | for(my $i = 0; $i < scalar(@files1); $i++) {
27 | 	open(my $ifh1, $files1[$i]) or die $!;
28 | 	open(my $ifh2, $files2[$i]) or die $!;
29 | 	
30 | 	my $barcode = "";
31 | 	if ($files1[$i] =~ /sample(R\d+C\d+)_/) {
32 | 		my $name=$1;
33 | 		if (exists($Name2Barcode{$name})) {
34 | 			$barcode = $Name2Barcode{$name};
35 | 		} else {
36 | 			die "$name has no barcode\n";
37 | 		}
38 | 	} else {
39 | 		if ($files1[$i] =~/unassigned/i){
40 | 			$unassigned1 = $files1[$i];
41 | 			$unassigned2 = $files2[$i];
42 | 			next;
43 | 		}
44 | 		die "$files1[$i] does not match\n";
45 | 	}
46 | 
47 | 	while(<$ifh1>) {
48 | 	        my $file1line = $_;
49 | 	        my $file2line = <$ifh2>;
50 | 	        if ($file1line =~ /^@/) {
51 | 	                my @thing1 = split(/\s+/,$file1line);
52 | 	                my @thing2 = split(/\s+/,$file2line);
53 | 	                my $readname = $thing1[0];
54 | 	                if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";}
55 | 			my $barcodeseq = <$ifh1>;
56 | 			$barcodeseq = $barcode.$barcodeseq;
57 | 	                my $read = <$ifh2>;
58 | 			<$ifh1>;<$ifh2>; #+'s
59 |                         my $file1qual = <$ifh1>;
60 | 			$file1qual =  'E' x length($barcode) . $file1qual;
61 |                         my $file2qual = <$ifh2>;
62 | 			print $ofh1 "$readname\n$barcodeseq+\n$file1qual";
63 | 			print $ofh2 "$readname\n$read+\n$file2qual";
64 | 		}
65 | 	}
66 | 	close($ifh1);
67 | 	close($ifh2);
68 | } 
69 | close ($ofh1); close($ofh2);
70 | system("cat $unassigned1 >> $out1");
71 | system("cat $unassigned2 >> $out2");
72 | print "Successfully Completed\n";
73 | 


--------------------------------------------------------------------------------
/0_make_transcriptome.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR
3 | 
4 | FA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Nascent_Transcripts.fa
5 | GTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Nascent_Transcripts.gtf
6 | 
7 | bsub -R"select[mem>37000] rusage[mem=37000]" -M37000 -o buildtranscriptome.out -e buildtranscriptome.err $STAR --runMode genomeGenerate --genomeDir /lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/NeuronsLiora --genomeFastaFiles $FA --sjdbGTFfile $GTF --sjdbOverhang 20 --limitGenomeGenerateRAM 36000000000
8 | 


--------------------------------------------------------------------------------
/1.5_DO_Trim_Reads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesQCed
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesToMap
 7 | INPUTFILES=($INPUTDIR/*)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES))
10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%100" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/1.5_Trim_Reads.sh $INPUTDIR $OUTPUTDIR
11 | 


--------------------------------------------------------------------------------
/1.5_Trim_Reads_Paired.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Initial QC
 3 | INPUTDIR=$1 #directory of inputfiles
 4 | OUTPUTDIR=$2 #directory for outputfiles
 5 | TRIMMER=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Trimmomatic-0.36/trimmomatic-0.36.jar
 6 | # There is also the -o for an appropriate output directory
 7 | 
 8 | if [ -z "$INPUTDIR" ] ; then
 9 |   echo "Please provide an input directory of fastq files (Argument 1/2)"
10 |   exit 1
11 | fi
12 | if [ -z "$OUTPUTDIR" ] ; then
13 |   echo "Please provide a directory for outputfiles (Argument 2/2)"
14 |   exit 1
15 | fi
16 | 
17 | if [ ! -f "$TRIMMER" ] ; then
18 |   echo "Sorry $TRIMMER not available "
19 |   exit 1
20 | fi
21 | 
22 | mkdir -p $OUTPUTDIR
23 | FILES=($INPUTDIR/*.fq)
24 | ARRAYINDEX=$((($LSB_JOBINDEX-1)))
25 | INPUTFILE=${FILES[$ARRAYINDEX]}
26 | FILEnopath=`basename ${INPUTFILE%.fq}`
27 | OUTPUTFILE="$OUTPUTDIR/TRIMMED-$FILEnopath.fq"
28 | 
29 | export _JAVA_OPTIONS="-Xmx1000M -XX:MaxHeapSize=1000m"
30 | #java -jar $TRIMMER SE -phred33 $INPUTFILE $OUTPUTFILE ILLUMINACLIP:/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Trimmomatic-0.36/adapters/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:20
31 | java -jar $TRIMMER SE -phred33 $INPUTFILE $OUTPUTFILE ILLUMINACLIP:/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Trimmomatic-0.36/adapters/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:50
32 | 


--------------------------------------------------------------------------------
/1.5_Trim_UMI.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 3) {die "1.5_Trim_UMI.pl 5'Length 3'Length inputdir outputdir\n";}
 5 | 
 6 | my @files = glob("$ARGV[2]/*.fq");
 7 | my $tmpfile = "tmp.txt";
 8 | foreach my $file (@files) {
 9 | 	$file =~ /([^\/]+\.fq)/;
10 | 	my $filename = $1;
11 | 	open(my $ifh, $file) or die $!;
12 | 	open(my $ofh, ">",$tmpfile) or die $!;
13 | 
14 | 	while (<$ifh>) {
15 | 		if ($_ =~ /:/) {
16 | 			chomp;
17 | 			my @stuff = split(/\:/);
18 | 			my $UMI = $stuff[scalar(@stuff)-1];
19 | 			my $trimmed = substr($UMI, $ARGV[0], -$ARGV[1]);
20 | 			if ($ARGV[1] == 0) {
21 | 				$trimmed = substr($UMI, $ARGV[0]);
22 | 			}
23 | 			$stuff[scalar(@stuff)-1]=$trimmed;
24 | 			print $ofh (join(":",@stuff)."\n");
25 | 		} else {
26 | 			print $ofh ($_);
27 | 		}
28 | 	} close ($ifh); close ($ofh);
29 | 	system("mv $tmpfile $ARGV[3]/$filename");
30 | }
31 | 


--------------------------------------------------------------------------------
/1.6_Summarizing_Trimming.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "1.6_Summarizing_Trimming.pl directory of outputfiles\n";}
 5 | 
 6 | my @files = glob("$ARGV[0]/*");
 7 | my %Cell2ReadCount = ();
 8 | foreach my $file (@files) {
 9 | 	open (my $ifh, $file) or die $!;
10 | 	my $cell = "";
11 | 	my $surviving = 0;
12 | 	while(<$ifh>) {
13 | 		if ($_ =~ /([ATCG]+)\.fq/) {
14 | 			$cell = $1;
15 | 		}
16 | 		if ($_ =~ /Surviving: (\d+) /) {
17 | 			$surviving=$1;
18 | 			last;
19 | 		}
20 | 	} close ($ifh);
21 | 	$Cell2ReadCount{$cell} = $surviving;
22 | }
23 | 
24 | foreach my $code (sort(keys(%Cell2ReadCount))) {
25 | 	print "$code\t$Cell2ReadCount{$code}\n";
26 | }
27 | 		
28 | 
29 | 


--------------------------------------------------------------------------------
/1_BreakDown_Files_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$1" ] ; then
 4 |   echo "Please set maximum number of reads per file (ARG 1/4)"
 5 |   exit 1
 6 | fi
 7 | if [ -z $2 ] ; then
 8 |   echo "Please set input file directory (ARG 2/4)"
 9 |   exit 1
10 | fi
11 | if [ -z $3 ] ; then
12 |   echo "Please set a pattern for inputfiles (ARG 3/4)"
13 |   exit 1
14 | fi
15 | if [ -z "$4" ] ; then
16 |   echo "Please set a directory for output files (ARG 4/4)"
17 |   exit 1
18 | fi
19 | 
20 | OUTPUTDIR=$4
21 | INPUTFILES=($2/$3)
22 | ARRAYINDEX=$((($LSB_JOBINDEX-1)))
23 | 
24 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/1_BreakDown_PairedEnds.pl $LSB_JOBINDEX $1 $OUTPUTDIR ${INPUTFILES[$ARRAYINDEX]}
25 | 


--------------------------------------------------------------------------------
/1_BreakDown_PairedEnds.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | # Input a pair of sequencing FastQ files
 5 | # gunzip, read, write out smaller broken down files in a format suitable for submitting job array, and re-gzip each one in turn
 6 | # Breakdown by lane & cellID
 7 | # Keep order. 
 8 | # This should work equally well for single-end reads and can take any number of files as arguments.
 9 | # TESTED
10 | 
11 | if (@ARGV < 3) { die "Breakdown_Paired_Ends.pl JOBID MAXREADS OUTPUTDIR FastQfile1 FastQfile2\n";}
12 | 
13 | my $JOBID = shift(@ARGV); #Maxmimum number of reads per file [job].
14 | my $MAX_READS_PER_FILE = shift(@ARGV); #Maxmimum number of reads per file [job].
15 | my $OUTPUT_DIR = shift(@ARGV); #directory for output
16 | system("mkdir -p $OUTPUT_DIR");
17 | 
18 | foreach my $file (@ARGV) {
19 | 	my %cell2lines = ();
20 | 	my $workingfile = $file;
21 | #	my $workingfile = "/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/temporaryfile$JOBID.txt.gz";
22 | #	system("cp $file $workingfile");
23 | #	system("gunzip $workingfile");
24 | #	$workingfile =~ s/\.gz$//;
25 | #	my $pair = 0; my $orig_file_id = 0; my $experiment = "";
26 | #	if ($file =~ /(exp\d)_.*_(Bergiers_\w+)_(\d)_sequence/) {
27 | #	if ($file =~ /(lane\d)(sample\d)_(\d)_sequence/) {
28 | #		$experiment = $1;
29 | #		$orig_file_id = $2;
30 | #		$pair = $3;
31 | #	}
32 | 		
33 | 	open (my $ifh, $workingfile) or die "Cannot open $workingfile :  $!\n";
34 | 
35 | 	while (<$ifh>) {
36 | 		if ($_ =~ /^@/) {
37 | 			my @record = split(/\s+/);
38 | 			my $cell = "AAAAAAAAAAA";
39 | 			if (scalar(@record) == 3) {
40 | 				$cell = $record[1];
41 | 			}
42 | 			push(@{$cell2lines{$cell}}, $_);
43 | 			push(@{$cell2lines{$cell}}, <$ifh>);
44 | 			push(@{$cell2lines{$cell}}, <$ifh>);
45 | 			push(@{$cell2lines{$cell}}, <$ifh>);
46 | 		}
47 | 	} close ($ifh);
48 | #	system("rm $workingfile");
49 | 	foreach my $cell (sort(keys(%cell2lines))) {
50 | 		my $fileid = 1; my $Nlines = 0;
51 | #		my $currentfile = "$OUTPUT_DIR/$orig_file_id\_$experiment\_$cell\_$pair.$fileid.fq";
52 | 		my $currentfile = "$OUTPUT_DIR/$JOBID\_$cell.$fileid.fq";
53 | 		open (my $ofh, ">$currentfile") or die $!;
54 | 		foreach my $line (@{$cell2lines{$cell}}) {
55 | 
56 | 			print $ofh $line;
57 | 			$Nlines++;
58 | 
59 | 			if ($Nlines == $MAX_READS_PER_FILE*4) {
60 | 				close ($ofh); #close current file
61 | 				system("gzip $currentfile"); #compress it
62 | 				$fileid++; 
63 | #				$currentfile = "$OUTPUT_DIR/$orig_file_id\_$experiment\_$cell\_$pair.$fileid.fq";
64 | 				$currentfile = "$OUTPUT_DIR/$JOBID\_$cell.$fileid.fq";
65 | 				open ($ofh, ">$currentfile") or die $!; #open the next file
66 | 				$Nlines = 0; #reset line counter
67 | 			}
68 | 		}
69 | 		close($ofh); system("gzip $currentfile"); # close and compress the current file.
70 | 	}
71 | #	if (-e $workingfile) {
72 | #		system("rm $workingfile");
73 | #	}
74 | }
75 | 


--------------------------------------------------------------------------------
/1_BreakDown_PairedEnds_Custom_Wafergen.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | if (@ARGV < 6) { die "Breakdown_Paired_Ends.pl OUTPUTDIR INPUT1 INPUT2 BarcodeIndexfile BarcodeColumn(0=first column) ProjectName\n";}
  5 | my $OUTPUT_DIR = $ARGV[0]; #directory for output
  6 | system("mkdir -p $OUTPUT_DIR");
  7 | my $infile1 = $ARGV[1];
  8 | my $infile2 = $ARGV[2];
  9 | 
 10 | # Get acceptable cell barcodes
 11 | my %CellBarcodes = ();
 12 | open (my $ifh, $ARGV[3]) or die "Cannot open $ARGV[3]\n";
 13 | <$ifh>; # header
 14 | my $column = $ARGV[4];
 15 | my $index=1;
 16 | my %ofhs = ();
 17 | while (<$ifh>) {
 18 | 	if ($_ =~/^#/) {next;}
 19 | 	my @record = split(/\s+/);
 20 | 	my $barcode = $record[$column];
 21 | 	$CellBarcodes{$barcode} = $index;
 22 | 	open(my $fh,'>',"$OUTPUT_DIR/$ARGV[5]_$barcode.fq") or die $!;
 23 | 	$ofhs{$index} = $fh;
 24 | 	$index++;
 25 | } close($ifh);
 26 | 
 27 | my $NotProperTail = 0;
 28 | my $NotPossibleCell = 0;
 29 | my $AmbiguousCell = 0;
 30 | my $ExactMatch = 0;
 31 | my $Mismatch1 = 0;
 32 | my $Mismatch2 = 0;
 33 | my $total_reads = 0;
 34 | open (my $ifh1, $infile1) or die $!;
 35 | open (my $ifh2, $infile2) or die $!;
 36 | while(<$ifh1>) {
 37 | 	my $file1line = $_;
 38 | 	my $file2line = <$ifh2>;
 39 | 	if ($file1line =~ /^@/) {
 40 | 		my @thing1 = split(/\s+/,$file1line);
 41 | 		my @thing2 = split(/\s+/,$file2line);
 42 | 		my $readname = $thing1[0];
 43 | 		if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";}
 44 | 		my $barcodes = <$ifh1>;
 45 | 		my $read = <$ifh2>;
 46 | 		$total_reads++;
 47 | 		my $mismatches = 0;
 48 | #		if ($barcodes =~ /([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})[TKYWBHDNX]{9}/) {
 49 | 		if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/) {
 50 | 			my $UMI = $2;
 51 | 			my $CellID = $1;
 52 | 			if (!exists($CellBarcodes{$CellID})) { # Not an expected barcode
 53 | 				if ($CellID !~ /^[ATCG]+$/) {
 54 | 					$mismatches = () = $CellID =~ /[^ATCG]/g; # count uncertain bases as mismatches
 55 | 					$CellID =~ s/[^ATCG]/./g; #Turn non-ATCG bases into wildcards
 56 | 				}
 57 | 				my @matches = ();
 58 | 				my %close = ();
 59 | 				foreach my $barcode (keys(%CellBarcodes)) {
 60 | 					if ($barcode =~/$CellID/) { # Match but with uncertainty
 61 | 						push(@matches, $barcode);
 62 | 					} else {
 63 | 						if (scalar(@matches == 0)) { # Count mismatches
 64 | 							my $count = ( $barcode ^ $CellID ) =~ tr/\0//;
 65 | 							if ($count >= length($barcode)-2) { # Allow upto 2 mismatches
 66 | 								$close{$barcode} = $count;
 67 | 							}
 68 | 						}
 69 | 					}
 70 | 				}
 71 | 				if (scalar(@matches) == 0 && scalar(keys(%close)) > 0) { # Has 1 or 2 mismatches
 72 | 					my $max = my_max(values(%close)); # Closest match
 73 | 					$mismatches = length($CellID)-$max;
 74 | 					foreach my $code (keys(%close)) {
 75 | 						if ($close{$code} == $max) {
 76 | 							push(@matches,$code);
 77 | 						}
 78 | 					}
 79 | 				}
 80 | 				if (scalar(@matches) == 1) { # single best match
 81 | 					$CellID = $matches[0];
 82 | 					if ($mismatches == 2) {
 83 | 						$Mismatch2++;
 84 | 					}
 85 | 					if ($mismatches == 1) {
 86 | 						$Mismatch1++;
 87 | 					}
 88 | 				} elsif (scalar(@matches) > 1) { #More than one equally good match
 89 | 					$AmbiguousCell++;
 90 | 					next;
 91 | 				} else { # No match
 92 | 					$NotPossibleCell++;
 93 | 					next;
 94 | 				}
 95 | 			} else { # Exact match
 96 | 				$ExactMatch++;
 97 | 			} 
 98 | 			# ProperTailProperBarcode
 99 | 			<$ifh1>;<$ifh2>; #+'s
100 | 			my $file1qual = <$ifh1>;
101 | 			my $file2qual = <$ifh2>;
102 | 			my $handle = $ofhs{$CellBarcodes{$CellID}};
103 | 			print $handle "$readname:$UMI\n$read+\n$file2qual";
104 | 		} else {
105 | 			$NotProperTail++;
106 | 			next;
107 | 		}
108 | 	} else {next;}
109 | }
110 | print STDERR "Not proper read: $NotProperTail\nNot possible cell: $NotPossibleCell\nAmbiguous: $AmbiguousCell\nExact Matches:$ExactMatch\nOne mismatch: $Mismatch1\nTwo mismatch: $Mismatch2\n Total: $total_reads\n";
111 | close($ifh1);
112 | close($ifh2);
113 | foreach my $ofh (keys(%ofhs)) {close($ofhs{$ofh});}
114 | 
115 | 
116 | sub my_max {
117 | 	if (scalar(@_) == 1) {return($_[0])};
118 | 	my $max = shift;
119 | 	foreach my $ele (@_) {
120 | 		if ($ele > $max) {$max = $ele;}
121 | 	}
122 | 	return($max);
123 | }
124 | 


--------------------------------------------------------------------------------
/1_Breakdown_UMI_read_pairs.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | # Matches upto two mismatches between observed cell barcodes and the expected cell barcodes
  5 | # Excludes reads with problematic UMIs: >= 80% A, >= 80% T, contained in adaptor sequence. - Note do not provide adaptors for short UMI datasets (fewer than 7 bases) since there is a high probability of real UMIs being contained in the adaptor for such cases.
  6 | # Allows barcodes to contain ambiguous bases
  7 | # Allows trailing bases at the end of the barcode sequence but requires barcodes to begin from the first base in the barcode sequence.
  8 | 
  9 | if (@ARGV < 6) { die "Usage: 1_Breakdown_UMI_read_pairs.pl BarcodeFastq ReadFastq BarcodeStructure(C=cellbarcodebase, U=UMIbase) BarcodeIndexFile(\"UNKNOWN\" triggers counting reads with every unique barcode) BarcodeColumn(0=first column) OutputPrefix AdaptorFasta(optional)\n";}
 10 | my $infile1 = $ARGV[0];
 11 | my $infile2 = $ARGV[1];
 12 | my $barcodestructure = $ARGV[2];
 13 | 
 14 | # Parse Barcode Structure #
 15 | 
 16 | $barcodestructure =~ s/[^CU]//g;
 17 | 
 18 | print "$barcodestructure\n";
 19 | 
 20 | my $order = -1;
 21 | my $C_len = -1;
 22 | my $U_len = -1;
 23 | 
 24 | if ($barcodestructure =~ /^(C+)(U+)$/) {
 25 | 	$order=1;
 26 | 	$C_len = length($1);
 27 | 	$U_len = length($2);
 28 | 	print "Barcode Structure: $C_len bp CellID followed by $U_len bp UMI\n";
 29 | } elsif ($barcodestructure =~ /^(U+)(C+)$/) {
 30 | 	$order = 0;
 31 | 	$C_len = length($2);
 32 | 	$U_len = length($1);
 33 | 	print "Barcode Structure: $U_len bp UMI followed by $C_len bp CellID\n";
 34 | } else {
 35 | 	die "Intermingled cell & umi barcodes are not supported\n";
 36 | }
 37 | # ----------------------- #
 38 | 
 39 | my $OUTprefix = $ARGV[5]; #prefix for output
 40 | #Ensure output directory exists
 41 | if ($OUTprefix =~ /^(.+)\/[^\/]$/) {
 42 | 	if ($1 ne ".") {
 43 | 		system("mkdir -p $1");
 44 | 	}
 45 | }
 46 | 
 47 | # Read Expected Cell Barcodes #
 48 | my %CellBarcodes = ();
 49 | my %ofhs = ();
 50 | if ($ARGV[3] ne "UNKNOWN") {
 51 | 	open (my $ifh, $ARGV[3]) or die "Cannot open $ARGV[3]\n";
 52 | 	<$ifh>; # header
 53 | 	my $column = $ARGV[4];
 54 | 	my $index=1;
 55 | 	while (<$ifh>) {
 56 | 		chomp;
 57 | 		if ($_ =~/^#/) {next;}
 58 | 		my @record = split(/\s+/);
 59 | 		my $barcode = $record[$column];
 60 | 		$CellBarcodes{$barcode} = $index;
 61 | 		open(my $fh,'>',"$OUTprefix\_$barcode.fq") or die $!;
 62 | 		$ofhs{$index} = $fh;
 63 | 		$index++;
 64 | 	} close($ifh);
 65 | }
 66 | # --------------------------- #
 67 | 
 68 | # Read Adaptor Fasta #
 69 | my @Adaptors = ();
 70 | if (defined($ARGV[6])) {
 71 | 	open (my $afh, $ARGV[6]) or die $!;
 72 | 	while (<$afh>) {
 73 | 		if ($_ =~ /^>/) {
 74 | 			my $seq = <$afh>;
 75 | 			chomp($seq);
 76 | 			push(@Adaptors, $seq);
 77 | 		}
 78 | 	} close($afh);
 79 | }
 80 | # ------------------ #
 81 | 		
 82 | 
 83 | ### Process Reads ###
 84 | 
 85 | # Summary Statistics
 86 | my $NotProperBarcodes = 0;
 87 | my $NotPossibleCell = 0;
 88 | my $AmbiguousCell = 0;
 89 | my $ExactMatch = 0;
 90 | my $Mismatch1 = 0;
 91 | my $Mismatch2 = 0;
 92 | my $BadUMI = 0;
 93 | my $total_reads = 0;
 94 | my $OutputReads=0;
 95 | 
 96 | open (my $ifh1, $infile1) or die $!;
 97 | open (my $ifh2, $infile2) or die $!;
 98 | while(<$ifh1>) {
 99 | 	my $file1line = $_;
100 | 	my $file2line = <$ifh2>;
101 | 	if ($file1line =~ /^@/) { #Skip any file headers
102 | 
103 | 		# Ensure matching pair of reads
104 | 		my @thing1 = split(/\s+/,$file1line);
105 | 		my @thing2 = split(/\s+/,$file2line);
106 | 		my $readname = $thing1[0];
107 | 		if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match! $readname $thing2[0]\n";}
108 | 		my $barcodes = <$ifh1>;
109 | 		my $read = <$ifh2>;
110 | 		$total_reads++;
111 | 
112 | 		# Parse barcodes
113 | 		my $CellID = ""; my $UMI = "";
114 | 		if ($order) {
115 | 			if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$C_len})([ATCGNUKMRYSWBVHDX]{$U_len})/) {
116 | 				$CellID = $1; $UMI = $2;
117 | 			} else {$NotProperBarcodes++; next;}
118 | 		} else {
119 | 			if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$U_len})([ATCGNUKMRYSWBVHDX]{$C_len})/) {
120 | 				$CellID = $2; $UMI = $1;
121 | 
122 | 			} else {$NotProperBarcodes++; next;}
123 | 		}
124 | #		if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/) {
125 | 
126 | 
127 | 		# Correct for upto two mismatches between observed and expected cell barcodes
128 | 		if ($ARGV[3] ne "UNKNOWN") {
129 | 		my $mismatches = 0;
130 | 		if (!exists($CellBarcodes{$CellID})) { # Not an expected barcode
131 | 
132 | 			# Barcode contains uncertain bases -> convert to wildcards and pattern match on expected barcodes. (given priority over barcodes with higher confidence mismatches)
133 | 			if ($CellID !~ /^[ATCG]+$/) {
134 | 				$mismatches = () = $CellID =~ /[^ATCG]/g; # count uncertain bases as mismatches
135 | 				$CellID =~ s/[^ATCG]/./g; #Turn non-ATCG bases into wildcards
136 | 			}
137 | 			my @matches = ();
138 | 			my %close = ();
139 | 			foreach my $barcode (keys(%CellBarcodes)) {
140 | 				if ($barcode =~/$CellID/) { # Match but with uncertainty
141 | 					push(@matches, $barcode);
142 | 				} else {
143 | 					if (scalar(@matches == 0)) { # Count mismatches
144 | 						my $count = ( $barcode ^ $CellID ) =~ tr/\0//;
145 | 						if ($count >= length($barcode)-2) { # Allow upto 2 mismatches
146 | 							$close{$barcode} = $count;
147 | 						}
148 | 					}
149 | 				}
150 | 			}
151 | 			# If exact matches with uncertainty then give those priority, otherwise keep the most similar expected barcodes
152 | 			if (scalar(@matches) == 0 && scalar(keys(%close)) > 0) { # Has 1 or 2 mismatches
153 | 				my $max = my_max(values(%close)); # Closest match
154 | 				$mismatches = length($CellID)-$max;
155 | 				foreach my $code (keys(%close)) {
156 | 					if ($close{$code} == $max) {
157 | 						push(@matches,$code);
158 | 					}
159 | 				}
160 | 			}
161 | 			if (scalar(@matches) == 1) { # single best match
162 | 				$CellID = $matches[0];
163 | 				if ($mismatches == 2) {
164 | 					$Mismatch2++;
165 | 				}
166 | 				if ($mismatches == 1) {
167 | 					$Mismatch1++;
168 | 				}
169 | 			} elsif (scalar(@matches) > 1) { #More than one equally good match
170 | 				$AmbiguousCell++;
171 | 				next;
172 | 			} else { # No match
173 | 				$NotPossibleCell++;
174 | 				next;
175 | 			}
176 | 		} else { # Exact match
177 | 			$ExactMatch++;
178 | 		} 
179 | 
180 | 		} #If known barcodes
181 | 
182 | 		# UMI filter
183 | 
184 | 		# All As or All Ts with 2 mismatches - No I think >80% A or T is a better definition since short UMIs quite likely to get real A/T rich UMIs
185 | 		my $As_in_UMI = () = $UMI =~ /A/g;	
186 | 		my $Ts_in_UMI = () = $UMI =~ /T/g;	
187 | 		if ($As_in_UMI >= length($UMI)*0.8 || $Ts_in_UMI >= length($UMI)*0.8) {
188 | 			$BadUMI++; next;
189 | 		}
190 | 		# UMI contained in adaptor sequence - Don't need UMI length limit here since just don't provide adaptor sequences for short UMI datasets.
191 | 		if (scalar(@Adaptors) > 0) {
192 | 			foreach my $adapt (@Adaptors) {
193 | 				if ($adapt =~ /$UMI/) {
194 | 					$BadUMI++; next;
195 | 				}
196 | 			}
197 | 		}
198 | 
199 | 		if ($ARGV[3] ne "UNKNOWN") {
200 | 			# Has Acceptable Barcode
201 | 			<$ifh1>;<$ifh2>; #+'s
202 | 			my $file1qual = <$ifh1>;
203 | 			my $file2qual = <$ifh2>;
204 | 			my $handle = $ofhs{$CellBarcodes{$CellID}};
205 | 			print $handle "$readname:$UMI\n$read+\n$file2qual";
206 | 			$OutputReads++;
207 | 		} else {
208 | 			$CellBarcodes{$CellID}++;
209 | 		}
210 | 	} else {next;}
211 | }
212 | if ($ARGV[3] ne "UNKNOWN") {
213 | 	print STDERR "
214 | 	Not proper read: $NotProperBarcodes
215 | 	Not possible cell: $NotPossibleCell
216 | 	Ambiguous: $AmbiguousCell
217 | 	Exact Matches:$ExactMatch
218 | 	One mismatch: $Mismatch1
219 | 	Two mismatch: $Mismatch2
220 | 	Bad UMI: $BadUMI
221 | 	Input Reads: $total_reads
222 | 	Output Reads: $OutputReads\n";
223 | 	close($ifh1);
224 | 	close($ifh2);
225 | 	foreach my $ofh (keys(%ofhs)) {close($ofhs{$ofh});}
226 | } else {
227 | 	print STDERR "Bad UMI: $BadUMI\n";
228 | 	my @Codes = sort { $CellBarcodes{$b} <=> $CellBarcodes{$a} } keys(%CellBarcodes);
229 | 	foreach my $code (@Codes) {
230 | 		print "$code ".$CellBarcodes{$code}."\n";
231 | 	}
232 | }
233 | 
234 | sub my_max {
235 | 	if (scalar(@_) == 1) {return($_[0])};
236 | 	my $max = shift;
237 | 	foreach my $ele (@_) {
238 | 		if ($ele > $max) {$max = $ele;}
239 | 	}
240 | 	return($max);
241 | }
242 | 


--------------------------------------------------------------------------------
/1_DO_BreakDown_Files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/
 4 | mkdir -p $OUTPUTDIR
 5 | INPUTDIR=/nfs/team218/MH/2015-03-02-C6H8GANXX/2015-03-02-C6H8GANXX/
 6 | PATTERN="*exp2*_sequence.txt.gz"
 7 | INPUTFILES=($INPUTDIR/$PATTERN)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES))
10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%40" -R"select[mem>4000] rusage[mem=4000]" -M4000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/1_BreakDown_Files_wrapper.sh 100000000 "$INPUTDIR" "$PATTERN" $OUTPUTDIR
11 | 


--------------------------------------------------------------------------------
/1_Flexible_FullTranscript_Demultiplexing.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/perl
  2 | use strict;
  3 | use warnings;
  4 | 
  5 | # Input a pair of sequencing FastQ files
  6 | # gunzip, read, write out smaller broken down files in a format suitable for submitting job array, and re-gzip each one in turn
  7 | # Breakdown by lane & cellID
  8 | # Keep order. 
  9 | # This should work equally well for single-end reads and can take any number of files as arguments.
 10 | # TESTED
 11 | 
 12 | if (@ARGV != 7) { 
 13 | 	print STDERR "perl 1_Flexible_FullTranscript_Demultiplexing.pl read1.fq read2.fq b_pos b_len index mismatch prefix\n";
 14 | 	print STDERR "
 15 | 		read1.fq : barcode containing read
 16 | 		read2.fq : non-barcode containg read
 17 | 		b_pos : position of cell-barcode in the read. [\"start\" or \"end\"]
 18 | 		b_len : length of cell-barcode (bp)
 19 | 		index : file contain a single column of expected barcodes
 20 | 		mismatch : maximum number of permitted mismatches (recommend 2)
 21 | 		prefix : prefix for output fq files.\n";
 22 | 	exit(1);
 23 | }
 24 | 
 25 | my $infile1 = $ARGV[0];
 26 | my $infile2 = $ARGV[1];
 27 | my $barcode_pos = $ARGV[2];
 28 | my $barcode_len = $ARGV[3];
 29 | my $barcode_index_file = $ARGV[4];
 30 | my $MAXmismatch = $ARGV[5];
 31 | my $OUTprefix = $ARGV[6];
 32 | 
 33 | if ($OUTprefix =~ /^(.+)\/[^\/]$/) {
 34 | 	if ($1 ne ".") {
 35 | 		system("mkdir -p $1");
 36 | 	}
 37 | }
 38 | 
 39 | my %CellBarcodes = ();
 40 | my %ofhs1 = ();
 41 | my %ofhs2 = ();
 42 | open(my $ifh, $barcode_index_file) or die "Cannot open $barcode_index_file\n";
 43 | my $index=1;
 44 | while (<$ifh>) {
 45 | 	chomp;
 46 | 	$CellBarcodes{$_} = $index;
 47 | 	open(my $fh1, '>', "$OUTprefix\_$_\_read1.fq") or die $!;
 48 | 	$ofhs1{$index} = $fh1;
 49 | 	open(my $fh2, '>', "$OUTprefix\_$_\_read2.fq") or die $!;
 50 | 	$ofhs2{$index} = $fh2;
 51 | 	$index++;
 52 | } close($ifh);
 53 | 
 54 | 
 55 | my $NotProperBarcodes = 0;
 56 | my $NotPossibleCell = 0;
 57 | my $AmbiguousCell = 0;
 58 | my $ExactMatch = 0;
 59 | my $Mismatch = 0;
 60 | my $total_reads = 0;
 61 | my $OutputReads = 0;
 62 | 
 63 | open(my $ifh1, $infile1) or die $!;
 64 | open(my $ifh2, $infile2) or die $!;
 65 | while(<$ifh1>) {
 66 | 	my $file1line=$_;
 67 | 	my $file2line = <$ifh2>;
 68 | 	if ($file1line =~ /^@/) {
 69 | 		# Ensure matching pair of reads
 70 | 		my @thing1 = split(/\s+/, $file1line);
 71 | 		my @thing2 = split(/\s+/, $file2line);
 72 | 		my $readname = $thing1[0];
 73 | 		#if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";}
 74 | 		my $barcode_read = <$ifh1>;
 75 | 		chomp $barcode_read;
 76 | 		my $read2 = <$ifh2>;
 77 | 		chomp $read2;
 78 | 		$total_reads++;
 79 | 
 80 | 		<$ifh1>; <$ifh2>;
 81 | 		my $file1qual = <$ifh1>;
 82 | 		chomp $file1qual;
 83 | 		my $file2qual = <$ifh2>;
 84 | 		chomp $file2qual;
 85 | 		my $CellID = "";
 86 | 		if ($barcode_pos eq "start") {
 87 | 			$CellID = substr($barcode_read, 0, $barcode_len, "");
 88 | 			substr($file1qual, 0, $barcode_len, "");
 89 | 		} else {
 90 | 			$CellID = substr($barcode_read, -$barcode_len, $barcode_len, "");
 91 | 			substr($file1qual, -$barcode_len, $barcode_len, "");
 92 | 		}
 93 | 		my $mismatches = 0;
 94 | 		if (!exists($CellBarcodes{$CellID})) {
 95 | 			my @matches = ();
 96 | 			my %close = ();
 97 | 			foreach my $expected_barcode (keys(%CellBarcodes)) {
 98 | 				my $count = ( $expected_barcode ^ $CellID ) =~ tr/\0//;
 99 | 				if ($count >= length($expected_barcode)-$MAXmismatch) {
100 | 					$close{$expected_barcode} = $count;
101 | 				}
102 | 			}
103 | 			if (scalar(keys(%close)) > 0) {
104 | 				my $max = my_max(values(%close));
105 | 				$mismatches = length($CellID) - $max;
106 | 				foreach my $code (keys(%close)) {
107 | 					if ($close{$code} == $max) {
108 | 						push(@matches, $code);
109 | 					}
110 | 				}
111 | 			}
112 | 			if (scalar(@matches) == 1) {
113 | 				$CellID = $matches[0];
114 | 				$Mismatch++;
115 | 			} elsif (scalar(@matches) > 1) {
116 | 				$AmbiguousCell++;
117 | 				next;
118 | 			} else {
119 | 				$NotPossibleCell++;
120 | 				next;
121 | 			}
122 | 		} else {
123 | 			$ExactMatch++;
124 | 		}
125 | 		# print the read
126 | 		my $handle1 = $ofhs1{$CellBarcodes{$CellID}};
127 | 		my $handle2 = $ofhs2{$CellBarcodes{$CellID}};
128 | 		print $handle1 "$readname\n$barcode_read\n+\n$file1qual\n";
129 | 		print $handle2 "$readname\n$read2\n+\n$file2qual\n";
130 | 		$OutputReads++;
131 | 	} else {next;}
132 | }
133 | 
134 | print STDERR "
135 | Doesn't match any cell: $NotPossibleCell
136 | Ambiguous: $AmbiguousCell
137 | Exact Matches: $ExactMatch
138 | Contain Mismatches: $Mismatch
139 | Input Reads: $total_reads
140 | Output Reads: $OutputReads\n";
141 | close($ifh1);
142 | close($ifh2);
143 | foreach my $ofh1 (keys(%ofhs1)) {close($ofhs1{$ofh1});}
144 | foreach my $ofh2 (keys(%ofhs2)) {close($ofhs2{$ofh2});}
145 | 
146 | 
147 | 
148 | sub my_max {
149 | 	if (scalar(@_) == 1) {return($_[0])};
150 | 	my $max = shift;
151 | 	foreach my $ele (@_) {
152 | 		if ($ele > $max) {$max = $ele;}
153 | 	}
154 | 	return($max);
155 | }
156 | 


--------------------------------------------------------------------------------
/1_Flexible_UMI_Demultiplexing.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | # Matches upto two mismatches between observed cell barcodes and the expected cell barcodes
  5 | # Excludes reads with problematic UMIs: >= 80% A, >= 80% T, contained in adaptor sequence. - Note do not provide adaptors for short UMI datasets (fewer than 7 bases) since there is a high probability of real UMIs being contained in the adaptor for such cases.
  6 | # Allows barcodes to contain ambiguous bases
  7 | # Allows trailing bases at the end of the barcode sequence but requires barcodes to begin from the first base in the barcode sequence.
  8 | 
  9 | if (@ARGV != 6) { 
 10 | print STDERR "perl 1_Flexible_UMI_Demultiplexing.pl read1.fq read2.fq b_structure index mismatch prefix\n";
 11 | print STDERR "
 12 | 		read1.fq : barcode/umi containing read
 13 | 		read2.fq : non-barcode containing read
 14 | 		b_structure : a single string of the format C##U# or U#C## 
 15 | 			where C## is the cell-barcode and U# is the UMI.
 16 | 			e.g. C10U4 = a 10bp cell barcode followed by a 4bp UMI
 17 | 		index : file containg a single column of expected cell-barcodes.
 18 | 			if equal to \"UNKNOWN\" script will output read counts for each unique barcode.
 19 | 		mismatch : maximum number of permitted mismatches (recommend 2)
 20 | 		prefix : prefix for output fastq files.\n";
 21 | exit(1);}
 22 | my $infile1 = $ARGV[0];
 23 | my $infile2 = $ARGV[1];
 24 | my $barcodestructure = $ARGV[2];
 25 | my $MAXmismatch = $ARGV[4];
 26 | 
 27 | # Parse Barcode Structure #
 28 | 
 29 | 
 30 | 
 31 | my $order = -1;
 32 | my $C_len = -1;
 33 | my $U_len = -1;
 34 | 
 35 | if ($barcodestructure =~ /^C(\d+)U(\d+)$/) {
 36 | 	$order=1;
 37 | 	$C_len = $1;
 38 | 	$U_len = $2;
 39 | 	print "Barcode Structure: $C_len bp CellID followed by $U_len bp UMI\n";
 40 | } elsif ($barcodestructure =~ /^U(\d+)C(\d+)$/) {
 41 | 	$order = 0;
 42 | 	$C_len = $2;
 43 | 	$U_len = $1;
 44 | 	print "Barcode Structure: $U_len bp UMI followed by $C_len bp CellID\n";
 45 | } else {
 46 | 	die "$barcodestructure not recognized.\n";
 47 | }
 48 | # ----------------------- #
 49 | 
 50 | my $OUTprefix = $ARGV[5]; #prefix for output
 51 | #Ensure output directory exists
 52 | if ($OUTprefix =~ /^(.+)\/[^\/]$/) {
 53 | 	if ($1 ne ".") {
 54 | 		system("mkdir -p $1");
 55 | 	}
 56 | }
 57 | 
 58 | # Read Expected Cell Barcodes #
 59 | my %CellBarcodes = ();
 60 | my %ofhs = ();
 61 | if ($ARGV[3] ne "UNKNOWN") {
 62 | 	open (my $ifh, $ARGV[3]) or die "Cannot open $ARGV[3]\n";
 63 | 	my $index=1;
 64 | 	while (<$ifh>) {
 65 | 		chomp;
 66 | 		if ($_ =~/^#/) {next;}
 67 | 		my $barcode = $_;
 68 | 		$CellBarcodes{$barcode} = $index;
 69 | 		open(my $fh,'>',"$OUTprefix\_$barcode.fq") or die $!;
 70 | 		$ofhs{$index} = $fh;
 71 | 		$index++;
 72 | 	} close($ifh);
 73 | }
 74 | # --------------------------- #
 75 | 
 76 | 
 77 | ### Process Reads ###
 78 | 
 79 | # Summary Statistics
 80 | my $NotProperBarcodes = 0;
 81 | my $NotPossibleCell = 0;
 82 | my $AmbiguousCell = 0;
 83 | my $ExactMatch = 0;
 84 | my $Mismatch = 0;
 85 | my $BadUMI = 0;
 86 | my $total_reads = 0;
 87 | my $OutputReads=0;
 88 | 
 89 | open (my $ifh1, $infile1) or die $!;
 90 | open (my $ifh2, $infile2) or die $!;
 91 | while(<$ifh1>) {
 92 | 	my $file1line = $_;
 93 | 	my $file2line = <$ifh2>;
 94 | 	if ($file1line =~ /^@/) { #Skip any file headers
 95 | 
 96 | 		# Ensure matching pair of reads
 97 | 		my @thing1 = split(/\s+/,$file1line);
 98 | 		my @thing2 = split(/\s+/,$file2line);
 99 | 		#my $readname1 = chop($thing1[0]);
100 | 		#my $readname2 = chop($thing2[0]);
101 | 		#if ($readname1 ne $readname2) {die "file1 & file2 readnames don't match! $thing1[0] $thing2[0]\n";}
102 | 		my $readname = $thing1[0];
103 | 		my $barcodes = <$ifh1>;
104 | 		my $read = <$ifh2>;
105 | 		$total_reads++;
106 | 
107 | 		# Parse barcodes
108 | 		my $CellID = ""; my $UMI = "";
109 | 		if ($order) {
110 | 			if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$C_len})([ATCGNUKMRYSWBVHDX]{$U_len})/) {
111 | 				$CellID = $1; $UMI = $2;
112 | 			} else {$NotProperBarcodes++; next;}
113 | 		} else {
114 | 			if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{$U_len})([ATCGNUKMRYSWBVHDX]{$C_len})/) {
115 | 				$CellID = $2; $UMI = $1;
116 | 
117 | 			} else {$NotProperBarcodes++; next;}
118 | 		}
119 | #		if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/) {
120 | 
121 | 
122 | 		# Correct for upto two mismatches between observed and expected cell barcodes
123 | 		if ($ARGV[3] ne "UNKNOWN") {
124 | 		my $mismatches = 0;
125 | 		if (!exists($CellBarcodes{$CellID})) { # Not an expected barcode
126 | 
127 | 			# Barcode contains uncertain bases -> convert to wildcards and pattern match on expected barcodes. (given priority over barcodes with higher confidence mismatches)
128 | 			if ($CellID !~ /^[ATCG]+$/) {
129 | 				$mismatches = () = $CellID =~ /[^ATCG]/g; # count uncertain bases as mismatches
130 | 				$CellID =~ s/[^ATCG]/./g; #Turn non-ATCG bases into wildcards
131 | 			}
132 | 			my @matches = ();
133 | 			my %close = ();
134 | 			foreach my $barcode (keys(%CellBarcodes)) {
135 | 				if ($barcode =~/$CellID/) { # Match but with uncertainty
136 | 					push(@matches, $barcode);
137 | 				} else {
138 | 					if (scalar(@matches == 0)) { # Count mismatches
139 | 						my $count = ( $barcode ^ $CellID ) =~ tr/\0//;
140 | 						if ($count >= length($barcode)-$MAXmismatch) { # Allow upto 2 mismatches
141 | 							$close{$barcode} = $count;
142 | 						}
143 | 					}
144 | 				}
145 | 			}
146 | 			# If exact matches with uncertainty then give those priority, otherwise keep the most similar expected barcodes
147 | 			if (scalar(@matches) == 0 && scalar(keys(%close)) > 0) { # Has 1 or 2 mismatches
148 | 				my $max = my_max(values(%close)); # Closest match
149 | 				$mismatches = length($CellID)-$max;
150 | 				foreach my $code (keys(%close)) {
151 | 					if ($close{$code} == $max) {
152 | 						push(@matches,$code);
153 | 					}
154 | 				}
155 | 			}
156 | 			if (scalar(@matches) == 1) { # single best match
157 | 				$CellID = $matches[0];
158 | 				$Mismatch++;
159 | 			} elsif (scalar(@matches) > 1) { #More than one equally good match
160 | 				$AmbiguousCell++;
161 | 				next;
162 | 			} else { # No match
163 | 				$NotPossibleCell++;
164 | 				next;
165 | 			}
166 | 		} else { # Exact match
167 | 			$ExactMatch++;
168 | 		} 
169 | 
170 | 		} #If known barcodes
171 | 
172 | 		if ($ARGV[3] ne "UNKNOWN") {
173 | 			# Has Acceptable Barcode
174 | 			<$ifh1>;<$ifh2>; #+'s
175 | 			my $file1qual = <$ifh1>;
176 | 			my $file2qual = <$ifh2>;
177 | 			my $handle = $ofhs{$CellBarcodes{$CellID}};
178 | 			print $handle "$readname:$UMI\n$read+\n$file2qual";
179 | 			$OutputReads++;
180 | 		} else {
181 | 			$CellBarcodes{$CellID}++;
182 | 		}
183 | 	} else {next;}
184 | }
185 | if ($ARGV[3] ne "UNKNOWN") {
186 | 	print STDERR "
187 | 	Doesn't match any cell: $NotPossibleCell
188 | 	Ambiguous: $AmbiguousCell
189 | 	Exact Matches: $ExactMatch
190 | 	Contain mismatches: $Mismatch
191 | 	Input Reads: $total_reads
192 | 	Output Reads: $OutputReads\n";
193 | 	close($ifh1);
194 | 	close($ifh2);
195 | 	foreach my $ofh (keys(%ofhs)) {close($ofhs{$ofh});}
196 | } else {
197 | 	print STDERR "Bad UMI: $BadUMI\n";
198 | 	my @Codes = sort { $CellBarcodes{$b} <=> $CellBarcodes{$a} } keys(%CellBarcodes);
199 | 	foreach my $code (@Codes) {
200 | 		print "$code ".$CellBarcodes{$code}."\n";
201 | 	}
202 | }
203 | 
204 | sub my_max {
205 | 	if (scalar(@_) == 1) {return($_[0])};
206 | 	my $max = shift;
207 | 	foreach my $ele (@_) {
208 | 		if ($ele > $max) {$max = $ele;}
209 | 	}
210 | 	return($max);
211 | }
212 | 


--------------------------------------------------------------------------------
/2-5.1_DO_kallisto_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | INDEXFILE=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/kallisto_index.idx
 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Kallisto
 6 | mkdir -p $OUTPUTDIR
 7 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/
 8 | INPUTFILES=($INPUTDIR/*)
 9 | NUMFILES=${#INPUTFILES[@]}
10 | MAXJOBS=$(($NUMFILES/2))
11 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5.1_kallisto_quant.sh 1 $INPUTDIR $OUTPUTDIR $INDEXFILE Bergiers_Vivo
12 | 
13 | 


--------------------------------------------------------------------------------
/2-5.1_kallisto_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with KALLISTO -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | # Arguments: 
 7 | #    $1 = number of threads to run on, 
 8 | #    $2 = directory of files to map
 9 | #    $3 = outputdirectory
10 | 
11 | NUMTHREADS=$1
12 | FILESTOMAPDIR=$2
13 | OUTDIR=$3
14 | INFILE=$4
15 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto
16 | 
17 | #Check appropriate arguments
18 | if [ ! -f "$KALLISTO" ] ; then
19 |   echo "Sorry KALLISTO not available "
20 |   exit 1
21 | fi
22 | 
23 | if [ -z "$NUMTHREADS" ] ; then
24 |   echo "Please set number of threads to use (ARG 1/4)"
25 |   exit 1
26 | fi
27 | 
28 | if [ -z "$FILESTOMAPDIR" ] ; then
29 |   echo "Please include a directory of files to map (ARG 2/4)"
30 |   exit 1
31 | fi
32 | FILEStoMAP=($FILESTOMAPDIR/*)
33 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
34 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
35 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
36 | 
37 | NAME=${FILE1TOMAP##*/}
38 | NAME=${NAME%.*}
39 | 
40 | if [ -z "$FILE1TOMAP" ] ; then
41 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
42 |   exit 1
43 | fi
44 | 
45 | if [ -z "$FILE2TOMAP" ] ; then
46 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
47 |   exit 1
48 | fi
49 | 
50 | if [ -z "$OUTDIR" ] ; then
51 |   echo "Please include a directory for output (ARG 3/4)"
52 |   exit 1
53 | fi
54 | 
55 | if [ -z "$INFILE" ] ; then
56 |   echo "Please include a transcript index file (ARG 4/4)"
57 |   exit 1
58 | fi
59 | 
60 | # Make directory for output if necessary
61 | if [ ! -d "$OUTDIR" ] ; then
62 |   mkdir -p $OUTDIR
63 | fi
64 | if [ ! -d "$OUTDIR/$LSB_JOBINDEX" ] ; then
65 |   mkdir -p $OUTDIR/$LSB_JOBINDEX
66 | fi
67 | 
68 | # bsub -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o test_kallisto_quant.out -e test_kallisto_quant.err /nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto quant --bias -b 100 --seed=1 --plaintext --threads=1 -i /lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/kallisto_index.idx /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/G1_Cell01_1.fastq.gz /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/G1_Cell01_2.fastq.gz -o /lustre/scratch108/compgen/team218/TA/TEST
69 | 
70 | # Run KALLISTO 
71 | $KALLISTO quant --bias --plaintext --threads=$NUMTHREADS -i $INFILE -o $OUTDIR/$LSB_JOBINDEX $FILE1TOMAP $FILE2TOMAP 
72 | mv $OUTDIR/$LSB_JOBINDEX/abundance.tsv $OUTDIR/$NAME.abundances.tsv
73 | rm $OUTDIR/$LSB_JOBINDEX/run_info.json
74 | rmdir $OUTDIR/$LSB_JOBINDEX
75 | 
76 | 


--------------------------------------------------------------------------------
/2-5.2_DO_Salmon_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | INDEXFILE=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/salmon_index
 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Salmon
 6 | GTFFILE=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf
 7 | mkdir -p $OUTPUTDIR
 8 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/
 9 | INPUTFILES=($INPUTDIR/*)
10 | NUMFILES=${#INPUTFILES[@]}
11 | MAXJOBS=$(($NUMFILES/2))
12 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%50" -R"select[mem>5000] rusage[mem=5000] span[hosts=1]" -M5000 -n2 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5.2_Salmon_quant.sh 1 $INPUTDIR $OUTPUTDIR $INDEXFILE $GTFFILE
13 | 
14 | 


--------------------------------------------------------------------------------
/2-5.2_Salmon_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 3 Nov 2016 : wrapper for Mapping Reads with SALMON -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | # Arguments: 
 7 | #    $1 = number of threads to run on, 
 8 | #    $2 = directory of files to map
 9 | #    $3 = outputdirectory
10 | #    $4 = transcript index file (see 0.3_Salmon_build_index.sh)
11 | #    $5 = annotation gtf (map transcripts to genes)
12 | 
13 | NUMTHREADS=$1
14 | FILESTOMAPDIR=$2
15 | OUTDIR=$3
16 | INFILE=$4
17 | ANNFILE=$5
18 | SALMON=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/Salmon-0.7.2_linux_x86_64/bin/salmon
19 | 
20 | #Check appropriate arguments
21 | if [ ! -f "$SALMON" ] ; then
22 |   echo "Sorry SALMON not available "
23 |   exit 1
24 | fi
25 | 
26 | if [ -z "$NUMTHREADS" ] ; then
27 |   echo "Please set number of threads to use (ARG 1/4)"
28 |   exit 1
29 | fi
30 | 
31 | if [ -z "$FILESTOMAPDIR" ] ; then
32 |   echo "Please include a directory of files to map (ARG 2/4)"
33 |   exit 1
34 | fi
35 | FILEStoMAP=($FILESTOMAPDIR/*)
36 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
37 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
38 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
39 | 
40 | NAME=${FILE1TOMAP##*/}
41 | NAME=${NAME%.*}
42 | 
43 | if [ -z "$FILE1TOMAP" ] ; then
44 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
45 |   exit 1
46 | fi
47 | 
48 | if [ -z "$FILE2TOMAP" ] ; then
49 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
50 |   exit 1
51 | fi
52 | 
53 | if [ -z "$OUTDIR" ] ; then
54 |   echo "Please include a directory for output (ARG 3/4)"
55 |   exit 1
56 | fi
57 | 
58 | if [ -z "$INFILE" ] ; then
59 |   echo "Please include a transcript index file (ARG 4/4)"
60 |   exit 1
61 | fi
62 | 
63 | # Make directory for output if necessary
64 | if [ ! -d "$OUTDIR" ] ; then
65 |   mkdir -p $OUTDIR
66 | fi
67 | if [ ! -d "$OUTDIR/$LSB_JOBINDEX" ] ; then
68 |   mkdir -p $OUTDIR/$LSB_JOBINDEX
69 | fi
70 | 
71 | # Run SALMON 
72 | $SALMON quant -i $INFILE -o $OUTDIR/$LSB_JOBINDEX -1 $FILE1TOMAP -2 $FILE2TOMAP -p $NUMTHREADS -l A -g $ANNFILE --seqBias --gcBias --posBias -q
73 | mv $OUTDIR/$LSB_JOBINDEX/quant.sf $OUTDIR/$NAME.quant.sf
74 | mv $OUTDIR/$LSB_JOBINDEX/quant.genes.sf $OUTDIR/$NAME.quant.genes.sf
75 | rm -r $OUTDIR/$LSB_JOBINDEX
76 | 


--------------------------------------------------------------------------------
/2-5_DO_RSEM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_RSEM
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap
 7 | INPUTFILES=($INPUTDIR/*)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES/2))
10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>30000] rusage[mem=30000] span[hosts=1]" -M30000 -n2 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5_STAR-RSEM.sh $INPUTDIR $OUTPUTDIR Beuttner_STAR_RSEM 2
11 | #bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>10000] rusage[mem=10000] span[hosts=1]" -M10000 -n5 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2-5_bowtie2-RSEM.sh $INPUTDIR $OUTPUTDIR Beuttner_bowtie2_RSEM 5
12 | 
13 | 


--------------------------------------------------------------------------------
/2-5_STAR-RSEM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Assume paired end
 3 | # Arguments:
 4 | # $1 = directory of fastq files to map/quantify
 5 | # $2 = output directory for final quantification file
 6 | # $3 = prefix
 7 | # $4 = number of threads to run on (optional)
 8 | RSEM=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26/rsem-calculate-expression
 9 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/
10 | REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38
11 | FILESTOMAPDIR=$1
12 | OUTDIR=$2
13 | PREFIX="$3-$LSB_JOBINDEX-"
14 | THREADS=$4
15 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX
16 | 
17 | if [ -z "$THREADS" ] ; then
18 |   THREADS=1
19 | fi
20 | if [ ! -f "$RSEM" ] ; then
21 |   echo "Sorry RSEM not available "
22 |   exit 1
23 | fi
24 | if [ -z "$FILESTOMAPDIR" ] ; then
25 |   echo "Please include a directory of files to map (ARG 1/4)"
26 |   exit 1
27 | fi
28 | if [ -z "$OUTDIR" ] ; then
29 |   echo "Please include a directory for outputfile (ARG 2/4)"
30 |   exit 1
31 | fi
32 | if [ -z "$3" ] ; then
33 |   echo "Please include a prefix for output (ARG 3/4)"
34 |   exit 1
35 | fi
36 | 
37 | # Get fastq files
38 | FILEStoMAP=($FILESTOMAPDIR/*)
39 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
40 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
41 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
42 | 
43 | if [ -z "$FILE1TOMAP" ] ; then
44 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
45 |   exit 1
46 | fi
47 | 
48 | if [ -z "$FILE2TOMAP" ] ; then
49 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
50 |   exit 1
51 | fi
52 | # Make directory for output if necessary
53 | if [ ! -d "$OUTDIR" ] ; then
54 |   mkdir -p $OUTDIR
55 | fi
56 | 
57 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then
58 |   $RSEM --star --star-path $STAR --gzipped-read-file --no-bam-output --single-cell-prior --temporary-folder $WORKINGDIR --paired-end $FILE1TOMAP $FILE2TOMAP $REFname $OUTDIR/$PREFIX
59 | else 
60 |   $RSEM --star --star-path $STAR --no-bam-output --single-cell-prior --temporary-folder $WORKINGDIR --paired-end $FILE1TOMAP $FILE2TOMAP $REFname $OUTDIR/$PREFIX
61 | fi
62 | 


--------------------------------------------------------------------------------
/2-5_bowtie2-RSEM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Assume paired end
 3 | # Arguments:
 4 | # $1 = directory of fastq files to map/quantify
 5 | # $2 = output directory for final quantification file
 6 | # $3 = prefix
 7 | # $4 = number of threads to run on (optional)
 8 | RSEM=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26/rsem-calculate-expression
 9 | BOWTIE=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bowtie2-2.2.6/
10 | #REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38
11 | REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38
12 | FILESTOMAPDIR=$1
13 | OUTDIR=$2
14 | PREFIX="$3-$LSB_JOBINDEX-"
15 | THREADS=$4
16 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX
17 | 
18 | if [ -z "$THREADS" ] ; then
19 |   THREADS=1
20 | fi
21 | if [ ! -f "$RSEM" ] ; then
22 |   echo "Sorry RSEM not available "
23 |   exit 1
24 | fi
25 | if [ -z "$FILESTOMAPDIR" ] ; then
26 |   echo "Please include a directory of files to map (ARG 1/4)"
27 |   exit 1
28 | fi
29 | if [ -z "$OUTDIR" ] ; then
30 |   echo "Please include a directory for outputfile (ARG 2/4)"
31 |   exit 1
32 | fi
33 | if [ -z "$3" ] ; then
34 |   echo "Please include a prefix for output (ARG 3/4)"
35 |   exit 1
36 | fi
37 | 
38 | # Get fastq files
39 | FILEStoMAP=($FILESTOMAPDIR/*)
40 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
41 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
42 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
43 | 
44 | if [ -z "$FILE1TOMAP" ] ; then
45 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
46 |   exit 1
47 | fi
48 | 
49 | if [ -z "$FILE2TOMAP" ] ; then
50 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
51 |   exit 1
52 | fi
53 | # Make directory for output if necessary
54 | if [ ! -d "$OUTDIR" ] ; then
55 |   mkdir -p $OUTDIR
56 | fi
57 | 
58 | $RSEM --bowtie2 --bowtie2-path $BOWTIE --no-bam-output --single-cell-prior --temporary-folder $WORKINGDIR --paired-end -p $THREADS $FILE1TOMAP $FILE2TOMAP $REFname $OUTDIR/$PREFIX
59 | 


--------------------------------------------------------------------------------
/2.2_DO_MapReads_Tophat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_Tophat
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap
 7 | INPUTFILES=($INPUTDIR/*)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES/2))
10 | GENOME=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/bowtie2_build
11 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%50" -R"select[mem>6000] rusage[mem=6000] span[hosts=1]" -M6000 -n5 -q long -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2.2_MapReads_Tophat.sh 5 $INPUTDIR $OUTPUTDIR $GENOME Beuttner_Tophat
12 | 
13 | 


--------------------------------------------------------------------------------
/2.2_MapReads_Tophat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with TOPHAT -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | # Arguments: 
 7 | #    $1 = number of threads to run on, 
 8 | #    $2 = directory of files to map
 9 | #    $3 = outputdirectory
10 | #    $4 = genome base
11 | #    $5 = Prefix
12 | 
13 | NUMTHREADS=$1
14 | FILESTOMAPDIR=$2
15 | OUTDIR=$3
16 | TOPHAT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/tophat-2.1.0.Linux_x86_64/tophat2
17 | GENOME=$4
18 | PREFIX="$5-$LSB_JOBINDEX-"
19 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX
20 | 
21 | #Check appropriate arguments
22 | if [ ! -f "$TOPHAT" ] ; then
23 |   echo "Sorry TOPHAT not available "
24 |   exit 1
25 | fi
26 | 
27 | if [ -z "$NUMTHREADS" ] ; then
28 |   echo "Please set number of threads to use (ARG 1/4)"
29 |   exit 1
30 | fi
31 | 
32 | if [ -z "$FILESTOMAPDIR" ] ; then
33 |   echo "Please include a directory of files to map (ARG 2/4)"
34 |   exit 1
35 | fi
36 | FILEStoMAP=($FILESTOMAPDIR/*)
37 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
38 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
39 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
40 | 
41 | if [ -z "$FILE1TOMAP" ] ; then
42 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
43 |   exit 1
44 | fi
45 | 
46 | if [ -z "$FILE2TOMAP" ] ; then
47 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
48 |   exit 1
49 | fi
50 | 
51 | if [ -z "$OUTDIR" ] ; then
52 |   echo "Please include a directory for output (ARG 3/4)"
53 |   exit 1
54 | fi
55 | 
56 | if [ -z "$GENOME" ] ; then
57 |   echo "Please include the base genome name (ARG 4/4)"
58 |   exit 1
59 | fi
60 | 
61 | if [ -z "$5" ] ; then
62 |   echo "Warning: no file prefix included"
63 | fi
64 | 
65 | #To fix failed jobs
66 | if [ -d "$OUTDIR/$LSB_JOBINDEX" ]; then
67 | #-----------------
68 | 
69 | # Make directory for output if necessary
70 | if [ ! -d "$OUTDIR/$LSB_JOBINDEX" ] ; then
71 |   mkdir -p $OUTDIR/$LSB_JOBINDEX
72 | fi
73 | 
74 | NAME=${FILE1TOMAP##*/}
75 | NAME=${NAME%.*}
76 | 
77 | echo "Job$LSB_JOBINDEX Mapping: $FILE1TOMAP $FILE2TOMAP\n"
78 | 
79 | # Run TOPHAT 
80 | FILEnopath=`basename ${FILE1TOMAP%.fq.gz}`
81 | cd $OUTDIR/$LSB_JOBINDEX
82 | $TOPHAT $GENOME $FILE1TOMAP $FILE2TOMAP
83 | 
84 | mv $OUTDIR/$LSB_JOBINDEX/tophat_out/align_summary.txt $OUTDIR/$NAME.align_summary.txt
85 | /usr/bin/samtools merge -n -f $OUTDIR/$NAME.sorted.aligned.bam $OUTDIR/$LSB_JOBINDEX/tophat_out/accepted_hits.bam $OUTDIR/$LSB_JOBINDEX/tophat_out/unmapped.bam
86 | rm -r $OUTDIR/$LSB_JOBINDEX
87 | 
88 | #To fix failed jobs
89 | fi
90 | #-----------------
91 | 


--------------------------------------------------------------------------------
/2_DO_MapReadsFile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_STAR
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap
 7 | INPUTFILES=($INPUTDIR/*)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES/2))
10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%30" -R"select[mem>30000] rusage[mem=30000]" -M30000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2_MapReadsFile.sh 1 $INPUTDIR $OUTPUTDIR /nfs/users/nfs_t/ta6/RNASeqPipeline/2_STAR_Parameters.txt Beuttner_STAR
11 | 
12 | 


--------------------------------------------------------------------------------
/2_DO_MapReadsFile_singleend.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | #OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/NeuronsEmmyLiora/FilesMapped
 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedTranscriptome
 6 | mkdir -p $OUTPUTDIR
 7 | #INPUTDIR=/lustre/scratch108/compgen/team218/TA/NeuronsEmmyLiora/FilesUMITrimmed
 8 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesQCed
 9 | GENOMEDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/Bergiers
10 | INPUTFILES=($INPUTDIR/*)
11 | NUMFILES=${#INPUTFILES[@]}
12 | MAXJOBS=$(($NUMFILES))
13 | #GENOMEDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/NeuronsLiora/
14 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%100" -R"select[mem>35000] rusage[mem=35000]" -M35000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2_MapReadsFile_singleend.sh 1 $INPUTDIR $OUTPUTDIR /nfs/users/nfs_t/ta6/RNASeqPipeline/2_STAR_Parameters.txt $GENOMEDIR Bergiers_Trimmed_Waf375
15 | 


--------------------------------------------------------------------------------
/2_MapReadsFile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | ## Haven't tested since moved genomedir out of parameterfile
 7 | 
 8 | # Arguments: 
 9 | #    $1 = number of threads to run on, 
10 | #    $2 = directory of files to map
11 | #    $3 = outputdirectory
12 | #    $4 = STAR Parameters file
13 | #    $5 = Prefix
14 | 
15 | NUMTHREADS=$1
16 | FILESTOMAPDIR=$2
17 | OUTDIR=$3
18 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR
19 | PARAMFILE=$4
20 | PREFIX="$5-$LSB_JOBINDEX-"
21 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX
22 | GENOME=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/NeuronsLiora
23 | 
24 | #Check appropriate arguments
25 | if [ ! -f "$STAR" ] ; then
26 |   echo "Sorry STAR not available "
27 |   exit 1
28 | fi
29 | 
30 | if [ -z "$NUMTHREADS" ] ; then
31 |   echo "Please set number of threads to use (ARG 1/4)"
32 |   exit 1
33 | fi
34 | 
35 | if [ -z "$FILESTOMAPDIR" ] ; then
36 |   echo "Please include a directory of files to map (ARG 2/4)"
37 |   exit 1
38 | fi
39 | FILEStoMAP=($FILESTOMAPDIR/*)
40 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
41 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
42 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
43 | 
44 | if [ -z "$FILE1TOMAP" ] ; then
45 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
46 |   exit 1
47 | fi
48 | 
49 | if [ -z "$FILE2TOMAP" ] ; then
50 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
51 |   exit 1
52 | fi
53 | 
54 | if [ -z "$OUTDIR" ] ; then
55 |   echo "Please include a directory for output (ARG 3/4)"
56 |   exit 1
57 | fi
58 | 
59 | if [ -z "$PARAMFILE" ] ; then
60 |   echo "Please include a parameter file (ARG 4/4)"
61 |   exit 1
62 | fi
63 | 
64 | if [ -z "$5" ] ; then
65 |   echo "Warning: no file prefix included"
66 | fi
67 | 
68 | # Make directory for output if necessary
69 | if [ ! -d "$OUTDIR" ] ; then
70 |   mkdir -p $OUTDIR
71 | fi
72 | 
73 | # Run STAR 
74 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then
75 |     FILEnopath=`basename ${FILE1TOMAP%.fq.gz}`
76 |     $STAR --runThreadN $NUMTHREADS --runMode alignReads --genomeDir $GENOME --readFilesIn $FILE1TOMAP $FILE2TOMAP --readFilesCommand zcat --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR
77 | else
78 |     FILEnopath=`basename ${FILE1TOMAP%.fq}`
79 |     $STAR --runThreadN $NUMTHREADS --runMode alignReads --genomeDir $GENOME --readFilesIn $FILE1TOMAP $FILE2TOMAP --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR
80 | fi
81 | 


--------------------------------------------------------------------------------
/2_MapReadsFile_Transcriptome.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | # Arguments: 
 7 | #    $1 = number of threads to run on, 
 8 | #    $2 = directory of files to map
 9 | #    $3 = outputdirectory
10 | #    $4 = STAR Parameters file
11 | #    $5 = Prefix
12 | 
13 | NUMTHREADS=$1
14 | FILESTOMAPDIR=$2
15 | OUTDIR=$3
16 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR
17 | PARAMFILE=$4
18 | PREFIX="$5-$LSB_JOBINDEX-"
19 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$PREFIX
20 | 
21 | #Check appropriate arguments
22 | if [ ! -f "$STAR" ] ; then
23 |   echo "Sorry STAR not available "
24 |   exit 1
25 | fi
26 | 
27 | if [ -z "$NUMTHREADS" ] ; then
28 |   echo "Please set number of threads to use (ARG 1/4)"
29 |   exit 1
30 | fi
31 | 
32 | if [ -z "$FILESTOMAPDIR" ] ; then
33 |   echo "Please include a directory of files to map (ARG 2/4)"
34 |   exit 1
35 | fi
36 | FILEStoMAP=($FILESTOMAPDIR/*)
37 | ARRAYINDEX=$((($LSB_JOBINDEX-1)*2))
38 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
39 | FILE2TOMAP=${FILEStoMAP[$ARRAYINDEX+1]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
40 | 
41 | if [ -z "$FILE1TOMAP" ] ; then
42 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
43 |   exit 1
44 | fi
45 | 
46 | if [ -z "$FILE2TOMAP" ] ; then
47 |   echo "$ARRAYINDEX+1-th file in the $FILESTOMAPDIR does not exist."
48 |   exit 1
49 | fi
50 | 
51 | if [ -z "$OUTDIR" ] ; then
52 |   echo "Please include a directory for output (ARG 3/4)"
53 |   exit 1
54 | fi
55 | 
56 | if [ -z "$PARAMFILE" ] ; then
57 |   echo "Please include a parameter file (ARG 4/4)"
58 |   exit 1
59 | fi
60 | 
61 | if [ -z "$5" ] ; then
62 |   echo "Warning: no file prefix included"
63 | fi
64 | 
65 | # Make directory for output if necessary
66 | if [ ! -d "$OUTDIR" ] ; then
67 |   mkdir -p $OUTDIR
68 | fi
69 | 
70 | # Run STAR 
71 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then
72 |     FILEnopath=`basename ${FILE1TOMAP%.fq.gz}`
73 |     $STAR --runThreadN $NUMTHREADS --runMode alignReads --quantMode TranscriptomeSAM --readFilesIn $FILE1TOMAP $FILE2TOMAP --readFilesCommand zcat --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR
74 | else
75 |     FILEnopath=`basename ${FILE1TOMAP%.fq}`
76 |     $STAR --runThreadN $NUMTHREADS --runMode alignReads --quantMode TranscriptomeSAM --readFilesIn $FILE1TOMAP $FILE2TOMAP --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR
77 | fi
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/2_MapReadsFile_singleend.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | # Arguments: 
 7 | #    $1 = number of threads to run on, 
 8 | #    $2 = directory of files to map
 9 | #    $3 = outputdirectory
10 | #    $4 = STAR Parameters file
11 | #    $5 = STAR Genome directory
12 | #    $6 = Prefix
13 | 
14 | NUMTHREADS=$1
15 | FILESTOMAPDIR=$2
16 | OUTDIR=$3
17 | STAR=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/STAR
18 | PARAMFILE=$4
19 | PREFIX="$6-$LSB_JOBINDEX-"
20 | WORKINGDIR=/lustre/scratch108/compgen/team218/TA/Pipeline_RunningDir/STAR/$PREFIX
21 | GENOME=$5
22 | 
23 | #Check appropriate arguments
24 | if [ ! -f "$STAR" ] ; then
25 |   echo "Sorry STAR not available "
26 |   exit 1
27 | fi
28 | 
29 | if [ -z "$NUMTHREADS" ] ; then
30 |   echo "Please set number of threads to use (ARG 1/4)"
31 |   exit 1
32 | fi
33 | 
34 | if [ -z "$FILESTOMAPDIR" ] ; then
35 |   echo "Please include a directory of files to map (ARG 2/4)"
36 |   exit 1
37 | fi
38 | FILEStoMAP=($FILESTOMAPDIR/*)
39 | ARRAYINDEX=$((($LSB_JOBINDEX-1)))
40 | FILE1TOMAP=${FILEStoMAP[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
41 | 
42 | if [ -z "$FILE1TOMAP" ] ; then
43 |   echo "$ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
44 |   exit 1
45 | fi
46 | 
47 | if [ -z "$OUTDIR" ] ; then
48 |   echo "Please include a directory for output (ARG 3/4)"
49 |   exit 1
50 | fi
51 | 
52 | if [ -z "$PARAMFILE" ] ; then
53 |   echo "Please include a parameter file (ARG 4/4)"
54 |   exit 1
55 | fi
56 | 
57 | if [ -z "$5" ] ; then
58 |   echo "Warning: no file prefix included"
59 | fi
60 | 
61 | # Make directory for output if necessary
62 | if [ ! -d "$OUTDIR" ] ; then
63 |   mkdir -p $OUTDIR
64 | fi
65 | 
66 | # Run STAR 
67 | if [[ $FILE1TOMAP =~ \.gz$ ]] ; then
68 |     FILEnopath=`basename ${FILE1TOMAP%.fq.gz}`
69 |     $STAR --runThreadN $NUMTHREADS --runMode alignReads --readFilesIn $FILE1TOMAP --genomeDir $GENOME --readFilesCommand zcat --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR
70 | else
71 |     FILEnopath=`basename ${FILE1TOMAP%.fq}`
72 |     $STAR --runThreadN $NUMTHREADS --runMode alignReads --readFilesIn $FILE1TOMAP --genomeDir $GENOME --parametersFiles $PARAMFILE --outFileNamePrefix $OUTDIR/$FILEnopath --outTmpDir $WORKINGDIR
73 | fi
74 | 


--------------------------------------------------------------------------------
/2_STAR_Parameters.txt:
--------------------------------------------------------------------------------
 1 | outSAMstrandField intronMotif
 2 | outFilterIntronMotifs RemoveNoncanonical
 3 | outSAMtype BAM SortedByCoordinate
 4 | outFilterType BySJout 
 5 | outFilterMultimapNmax 20 
 6 | alignSJoverhangMin 8
 7 | alignSJDBoverhangMin 1 
 8 | outFilterMismatchNmax 999 
 9 | outFilterMismatchNoverLmax 0.04 
10 | alignIntronMin 20
11 | alignIntronMax 1000000 
12 | alignMatesGapMax 1000000 
13 | 


--------------------------------------------------------------------------------
/3_CLEANUP_MapReadFiles.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh to break it down into steps.
 3 | # Run this after mapping is finished.
 4 | 
 5 | # These must be consistent with 2_DO_MapReadsFile.sh
 6 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedTranscriptome
 7 | TAG="Trimmed50-Bergiers_Waf375"
 8 | 
 9 | rm $OUTPUTDIR/*Log.progress.out
10 | 
11 | perl /nfs/users/nfs_t/ta6/RNASeqPipeline/3_Compile_Mapping_Statistics.pl $OUTPUTDIR > /lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/$TAG.mapped_summary.out
12 | 
13 | tar -cvzf $OUTPUTDIR/$TAG.ParameterLogfiles.tar.gz  $OUTPUTDIR/*Log.out
14 | tar -cvzf $OUTPUTDIR/$TAG.SpliceJunctionfiles.tar.gz  $OUTPUTDIR/*SJ.out.tab
15 | tar -cvzf $OUTPUTDIR/$TAG.FinalLogfiles.tar.gz  $OUTPUTDIR/*Log.final.out
16 | rm $OUTPUTDIR/*Log.out
17 | rm $OUTPUTDIR/*SJ.out.tab
18 | rm $OUTPUTDIR/*Log.final.out
19 | 


--------------------------------------------------------------------------------
/3_Compile_Mapping_Statistics.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (scalar(@ARGV) != 1) {die "Please provide a directory of STAR output\n";}
 5 | 
 6 | my @files = glob("$ARGV[0]/*Log.final.out");
 7 | print "lane\tsample\texp\tproject\tfile\tNreads\tNuniquemap\tNmultimap\tNnomap\tNsplice\tNnovelSJ\tNoMapTooManyMap\tNoMapTooManyMis\tNoMapTooShort\n";
 8 | foreach my $file (@files) {
 9 | 	my $fullfilename = $file;
10 | 
11 | 	# Get as much info from file names as possible
12 | 	$file =~ /([^\/]+)$/; $file = $1;
13 | 	my $laneID = "NA"; my $sampleID = "NA"; my $expID = "NA"; my $projectID = "NA"; my $fileID = "NA";
14 | 	if ($file =~ s/(lane\d+)//) {$laneID = $1;}
15 | #	if ($file =~ s/(exp\d+)//) {$expID = $1;}
16 | #	if ($file =~ s/(sc\d)//) {$expID = $1;}
17 | 	if ($file =~ s/([ACTG]{5,})//) {$sampleID = $1;}
18 | #	if ($file =~ s/(cell\d\d)//) {$sampleID = $1;}
19 | 	if ($file =~ /^([^_]+)/) {
20 | 		my @remnants = split(/_+/, $file);
21 | 		$projectID = $remnants[0];
22 | 		$fileID = $remnants[1];
23 | 	}
24 | 
25 | #	print "$fullfilename\n";
26 | 	print "$laneID\t$sampleID\t$expID\t$projectID\t$fileID\t";
27 | 
28 | 	my $Nreads = 0, my $Nuniquelymapped = 0; my $Nmultimap = 0; my $Nsplice = 0; my $NspliceAnn = 0;
29 | 	my $UnmappedTooManyMultimapN = 0; my $UnmappedTooManyMMprop = 0; my $UnmappedTooShortprop = 0;
30 | 	open(my $ifh, $fullfilename) or die $!;
31 | 	while (<$ifh>) {
32 | 		if ($_ =~ /Number of input reads[\s|]+(\d+)/) {$Nreads = $1;}
33 | 		if ($_ =~ /Uniquely mapped reads number[\s|]+(\d+)/) {$Nuniquelymapped = $1;}
34 | 		if ($_ =~ /Number of reads mapped to multiple loci[\s|]+(\d+)/) {$Nmultimap = $1;}
35 | 		if ($_ =~ /Number of splices: Total[\s|]+(\d+)/) {$Nsplice = $1;}
36 | 		if ($_ =~ /Number of splices: Annotated \(sjdb\)[\s|]+(\d+)/) {$NspliceAnn = $1;}
37 | 		if ($_ =~ /Number of reads mapped to too many loci[\s|]+(\d+)/) {$UnmappedTooManyMultimapN = $1;}
38 | 		if ($_ =~ /of reads unmapped: too many mismatches[\s|]+([\d\.]+%)/) {$UnmappedTooManyMMprop = $1;}
39 | 		if ($_ =~ /of reads unmapped: too short[\s|]+([\d\.]+%)/) {$UnmappedTooShortprop = $1;}
40 | 	} close ($ifh);
41 | 
42 | 	print "$Nreads\t$Nuniquelymapped\t$Nmultimap\t".($Nreads-$Nuniquelymapped-$Nmultimap)."\t$Nsplice\t".($Nsplice-$NspliceAnn)."\t$UnmappedTooManyMultimapN\t$UnmappedTooManyMMprop\t$UnmappedTooShortprop\n";
43 | 
44 | #	exit(); #short circuit for debugging
45 | }
46 | 


--------------------------------------------------------------------------------
/3_Compile_UMI_Statistics.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (scalar(@ARGV) != 1) {die "Please provide a directory of UMI-tools running output\n";}
 5 | 
 6 | my @files = glob("$ARGV[0]/*err*");
 7 | print "sample\tmethod\tNreads\tNmolecules\n";
 8 | for(my $i = 0; $i < scalar(@files); $i++) {
 9 | 	my $file = $files[$i];
10 | 
11 | 	my $Nreads = 0, my $Nmolecules = 0;
12 | 	my $cellID = ""; my $method = "";
13 | 	open(my $ifh, $file) or die $!;
14 | 	while (<$ifh>) {
15 | 		if ($_ =~ /Number of reads in:\s*(\d+)/) {$Nreads = $1;}
16 | 		if ($_ =~ /Number of reads out:\s*(\d+)/) {$Nmolecules = $1;}
17 | 		if ($_ =~ /([AGCT]+)Aligned/) {$cellID = $1;}
18 | 		if ($_ =~ /Method:\s*(\w+)/) {$method = $1;}
19 | 	} close ($ifh);
20 | 
21 | 	print "$cellID\t$method\t$Nreads\t$Nmolecules\n";
22 | }
23 | 


--------------------------------------------------------------------------------
/3_DO_UmiDedup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedDeDupped
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedTranscriptome
 7 | INPUTFILES=($INPUTDIR/*.bam)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES))
10 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%100" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o umi-tools.out.%J.%I -e umi-tools.err.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/3_UmiDedup.sh $INPUTDIR $OUTPUTDIR Rerum_DirAdj_transcriptome directional-adjacency
11 | 
12 | #    methods:
13 | #    options_method = "directional-adjacency"
14 | #    options_method = "adjacency"
15 | #    options_method = "unique"
16 | 
17 | 


--------------------------------------------------------------------------------
/3_SAMtools_sort_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z $1 ] ; then
 4 |   echo "Please set input file (ARG 1/3)"
 5 |   exit 1
 6 | fi
 7 | if [ -z $2 ] ; then
 8 |   echo "Please set an output prefix (ARG 2/3)"
 9 |   exit 1
10 | fi
11 | if [ -z $3 ] ; then
12 |   echo "Please set an max memory limit (ARG 3/3)"
13 |   exit 1
14 | fi
15 | 
16 | samtools sort -m $3 $1 $2
17 | rm $1
18 | 


--------------------------------------------------------------------------------
/3_SortBAMs.pl:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | 
4 | foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/*.out.bam")) {
5 | 	$file =~ /(.*)\.out\.bam$/;
6 | 	my $outprefix = "$1.sorted";
7 | 	system("bsub -R\"select[mem>3000] rusage[mem=3000]\" -M3000 -q normal -o output.%J /nfs/users/nfs_t/ta6/RNASeqPipeline/3_SAMtools_sort_wrapper.sh $file $outprefix 3000000000\n");
8 | }
9 | 


--------------------------------------------------------------------------------
/3_UmiDedup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Tallulah 31 Mar 2015 : wrapper for Mapping Reads with STAR -> to be called from a job-array bsub command.
 3 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 4 | # Maps paired reads only!
 5 | 
 6 | ## Haven't tested since moved genomedir out of parameterfile
 7 | 
 8 | # Arguments: 
 9 | #    $1 = directory of files to map
10 | #    $2 = outputdirectory
11 | #    $3 = Prefix
12 | #    $4 = Method
13 | 
14 | FILESTOMAPDIR=$1
15 | OUTDIR=$2
16 | PREFIX=$3
17 | METHOD=$4
18 | DIST_THRESH=0
19 | UMITOOLS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/dedup_umi.py
20 | 
21 | #Check appropriate arguments
22 | if [ ! -f "$UMITOOLS" ] ; then
23 |   echo "Sorry UMI-tools not available "
24 |   exit 1
25 | fi
26 | 
27 | if [ -z "$FILESTOMAPDIR" ] ; then
28 |   echo "Please include a directory of files to map (ARG 1/3)"
29 |   exit 1
30 | fi
31 | MYFILES=($FILESTOMAPDIR/*.bam)
32 | ARRAYINDEX=$((($LSB_JOBINDEX-1)))
33 | MYFILE=${MYFILES[$ARRAYINDEX]} #Note bash array indicies start at 0 but job array indices must start at 1!!!
34 | echo $FILESTOMAPDIR
35 | echo $ARRAYINDEX
36 | echo ${#MYFILES[@]}
37 | echo $MYFILE
38 | if [ -z "$MYFILE" ] ; then
39 |   echo "$MYFILE the $ARRAYINDEX-th file in the $FILESTOMAPDIR does not exist."
40 |   exit 1
41 | fi
42 | 
43 | if [ -z "$OUTDIR" ] ; then
44 |   echo "Please include a directory for output (ARG 2/3)"
45 |   exit 1
46 | fi
47 | 
48 | if [ -z "$PREFIX" ] ; then
49 |   echo "Warning: no file prefix included"
50 | fi
51 | 
52 | # Make directory for output if necessary
53 | if [ ! -d "$OUTDIR" ] ; then
54 |   mkdir -p $OUTDIR
55 | fi
56 | 
57 | # Run STAR 
58 | FILEnopath=`basename ${MYFILE%.bam}`
59 | /usr/bin/python $UMITOOLS $MYFILE $DIST_THRESH $OUTDIR/$PREFIX-$FILEnopath.bam $METHOD
60 | 


--------------------------------------------------------------------------------
/3_merge_dedup_MappedReads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Coppied commands from 00_LIST_OF_BSUB_COMMANDS.sh
 3 | 
 4 | # These must be consistent with 2_DO_MapReadsFile.sh
 5 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_Tophat/
 6 | TAG="Beuttner_Tophat2_dedup"
 7 | SCRIPT=/nfs/users/nfs_t/ta6/RNASeqPipeline/4_MergeBAMs.pl
 8 | 
 9 | if [ ! -f $SCRIPT ] ; then
10 |   echo "$SCRIPT not available"
11 |   exit 1
12 | fi
13 | 
14 | if [ -z $TAG ] ; then
15 |   echo "No project tag"
16 |   exit 1
17 | fi
18 | 
19 | if [ -z $OUTPUTDIR ] ; then
20 |   echo "No directory of sorted mapped read bam files"
21 |   exit 1
22 | fi
23 | 
24 | # Do I want to do this in here?
25 | perl $SCRIPT $OUTPUTDIR $TAG
26 | 
27 | MappedDedupDIR=$OUTPUTDIR/$TAG/Deduplicated
28 | MappedWdupDIR=$OUTPUTDIR/$TAG/WithDuplicates
29 | mkdir -p $MappedDedupDIR
30 | mkdir -p $MappedWdupDIR
31 | mv $OUTPUTDIR/*dedup* $MappedDedupDIR
32 | mv $OUTPUTDIR/*sorted*.bam $MappedWdupDIR
33 | #tar -cvzf $OUTPUTDIR/Bergiers_exp2_mapping_output.tar.gz $OUTPUTDIR/Bergiers*exp2*
34 | #rm $OUTPUTDIR/Bergiers_lane*.bam
35 | 
36 | 


--------------------------------------------------------------------------------
/4_Convert_GTF2BED_customized_for_Ensembl.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Downloaded from: https://code.google.com/p/ea-utils/source/browse/trunk/clipper/gtf2bed on 9 April 2015
  4 | # Based on suggestion from: http://onetipperday.blogspot.com/2012/08/convert-bed-to-gtf.html
  5 | 
  6 | # Copyright (c) 2011 Erik Aronesty (erik@q32.com)
  7 | # 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in
 16 | # all copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 24 | # THE SOFTWARE.
 25 | # 
 26 | # ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
 27 | 
 28 | use Data::Dumper;
 29 | use Getopt::Long;
 30 | 
 31 | 
 32 | my $extended;
 33 | GetOptions("x"=>\$extended);
 34 | 
 35 | $in = shift @ARGV;
 36 | 
 37 | if (!defined($in) || $in eq "" || !-e $in) {die "No input file provided $!\n";}
 38 | 
 39 | my $in_cmd =($in =~ /\.gz$/ ? "gunzip -c $in|" : $in =~ /\.zip$/ ? "unzip -p $in|" : "$in") || die "Please provide valid gtf (or compressed gtf) file, Can't open $in: $!\n";
 40 | open IN, $in_cmd;
 41 | 
 42 | while (<IN>) {
 43 | 	$gff = 2 if /^##gff-version 2/;
 44 | 	$gff = 3 if /^##gff-version 3/;
 45 | 	next if /^#/ && $gff;
 46 | 
 47 | 	s/\s+$//;
 48 | 	# 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr
 49 | 	my @f = split /\t/;
 50 | # Tallulah's Modifications:
 51 | 	($transid) = $f[8]=~ /transcript_id "([^"]+)"/;
 52 | 	($geneid) = $f[8]=~ /gene_id "([^"]+)"/;
 53 | 	($gene_type) = $f[8]=~ /gene_biotype "([^"]+)"/;
 54 | 	($gene_name) = $f[8]=~ /gene_name "([^"]+)"/;
 55 | 	($trans_type) = $f[8]=~ /transcript_biotype "([^"]+)"/;
 56 | 	$id="${gene_name}__${geneid}__${transid}__${gene_type}.${trans_type}";
 57 | #	if ($gff) {
 58 | #        # most ver 2's stick gene names in the id field
 59 | #		($id) = $f[8]=~ /\bID="([^"]+)"/;
 60 | #        # most ver 3's stick unquoted names in the name field
 61 | #		($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3;
 62 | #	} else {
 63 | #		($id) = $f[8]=~ /transcript_id "([^"]+)"/;
 64 | #	}
 65 | # End of Modifications ---
 66 | 
 67 | 	next unless $id && $f[0];
 68 | 
 69 | 	if ($f[2] eq 'exon') {
 70 | 		die "no position at exon on line $." if ! $f[3];
 71 |         # gff3 puts :\d in exons sometimes
 72 |         $id =~ s/:\d+$// if $gff == 3;
 73 | 		push @{$exons{$id}}, \@f;
 74 | 		# save lowest start
 75 | 		$trans{$id} = \@f if !$trans{$id};
 76 | 	} elsif ($f[2] eq 'start_codon') {
 77 | 		#optional, output codon start/stop as "thick" region in bed
 78 | 		$sc{$id}->[0] = $f[3];
 79 | 	} elsif ($f[2] eq 'stop_codon') {
 80 | 		$sc{$id}->[1] = $f[4];
 81 | 	} elsif ($f[2] eq 'miRNA' ) {
 82 | 		$trans{$id} = \@f if !$trans{$id};
 83 | 		push @{$exons{$id}}, \@f;
 84 | 	}
 85 | }
 86 | 
 87 | for $id ( 
 88 | 	# sort by chr then pos
 89 | 	sort {
 90 | 		$trans{$a}->[0] eq $trans{$b}->[0] ? 
 91 | 		$trans{$a}->[3] <=> $trans{$b}->[3] : 
 92 | 		$trans{$a}->[0] cmp $trans{$b}->[0]
 93 | 	} (keys(%trans)) ) {
 94 | 		my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}};
 95 |         my ($cds, $cde);
 96 |         ($cds, $cde) = @{$sc{$id}} if $sc{$id};
 97 | 
 98 | 		# sort by pos
 99 | 		my @ex = sort {
100 | 			$a->[3] <=> $b->[3]
101 | 		} @{$exons{$id}};
102 | 
103 | 		my $beg = $ex[0][3];
104 | 		my $end = $ex[-1][4];
105 | 		
106 | 		if ($dir eq '-') {
107 | 			# swap
108 | 			$tmp=$cds;
109 | 			$cds=$cde;
110 | 			$cde=$tmp;
111 | 			$cds -= 2 if $cds;
112 | 			$cde += 2 if $cde;
113 | 		}
114 | 
115 | 		# not specified, just use exons
116 | 		$cds = $beg if !$cds;
117 | 		$cde = $end if !$cde;
118 | 
119 | 		# adjust start for bed
120 | 		--$beg; --$cds;
121 | 	
122 | 		my $exn = @ex;												# exon count
123 | 		my $exst = join ",", map {$_->[3]-$beg-1} @ex;				# exon start
124 | 		my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex;			# exon size
125 | 
126 |         my $gene_id;
127 |         my $extend = "";
128 |         if ($extended) {
129 |     	    ($gene_id) = $attr =~ /gene_name "([^"]+)"/;
130 |     	    ($gene_id) = $attr =~ /gene_id "([^"]+)"/ unless $gene_id;
131 |             $extend="\t$gene_id";
132 |         }
133 | 		# added an extra comma to make it look exactly like ucsc's beds
134 | 		print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,$extend\n";
135 | }
136 | 
137 | 
138 | close IN;
139 | 


--------------------------------------------------------------------------------
/4_DO_RSeQC_Multiple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Arguments:
 3 | # $1 = Organism under consideration
 4 | # $2 = input bam dir
 5 | # $3 = output directory
 6 | 
 7 | ORGANISM=$1
 8 | INPUTDIR=$2
 9 | OUTDIR=$3
10 | 
11 | if [ -z $ORGANISM ] ; then
12 |   echo "Please set organism for reference annotations (ARG 1/3)"
13 |   exit 1
14 | fi
15 | 
16 | if [ -z $INPUTDIR ] ; then
17 |   echo "$INPUTDIR does not exist. Please provide a directory of BAMfiles (ARG 2/3)"
18 |   exit 1
19 | fi
20 | 
21 | if [ -z $OUTDIR ] ; then
22 |   echo "Please set a directory for the output files (ARG 3/3)"
23 |   exit 1
24 | fi
25 | 
26 | mkdir -p $OUTDIR
27 | 
28 | 
29 | # Check relevant annotation/gene model files exist and have been converted to BED format -> prevent multiple jobs trying to write to the same place.
30 | # This code is duplicated in 4_RSeQC_Multiple.sh so that it can be run safely on its own (specific/detailed analyses for particular files) or from this script for bulk analysis
31 | MASKgtf=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$ORGANISM-rRNAtRNAmtmRNAs-mask.gtf
32 | MASKbed="${MASKgtf%.gtf}.bed"
33 | REFGENOME="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf"
34 | REFGENOMEbed="${REFGENOME%.gtf}.bed"
35 | 
36 | if [ ! -f $MASKbed ] ; then
37 |   echo "Cannot find $MASKbed: attempting to make it."
38 |   if [ ! -f $MASKgtf ] ; then
39 |     /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0
40 |   fi
41 |   if [ ! -s $MASKgtf ] ; then
42 |     echo "Cannot find or make $MASKgtf\n"
43 |     exit 1
44 |   fi
45 | # Convert to bed format
46 |   perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $MASKgtf > $MASKbed
47 |   if [ ! -s $MASKbed ] ; then
48 |     echo "Failed to make $MASKbed\n"
49 |     exit 1
50 |   fi
51 | fi
52 | 
53 | 
54 | 
55 | if [ ! -f $REFGENOMEbed ] ; then
56 |   echo "Cannot find $REFGENOMEbed: attempting to make it."
57 |   if [ ! -f $REFGENOME ] ; then
58 |     /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0
59 |   fi
60 |   if [ ! -s $REFGENOME ] ; then
61 |     echo "Cannot find or make $REFGENOME\n"
62 |     exit 1
63 |   fi
64 | # Convert to bed format
65 |   perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $REFGENOME > $REFGENOMEbed
66 |   if [ ! -s $REFGENOMEbed ] ; then
67 |     echo "Failed to make $REFGENOMEbed\n"
68 |     exit 1
69 |   fi
70 | fi
71 | 
72 | for INPUTFILE in $INPUTDIR/*.bam ; do
73 |   OUTPREFIX=$(basename $INPUTFILE)
74 |   OUTPREFIX=${OUTPREFIX%.bam}
75 |   bsub -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o $OUTDIR/RSEQC_$OUTPREFIX.output /nfs/users/nfs_t/ta6/RNASeqPipeline/4_RSeQC_Multiple.sh $ORGANISM $INPUTFILE $OUTDIR/RSEQC_$OUTPREFIX 1
76 | done
77 | 


--------------------------------------------------------------------------------
/4_MergeBAMs.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | # 18 Apr 2015 : added path to samtools and the check that this samtools is available.
 5 | # 10 Apr 2015 : added indexing of dedupped file.
 6 | 
 7 | if (scalar(@ARGV) < 2) {die "Arguements: sortedmappedbamfiledirectory outputfileprefix\n";}
 8 | 
 9 | my $dir = $ARGV[0];#"/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped";
10 | 
11 | # Sort files by cell
12 | my @files = glob("$dir/*sorted*aligned.bam");
13 | my %sample2files = ();
14 | foreach my $file (@files) {
15 | #	if ($file =~ /([ATCG]{5,})/) {
16 | 	if ($file =~ /([^\/]+_Cell\d\d)/) {
17 | 		push(@{$sample2files{$1}},$file);
18 | 	} else {
19 | 		die "$file does not match?\n";
20 | 	}
21 | }
22 | 
23 | if (! -e "/usr/bin/samtools") { die "Cannot find samtools\n";}
24 | # merge files for each cell 
25 | open (my $ofh, ">", "$ARGV[1]\_DeDuppingStatistics.out") or die $!;
26 | print $ofh "sample\tdups\treads\n";
27 | foreach my $sample (sort(keys(%sample2files))) {
28 | 	print STDERR "Starting $sample\n";
29 | 	my $mergedfile = "$dir/$ARGV[1]\_$sample.sorted.bam";
30 | 	my $dedupedfile = "$dir/$ARGV[1]\_$sample.sorted.dedupped.bam";
31 | 	if (! -e $dedupedfile) {
32 | 		my @infiles = @{$sample2files{$sample}};
33 | 		if (scalar(@infiles) > 1) {
34 | 			print("/usr/bin/samtools merge $mergedfile @infiles\n");
35 | 	#		print STDERR "Finished Merging @infiles\n";
36 | 			print("/usr/bin/samtools rmdup $mergedfile $dedupedfile 2> dup.log\n");
37 | 	#		print STDERR "Finished removing dups from $mergedfile\n";
38 | 		#	system("samtools index $dedupedfile\n"); # Changed my mind, will do this as needed from 4_RSeQC_Multiple.sh
39 | 		} else {
40 | 			$mergedfile = $infiles[0];
41 | 			system("/usr/bin/samtools rmdup $mergedfile $dedupedfile 2> dup.log\n");
42 | 		}
43 | 		my $last = ""; 
44 | #		open (my $ifh, "dup.log") or die $!;
45 | #		while (<$ifh>) {$last=$_;} close($ifh);
46 | #		if ($last =~ /(\d+) \/ (\d+) =/) {
47 | #			print $ofh "$sample\t$1\t$2\n";
48 | #		} else {die "$last line does not match\n";}
49 | ##		print STDERR "Finished extracting data from logfile & writing to new output file\n";
50 | 	}
51 | }
52 | close ($ofh);
53 | 


--------------------------------------------------------------------------------
/4_Process_RSEQC_output.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | # Optionally takes a second arguement of a list of sample to exclude. -> this is untested
  5 | 
  6 | # Output:
  7 | # sample	totalreads	QCfailed	duplicates	multimap	uniquemap	unmapped	rRNA	read1/read2	+/-	non-splice	splice	GeneBodyskewness avgfragsize
  8 | 
  9 | if (@ARGV < 1) {die "Please provide a directory of RSeQC output files (and optionally a file of samples to exclude)\n";}
 10 | 
 11 | my %exclude = ();
 12 | if (defined($ARGV[1])) {
 13 | 	open(my $ifh, $ARGV[1]) or die $!;
 14 | 	while (<$ifh>) {
 15 | 		chomp;
 16 | 		my @record = split(/\s+/);
 17 | 		foreach my $ele (@record) {
 18 | 			$exclude{$ele} = 1;
 19 | 		}
 20 | 	} close($ifh);
 21 | }
 22 | 
 23 | my @files = glob("$ARGV[0]/RSEQC_*.output");
 24 | my %sample2output = ();
 25 | 
 26 | 
 27 | 
 28 | my @ordered_expected_keys = ("totalreads","QCfailed","duplicates","multimap","uniquemap","unmapped","rRNA", "read1/read2", "+/-", "non-splice", "splice", "GeneBodyskewness", "avgfragsize");
 29 | 
 30 | foreach my $file (@files) {
 31 | #	$file =~ /([ATGC]{5,})/;
 32 | 	$file =~ /(sc\d_cell\d\d)/;
 33 | 	my $sample = $1;
 34 | 	if (exists($exclude{$sample})) {next;}
 35 | 	open (my $ifh, $file) or die $!;
 36 | 	my %outputs = ();
 37 | 	while (<$ifh>) {
 38 | 		if ($_ =~ /Total records:\s*(\d+)/) {
 39 | 			$outputs{"totalreads"} = $1;
 40 | 		}
 41 | 		elsif ($_ =~ /Reads consumed by input gene list\):\s*(\d+)/) {
 42 | 			$outputs{"rRNA"} = $1;
 43 | 		}
 44 | 		elsif ($_ =~ /QC failed:\s*(\d+)/) {
 45 | 			$outputs{"QCfailed"} = $1;
 46 | 		}
 47 | 		elsif ($_ =~ /PCR duplicate:\s*(\d+)/) {
 48 | 			$outputs{"duplicates"} = $1;
 49 | 		}
 50 | 		elsif ($_ =~ /Non primary hits\s*(\d+)/) {
 51 | 			$outputs{"multimap"} += $1;
 52 | 		}
 53 | 		elsif ($_ =~ /mapq < mapq_cut \(non-unique\):\s*(\d+)/) {
 54 | 			$outputs{"multimap"} += $1;
 55 | 		}
 56 | 		elsif ($_ =~ /mapq >= mapq_cut \(unique\):\s*(\d+)/) {
 57 | 			$outputs{"uniquemap"} += $1;
 58 | 		}
 59 | 		elsif ($_ =~ /Unmapped reads:\s*(\d+)/) {
 60 | 			$outputs{"unmapped"} = $1;
 61 | 		}
 62 | 		elsif ($_ =~ /Read-1:\s*(\d+)/) {
 63 | 			$outputs{"read1"} = $1;
 64 | 		}
 65 | 		elsif ($_ =~ /Read-2:\s*(\d+)/) {
 66 | 			$outputs{"read2"} = $1;
 67 | 		}
 68 | 		elsif ($_ =~ /Reads map to '\+':\s*(\d+)/) {
 69 | 			$outputs{"+"} = $1;
 70 | 		}
 71 | 		elsif ($_ =~ /Reads map to '\-':\s*(\d+)/) {
 72 | 			$outputs{"-"} = $1;
 73 | 		}
 74 | 		elsif ($_ =~ /Non-splice reads:\s*(\d+)/) {
 75 | 			$outputs{"non-splice"} = $1;
 76 | 		}
 77 | 		elsif ($_ =~ /Splice reads:\s*(\d+)/) {
 78 | 			$outputs{"splice"} = $1;
 79 | 		}
 80 | 		elsif ($_ =~ /Sample\s+Skewness/) {
 81 | 			my $data = <$ifh>;
 82 | 			$data =~ /\s+([-\.\d]+)/;
 83 | 			$outputs{"GeneBodyskewness"} = $1;
 84 | 		}
 85 | 		else {
 86 | 			#count number of tabs in line
 87 | 			my @record = split(/\t/);
 88 | 			if (scalar(@record) == 8 && $record[7] =~ /\d/) {
 89 | 				$outputs{"sumfrag"} += $record[5]*$record[4];
 90 | 				$outputs{"numfrag"} += $record[4];
 91 | 			}
 92 | 		}
 93 | 	} close($ifh);
 94 | 	if (exists($outputs{"read1"}) && exists($outputs{"read2"})) {
 95 | 		$outputs{"read1/read2"} = $outputs{"read1"}/$outputs{"read2"};
 96 | 	} else {
 97 | 		$outputs{"read1/read2"} = "NA";
 98 | 	}
 99 | 	if (exists($outputs{"+"}) && exists($outputs{"-"})) {
100 | 		$outputs{"+/-"} = $outputs{"+"}/$outputs{"-"};
101 | 	} else {
102 | 		$outputs{"+/-"} = "NA";
103 | 	}
104 | 	if ((exists($outputs{"sumfrag"}) && exists($outputs{"numfrag"})) && $outputs{"numfrag"} > 0) {
105 | 		$outputs{"avgfragsize"} = $outputs{"sumfrag"}/$outputs{"numfrag"};
106 | 	} else {
107 | 		$outputs{"avgfragsize"} = "NA";
108 | 	}
109 | 
110 | 	foreach my $key (@ordered_expected_keys){
111 | 		if (!exists($outputs{$key})) {
112 | 			die "No data for $key\n";
113 | 		}
114 | 		push(@{$sample2output{$sample}},$outputs{$key});
115 | 	}
116 | }
117 |  
118 | print "sample\t".join("\t", @ordered_expected_keys)."\n";
119 | foreach my $sample (keys(%sample2output)) {
120 | 	print "$sample\t".join("\t", @{$sample2output{$sample}})."\n";
121 | }
122 | 
123 | # Combine Rscripts
124 | 
125 | @files = glob("$ARGV[0]/RSEQC_*.GC_plot.r");
126 | my $plotcmd = "";
127 | my @datacmds = ();
128 | my @bincounts = (0)x100;
129 | my $pdfcmd="pdf(\"$ARGV[0]/RSEQC_GC_plot_Combined.pdf\")\n";
130 | foreach my $file (@files) {
131 | 
132 | #	$file =~ /([ATCG]{5,})/; my $sample = $1;
133 | 	$file =~ /(sc\d_cell\d\d)/; my $sample = $1;
134 | 	if (exists($exclude{$sample})) {next;}
135 | 
136 | 	open (my $ifh, $file) or die $!;
137 | 	<$ifh>; #pdfcmd
138 | 	my $data = <$ifh>;
139 | # Data is originally  "rep(c(),times=c())" how to process this? -> split the two c()'s and interate for ($i ...) through each of them.
140 | 	my @stuff = split(/[\(\)]/,$data);
141 | 	my @values = split(",",$stuff[2]);
142 | 	my @times = split(",",$stuff[4]);
143 | 	if (scalar(@values) != scalar(@times)) {die "Does not compute: Not same number of values as times\n";}
144 | 	for (my $i =0; $i < scalar(@values); $i++) {
145 | 		my $index = int($values[$i]);
146 | # floor each data point to nearest integer, add 1 to that index of @bincounts
147 | 		$bincounts[$index]+=$times[$i];
148 | 	}
149 | 	$plotcmd=<$ifh>;# need new plot command
150 | 	<$ifh>;
151 | 	close($ifh);
152 | }
153 | open(my $ofh, ">", "$ARGV[0]/RSEQC_GC_plot_Combined.r") or die $!;
154 | print $ofh $pdfcmd;
155 | print $ofh "data=c(".join(",",@bincounts).")\n";
156 | print $ofh "xes=barplot(data/sum(data), space=0, col=\"white\",ylab=\"Density of Reads\", border=\"blue\", main=\"\", xlab=\"GCcontent (%)\")\n";
157 | print $ofh "axis(1,at=xes,labels=1:100,col=\"white\")\n";
158 | print $ofh "dev.off()\n";
159 | close($ofh);
160 | 
161 | @files = glob("$ARGV[0]/RSEQC_*.geneBodyCoverage.r");
162 | $plotcmd = "matplot(data,type='l', xlab=\"Gene body percentile (5'->3')\", ylab=\"Coverage\",lwd=0.8,col=colours)\n";
163 | @datacmds = ();
164 | my $colourcmd = "colours=colorRampPalette(c(\"#7fc97f\",\"#beaed4\",\"#fdc086\",\"#ffff99\",\"#386cb0\",\"#f0027f\"))(".scalar(@files).")\n";
165 | $pdfcmd="pdf(\"$ARGV[0]/RSEQC_geneBodyCoverage_plot_Combined.pdf\")\n";
166 | foreach my $file (@files) {
167 | #	$file =~ /([ATCG]{5,})/; my $sample = $1;
168 | 	$file =~ /(sc\d_cell\d\d)/; my $sample = $1;
169 | 	if (exists($exclude{$sample})) {next;}
170 | 
171 | 	open (my $ifh, $file) or die $!;
172 | 	my $data = <$ifh>;
173 | 	$data =~ /(c\(.+\))/;
174 | 	push(@datacmds, $1);
175 | 	close($ifh);
176 | }
177 | open($ofh, ">", "$ARGV[0]/RSEQC_geneBodyCoverage_plot_Combined.r") or die $!;
178 | print $ofh $pdfcmd;
179 | print $ofh "data=cbind(".join(",",@datacmds).")\n";
180 | print $ofh $colourcmd;
181 | print $ofh $plotcmd;
182 | print $ofh "dev.off()\n";
183 | close($ofh);
184 | 
185 | # Use average at each point for each line over all samples!
186 | @files = glob("$ARGV[0]/RSEQC_*.junctionSaturation_plot.r");
187 | my $xcmd = "x=c(5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100)";
188 | $pdfcmd="pdf(\"$ARGV[0]/RSEQC_junctionSaturation_plot_Combined.pdf\")";
189 | my $legendcmd = "legend(5,40, legend=c(\"All junctions\",\"known junctions\", \"novel junctions\"),col=c(\"blue\",\"red\",\"green\"),lwd=1,pch=1)";
190 | my %data = ();
191 | foreach my $file (@files) {
192 | #	$file =~ /([ATCG]{5,})/; my $sample = $1;
193 | 	$file =~ /(sc\d_cell\d\d)/; my $sample = $1;
194 | 	if (exists($exclude{$sample})) {next;}
195 | 
196 | 	open (my $ifh, $file) or die $!;
197 | 	<$ifh>;<$ifh>; #pdf cmd, xes
198 | 	my $y = <$ifh>; $y =~ s/y=c\(//; $y =~s/\)//;
199 | 	my @yes = split(/,/,$y);
200 | 	for (my $i = 0; $i < scalar(@yes); $i++) {$data{"y"}->[$i] += $yes[$i];}
201 | 
202 | 	my $z = <$ifh>; $z =~ s/z=c\(//; $z =~s/\)//;
203 | 	my @zes = split(/,/,$z);
204 | 	for (my $i = 0; $i < scalar(@zes); $i++) {$data{"z"}->[$i] += $zes[$i];}
205 | 
206 | 	my $w = <$ifh>; $w =~ s/w=c\(//; $w =~s/\)//;
207 | 	my @wes = split(/,/,$w);
208 | 	for (my $i = 0; $i < scalar(@wes); $i++) {$data{"w"}->[$i] += $wes[$i];}
209 | 
210 | 	close($ifh);
211 | }
212 | #plot(x,z/1000,xlab='percent of total reads',ylab='Number of splicing junctions (x1000)',type='o',col='blue',ylim=c(n,m))
213 | #points(x,y/1000,type='o',col='red')
214 | #points(x,w/1000,type='o',col='green')
215 | 
216 | open($ofh, ">", "$ARGV[0]/RSEQC_junctionSaturation_plot_Combined.r") or die $!;
217 | print $ofh "x=c(5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100)\n";
218 | print $ofh "y=c(".join(",", @{$data{"y"}}).")/".scalar(@files)."\n";
219 | print $ofh "z=c(".join(",", @{$data{"z"}}).")/".scalar(@files)."\n";
220 | print $ofh "w=c(".join(",", @{$data{"w"}}).")/".scalar(@files)."\n";
221 | print $ofh "m=max(y,z,w)/1000\nn=min(y,z,w)/1000\n";
222 | print $ofh $pdfcmd."\n";
223 | print $ofh "plot(x,z/1000,xlab='percent of total reads',ylab='Number of splicing junctions (x1000)',type='o',col='blue',ylim=c(n,m))\npoints(x,y/1000,type='o',col='red')\npoints(x,w/1000,type='o',col='green')\n";
224 | print $ofh $legendcmd."\n";
225 | 
226 | print $ofh "dev.off()\n";
227 | close($ofh);
228 | 


--------------------------------------------------------------------------------
/4_RSeQC_Multiple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Arguments:
 3 | # $1 = Organism under consideration
 4 | # $2 = input bam
 5 | # $3 = output prefix
 6 | # $4 = full analysis? [0/1], 1=do all six analyses, 0=only do basic stats & rRNA content
 7 | 
 8 | ORGANISM=$1
 9 | INPUTBAM=$2
10 | OUTPREFIX=$3
11 | MAP_QUALITY=30 #default=30 on Phred scale
12 | 
13 | if [ -z $ORGANISM ] ; then
14 |   echo "Please set organism for reference annotations (ARG 1/4)"
15 |   exit 1
16 | fi
17 | 
18 | if [ -z $INPUTBAM ] || [ ! -f $INPUTBAM ] ; then
19 |   echo "$INPUTBAM does not exist. Please provide existing sorted BAM file (ARG 2/4)"
20 |   exit 1
21 | fi
22 | 
23 | if [ -z $OUTPREFIX ] ; then
24 |   echo "Please set a prefix for the output files (ARG 3/4)"
25 |   exit 1
26 | fi
27 | 
28 | if [ -z $4 ] ; then
29 |   echo "Please set type of analysis: 0 = basic stats & rRNA content only, 1 = full analysis (ARG 4/4)"
30 |   exit 1
31 | fi
32 | 
33 | # Check relevant annotation/gene model files exist and have been converted to BED format
34 | # This code is duplicated in 4_DO_RSeQC_Multiple.sh
35 | MASKgtf=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir/$ORGANISM-rRNAtRNAmtmRNAs-mask.gtf
36 | MASKbed="${MASKgtf%.gtf}.bed"
37 | REFGENOME="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf"
38 | REFGENOMEbed="${REFGENOME%.gtf}.bed"
39 | 
40 | if [ ! -s $MASKbed ] ; then
41 |   echo "Cannot find $MASKbed: attempting to make it."
42 |   if [ ! -s $MASKgtf ] ; then
43 |     /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0
44 |   fi  
45 |   if [ ! -s $MASKgtf ] ; then
46 |     echo "Cannot find or make $MASKgtf\n"
47 |     exit 1
48 |   fi  
49 | # Convert to bed format
50 |   perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $MASKgtf > $MASKbed
51 |   if [ ! -s $MASKbed ] ; then
52 |     echo "Failed to make $MASKbed\n"
53 |     exit 1
54 |   fi  
55 | fi
56 | 
57 | if [ ! -s $REFGENOMEbed ] ; then
58 |   echo "Cannot find $REFGENOMEbed: attempting to make it."
59 |   if [ ! -s $REFGENOME ] ; then
60 |     /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh $ORGANISM 0
61 |   fi  
62 |   if [ ! -s $REFGENOME ] ; then
63 |     echo "Cannot find or make $REFGENOME\n"
64 |     exit 1
65 |   fi  
66 | # Convert to bed format
67 |   perl /nfs/users/nfs_t/ta6/RNASeqPipeline/4_Convert_GTF2BED_customized_for_Ensembl.pl $REFGENOME > $REFGENOMEbed
68 |   if [ ! -s $REFGENOMEbed ] ; then
69 |     echo "Failed to make $REFGENOMEbed\n"
70 |     exit 1
71 |   fi
72 | fi
73 | 
74 | 
75 | # RUN RSeQC analysis
76 | # get python path
77 | bash
78 | 
79 | echo $INPUTBAM
80 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/split_bam.py -i $INPUTBAM -r $MASKbed -o $OUTPREFIX
81 | python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/bam_stat.py -i $INPUTBAM -q $MAP_QUALITY 
82 | if [ $4 -gt 0 ] ; then 
83 |   if [ ! -f $INPUTBAM.bai ] ; then
84 |     samtools index $INPUTBAM
85 |   fi
86 |   python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/geneBody_coverage.py -i $INPUTBAM -r $REFGENOMEbed -o $OUTPREFIX #requires BAM indexing file *.bam.bai -> index using samtools
87 |   # -m 20 = minimum intron size (keeping consistent with STAR ENCODE parameters)
88 |   python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/junction_saturation.py -i $INPUTBAM -r $REFGENOMEbed -o $OUTPREFIX -m 20 -q $MAP_QUALITY
89 |   python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/read_GC.py -i $INPUTBAM -o $OUTPREFIX -q $MAP_QUALITY
90 | #  python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/RNA_fragment_size.py -i $INPUTBAM -r $REFGENOMEbed -q $MAP_QUALITY  #requires BAM indexing file *.bam.bai, output takes up lot of memory & similar to what I will probably get from fragment counting software so not very important
91 | fi
92 | 
93 | rm $OUTPREFIX*.bam
94 | 


--------------------------------------------------------------------------------
/5.0_Summarize_Known_Transcriptome.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | #Things to filter: 
 5 | #	single exon non-reference transcripts (class 'u' 'i'  & single exon), 
 6 | #	transcripts with retained introns (class 'e'), 
 7 | #	polymerase read-though (class 'p'), 
 8 | #	class code 's' (likely read mapping error)
 9 | # helpful info: http://seqanswers.com/forums/showthread.php?t=3518
10 | # Stats I would like to have: (1) % reference transcripts recovered (# transcripts class '=' vs # transcripts genome, (2) # novel intergenic multi-exonic transcripts, (3) # novel alternatively spliced transcripts
11 | 
12 | if (@ARGV < 1) {die "Please provide reference GTF\n";}
13 | 
14 | my %transcriptid2lines =();
15 | my %transcriptid2numexons = ();
16 | open (my $ifh, $ARGV[0]) or die $!;
17 | while (<$ifh>) {
18 | 	chomp;
19 | 	$_ =~ /transcript_id "(.+?)";/;
20 | 	my $tid = $1;
21 | 	if ($_ =~ /exon_number "(\d+)"/) {
22 | 		if (!exists($transcriptid2numexons{$tid}) || $transcriptid2numexons{$tid} < $1) {
23 | 			$transcriptid2numexons{$tid} = $1;
24 | 		}
25 | 	}
26 | } close($ifh);
27 | 
28 | print "Number of Transcripts: ".scalar(keys(%transcriptid2numexons))."\n";
29 | 


--------------------------------------------------------------------------------
/5_Cufflinks_wrapper.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Note: this may be called by 4_RSeQC_Multiple.sh, 4_DO_RSeQC_Multiple.sh
  3 | # Arguments:
  4 | # $1 = organism: either Mmus or Hsap 
  5 | # $2 = number of threads to run on
  6 | # $3 = input BAM to run on (only required if number of threads > 0)
  7 | # $4 = outputdir (optional, default = /lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified).
  8 | # $5 = gtf file. -> if provided allows faux reads, if not provided gets genome one and does not use faux-reads
  9 | 
 10 | CUFFLINKS=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/cufflinks
 11 | ORGANISM=$1
 12 | NUMTHREADS=$2
 13 | INPUTDIR=$3
 14 | OUTDIR=$4
 15 | ANNOTATIONgtf=$5
 16 | TEMPDIR=/lustre/scratch108/compgen/team218/TA/TemporaryFileDir
 17 | FILEStoMAP=($INPUTDIR/*.bam)
 18 | ARRAYINDEX=$(($LSB_JOBINDEX-1))
 19 | INPUTBAM=${FILEStoMAP[$ARRAYINDEX]}
 20 | echo "Inputfile: $INPUTBAM"
 21 | 
 22 | if [ ! -f $CUFFLINKS ] ; then
 23 |   echo "Sorry Cufflinks not available"
 24 |   exit 1
 25 | fi
 26 | 
 27 | if [ -z $ORGANISM ] ; then
 28 |   echo "Please set organism for reference annotations (ARG 1/4)"
 29 |   exit 1
 30 | fi
 31 | 
 32 | if [ -z $NUMTHREADS ] ; then
 33 |   echo "Please set number of threads to run on, setting = 0 will get genome & rRNA gtf but not run cufflinks (ARG 2/4)"
 34 |   exit 1
 35 | fi
 36 | 
 37 | if [ $NUMTHREADS -gt 0 ] ; then
 38 |   if [ -z $INPUTBAM ] || [ ! -f $INPUTBAM ] ; then
 39 |     echo "$INPUTBAM, jobindex $LSB_JOBINDEX, array index $ARRAYINDEX of $INPUTDIR does not exist. Please provide a directory containing BAMfiles (ARG 3/4)"
 40 |     exit 1
 41 |   fi
 42 | fi
 43 | 
 44 | if [ -z $OUTDIR ] ; then
 45 |   OUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified
 46 | fi
 47 | if [ -z $LSB_JOBINDEX ] ; then
 48 |   LSB_JOBINDEX=7
 49 | fi
 50 | 
 51 | SEED=$((100+$LSB_JOBINDEX))
 52 | echo "rgenerator seed: $SEED"
 53 | 
 54 | FAUXREADS=""
 55 | 
 56 | # Get stuff for cufflinks:
 57 | # gtf if not already present, get genome fasta if not already present (basically run Build genome without actually running STAR.
 58 | if [ -z $ANNOTATIONgtf ] ; then
 59 |   echo "Using mapping-genome annotations"
 60 |   GENOMEDIR=/lustre/scratch108/compgen/team218/TA/genomebuilding
 61 |   GENOMEfa=$GENOMEDIR/*.fa
 62 |   GENOMEgtf=$GENOMEDIR/*.gtf
 63 |   ANNOTATIONgtf=$GENOMEgtf
 64 |   FAUXREADS="--no-faux-reads"
 65 | 
 66 | #  if [ ! -s $GENOMEfa ] ; then
 67 | #    /nfs/users/nfs_t/ta6/RNASeqPipeline/0_BuildGenome.sh $GENOMEDIR $TEMPDIR 0 125 $ORGANISM /nfs/users/nfs_t/ta6/Collaborations/Bergiers_Italy
 68 | #  fi
 69 | #  if [ ! -s $GENOMEgtf ] ; then
 70 | #    /nfs/users/nfs_t/ta6/RNASeqPipeline/0_BuildGenome.sh $GENOMEDIR $TEMPDIR 0 125 $ORGANISM /nfs/users/nfs_t/ta6/Collaborations/Bergiers_Italy
 71 | #  fi
 72 | fi
 73 |   # get rRNA, mitochondial transcripts, tRNAs to mask -> hummmmm....... how best to do this? -> use grep to select the relevant lines from the existing .gtf is fast and ensures compatibility between the two gtf files and with the .fa file.
 74 | MASKgtf=$TEMPDIR/$ORGANISM-rRNAtRNAmtmRNAs-mask.gtf
 75 | if [ ! -f $MASKgtf ] ; then
 76 |   grep -E 'rRNA|tRNA|^MT' /lustre/scratch108/compgen/team218/TA/genomebuilding/*.gtf > $MASKgtf
 77 | fi
 78 | 
 79 | 
 80 | if [ ! -s $GENOMEfa ] ; then
 81 |   echo "Failed to find or make $GENOMEfa"
 82 |   exit 1;
 83 | fi
 84 | if [ ! -s $GENOMEgtf ] ; then
 85 |   echo "Failed to find or make $GENOMEgtf"
 86 |   exit 1;
 87 | fi
 88 | if [ ! -f $MASKgtf ] ; then
 89 |   echo "Failed to find or make $MASKgtf"
 90 |   exit 1;
 91 | fi
 92 | if [ ! -s $MASKgtf ] ; then
 93 |   echo "Warning: Mask ($MASKgtf) is empty, continuing anyway..."
 94 | fi
 95 | 
 96 | 
 97 | 
 98 | # Cufflinks options:
 99 | # --GTF-guide <reference_annotation.gtf>
100 | # --mask-file <mask.gtf>
101 | # --frag-bias-correct <genome.fa>
102 | # --multi-read-correct
103 | # --quiet
104 | # --no-update-check
105 | # -o <outputdirectory>
106 | # --num-threads <number of threads used during analysis>
107 | # --seed <random # generator seed>
108 | # --max-intron-length 1000000 #keep consistent with STAR parameters
109 | # --min-intron-length 20 #keep consistent with STAR parameters
110 | # --max-multiread-fraction <maximum fraction of allowed multireads per transcript> #default is 0.75
111 | # --library-type <one of supported types> #default is fr-unstranded
112 | 
113 | #To fix failed jobs
114 | #if [ -d "$OUTDIR/JOB$LSB_JOBINDEX" ]; then
115 | #-----------------
116 | 
117 | if [ $NUMTHREADS -gt 0 ] ; then
118 |   OUTDIR=$OUTDIR/JOB$LSB_JOBINDEX
119 |   mkdir -p $OUTDIR
120 | 
121 |   # Get rid of S thing from STAR.
122 |   TMPSAM=Temp$LSB_JOBINDEX.out.sam
123 |   samtools view -h -o $TEMPDIR/$TMPSAM $INPUTBAM
124 | 
125 |   awk 'BEGIN {OFS="\t"} {split($6,C,/[0-9]*/); split($6,L,/[SMDIN]/); if (C[2]=="S") {$10=substr($10,L[1]+1); $11=substr($11,L[1]+1)}; if (C[length(C)]=="S") {L1=length($10)-L[length(L)-1]; $10=substr($10,1,L1); $11=substr($11,1,L1); }; gsub(/[0-9]*S/,"",$6); print}' $TEMPDIR/$TMPSAM > $TEMPDIR/noS.$TMPSAM
126 | 
127 |   NEWINPUTBAM=$TEMPDIR/noS.Temp$LSB_JOBINDEX.out.bam
128 |   samtools view -bS $TEMPDIR/noS.$TMPSAM > $NEWINPUTBAM
129 |   rm $TEMPDIR/$TMPSAM
130 |   rm $TEMPDIR/noS.$TMPSAM
131 | 
132 |   # de novo assembly command
133 | #  $CUFFLINKS --GTF-guide $ANNOTATIONgtf --frag-bias-correct $GENOMEfa --mask-file $MASKgtf --multi-read-correct --max-intron-length 1000000 --min-intron-length 20 -o $OUTDIR --quiet --no-update-check --no-faux-reads --seed $SEED --num-threads $NUMTHREADS $NEWINPUTBAM
134 |   $CUFFLINKS --GTF-guide $ANNOTATIONgtf --mask-file $MASKgtf --multi-read-correct --max-intron-length 1000000 --min-intron-length 20 -o $OUTDIR --quiet --no-update-check $FAUXREADS --seed $SEED --num-threads $NUMTHREADS $NEWINPUTBAM
135 |   rm $NEWINPUTBAM
136 |   perl  /nfs/users/nfs_t/ta6/RNASeqPipeline/5_TidyCufflinks.pl $OUTDIR $INPUTBAM
137 | 
138 |   if [ -f $TEMPDIR/noS.$TMPSAM ]; then
139 |     rm $TEMPDIR/noS.$TMPSAM
140 |   fi
141 | fi
142 | #To fix failed jobs
143 | #fi
144 | #-----------------
145 | 


--------------------------------------------------------------------------------
/5_Cuffmerge_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Note: this may be called by 4_RSeQC_Multiple.sh, 4_DO_RSeQC_Multiple.sh
 3 | # Arguments:
 4 | # $1 = number of threads to run on
 5 | # $2 = file of files to merge
 6 | # $3 = reference gtf (optional)
 7 | # $4 = reference fasta (optional)
 8 | 
 9 | CUFFMERGE=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/cuffmerge
10 | NUMTHREADS=$1
11 | INPUTFILE=$2
12 | REFgtf=$3
13 | REFfasta=$4
14 | 
15 | # Add gtf_to_sam and other accessorty cufflinks scripts to my path
16 | export PATH=$PATH:/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/
17 | 
18 | if [ ! -f $CUFFMERGE ] ; then
19 |   echo "Sorry Cuffmerge not available"
20 |   exit 1
21 | fi
22 | 
23 | if [ -z $NUMTHREADS ] ; then
24 |   echo "Please set number of threads to run on, setting = 0 will get genome & rRNA gtf but not run cufflinks (ARG 1/4)"
25 |   exit 1
26 | fi
27 | 
28 | if [ $NUMTHREADS -lt 1 ] ; then
29 |   echo "Number of threads must be at least 1."
30 |   exit 1
31 | fi
32 | 
33 | if [ -z $INPUTFILE ] ; then
34 |   echo "Please set provide a file with a list of gtf files to merge (ARG 2/4)"
35 |   exit 1
36 | fi
37 | 
38 | ARGrefgtf=""
39 | if [ ! -z $REFgtf ] ; then
40 |   if [ -s $REFgtf ] ; then
41 |     ARGrefgtf="-g $REFgtf"
42 |   else
43 |     echo "Reference GTF is empty or does not exist, will not be used";
44 |   fi
45 | fi
46 | 
47 | ARGreffa=""
48 | if [ ! -z $REFfasta ] ; then
49 |   if [ -s $REFfasta ] ; then
50 |     ARGreffa="-s $REFfasta"
51 |   else 
52 |     echo "Reference FASTA is empty of does not exist, will not be used";
53 |   fi
54 | fi
55 | 
56 | 
57 | # Cuffmerge options:
58 | # -o outprefix->redirects stdout
59 | # -g ref-gtf
60 | # -p number of threads
61 | # -s ref-sequence
62 | $CUFFMERGE $ARGrefgtf $ARGreffa --num-threads $NUMTHREADS $INPUTFILE
63 | 


--------------------------------------------------------------------------------
/5_DO_Cufflinks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Cufflinks
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_Tophat/Beuttner_Tophat2_dedup/Deduplicated
 7 | INPUTFILES=($INPUTDIR/*.bam)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES))
10 | 
11 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%40" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o output.%J.%I -e error.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh Mmus 1 $INPUTDIR $OUTPUTDIR /lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf
12 | 


--------------------------------------------------------------------------------
/5_DO_Cufflinks_denovo_Transcripts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is just a copy of commands in 00_LIST_OF_BSUB_COMMANDS.sh so that I can run it separately from the rest of the pipeline.
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/DeNovoTranscripts
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Deduplicated
 7 | INPUTFILES=($INPUTDIR/*.bam)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES))
10 | 
11 | bsub -J"mappingwithstararrayjob[48-$MAXJOBS]%40" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o output.%J.%I -e error.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh Mmus 1 $INPUTDIR $OUTPUTDIR
12 | 
13 | 


--------------------------------------------------------------------------------
/5_DO_Cuffmerge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/DeNovoTranscripts
 4 | mkdir -p $OUTPUTDIR
 5 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified
 6 | INPUTFILE=$OUTPUTDIR/List_of_GTFs_to_merge.txt
 7 | REFgtf=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf
 8 | REFfa=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa
 9 | 
10 | ls $INPUTDIR/*_transcripts.gtf > $INPUTFILE
11 | 
12 | cd $OUTPUTDIR
13 | 
14 | readarray -t array < $INPUTFILE
15 | for file in ${array[@]} ; do
16 |   cat $file | sed "s/TNeo CDS/TNeoCDS/" > tempfile.tmp
17 |   mv tempfile.tmp $file
18 | done
19 | 
20 | bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o output.%J -e error.%J /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cuffmerge_wrapper.sh 1 $INPUTFILE $REFgtf $REFfa
21 | 
22 | 


--------------------------------------------------------------------------------
/5_DO_Quantification_X2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # NOT TESTED
 3 | 
 4 | OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Bergiers_Vivo
 5 | mkdir -p $OUTPUTDIR
 6 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Bergiers_Vivo/Deduplicated
 7 | INPUTFILES=($INPUTDIR/Bergiers_Vivo*.bam)
 8 | NUMFILES=${#INPUTFILES[@]}
 9 | MAXJOBS=$(($NUMFILES))
10 | 
11 | NEWANNOTATIONgtf="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf"
12 | 
13 | #bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%20" -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o output.%J.%I -e error.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_Cufflinks_wrapper.sh Mmus 1 $INPUTDIR $OUTPUTDIR $NEWANNOTATIONgtf
14 | 
15 | 
16 | #Only run these one at a time because they create a huge amount of temporary files
17 | bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%1" -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o FCountoutput.%J.%I -e FCounterror.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_featureCounts_wrapper.sh $NEWANNOTATIONgtf 1 $INPUTDIR
18 | 
19 | 


--------------------------------------------------------------------------------
/5_DO_featureCounts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedDeDupped
 4 | INPUTFILES=($INPUTDIR/*.bam)
 5 | NUMFILES=${#INPUTFILES[@]}
 6 | MAXJOBS=$NUMFILES
 7 | OUTDIR=/lustre/scratch108/compgen/team218/TA/Bergiers_Wafergen/BigFilesMappedDedupedCounted
 8 | TMPDIR=/lustre/scratch108/compgen/team218/TA/Pipeline_RunningDir/FeatureCounts
 9 | mkdir -p $OUTDIR
10 | ANNOTATIONgtf="/lustre/scratch108/compgen/team218/TA/genomebuilding/Bergiers_Transcripts.gtf"
11 | featureCOUNT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/featureCounts
12 | NUMTHREADS=1
13 | 
14 | if [ ! -f $featureCOUNT ] ; then
15 |   echo "Sorry featureCounts not available"
16 |   exit 1
17 | fi
18 | 
19 | if [ -z $ANNOTATIONgtf ] || [ ! -f $ANNOTATIONgtf ] ; then
20 |   echo "Please provide an annotation GTF file (ARG 1/3)"
21 |   exit 1
22 | fi
23 | 
24 | if [ -z $NUMTHREADS ] ; then
25 |   echo "Please set number of threads to run on (ARG 2/3)"
26 |   exit 1
27 | fi
28 | 
29 | if [ ! $NUMTHREADS -gt 0 ] ; then
30 |   echo "Error: number of threads must be > 0"
31 |   exit 1
32 | fi
33 | 
34 | # featureCounts options:
35 | # -t 'string' : specify the feature type to count reads for, default='exon'
36 | # -g 'string' : specify the attribute used to group features into meta-features, default='gene_id'
37 | # -f : read summarization performed at the feature level instead of the meta-feature level
38 | # -O : reads can match more than one feature/metafeature
39 | # -M : multi-mapping can be counted multiple times (once for each of their mapping locations
40 | #    alternatively
41 | # --primary : only primary alignments will be counted
42 | # -Q ## : minimum mapping quality (default = 0, 30 = consistent with RSeQC)
43 | # -T ## : number of threads to run on (default =1)
44 | # -R : output read counting assignments of each read into a .featureCounts file
45 | # --ignoreDup : ignores any reads marked as duplicates
46 | # -p : fragments rather than reads counted for paired-end data.
47 | # -d ## : minimum fragment/template legnth (default: 50) -> only if using -P parameter too
48 | # -D ## : maximum fragment/template length (default 600) -> only if using -P parameter too
49 | # -B : only reads with both ends mapping considered
50 | # -C : reads with ends mapping to different Chrs excluded
51 | 
52 | bsub -J"featurecountsjobarray[1-$MAXJOBS]%100" -R"select[mem>5000] rusage[mem=5000]" -M5000 -q normal -o FCoutput.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/5_featureCounts_wrapper.sh $ANNOTATIONgtf $NUMTHREADS $INPUTDIR $TMPDIR $OUTDIR
53 | 


--------------------------------------------------------------------------------
/5_DO_featureCounts_locally.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # NOT TESTED
 3 | 
 4 | INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Beuttner_STAR/Beuttner_STAR_dedup/Deduplicated/
 5 | OUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_FeatureCounts
 6 | mkdir -p $OUTDIR
 7 | ANNOTATIONgtf="/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf"
 8 | featureCOUNT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/featureCounts
 9 | NUMTHREADS=1
10 | 
11 | if [ ! -f $featureCOUNT ] ; then
12 |   echo "Sorry featureCounts not available"
13 |   exit 1
14 | fi
15 | 
16 | if [ -z $ANNOTATIONgtf ] || [ ! -f $ANNOTATIONgtf ] ; then
17 |   echo "Please provide an annotation GTF file (ARG 1/3)"
18 |   exit 1
19 | fi
20 | 
21 | if [ -z $NUMTHREADS ] ; then
22 |   echo "Please set number of threads to run on (ARG 2/3)"
23 |   exit 1
24 | fi
25 | 
26 | if [ ! $NUMTHREADS -gt 0 ] ; then
27 |   echo "Error: number of threads must be > 0"
28 |   exit 1
29 | fi
30 | 
31 | # featureCounts options:
32 | # -t 'string' : specify the feature type to count reads for, default='exon'
33 | # -g 'string' : specify the attribute used to group features into meta-features, default='gene_id'
34 | # -f : read summarization performed at the feature level instead of the meta-feature level
35 | # -O : reads can match more than one feature/metafeature
36 | # -M : multi-mapping can be counted multiple times (once for each of their mapping locations
37 | #    alternatively
38 | # --primary : only primary alignments will be counted
39 | # -Q ## : minimum mapping quality (default = 0, 30 = consistent with RSeQC)
40 | # -T ## : number of threads to run on (default =1)
41 | # -R : output read counting assignments of each read into a .featureCounts file
42 | # --ignoreDup : ignores any reads marked as duplicates
43 | # -p : fragments rather than reads counted for paired-end data.
44 | # -d ## : minimum fragment/template legnth (default: 50) -> only if using -P parameter too
45 | # -D ## : maximum fragment/template length (default 600) -> only if using -P parameter too
46 | # -B : only reads with both ends mapping considered
47 | # -C : reads with ends mapping to different Chrs excluded
48 | 
49 | 
50 | for INPUTBAM in $INPUTDIR/*.bam ; do
51 |   OUTPUTFILE=$(basename "${INPUTBAM%.bam}.fragmentcounts")
52 |   $featureCOUNT -O -M -Q 30 -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #allow multimap
53 |   #$featureCOUNT -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #No multimap
54 |   rm temp*
55 | done
56 | 


--------------------------------------------------------------------------------
/5_Fix_Transcriptome_for_featureCounts.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "0_My_Extract_Transcriptome.pl .gtf\n";}
 5 | 
 6 | my %Ensg2Gtf = ();
 7 | 
 8 | open (my $gtf_out, ">","Transcripts_featureCounts.gtf") or die $!;
 9 | open (my $gtf, $ARGV[0]) or die $!;
10 | my $gtf_line = "";
11 | while ($gtf_line = <$gtf>) {
12 | 	if ($gtf_line =~ /^#/) {
13 | 		next;
14 | 	} # ignore headers
15 | 	my $geneid = "";
16 | 	if ($gtf_line =~ /gene_id "(.+?)";/) {
17 | 		$geneid = $1;
18 | 	} else {
19 | 		next;
20 | 	} # get gene id
21 | 	my @record = split(/\t/, $gtf_line);
22 | 	my $seq_chr = $record[0];
23 | 	my $seq_st = $record[3];
24 | 	my $seq_end = $record[4];
25 | 	if ($record[2] eq "exon") {
26 | 		$gtf_line =~ s/transcript_id "(.+?)"/transcript_id "$geneid"/;
27 | 		print $gtf_out $gtf_line;
28 | 	} else {
29 | 		$record[2] = "exon";
30 | 		my $lastele = scalar(@record)-1;
31 | 		$record[$lastele] = "gene_id \"$geneid\"; transcript_id \"$geneid\"; exon_number \"1\"; gene_name \"$geneid\"\n";
32 | 		print $gtf_out join("\t", @record);
33 | 	}
34 | }
35 | close($gtf);
36 | close($gtf_out);
37 | 


--------------------------------------------------------------------------------
/5_RSEM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Arguments:
 3 | # $1 = BAM file to map
 4 | # $2 = is paired end?
 5 | # $3 = number of threads to run on (default = 1)
 6 | RSEMdir=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26/
 7 | TEMPdir=/lustre/scratch108/compgen/team218/TA/Pipeline_RunningDir/RSEM/TMP
 8 | BAMfile=$1
 9 | paired=$2
10 | REFname=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/GRCm38
11 | BASEname=${BAMfile##*/}
12 | PREFIX=${BASEname%%.*}
13 | BAMfileOut=$TEMPdir/Out$BASEname
14 | BAMfiltered=$TEMPdir/Filtered$BASEname
15 | BAMfixed=$TEMPdir/Fixed$BASEname
16 | BAMsorted=$TEMPdir/Sorted$BASEname
17 | THREADS=$3
18 | 
19 | if [ -z "$THREADS" ] ; then
20 |   THREADS=1
21 | fi
22 | 
23 | mkdir -p $TEMPdir/$PREFIX
24 | 
25 | samtools view -b -f 2 $BAMfile > $BAMfiltered # read mapped in proper pair
26 | samtools sort -n $BAMfiltered $BAMsorted
27 | 
28 | #/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/utilities/subtools -i $BAMsorted.bam -o $BAMsorted --informat BAM --outformat BAM --sort byname
29 | 
30 | #$RSEMdir/convert-sam-for-rsem $BAMsorted $BAMfixed
31 | 
32 | #echo "validate file"
33 | #$RSEMdir/rsem-sam-validator $BAMsorted 
34 | 
35 | #$RSEMdir/convert-sam-for-rsem $BAMsorted.bam $BAMfileOut -T $TEMPdir/$PREFIX
36 | 
37 | 
38 | #if [ $paired ] ; then
39 | echo "$RSEMdir/rsem-calculate-expression --bam --paired-end --num-threads $THREADS --single-cell-prior --temporary-folder $TEMPdir --no-bam-output $BAMsorted.bam $REFname $PREFIX"
40 | #else
41 | #  $RSEMdir/rsem-calculate-expression --bam --num-threads $THREADS --single-cell-prior --temporary-folder $TEMPdir --no-bam-output $BAMfileTmp $REFname $PREFIX
42 | #fi
43 | 
44 | #rm $BAMfileTmp
45 | 


--------------------------------------------------------------------------------
/5_RSEM_build_refrence.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | RSEMdir=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSEM-1.2.26
 3 | OUTDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES/RSEM/
 4 | REFname=GRCm38
 5 | GenomeGTF=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf
 6 | GenomeFASTA=/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa
 7 | BOWTIEpath=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/bowtie2-2.2.6/
 8 | STARpath=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/STAR-STAR_2.4.0j/bin/Linux_x86_64_static/
 9 | 
10 | $RSEMdir/rsem-prepare-reference --gtf $GenomeGTF --bowtie2 --bowtie2-path $BOWTIEpath $GenomeFASTA $OUTDIR/$REFname
11 | $RSEMdir/rsem-prepare-reference --gtf $GenomeGTF --star --star-path $STARpath $GenomeFASTA $OUTDIR/$REFname
12 | 


--------------------------------------------------------------------------------
/5_Summarize_Filter_Merged_Transcriptome.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | #Things to filter: 
 5 | #	single exon non-reference transcripts (class 'u' 'i'  & single exon), 
 6 | #	transcripts with retained introns (class 'e'), 
 7 | #	polymerase read-though (class 'p'), 
 8 | #	class code 's' (likely read mapping error)
 9 | # helpful info: http://seqanswers.com/forums/showthread.php?t=3518
10 | # Stats I would like to have: (1) % reference transcripts recovered (# transcripts class '=' vs # transcripts genome, (2) # novel intergenic multi-exonic transcripts, (3) # novel alternatively spliced transcripts
11 | 
12 | if (@ARGV < 1) {die "Please provide cuffmerge outputfile\n";}
13 | 
14 | my %code2count = ();
15 | my %transcriptid2lines =();
16 | my %transcriptid2code=();
17 | my %transcriptid2numexons = ();
18 | open (my $ifh, $ARGV[0]) or die $!;
19 | while (<$ifh>) {
20 | 	chomp;
21 | 	$_ =~ /transcript_id "(.+)"; exon_number/;
22 | 	my $tid = $1;
23 | 	push(@{$transcriptid2lines{$tid}},$_);
24 | 	$_ =~ /class_code "(.+)"; tss_id/;
25 | 	my $code = $1;
26 | 	if (exists($transcriptid2code{$tid}) && $transcriptid2code{$tid} ne $code) {die "Contradicting codes\n";}
27 | 	$transcriptid2code{$tid}=$code;
28 | 	$code2count{$code}++;
29 | 	if ($_ =~ /exon_number "(\d+)"/) {
30 | 		if (!exists($transcriptid2numexons{$tid}) || $transcriptid2numexons{$tid} < $1) {
31 | 			$transcriptid2numexons{$tid} = $1;
32 | 		}
33 | 	} else { die "exon_num not match\n";}
34 | } close($ifh);
35 | 
36 | 
37 | my $Nrecovered = 0;
38 | my $Nremoved = 0;
39 | %code2count=();
40 | open(my $ofh, ">", "New_Transcriptome.gtf") or die $!;
41 | foreach my $tid (keys(%transcriptid2lines)) {
42 | 	$Nremoved++;
43 | 	my $code = $transcriptid2code{$tid};
44 | 	my $exons = $transcriptid2numexons{$tid};
45 | 	$code2count{$code}++;
46 | 	if ($exons == 1 && ($code eq "u" || $code eq "i")) {next;}
47 | 	if ($code eq "e" || $code eq "p" || $code eq "s" || $code eq "r") {next;}
48 | 	if ($code eq "=") {$Nrecovered++;}
49 | 	$Nremoved--;
50 | 
51 | 	foreach my $line (@{$transcriptid2lines{$tid}}) {
52 | 		print $ofh $line."\n";
53 | 	}
54 | }
55 | foreach my $code (keys(%code2count)) {
56 | 	print "$code : $code2count{$code}\n";
57 | }
58 | print "transcripts recovered: $Nrecovered\n";
59 | print "transcripts removed: $Nremoved\n";
60 | 


--------------------------------------------------------------------------------
/5_TidyCufflinks.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 2) {die "Provide: a directory of cufflinks output, and the file it was produced from\n";}
 5 | 
 6 | my $tag="ERR";
 7 | my $origfile = $ARGV[1];
 8 | if ($origfile =~ /_([^_]+_Cell\d\d)/) {
 9 |         $tag = $1;
10 | } else {
11 |         die "$origfile does not match\n";
12 | }
13 | 
14 | #$origfile =~ /([ACGT]{5,})/;
15 | if( chdir($ARGV[0])) {
16 | 
17 | 	foreach my $file (glob("*")) {
18 | 		system("mv $file ../$tag\_$file \n");
19 | 	}
20 | 	chdir("/nfs/users/nfs_t/ta6/RNASeqPipeline");
21 | 	rmdir($ARGV[0]);
22 | } else {die "error changing directory to $ARGV[0]";}
23 | 	
24 | 


--------------------------------------------------------------------------------
/5_featureCounts_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Arguments:
 3 | # $1 = Annotation GTF file
 4 | # $2 = number of threads to run on
 5 | # $3 = input BAM to run on (only required if number of threads > 0)
 6 | # $4 = workingdir
 7 | # $5 = outputdir
 8 | # This runs fast & efficiently, not on cluster took < 10 minutes to count one of the dedupped merged files.
 9 | # But should run this on the complete annotations after cufflinks de novo transcript assembly.
10 | 
11 | featureCOUNT=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/subread-1.4.6-p2-Linux-x86_64/bin/featureCounts
12 | ANNOTATIONgtf=$1
13 | NUMTHREADS=$2
14 | INPUTDIR=$3
15 | WORKINGDIR=$4/$LSB_JOBINDEX
16 | OUTDIR=$5
17 | FILEStoMAP=($INPUTDIR/*.bam)
18 | ARRAYINDEX=$(($LSB_JOBINDEX-1))
19 | INPUTBAM=${FILEStoMAP[$ARRAYINDEX]}
20 | echo "Inputfile: $INPUTBAM"
21 | 
22 | 
23 | 
24 | if [ ! -f $featureCOUNT ] ; then
25 |   echo "Sorry featureCounts not available"
26 |   exit 1
27 | fi
28 | 
29 | if [ -z $ANNOTATIONgtf ] || [ ! -f $ANNOTATIONgtf ] ; then
30 |   echo "Please provide an annotation GTF file (ARG 1/3)"
31 |   exit 1
32 | fi
33 | 
34 | if [ -z $NUMTHREADS ] ; then
35 |   echo "Please set number of threads to run on (ARG 2/3)"
36 |   exit 1
37 | fi
38 | 
39 | if [ $NUMTHREADS -gt 0 ] ; then
40 |   if [ -z $INPUTBAM ] || [ ! -f $INPUTBAM ] ; then
41 |     echo "$INPUTBAM does not exist. Please provide existing sorted BAM file (ARG 3/3)"
42 |     exit 1
43 |   fi
44 | else
45 |   echo "Error: number of threads must be > 0"
46 |   exit 1
47 | fi
48 | 
49 | # featureCounts options:
50 | # -t 'string' : specify the feature type to count reads for, default='exon'
51 | # -g 'string' : specify the attribute used to group features into meta-features, default='gene_id'
52 | # -f : read summarization performed at the feature level instead of the meta-feature level
53 | # -O : reads can match more than one feature/metafeature
54 | # -M : multi-mapping can be counted multiple times (once for each of their mapping locations
55 | #    alternatively
56 | # --primary : only primary alignments will be counted
57 | # -Q ## : minimum mapping quality (default = 0, 30 = consistent with RSeQC)
58 | # -T ## : number of threads to run on (default =1)
59 | # -R : output read counting assignments of each read into a .featureCounts file
60 | # --ignoreDup : ignores any reads marked as duplicates
61 | # -p : fragments rather than reads counted for paired-end data.
62 | # -d ## : minimum fragment/template legnth (default: 50) -> only if using -P parameter too
63 | # -D ## : maximum fragment/template length (default 600) -> only if using -P parameter too
64 | # -B : only reads with both ends mapping considered
65 | # -C : reads with ends mapping to different Chrs excluded
66 | mkdir -p $WORKINGDIR
67 | cd $WORKINGDIR
68 | 
69 | OUTPUTFILE=$(basename "${INPUTBAM%.bam}.fragmentcounts")
70 | $featureCOUNT -O -M -T $NUMTHREADS -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #yes multimap, single end, no quality threshold
71 | #$featureCOUNT -O -M -T $NUMTHREADS -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #allow multimap, single end, no quality threshold
72 | #$featureCOUNT -O -M -Q 30 -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #allow multimap
73 | #$featureCOUNT -Q 30 -T $NUMTHREADS -p -a $ANNOTATIONgtf -o $OUTDIR/$OUTPUTFILE $INPUTBAM #no multimap
74 | rm temp*
75 | cd ..
76 | rmdir $WORKINGDIR
77 | 


--------------------------------------------------------------------------------
/6.1_Get_Expression_Kallisto.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | if (@ARGV != 4) {die "Usage: perl 6.1_Get_Expression_Kallisto.pl /path/outprefix /path/genome.gtf /path/kallisto_output_directory [0/1 - aggregate by gene?].\n";}
  5 | 
  6 | my $outprefix = $ARGV[0];
  7 | my $gtf_file = $ARGV[1];
  8 | my $abundance_string = $ARGV[2]."*.abundance*.tsv";
  9 | my $agg_by_gene = $ARGV[3];
 10 | 
 11 | my $suffix = "kallisto_trans";
 12 | if ($agg_by_gene) {
 13 | 	$suffix = "kallisto_gene";
 14 | }
 15 | 
 16 | my %Gene2ID2FragCount = ();
 17 | my %Gene2ID2TPM = ();
 18 | my @IDs = ();
 19 | 
 20 | my %transcript2gene = ();
 21 | #open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!;
 22 | open (my $ifh, "$gtf_file") or die $!;
 23 | while (<$ifh>) {
 24 | 	if ($_ =~ /^#/) {next;}
 25 | 	my $geneid = "ERROR";
 26 | 	if ($_ =~ /gene_id "(.+?)"/) {
 27 | 		$geneid=$1;
 28 | 	} else {
 29 | 		die "No gene id!\n";
 30 | 	}
 31 | 	
 32 | 	my @record = split(/\t/);
 33 | 	if ($record[2] =~ /gene/i) {
 34 | 		#gene
 35 | 	} elsif ($_ =~ /exon_id "(.+?)"/) {
 36 | 		#exon
 37 | 	} elsif ($_ =~ /transcript_id "(.+?)"/) {
 38 | 		#transcript
 39 | 		$transcript2gene{$1}=$geneid;
 40 | 	}
 41 | } close ($ifh);
 42 | print STDERR "Done reading Annotations\n";
 43 | 
 44 | #foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Kallisto/*.abundance*.tsv")) {
 45 | foreach my $file (glob("$abundance_string")) {
 46 | #	$file =~ /([ATCG]{5,})/;
 47 | 	my $ID = "ERROR";
 48 | #	if ($file =~ /([^\/]+_Cell\d\d)/) {
 49 | 	if ($file =~ /([^\/]+)\.abundance/) {
 50 | 		$ID = $1;
 51 | 	} else {die "$file did not match!\n";}
 52 | 	push(@IDs,$ID);
 53 | 	open(my $ifh, $file) or die $!;
 54 | 	while (<$ifh>) {
 55 | 		chomp;
 56 | 		if ($_ =~ /^#/ || $_ =~ /^target_id/) {next;} #skip header & comments
 57 | 		my @record=split(/\t/);
 58 | 		my $gene = $record[0]; $gene =~ s/\s+//g;
 59 | 		if (exists($transcript2gene{$gene}) && $agg_by_gene) {
 60 | 			$gene = $transcript2gene{$gene};
 61 | 		}
 62 | 		if (exists($Gene2ID2FragCount{$gene}->{$ID})) {
 63 | 			$Gene2ID2FragCount{$gene}->{$ID} += $record[3]; 
 64 | 			$Gene2ID2TPM{$gene}->{$ID} += $record[4]; 
 65 | 		} else {
 66 | 			$Gene2ID2FragCount{$gene}->{$ID} = $record[3]; 
 67 | 			$Gene2ID2TPM{$gene}->{$ID} = $record[4]; 
 68 | 		}
 69 | 	} close ($ifh);
 70 | }
 71 | 
 72 | 
 73 | 
 74 | open(my $ofh1, ">", "$outprefix\_$suffix\_counts.txt") or die $!;
 75 | open(my $ofh2, ">", "$outprefix\_$suffix\_tpm.txt") or die $!;
 76 | print $ofh1 "Gene\t".join("\t",@IDs)."\n";
 77 | print $ofh2 "Gene\t".join("\t",@IDs)."\n";
 78 | foreach my $gene (keys(%Gene2ID2FragCount)) {
 79 | 	print $ofh1 "$gene";
 80 | 	print $ofh2 "$gene";
 81 | 	foreach my $ID (@IDs) {
 82 | 		my $count = "NA";
 83 | 		my $tpm = "NA";
 84 | 		if (exists($Gene2ID2FragCount{$gene}->{$ID})) {
 85 | 			$count = $Gene2ID2FragCount{$gene}->{$ID};
 86 | 		} else { 
 87 | 			$count = "NA";
 88 | 		}
 89 | 		if (exists($Gene2ID2TPM{$gene}->{$ID})) {
 90 | 			$tpm = $Gene2ID2TPM{$gene}->{$ID};
 91 | 		} else { 
 92 | 			$tpm = "NA";
 93 | 		}
 94 | 		print $ofh1 "\t".$count;
 95 | 		print $ofh2 "\t".$tpm;
 96 | 	}
 97 | 	print $ofh1 "\n";
 98 | 	print $ofh2 "\n";
 99 | }
100 | 


--------------------------------------------------------------------------------
/6_Get_Construct_Expression_Cufflinks.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | my %ID2Things = ();
 5 | my %ID2Thingsloci = ();
 6 | 
 7 | my $fpkmcol = 9;
 8 | 
 9 | foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/*genes.fpkm_tracking")) {
10 | 	$file =~ /([ATCG]{5,})/;
11 | 	my $ID = $1;
12 | 	open(my $ifh, $file) or die $!;
13 | 	while (<$ifh>) {
14 | 		chomp;
15 | 		if ($_ =~ /A2lox-TRE/) {
16 | 			my @record=split(/\t/);
17 | 			$ID2Things{$ID}->{$record[0]} = $record[$fpkmcol]; 
18 | 			my $locus = $record[6];
19 | 			$locus =~ /(\d+)-(\d+)/;
20 | 			$ID2Thingsloci{$ID}->{$record[0]} = "$1\t$2"; 
21 | 		}
22 | 	} close ($ifh);
23 | }
24 | 
25 | 
26 | foreach my $id (keys(%ID2Things)) {
27 | 	foreach my $thing (keys(%{$ID2Things{$id}})) {
28 | 		print "$id\t$thing\t".$ID2Things{$id}->{$thing}."\t".$ID2Thingsloci{$id}->{$thing}."\n";
29 | 	}
30 | }
31 | 
32 | 		
33 | 


--------------------------------------------------------------------------------
/6_Get_Expression_featureCounts.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "Please provide directory of featurecounts output\n";}
 5 | 
 6 | my $dir = $ARGV[0];
 7 | 
 8 | my %Gene2ID2FragCount = ();
 9 | my @IDs = ();
10 | 
11 | foreach my $file (glob("$dir/*.fragmentcounts")) {
12 | 	my $ID = "ERR";
13 | 	if ($file =~ /([ATCG]{5,})A/) {
14 | #	if ($file =~ /_([^_]+_Cell\d\d)/) {
15 | 		$ID = $1;
16 | 	} else {
17 | 		die "$file does not match\n";
18 | 	}	
19 | 	push(@IDs,$ID);
20 | 	open(my $ifh, $file) or die $!;
21 | 	while (<$ifh>) {
22 | 		chomp;
23 | 		if ($_ =~ /^#/ || $_ =~ /^Geneid/) {next;} #skip header & comments
24 | 		my @record=split(/\t/);
25 | 		my $gene = $record[0]; $gene =~ s/\s+//g;
26 | 		$Gene2ID2FragCount{$gene}->{$ID} = $record[6]; 
27 | 	} close ($ifh);
28 | }
29 | 
30 | print join("\t",@IDs)."\n";
31 | foreach my $gene (keys(%Gene2ID2FragCount)) {
32 | 	print "$gene";
33 | 	foreach my $ID (@IDs) {
34 | 		my $count = "NA";
35 | 		if (exists($Gene2ID2FragCount{$gene}->{$ID})) {
36 | 			$count = $Gene2ID2FragCount{$gene}->{$ID};
37 | 		} else { 
38 | 			$count = "0";
39 | 		}
40 | 		print "\t".$count;
41 | 	}
42 | 	print "\n";
43 | }
44 | 
45 | my %ID2Unassigned = ();
46 | foreach my $file (glob("$dir/*.fragmentcounts.summary")) {
47 | 	my $ID = "ERR";
48 | 	#if ($file =~ /_([^_]+_Cell\d\d)/) {
49 | 	if ($file =~ /([ATCG]{5,})A/) {
50 | 		$ID = $1;
51 | 	} else {
52 | 		die "$file does not match\n";
53 | 	}	
54 | 	open(my $ifh, $file) or die $!;
55 | 	<$ifh>; # header
56 | 	<$ifh>; #Assigned
57 | 	while (<$ifh>) {
58 | 		chomp;
59 | 		my @record=split(/\t/);
60 | 		$ID2Unassigned{$ID} += $record[1]
61 | 	} close ($ifh);
62 | }
63 | 
64 | print "Unassigned_Various";
65 | foreach my $ID (@IDs) {
66 | 	print "\t".$ID2Unassigned{$ID};
67 | } 
68 | print "\n";
69 | 


--------------------------------------------------------------------------------
/6_Get_Kallisto.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | # Currently replaces all estimated FPKMs which are not significantly bigger than 0 with 0. -> not as of (Feb 9 2016), also changed "not detected" genes from NA to 0.
 4 | 
 5 | if (@ARGV < 2) {die "Please supply a directory of Kallisto Output and a prefix for output\n";}
 6 | 
 7 | my $dir = $ARGV[0];
 8 | my $outprefix = $ARGV[1];
 9 | 
10 | # Now read in genome annotations
11 | my %transcript2gene = ();
12 | open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!;
13 | while (<$ifh>) {
14 | 	if ($_ =~ /^#/) {next;}
15 | 	
16 | 	if ($_ =~ /transcript_id "(.+?)"/) {
17 | 		my $transid = $1;
18 | 		#transcript
19 | 		my $geneid = "ERROR";
20 | 		if ($_ =~ /gene_id "(.+?)"/) {
21 | 			$geneid=$1;
22 | 		} else {
23 | 			die "No gene id!\n";
24 | 		}
25 | 		$transcript2gene{$transid}=$geneid;
26 | 	}
27 | 	
28 | } close ($ifh);
29 | print STDERR "Done reading Annotations\n";
30 | 
31 | # First get expression for all genes & store details for all Cuff-Genes
32 | my %AllGenes = (); my %AllSamples = ();
33 | my %Gene2Sample2TPM = ();
34 | my %Gene2Sample2Counts=();
35 | 
36 | my @files = glob("$dir/*.abundances.tsv");	
37 | foreach my $file (@files) {
38 | #	$file =~ /([ATCG]{5,})/;
39 | 	my $ID = "ERR";
40 | 	if ($file =~ /([^_]+_Cell\d\d)/) {
41 | 		$ID = $1;
42 | 	} else {
43 | 		die "$file does not match!";
44 | 	}
45 | 	$AllSamples{$ID}=1;
46 | 	open(my $ifh, $file) or die $!;
47 | 	<$ifh>; # header
48 | 	while (<$ifh>) {
49 | 		chomp;
50 | 		my @record = split(/\t/);
51 | 		my $trans = $record[0];
52 | 		my $count = $record[3];
53 | 		my $tpm = $record[4];
54 | 		my $gene = $trans;
55 | 		if (exists($transcript2gene{$trans})) {
56 | 			$gene = $transcript2gene{$trans};
57 | 		}
58 | 		if (exists($Gene2Sample2TPM{$gene})) {
59 | 			$Gene2Sample2TPM{$gene}->{$ID}+=$tpm;
60 | 			$Gene2Sample2Counts{$gene}->{$ID}+=$count;
61 | 		} else {
62 | 			$Gene2Sample2TPM{$gene}->{$ID}=$tpm;
63 | 			$Gene2Sample2Counts{$gene}->{$ID}=$count;
64 | 		}
65 | 	} close ($ifh);
66 | } 
67 | 
68 | open (my $ofhtpm, ">", "$outprefix.tpm") or die $!;
69 | open (my $ofhcounts, ">", "$outprefix.counts") or die $!;
70 | my @IDs = sort(keys(%AllSamples));
71 | print $ofhtpm "Gene\t".join("\t",@IDs)."\n";
72 | print $ofhcounts "Gene\t".join("\t",@IDs)."\n";
73 | 
74 | foreach my $gene (keys(%Gene2Sample2TPM)) {
75 | 	print $ofhtpm "$gene";
76 | 	print $ofhcounts "$gene";
77 | 	foreach my $ID (@IDs) {
78 |                  my $tpm = "NA";
79 |                  if (exists($Gene2Sample2TPM{$gene}->{$ID})) {
80 |                          $tpm = $Gene2Sample2TPM{$gene}->{$ID};
81 |                  } else {
82 |                          $tpm = "0";
83 |                  }
84 |                  my $count = "NA";
85 |                  if (exists($Gene2Sample2Counts{$gene}->{$ID})) {
86 |                          $count = $Gene2Sample2Counts{$gene}->{$ID};
87 |                  } else {
88 |                          $count = "0";
89 |                  }
90 |                  print $ofhcounts "\t".$count;
91 |                  print $ofhtpm "\t".$tpm;
92 |          }
93 |          print $ofhcounts "\n";
94 |          print $ofhtpm "\n";
95 | }
96 | close($ofhcounts);
97 | close($ofhtpm);
98 | 


--------------------------------------------------------------------------------
/6_Get_RSEM_Expression.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 2) {die "Please supply a directory of Cufflinks Output and a prefix for outputfiles\n";}
 5 | 
 6 | my $dir = $ARGV[0];
 7 | my $outprefix = $ARGV[1];
 8 | 
 9 | # First get expression for all genes & store details for all Cuff-Genes
10 | my %AllGenes = (); my %AllSamples = ();
11 | my %Gene2Sample2FPKM = ();
12 | my %Gene2Sample2TPM = ();
13 | 
14 | my @files = glob("$dir/*.genes.results");	
15 | foreach my $file (@files) {
16 | #	$file =~ /([ATCG]{5,})/;
17 | 	my $ID = "ERR";
18 | #	if ($file =~ /([^_]+_Cell\d\d)/) {
19 | 	if ($file =~ /RSEM-(\d+)-/) {
20 | 		$ID = $1;
21 | 	} else {
22 | 		die "$file does not match!";
23 | 	}
24 | 	$AllSamples{$ID}=1;
25 | 	open(my $ifh, $file) or die $!;
26 | 	<$ifh>; # header
27 | 	while (<$ifh>) {
28 | 		chomp;
29 | 		my @record = split(/\t/);
30 | 		my $gene = $record[0];
31 | 		my $fpkm = $record[6];
32 | 		my $tpm = $record[5];
33 | 		$Gene2Sample2FPKM{$gene}->{$ID}=$fpkm;
34 | 		$Gene2Sample2TPM{$gene}->{$ID} =$tpm;
35 | 	} close ($ifh);
36 | } 
37 | 
38 | print STDERR "Done reading FPKMs\n";
39 | 
40 | open (my $ofhfpkm, ">", "$outprefix.fpkm") or die $!;
41 | open (my $ofhtpm, ">", "$outprefix.tpm") or die $!;
42 | my @IDs = sort{$a<=>$b} keys(%AllSamples);
43 | print $ofhfpkm "Gene\t".join("\t",@IDs)."\n";
44 | print $ofhtpm "Gene\t".join("\t",@IDs)."\n";
45 | 
46 | foreach my $gene (keys(%Gene2Sample2FPKM)) {
47 | 	print $ofhfpkm "$gene";
48 | 	print $ofhtpm "$gene";
49 | 	foreach my $ID (@IDs) {
50 |                  my $tpm = "NA";
51 |                  my $fpkm = "NA";
52 |                  if (exists($Gene2Sample2FPKM{$gene}->{$ID})) {
53 |                          $fpkm = $Gene2Sample2FPKM{$gene}->{$ID};
54 |                  } else {
55 |                          $fpkm = "0";
56 |                  }
57 |                  if (exists($Gene2Sample2TPM{$gene}->{$ID})) {
58 |                          $tpm = $Gene2Sample2TPM{$gene}->{$ID};
59 |                  } else {
60 |                          $tpm = "0";
61 |                  }
62 |                  print $ofhfpkm "\t".$fpkm;
63 |                  print $ofhtpm "\t".$tpm;
64 |          }
65 |          print $ofhfpkm "\n";
66 |          print $ofhtpm "\n";
67 | }
68 | close($ofhtpm);
69 | close($ofhfpkm);
70 | 


--------------------------------------------------------------------------------
/6_Get_Salmon_Expression.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | # Currently replaces all estimated FPKMs which are not significantly bigger than 0 with 0. -> not as of (Feb 9 2016), also changed "not detected" genes from NA to 0.
 4 | 
 5 | if (@ARGV < 2) {die "Usage: 6_Get_Salmon_Expression.pl [directory of Salmon output] [gene|transcript] [output prefix]\n";}
 6 | 
 7 | my $dir = $ARGV[0];
 8 | my $feature = $ARGV[1];
 9 | my $outprefix = $ARGV[2];
10 | 
11 | # Process arguments
12 | my @files = ();
13 | if ($feature =~ /gene/i) {
14 | 	@files = glob("$dir/*.quant.genes.sf");
15 | } else {
16 | 	@files = glob("$dir/*.quant.sf");
17 | }
18 | # More efficient to do both at once
19 | #my $col = -1;
20 | #if ($type =~ /tpm/i) {
21 | #	# column of salmon output corresponding to tpm
22 | #	$col = 3;
23 | #} else {
24 | #	# column of salmon output corresponding to read counts
25 | #	$col = 4;
26 | #}
27 | 
28 | 
29 | # First get expression for all genes & store details for all Cuff-Genes
30 | my %AllGenes = (); my %AllSamples = ();
31 | my %Gene2Sample2TPM = ();
32 | my %Gene2Sample2Counts=();
33 | 
34 | foreach my $file (@files) {
35 | #	$file =~ /([ATCG]{5,})/;
36 | 	# Regular expressions to extract sample name from file name
37 | 	my $ID = "ERR";
38 | 	if ($file =~ /([^_]+_Cell\d\d)/) {
39 | 		$ID = $1;
40 | 	} else {
41 | 		die "$file does not match!";
42 | 	}
43 | 	#####
44 | 	$AllSamples{$ID}=1;
45 | 	open(my $ifh, $file) or die $!;
46 | 	<$ifh>; # header
47 | 	while (<$ifh>) {
48 | 		chomp;
49 | 		my @record = split(/\t/);
50 | 		my $feature = $record[0];
51 | 		my $count = $record[4];
52 | 		my $tpm = $record[3];
53 | 		if (exists($Gene2Sample2TPM{$feature})) {
54 | 			$Gene2Sample2TPM{$feature}->{$ID}+=$tpm;
55 | 			$Gene2Sample2Counts{$feature}->{$ID}+=$count;
56 | 		} else {
57 | 			$Gene2Sample2TPM{$feature}->{$ID}=$tpm;
58 | 			$Gene2Sample2Counts{$feature}->{$ID}=$count;
59 | 		}
60 | 	} close ($ifh);
61 | } 
62 | 
63 | open (my $ofhtpm, ">", "$outprefix.tpm") or die $!;
64 | open (my $ofhcounts, ">", "$outprefix.counts") or die $!;
65 | my @IDs = sort(keys(%AllSamples));
66 | print $ofhtpm "Gene\t".join("\t",@IDs)."\n";
67 | print $ofhcounts "Gene\t".join("\t",@IDs)."\n";
68 | 
69 | foreach my $gene (sort(keys(%Gene2Sample2TPM))) {
70 | 	print $ofhtpm "$gene";
71 | 	print $ofhcounts "$gene";
72 | 	foreach my $ID (@IDs) {
73 |                  my $tpm = "NA";
74 |                  if (exists($Gene2Sample2TPM{$gene}->{$ID})) {
75 |                          $tpm = $Gene2Sample2TPM{$gene}->{$ID};
76 |                  } else {
77 |                          $tpm = "0";
78 |                  }
79 |                  my $count = "NA";
80 |                  if (exists($Gene2Sample2Counts{$gene}->{$ID})) {
81 |                          $count = $Gene2Sample2Counts{$gene}->{$ID};
82 |                  } else {
83 |                          $count = "0";
84 |                  }
85 |                  print $ofhcounts "\t".$count;
86 |                  print $ofhtpm "\t".$tpm;
87 |          }
88 |          print $ofhcounts "\n";
89 |          print $ofhtpm "\n";
90 | }
91 | close($ofhcounts);
92 | close($ofhtpm);
93 | 


--------------------------------------------------------------------------------
/99_Check_Barcodes.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 3) { die "Breakdown_Paired_Ends.pl INPUT1 INPUT2 ProjectName\n";}
 5 | my $infile1 = $ARGV[0];
 6 | my $infile2 = $ARGV[1];
 7 | 
 8 | my %Barcodes = ();
 9 | open (my $ifh1, $infile1) or die $!;
10 | while(<$ifh1>) {
11 | 	my $file1line = $_;
12 | 	if ($file1line =~ /^@/) {
13 | 		my @thing1 = split(/\s+/,$file1line);
14 | 		my $readname = $thing1[0];
15 | 		my $barcodes = <$ifh1>;
16 | 		if ($barcodes =~ /^([ATCGNUKMRYSWBVHDX]{11})([ATCGNUKMRYSWBVHDX]{10})/){
17 | 			my $UMI = $2;
18 | 			my $CellID = $1;
19 | 			$Barcodes{$UMI}++;
20 | 		}
21 | 	} else {next;}
22 | }
23 | close($ifh1);
24 | 
25 | my @codes = sort{$Barcodes{$a}<=>$Barcodes{$b}} keys(%Barcodes);
26 | foreach my $code (@codes) {
27 | 	print "$code ".$Barcodes{$code}."\n";
28 | }
29 | 


--------------------------------------------------------------------------------
/99_Check_RSEM_Output.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "Please supply a directory of RSEM Output\n";}
 5 | 
 6 | my $dir = $ARGV[0];
 7 | 
 8 | # First get expression for all genes
 9 | my %AllGenes = (); my %AllSamples = ();
10 | my %Gene2Sample2FPKM = ();
11 | my %Gene2Sample2TPM = ();
12 | 
13 | my @files = glob("$dir/bowtie2*.genes.results");	
14 | foreach my $file (@files) {
15 | 	my $ID = "ERR";
16 | 	if ($file =~ /bowtie2_RSEM-(\d+)/) { # Match file name.
17 | 		$ID = $1;
18 | 	} else {
19 | 		next;
20 | 	}
21 | 	$AllSamples{$ID}=1;
22 | } 
23 | 
24 | my @IDs = sort{$a<=>$b} keys(%AllSamples);
25 | print ( join("\n", @IDs) );
26 | 


--------------------------------------------------------------------------------
/99_Check_Results.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "requires one or more LSF output files\n";}
 5 | 
 6 | my $CPU = 0;
 7 | my $Mem = 0;
 8 | my $Count = 0;
 9 | 
10 | foreach my $file (@ARGV) {
11 | 	open(my $ifh, $file) or die $!;
12 | 	my $success = 0;
13 | 	while(<$ifh>) {
14 | 		if ($_ =~ /Successfully completed/) {
15 | 			$success = 1;
16 | 			$Count++;
17 | 		}
18 | 		if ($success && $_ =~ /CPU time :\s+([\d\.]+) sec/) {
19 | 			$CPU += $1;
20 | 		}
21 | 		if ($success && $_ =~ /Max Memory :\s+([\d\.]+) MB/) {
22 | 			my $m = $1;
23 | 			if ($m > $Mem) {$Mem = $m;}
24 | 		}
25 | 	} close($ifh);
26 | 	if ($success) {
27 | 		system("rm $file\n");
28 | 	}
29 | }
30 | print "\"Total :\" ".scalar(@ARGV)."\n\"Success:\" $Count\n\"Max Mem:\" $Mem\n\"Total CPU:\" $CPU\n";
31 | 


--------------------------------------------------------------------------------
/99_NotesForImprovement:
--------------------------------------------------------------------------------
1 | Is it better to use SAMtools after mapping to sort BAM file or to have STAR sort the BAMs before writing them? (Probably the latter since less I/O) -> then just merge the sorted files using samtools (merge expects sorted bams anyway)
2 | 
3 | Streaming FASTQC made Java crash but this didn't halt the job which is problematic (eventually was killed because job time limit was exceeded). Need to increase efficiency. 
4 | -> worked fine when increased memory to 10 Gb and ran on each end of each lane.
5 | 
6 | Samtools sort on largest BAM took 104 sec and 200MB of memory on the cluster. Need to do some finicky stuff to get it to run as a job array though (or should I just send them as individual jobs? only 192 of them...). 
7 | 


--------------------------------------------------------------------------------
/99_get_order_chr_in_SAM.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "Arguments: same file\n";}
 5 | 
 6 | my @chrs = ("NULL");
 7 | open (my $ifh, $ARGV[0]) or die $!;
 8 | while (<$ifh>) {
 9 | 	if ($_ =~ /^@/) {next;}
10 | 	my @record = split(/\t/);
11 | 	my $chr = $record[2];
12 | 	my $i = scalar(@chrs) -1;
13 | 	if ($chr ne $chrs[$i]){
14 | 		push(@chrs, $chr);
15 | 	}
16 | } close($ifh);
17 | 
18 | shift(@chrs); # remove NULL
19 | print join("\n", @chrs);
20 | 
21 | 


--------------------------------------------------------------------------------
/ERCC_Controls/ERCC_Controls_Analysis.txt:
--------------------------------------------------------------------------------
 1 | #Re-sort ID	ERCC ID	subgroup	concentration in Mix 1 (attomoles/ul)	concentration in Mix 2 (attomoles/ul)	expected fold-change ratio	log2(Mix 1/Mix 2)
 2 | 1	ERCC-00130	A	30000	7500	4	2
 3 | 2	ERCC-00004	A	7500	1875	4	2
 4 | 3	ERCC-00136	A	1875	468.75	4	2
 5 | 4	ERCC-00108	A	937.5	234.375	4	2
 6 | 5	ERCC-00116	A	468.75	117.1875	4	2
 7 | 6	ERCC-00092	A	234.375	58.59375	4	2
 8 | 7	ERCC-00095	A	117.1875	29.296875	4	2
 9 | 8	ERCC-00131	A	117.1875	29.296875	4	2
10 | 9	ERCC-00062	A	58.59375	14.6484375	4	2
11 | 10	ERCC-00019	A	29.296875	7.32421875	4	2
12 | 11	ERCC-00144	A	29.296875	7.32421875	4	2
13 | 12	ERCC-00170	A	14.6484375	3.66210938	4	2
14 | 13	ERCC-00154	A	7.32421875	1.83105469	4	2
15 | 14	ERCC-00085	A	7.32421875	1.83105469	4	2
16 | 15	ERCC-00028	A	3.66210938	0.91552734	4	2
17 | 16	ERCC-00033	A	1.83105469	0.45776367	4	2
18 | 17	ERCC-00134	A	1.83105469	0.45776367	4	2
19 | 18	ERCC-00147	A	0.91552734	0.22888184	4	2
20 | 19	ERCC-00097	A	0.45776367	0.11444092	4	2
21 | 20	ERCC-00156	A	0.45776367	0.11444092	4	2
22 | 21	ERCC-00123	A	0.22888184	0.05722046	4	2
23 | 22	ERCC-00017	A	0.11444092	0.02861023	4	2
24 | 23	ERCC-00083	A	0.02861023	0.00715256	4	2
25 | 24	ERCC-00096	B	15000	15000	1	0
26 | 25	ERCC-00171	B	3750	3750	1	0
27 | 26	ERCC-00009	B	937.5	937.5	1	0
28 | 27	ERCC-00042	B	468.75	468.75	1	0
29 | 28	ERCC-00060	B	234.375	234.375	1	0
30 | 29	ERCC-00035	B	117.1875	117.1875	1	0
31 | 30	ERCC-00025	B	58.59375	58.59375	1	0
32 | 31	ERCC-00051	B	58.59375	58.59375	1	0
33 | 32	ERCC-00053	B	29.296875	29.296875	1	0
34 | 33	ERCC-00148	B	14.6484375	14.6484375	1	0
35 | 34	ERCC-00126	B	14.6484375	14.6484375	1	0
36 | 35	ERCC-00034	B	7.32421875	7.32421875	1	0
37 | 36	ERCC-00150	B	3.66210938	3.66210938	1	0
38 | 37	ERCC-00067	B	3.66210938	3.66210938	1	0
39 | 38	ERCC-00031	B	1.83105469	1.83105469	1	0
40 | 39	ERCC-00109	B	0.91552734	0.91552734	1	0
41 | 40	ERCC-00073	B	0.91552734	0.91552734	1	0
42 | 41	ERCC-00158	B	0.45776367	0.45776367	1	0
43 | 42	ERCC-00104	B	0.22888184	0.22888184	1	0
44 | 43	ERCC-00142	B	0.22888184	0.22888184	1	0
45 | 44	ERCC-00138	B	0.11444092	0.11444092	1	0
46 | 45	ERCC-00117	B	0.05722046	0.05722046	1	0
47 | 46	ERCC-00075	B	0.01430512	0.01430512	1	0
48 | 47	ERCC-00074	C	15000	22500	0.67	-0.58
49 | 48	ERCC-00113	C	3750	5625	0.67	-0.58
50 | 49	ERCC-00145	C	937.5	1406.25	0.67	-0.58
51 | 50	ERCC-00111	C	468.75	703.125	0.67	-0.58
52 | 51	ERCC-00076	C	234.375	351.5625	0.67	-0.58
53 | 52	ERCC-00044	C	117.1875	175.78125	0.67	-0.58
54 | 53	ERCC-00162	C	58.59375	87.890625	0.67	-0.58
55 | 54	ERCC-00071	C	58.59375	87.890625	0.67	-0.58
56 | 55	ERCC-00084	C	29.296875	43.9453125	0.67	-0.58
57 | 56	ERCC-00099	C	14.6484375	21.9726563	0.67	-0.58
58 | 57	ERCC-00054	C	14.6484375	21.9726563	0.67	-0.58
59 | 58	ERCC-00157	C	7.32421875	10.9863281	0.67	-0.58
60 | 59	ERCC-00143	C	3.66210938	5.49316406	0.67	-0.58
61 | 60	ERCC-00039	C	3.66210938	5.49316406	0.67	-0.58
62 | 61	ERCC-00058	C	1.83105469	2.74658203	0.67	-0.58
63 | 62	ERCC-00120	C	0.91552734	1.37329102	0.67	-0.58
64 | 63	ERCC-00040	C	0.91552734	1.37329102	0.67	-0.58
65 | 64	ERCC-00164	C	0.45776367	0.68664551	0.67	-0.58
66 | 65	ERCC-00024	C	0.22888184	0.34332275	0.67	-0.58
67 | 66	ERCC-00016	C	0.22888184	0.34332275	0.67	-0.58
68 | 67	ERCC-00012	C	0.11444092	0.17166138	0.67	-0.58
69 | 68	ERCC-00098	C	0.05722046	0.08583069	0.67	-0.58
70 | 69	ERCC-00057	C	0.01430512	0.02145767	0.67	-0.58
71 | 70	ERCC-00002	D	15000	30000	0.5	-1
72 | 71	ERCC-00046	D	3750	7500	0.5	-1
73 | 72	ERCC-00003	D	937.5	1875	0.5	-1
74 | 73	ERCC-00043	D	468.75	937.5	0.5	-1
75 | 74	ERCC-00022	D	234.375	468.75	0.5	-1
76 | 75	ERCC-00112	D	117.1875	234.375	0.5	-1
77 | 76	ERCC-00165	D	58.59375	117.1875	0.5	-1
78 | 77	ERCC-00079	D	58.59375	117.1875	0.5	-1
79 | 78	ERCC-00078	D	29.296875	58.59375	0.5	-1
80 | 79	ERCC-00163	D	14.6484375	29.296875	0.5	-1
81 | 80	ERCC-00059	D	14.6484375	29.296875	0.5	-1
82 | 81	ERCC-00160	D	7.32421875	14.6484375	0.5	-1
83 | 82	ERCC-00014	D	3.66210938	7.32421875	0.5	-1
84 | 83	ERCC-00077	D	3.66210938	7.32421875	0.5	-1
85 | 84	ERCC-00069	D	1.83105469	3.66210938	0.5	-1
86 | 85	ERCC-00137	D	0.91552734	1.83105469	0.5	-1
87 | 86	ERCC-00013	D	0.91552734	1.83105469	0.5	-1
88 | 87	ERCC-00168	D	0.45776367	0.91552734	0.5	-1
89 | 88	ERCC-00041	D	0.22888184	0.45776367	0.5	-1
90 | 89	ERCC-00081	D	0.22888184	0.45776367	0.5	-1
91 | 90	ERCC-00086	D	0.11444092	0.22888184	0.5	-1
92 | 91	ERCC-00061	D	0.05722046	0.11444092	0.5	-1
93 | 92	ERCC-00048	D	0.01430512	0.02861023	0.5	-1
94 | 


--------------------------------------------------------------------------------
/ERCC_Controls/ERCC_Controls_README.txt:
--------------------------------------------------------------------------------
 1 | Mapping Reads to ERCC Control Sequences with BioScope 1.2.1 
 2 | ------------------------------------------------------------
 3 | 
 4 | This document describes how to use BioScope 1.2.1 to map the results
 5 | of a SOLiD run containing ERCC control sequences against the ERCC
 6 | reference using the BioScope Whole Transcriptome Pipeline.
 7 | 
 8 | Once a SOLiD run containing ERCC control sequences is finished, the
 9 | results must be mapped and counted against the ERCC reference files.
10 | This can be accomplished in two ways using BioScope 1.2.1, by
11 | combining the ERCC references with the genome references or by mapping
12 | directly to the ERCC references.  
13 | 
14 | Both methods are described below.
15 | 
16 | 
17 | Prerequisites
18 | -------------
19 | 
20 | BioScope 1.2.1 is required to map against the ERCC references.  If you
21 | have an older version of BioScope, upgrade to 1.2.1 before proceeding.  
22 | 
23 | Two ERCC references are required for mapping and counting:
24 | 
25 | ERCC92.fa 
26 |   This multi-fasta file contains the reference sequences and IDs for
27 |   each ERCC control sequence.   
28 | 
29 | ERCC92.gtf 
30 |   This feature file contains feature entries for each ERCC control
31 |   sequence.  This is used as the exon reference in the Whole
32 |   Transcriptome pipeline. 
33 | 
34 | Both ERCC reference files can be downloaded from:
35 |   www.appliedbiosystems.com.
36 | 
37 | 
38 | Method 1: Mapping ERCCs Directly to the ERCC References
39 | --------------------------------------------------------
40 | 
41 | 1. Run BioScope 1.2.1 whole transcriptome analysis as directed in the 
42 |    BioScope documentation.  Use ERCC92.fa for the genome reference and
43 |    ERCC92.gtf for the exon reference.
44 | 
45 | 2. When the BioScope run completes, you will find the ERCC counts in
46 |    the last 92 lines of the countagresult.txt file. 
47 | 
48 | 
49 | Method 2: Combining the ERCC References with the Genomic References 
50 | -------------------------------------------------------------------
51 | 
52 | Combining the references allows you to map to both the ERCCs and the
53 | genome reference at the same time.  This is accomplished by appending
54 | the ERCC references to the genome references. 
55 | 
56 | Follow these steps to combine the references:
57 | 
58 | 1. Prepare your genome and feature references for use with BioScope
59 |    1.2.1.  For human references, you might have two reference files:
60 | 
61 |    human.fa - the multi-fasta file contain the human reference genome 
62 |    refGene.gtf - the exon reference file for each exon in refseq   
63 | 
64 | 2. Append the ERCC references to the genome and feature references.
65 |    If your genome reference is human.fa and your exon reference is
66 |    refGene.gtf then you could use the following UNIX commands from a
67 |    Bash shell to append the files (note that $ is the command prompt):    
68 | 
69 |   $ cat ERCC92.fa >> human.fa 
70 |   $ cat ERCC92.gtf >> refGene.gtf
71 | 
72 | 3. Run BioScope 1.2.1 whole transcriptome analysis as directed in the
73 |    BioScope documentation.
74 | 
75 | 4. When the BioScope run completes, you will find the ERCC counts in
76 |    the last 92 lines of the countagresult.txt file. 
77 | 
78 | 
79 | Post-Processing: Extracting Counts to a Tab-delimited File
80 | ----------------------------------------------------------
81 | 
82 | You can use the following UNIX commands from a Bash shell to parse the
83 | results into a tab delimited table (ERCC.counts) of ERCC name, raw
84 | read count and RPKM:
85 | 
86 |   $ tail -n 92 countagresult.txt | cut -f9 | cut -d';' -f1 | sed 's/gene_id\|"\| //g' > gene_id
87 |   $ tail -n 92 countagresult.txt | cut -f6 > raw_count
88 |   $ tail -n 92 countagresult.txt | cut -f9 | cut -d';' -f3 | sed 's/RPKM\|"\| //g' > RPKM
89 |   $ paste gene_id raw_count RPKM > ERCC.counts
90 |   $ rm gene_id raw_count RPKM
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/ERCC_Controls/Make_FASTA_GTF_from_Annotation.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | # Converts the Annotation file from https://www.thermofisher.com/order/catalog/product/4456740 into fasta and gtf files that can be added to the end of an existing genome fasta/gtf
 4 | 
 5 | my @FASTAlines = ();
 6 | my @GTFlines = ();
 7 | open (my $ifh, "ERCC_Controls_Annotation.txt") or die $!;
 8 | <$ifh>; #header
 9 | while (<$ifh>) {
10 | 	# Do all the important stuff
11 | 	chomp;
12 | 	my @record = split(/\t/);
13 | 	my $sequence = $record[4];
14 | 	$sequence =~ s/\s+//g; # get rid of any preceeding/tailing white space
15 | 	$sequence = $sequence."NNNN"; # add some buffer to the end of the sequence
16 | 	my $name = $record[0];
17 | 	my $genbank = $record[1];
18 | 	push(@FASTAlines, ">$name\n$sequence\n");
19 | # is GTF 1 indexed or 0 indexed? -> it is 1 indexed
20 | # + or - strand?
21 | 	push(@GTFlines, "$name\tERCC\tgene\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n");
22 | 	push(@GTFlines, "$name\tERCC\texon\t1\t".(length($sequence)-2)."\t.\t+\t.\tgene_id \"$name-$genbank\"; transcript_id \"$name-$genbank\"; exon_number \"1\"; gene_name \"ERCC $name-$genbank\"\n");
23 | } close($ifh);
24 | 
25 | # Write output
26 | open(my $ofh, ">", "ERCC_Controls.fa") or die $!;
27 | foreach my $line (@FASTAlines) {
28 | 	print $ofh $line;
29 | } close ($ofh);
30 | 
31 | open($ofh, ">", "ERCC_Controls.gtf") or die $!;
32 | foreach my $line (@GTFlines) {
33 | 	print $ofh $line;
34 | } close ($ofh);
35 | 


--------------------------------------------------------------------------------
/ERCC_Controls/Note:
--------------------------------------------------------------------------------
1 | Downloaded from: https://www.lifetechnologies.com/order/catalog/product/4456740
2 | Date: 10 April 2015
3 | 
4 | Files:
5 | ERCC_Controls_Analysis.txt
6 | ERCC_Controls_Annotation.txt
7 | ERCC_Controls_README.txt
8 | 


--------------------------------------------------------------------------------
/Extract_PlateID_and_WellID_from_headers.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | 
 5 | if (@ARGV < 1) {die "requires at least one headerfile\n";}
 6 | my %CellID2WellID = ();
 7 | 
 8 | foreach my $file (@ARGV) {
 9 | 	#Extract cell ID
10 | 	my $cellid = "";
11 | 	if ($file =~ /_(\d_\d+)\./) {
12 | 		$cellid = $1;
13 | 	} else {
14 | 		die "$file does not match\n";
15 | 	}
16 | 	open (my $ifh, $file) or die $!;
17 | 	while (<$ifh>) {
18 | 		if ($_ =~ /^\@RG/) {
19 | 			# Match the plate-well ID
20 | 			my $wellid = "";
21 | 			if ($_ =~ /SM:SCGC--(\w+)/) {
22 | 				$wellid = $1;
23 | 			} else {
24 | 				die "$_ does not match";
25 | 			}
26 | 			$CellID2WellID{$cellid} = $wellid;
27 | 			last;
28 | 		} else {
29 | 			next;
30 | 		}
31 | 	} close($ifh);
32 | }
33 | 
34 | foreach my $cell (sort(keys(%CellID2WellID))) {
35 | 	print $cell."\t".$CellID2WellID{$cell}."\n";
36 | }
37 | 


--------------------------------------------------------------------------------
/Kallisto_Build_Index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Build Kallisto Index from a reference fasta and gtf.
 3 | USAGE="Usage: Kallisto_Build_Index.sh ref.fa ref.gtf outdir\n
 4 | 	\tArguments:\n
 5 | 	\t ref.fa = reference fasta file\n
 6 | 	\t ref.gtf = reference GTF file\n
 7 | 	\t outdir = directory for output\n"
 8 | 
 9 | # Locations of required software
10 | GFFREAD=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread
11 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto
12 | 
13 | # Raw genome fasta and annotation gtf
14 | FA=$1
15 | GTF=$2
16 | 
17 | # Location for output files
18 | OUTDIR=$3
19 | 
20 | # Checks
21 | if [ ! -f $GFFREAD ] ; then
22 |   echo "Error: gffread not available"
23 |   exit 1
24 | fi
25 | 
26 | if [ ! -f $KALLISTO ] ; then
27 |   echo "Error: kallisto not available"
28 |   exit 1
29 | fi
30 | 
31 | if [ -z $FA ] || [ ! -f $FA ] ; then
32 |   echo -e $USAGE
33 |   exit 1
34 | fi
35 | 
36 | if [ -z $GTF ] || [ ! -f $GTF ] ; then
37 |   echo -e $USAGE
38 |   exit 1
39 | fi
40 | 
41 | 
42 | if [ -z $OUTDIR ] ; then
43 |   OUTDIR=./
44 | fi
45 | 
46 | # Extract transcriptome fasta using gffread
47 | $GFFREAD $GTF -g $FA -w $OUTDIR/Transcripts.fasta
48 | 
49 | # Index the extracted transcriptome
50 | $KALLISTO index -i $OUTDIR/kallisto_index.idx $OUTDIR/Transcripts.fasta
51 | 
52 | 


--------------------------------------------------------------------------------
/Kallisto_Make_ExpMat.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | use warnings;
  4 | 
  5 | if (@ARGV != 4) {die "Usage: Kallisto_Make_ExpMat.pl kallisto_dir ref.gtf [gene|trans] out_prefix\n
  6 | Arguments:\n
  7 | kalliso_dir = directory of kallisto output files
  8 | ref.gtf = reference GTF file
  9 | [gene|trans] = whether to aggregate expression at gene or trans[script] level
 10 | out_prefix = prefix for output files";}
 11 | 
 12 | my $dir = $ARGV[0];
 13 | my $gtf = $ARGV[1];
 14 | my $type = $ARGV[2];
 15 | my $outprefix = $ARGV[3];
 16 | 
 17 | # Now read in genome annotations
 18 | my %transcript2gene = ();
 19 | open (my $ifh, $gtf) or die $!;
 20 | while (<$ifh>) {
 21 | 	if ($_ =~ /^#/) {next;}
 22 | 	
 23 | 	if ($_ =~ /transcript_id "(.+?)"/) {
 24 | 		my $transid = $1;
 25 | 		#transcript
 26 | 		my $geneid = "ERROR";
 27 | 		if ($_ =~ /gene_id "(.+?)"/) {
 28 | 			$geneid=$1;
 29 | 		} else {
 30 | 			die "No gene id!\n";
 31 | 		}
 32 | 		$transcript2gene{$transid}=$geneid;
 33 | 	}
 34 | 	
 35 | } close ($ifh);
 36 | print STDERR "Done reading Annotations\n";
 37 | 
 38 | # First get expression for all genes & store details for all Cuff-Genes
 39 | my %AllGenes = (); my %AllSamples = ();
 40 | my %Gene2Sample2TPM = ();
 41 | my %Gene2Sample2Counts=();
 42 | 
 43 | my @files = glob("$dir/*.kallisto.abundances.tsv");	
 44 | foreach my $file (@files) {
 45 | #	$file =~ /([ATCG]{5,})/;
 46 | 	my $ID = "ERR";
 47 | #	if ($file =~ /([^_]+_Cell\d\d)/) { 
 48 | 	if ($file =~ /(.+)\.kallisto\.abundances\.tsv/) { # Extract sample ID from file name -> must be customized for each dataset.
 49 | 		$ID = $1;
 50 | 	} else {
 51 | 		die "$file does not match!";
 52 | 	}
 53 | 	$AllSamples{$ID}=1;
 54 | 	open(my $ifh, $file) or die $!;
 55 | 	<$ifh>; # header
 56 | 	while (<$ifh>) {
 57 | 		chomp;
 58 | 		my @record = split(/\t/);
 59 | 		my $trans = $record[0];
 60 | 		my $count = $record[3];
 61 | 		my $tpm = $record[4];
 62 | 		my $gene = $trans;
 63 | 		if ($type eq "gene") { #Aggregate by gene
 64 | 			if (exists($transcript2gene{$trans})) { 
 65 | 				$gene = $transcript2gene{$trans}; 
 66 | 			}
 67 | 		} else {
 68 | 			next;
 69 | 		}
 70 | 		if (exists($Gene2Sample2TPM{$gene})) {
 71 | 			$Gene2Sample2TPM{$gene}->{$ID}+=$tpm;
 72 | 			$Gene2Sample2Counts{$gene}->{$ID}+=$count;
 73 | 		} else {
 74 | 			$Gene2Sample2TPM{$gene}->{$ID}=$tpm;
 75 | 			$Gene2Sample2Counts{$gene}->{$ID}=$count;
 76 | 		}
 77 | 	} close ($ifh);
 78 | } 
 79 | 
 80 | #open (my $ofhtpm, ">", "$outprefix.tpm") or die $!;
 81 | open (my $ofhcounts, ">", "$outprefix.counts") or die $!;
 82 | my @IDs = sort(keys(%AllSamples));
 83 | #print $ofhtpm "Gene\t".join("\t",@IDs)."\n";
 84 | print $ofhcounts "Gene\t".join("\t",@IDs)."\n";
 85 | 
 86 | foreach my $gene (keys(%Gene2Sample2TPM)) {
 87 | #	print $ofhtpm "$gene";
 88 | 	print $ofhcounts "$gene";
 89 | 	foreach my $ID (@IDs) {
 90 |                  my $tpm = "NA";
 91 |                  if (exists($Gene2Sample2TPM{$gene}->{$ID})) {
 92 |                          $tpm = $Gene2Sample2TPM{$gene}->{$ID};
 93 |                  } else {
 94 |                          $tpm = "0";
 95 |                  }
 96 |                  my $count = "NA";
 97 |                  if (exists($Gene2Sample2Counts{$gene}->{$ID})) {
 98 |                          $count = $Gene2Sample2Counts{$gene}->{$ID};
 99 |                  } else {
100 |                          $count = "0";
101 |                  }
102 |                  print $ofhcounts "\t".$count;
103 | #                 print $ofhtpm "\t".$tpm;
104 |          }
105 |          print $ofhcounts "\n";
106 | #         print $ofhtpm "\n";
107 | }
108 | close($ofhcounts);
109 | #close($ofhtpm);
110 | 


--------------------------------------------------------------------------------
/Kallisto_Quantification_Wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Note job array requires indexing to start at 1 but array indexing starts at 0
 3 | # Maps paired reads only!
 4 | 
 5 | USAGE="Usage: Kallisto_Quantification_Wrapper.sh index threads file1 file2 outdir\n
 6 | 	\tArguments:\n
 7 | 	\tfile1 = either fastq for read1 or if running in jobarray directory of fasta files\n
 8 | 	\tfile2 = either fastq for read2 or if running in jobarray \"NULL\" or for single-end\n
 9 | 	\tindex = kallisto index (see: Kallisto_Build_Index.sh)\n
10 | 	\tthreads = number of cpus to use\n
11 | 	\toutdir = directory for output (default: current working directory)\n"
12 | 
13 | KALLISTO=/nfs/users/nfs_t/ta6/RNASeqPipeline/software/kallisto_linux-v0.42.4/kallisto
14 | JOB_INDEX=$LSB_JOBINDEX # for array jobs, index starts at 1.
15 | 
16 | FILE1=$1
17 | FILE2=$2
18 | KALLISTO_INDEX=$3
19 | NUMTHREADS=$4
20 | OUTDIR=$5
21 | 
22 | #Check appropriate arguments
23 | if [ ! -f "$KALLISTO" ] ; then
24 |   echo "Error: kallisto not available "
25 |   exit 1
26 | fi
27 | 
28 | if [ -z "$NUMTHREADS" ] ; then
29 |   echo -e $USAGE
30 |   exit 1
31 | fi
32 | 
33 | if [ -z "$KALLISTO_INDEX" ] ; then
34 |   echo -e $USAGE
35 |   exit 1
36 | fi
37 | 
38 | if [ -z "$FILE1" ] ; then
39 |   echo -e $USAGE
40 |   exit 1
41 | fi
42 | 
43 | # allow running in unpaired mode
44 | #if [ -z "$FILE2" ] ; then 
45 | #  echo -e $USAGE
46 | #  exit 1
47 | #fi
48 | 
49 | if [ -z "$OUTDIR" ] ; then
50 |   OUTDIR=./
51 | fi
52 | 
53 | # Set-up for either array job or for loop
54 | if [ $FILE2 == "NULL" ] && [ $JOB_INDEX -gt 0 ]; then
55 | 	echo "ArrayJob"
56 | 	echo $JOB_INDEX
57 | 	FILEStoMAP=($FILE1/*)
58 | 	ARRAYINDEX=$((($JOB_INDEX-1)*2))
59 | 	FILE1=${FILEStoMAP[$ARRAYINDEX]} #Bash array indicies start at 0
60 | 	FILE2=${FILEStoMAP[$ARRAYINDEX+1]} #Bash array indicies start at 0 
61 | fi
62 | 
63 | if [ ! -d "$OUTDIR" ] ; then
64 |   mkdir -p $OUTDIR
65 | fi
66 | 
67 | if [ -z "$FILE1" ] || [ ! -f "$FILE1" ] ; then
68 |   echo "$FILE1 does not exist."
69 |   exit 1
70 | fi
71 | 
72 | # allow running in unpaired mode
73 | #if [ -z "$FILE2" ] || [ ! -f "$FILE2" ] ; then
74 | #  echo "$FILE2 does not exist."
75 | #  exit 1
76 | #fi
77 | 
78 | NAME=${FILE1##*/}
79 | NAME=${NAME%.*}
80 | WORKDIR=$OUTDIR/$NAME
81 | 
82 | # Make directory for temporary output
83 | if [ ! -d "$WORKDIR" ] ; then
84 |   mkdir -p $WORKDIR
85 | fi
86 | 
87 | # Run KALLISTO 
88 | if [ -f $FILE2 ] ; then
89 |   $KALLISTO quant --bias --plaintext --threads=$NUMTHREADS -i $KALLISTO_INDEX -o $WORKDIR $FILE1 $FILE2
90 | else 
91 |   $KALLISTO quant --single --fragment-length=100 --sd=20 --bias --plaintext --threads=$NUMTHREADS -i $KALLISTO_INDEX -o $WORKDIR $FILE1
92 | fi
93 | mv $WORKDIR/abundance.tsv $OUTDIR/$NAME.kallisto.abundances.tsv
94 | rm $WORKDIR/run_info.json
95 | rmdir $WORKDIR
96 | 
97 | 


--------------------------------------------------------------------------------
/Parse_GTF_biotype.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | if (@ARGV < 1) {die "Required input: gtf file\n";}
 5 | 
 6 | my %GeneID2Stuff = ();
 7 | my %TranscriptID2GeneID = ();
 8 | open (my $ifh, $ARGV[0]) or die $!;
 9 | while (<$ifh>) {
10 | 	if ($_ =~ /^#/) {next;}
11 | 	my $geneid = "ERROR";
12 | 	if ($_ =~ /gene_id "(.+?)"/) {
13 | 		$geneid=$1;
14 | 	} else {
15 | 		die "No gene id!\n";
16 | 	}
17 | 	
18 | 	if ($_ =~ /transcript_id "(.+?)"/) {
19 | 		$GeneID2Stuff{$geneid}->{"transcript_ids"}->{$1} = 1;
20 | 		$TranscriptID2GeneID{$1}=$geneid;
21 | 	}
22 | 	if ($_ =~ /gene_name "(.+?)"/) {
23 | 		$GeneID2Stuff{$geneid}->{"gene_name"} = $1;
24 | 	}
25 | 	if ($_ =~ /gene_biotype "(.+?)"/) {
26 | 		$GeneID2Stuff{$geneid}->{"gene_biotype"} = $1;
27 | 	}
28 | 	my @record = split(/\t/);
29 | 	my $length = $record[4]-$record[3];
30 | 	if (!exists($GeneID2Stuff{$geneid}->{"length"}) || $GeneID2Stuff{$geneid}->{"length"} < $length) {
31 | 		$GeneID2Stuff{$geneid}->{"length"} = $length;
32 | 	}
33 | } close ($ifh);
34 | 	
35 | foreach my $gene (sort(keys(%GeneID2Stuff))) {
36 | 	print $gene."\t".$GeneID2Stuff{$gene}->{"gene_biotype"}."\n";
37 | }
38 | 	
39 | 


--------------------------------------------------------------------------------
/Parse_GTF_splicing.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | # Now read in genome annotations
 5 | my %chr2exon2locus = ();
 6 | my %exon2gene = ();
 7 | my %exon2transcript = ();
 8 | my %exon2size = ();
 9 | my %chr2gene2locus = ();
10 | my %transcript2gene = ();
11 | my %gene2trans = ();
12 | my %trans2exon = ();
13 | my %gene2exon = ();
14 | my %Addedchr = ();
15 | open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!;
16 | while (<$ifh>) {
17 | 	my $transcriptID = "ERROR";
18 | 	if ($_ =~ /^#/) {next;}
19 | 	my $geneid = "ERROR";
20 | 	if ($_ =~ /gene_id "(.+?)"/) {
21 | 		$geneid=$1;
22 | 	} else {
23 | 		die "No gene id!\n";
24 | 	}
25 | 	if ($_ =~ /transcript_id "(.+?)"/) {
26 | 		#transcript
27 | 		$transcriptID = $1;
28 | 		$transcript2gene{$transcriptID}=$geneid;
29 | 		$gene2trans{$geneid}->{$transcriptID} = 1;
30 | 	}
31 | 	
32 | 	my @record = split(/\t/);
33 | 	my $size = $record[4] - $record[3];
34 | 	if ($size < 0) {
35 | 		$size = $record[3] - $record[4];
36 | 	}
37 | 
38 | 	if ($record[2] =~ /gene/i) {
39 | 		#gene
40 | 		#$chr2gene2locus{$locus->[0]}->{$geneid} = $locus;
41 | 	} elsif ($_ =~ /exon_id "(.+?)"/) {
42 | 		#exon
43 | #		$chr2exon2locus{$locus->[0]}->{$1} = $locus;
44 | 		my $exonID = "$geneid $record[3] $record[4]";
45 | 		$exon2gene{$exonID}=$geneid;
46 | 		$exon2size{$exonID} = $size;
47 | 		$exon2transcript{$exonID}->{$transcriptID} = 1;
48 | 		$trans2exon{$transcriptID}->{$exonID} = 1;
49 | 		$gene2exon{$geneid}->{$exonID} = 1;
50 | 	}
51 | 	
52 | } close ($ifh);
53 | 
54 | foreach my $g (keys(%gene2trans)) {
55 | 	my $total_trans = scalar(keys(%{$gene2trans{$g}}));
56 | 	my $Perc_diff = 0;
57 | 	if ($total_trans > 1) {
58 | 		my $total_size = 0;
59 | 		my $var_size = 0;
60 | 		foreach my $e (keys(%{$gene2exon{$g}})) {
61 | 			$total_size += $exon2size{$e};
62 | 			if (scalar(keys(%{$exon2transcript{$e}})) > 1) {
63 | 				$var_size += $exon2size{$e};
64 | 			}
65 | 		}
66 | 		$Perc_diff = $var_size/$total_size;
67 | 	}
68 | 	print "$g $total_trans $Perc_diff\n";
69 | }
70 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | This is a collection of scripts I use (or have used in the past) to process scRNASeq data. They are free to use by anyone else for any purpose, but come with no assurances or guarantees of correctness or functionality. The general workflow is as follows:
 2 | 
 3 | 0 : Create the appropriate genome for the dataset, and obtain the read files & initial QC
 4 | 	- Building mapping indexes generally requires ~30Gb of memory for a mouse-sized genome
 5 | 1 : Split the files by well (cell), Trim reads as appropriate based on QC
 6 | 2 : Map the reads to the genome
 7 | 3 : Clean up mapping output & remove duplicates
 8 | 4 : Mapping QC
 9 | 5 : Quantify expression
10 | 6 : Assemble expression matrix
11 | 
12 | Finished Pipelines:
13 | 00_Kallisto_For_SmartSeq.readme = Smartseq2 + Kallisto (no UMIs) 
14 | 
15 | 
16 | Brief Descriptions of Useful files:
17 | 0_Extract_barcodes_from_BAM.sh : open the first line of each BAM file and find the barcode (tagged with BC:) - for matching up metadata.
18 | 
19 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | Integrate with ESAT: End Sequencing Analysis Toolkit
2 | Improve Transcriptome Extraction to reduce duplicate sequences.
3 | 


--------------------------------------------------------------------------------