├── .gitignore
├── README.md
├── functions-r
    └── plotPCAWithSampleNames.R
├── notes
    ├── converting-files.md
    ├── files-chrom-sizes.md
    ├── files-fasta.md
    ├── files-fastq.md
    ├── files-gtf.md
    ├── files-vcf.md
    ├── heatmaps.md
    ├── pathways.md
    ├── r.md
    ├── rna-seq-strand.md
    ├── rna-seq.md
    └── single-cell-rna-seq.md
├── scripts-bigpurple
    ├── assembly-10x-supernova.sh
    ├── cnvs-wgs-freec.sh
    ├── jupyter.sh
    ├── scrna-10x-cellranger-aggr.sh
    ├── scrna-10x-cellranger-count-features.sh
    ├── scrna-10x-cellranger-count.sh
    └── scrna-10x-cellranger-multi.sh
├── scripts-phoenix
    ├── assembly-10x-supernova.sh
    ├── bcl2fastq-sample-sheet-fix.sh
    ├── join-all.sh
    ├── scrna-10x-cellranger-aggr.sh
    ├── scrna-10x-cellranger-count.sh
    └── wgs-10x-longranger.sh
├── scripts
    ├── cnv-freec-genome-plot.R
    ├── cnv-freec-heatmap.R
    ├── csv-clean.sh
    ├── fastq-merge.pl
    ├── fastq-quality-bars.sh
    ├── gtf-remove-overlapping.pl
    ├── hdcyto-1-import-fcs.qmd
    ├── hdcyto-2-prepare-sce.qmd
    ├── hdcyto-3-analyze-sce.qmd
    ├── meth-minfi.R
    ├── mut-mhc-binding.pl
    ├── scrna-10x-seurat-1.R
    ├── scrna-10x-seurat-2.R
    ├── scrna-10x-seurat-3.R
    ├── scrna-decontaminate-soupx.R
    ├── scrna-doublets-scdblfinder.R
    └── snvs-cnvs-mutect-strelka-freec-pyclone.R
└── workflows
    ├── draft-genome-init.md
    ├── gatk-mouse-mm10.md
    ├── microarray.md
    ├── nanopore-init.md
    ├── ref-genome-gfp.md
    ├── ref-genome-init.md
    ├── rna-seq-diff-exp.md
    └── rrna-ref.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | ### macOS ###
 3 | # General
 4 | .DS_Store
 5 | .AppleDouble
 6 | .LSOverride
 7 | # Icon must end with two \r
 8 | Icon
 9 | # Thumbnails
10 | ._*
11 | # Files that might appear in the root of a volume
12 | .DocumentRevisions-V100
13 | .fseventsd
14 | .Spotlight-V100
15 | .TemporaryItems
16 | .Trashes
17 | .VolumeIcon.icns
18 | .com.apple.timemachine.donotpresent
19 | # Directories potentially created on remote AFP share
20 | .AppleDB
21 | .AppleDesktop
22 | Network Trash Folder
23 | Temporary Items
24 | .apdisk
25 | 
26 | ### Windows ###
27 | # Windows thumbnail cache files
28 | Thumbs.db
29 | Thumbs.db:encryptable
30 | ehthumbs.db
31 | ehthumbs_vista.db
32 | # Dump file
33 | *.stackdump
34 | # Folder config file
35 | [Dd]esktop.ini
36 | # Recycle Bin used on file shares
37 | $RECYCLE.BIN/
38 | # Windows Installer files
39 | *.cab
40 | *.msi
41 | *.msix
42 | *.msm
43 | *.msp
44 | # Windows shortcuts
45 | *.lnk
46 | 
47 | ### Linux ###
48 | *~
49 | # temporary files which can be created if a process still has a handle open of a deleted file
50 | .fuse_hidden*
51 | # KDE directory preferences
52 | .directory
53 | # Linux trash folder which might appear on any partition or disk
54 | .Trash-*
55 | # .nfs files are created when an open file is removed but is still being accessed
56 | .nfs*
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | computational genomics resources
 2 | 
 3 | contents:
 4 | 
 5 | * `notes`: notes and simple computational tasks
 6 | * `workflows`: multi-step protocols
 7 | * `scripts`: complete scripts for specific tasks
 8 | * `scripts-bigpurple` and `scripts-phoenix`: scripts optimized for a specific cluster (include references to data and system variables)
 9 | 
10 | related repositories:
11 | 
12 | * [sns/scripts](https://github.com/igordot/sns/blob/master/scripts): scripts from the `sns` pipeline
13 | * [tutorials](https://github.com/igordot/tutorials): tutorials
14 | 


--------------------------------------------------------------------------------
/functions-r/plotPCAWithSampleNames.R:
--------------------------------------------------------------------------------
 1 | # Modified DESeq2 plotPCA function with sample names and proportion of variance added.
 2 | # Sample names will be shown underneath each dot.
 3 | # The axis will display proportion of variance for each principal component.
 4 | # Tested using DESeq2 1.2.8, 1.6.2, and 1.8.1.
 5 | # The native DESeq2 plotPCA function switched from lattice to ggplot2 in version 1.5.11.
 6 | 
 7 | plotPCAWithSampleNames = function(x, intgroup="condition", ntop=500)
 8 | {
 9 | 	library(RColorBrewer)
10 | 	library(genefilter)
11 | 	library(lattice)
12 | 
13 | 	# pca
14 | 	rv = rowVars(assay(x))
15 | 	select = order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
16 | 	pca = prcomp(t(assay(x)[select,]))
17 | 
18 | 	# proportion of variance
19 | 	variance = pca$sdev^2 / sum(pca$sdev^2)
20 | 	variance = round(variance, 3) * 100
21 | 	
22 | 	# sample names
23 | 	names = colnames(x)
24 | 
25 | 	# factor of groups
26 | 	fac = factor(apply(as.data.frame(colData(x)[, intgroup, drop=FALSE]), 1, paste, collapse=" : "))
27 | 
28 | 	# colors
29 | 	if( nlevels(fac) >= 10 )
30 | 		colors = rainbow(nlevels(fac))
31 | 	else if( nlevels(fac) >= 3 )
32 | 		colors = brewer.pal(nlevels(fac), "Set1")
33 | 	else
34 | 		colors = c( "dodgerblue3", "firebrick3" )
35 | 
36 | 	# plot
37 | 	xyplot(
38 | 		PC2 ~ PC1, groups=fac, data=as.data.frame(pca$x), pch=16, cex=1.5,
39 | 		aspect = "fill",
40 | 		col = colors,
41 | 		xlab = list(paste("PC1 (", variance[1], "%)", sep=""), cex=0.8),
42 | 		ylab = list(paste("PC2 (", variance[2], "%)", sep=""), cex=0.8),
43 | 		panel = function(x, y, ...) {
44 | 			panel.xyplot(x, y, ...);
45 | 			ltext(x=x, y=y, labels=names, pos=1, offset=0.8, cex=0.7)
46 | 		},
47 | 		main = draw.key(
48 | 			key = list(
49 | 				rect = list(col = colors),
50 | 				text = list(levels(fac)),
51 | 				rep = FALSE
52 | 			)
53 | 		)
54 | 	)
55 | }
56 | 


--------------------------------------------------------------------------------
/notes/converting-files.md:
--------------------------------------------------------------------------------
 1 | # Converting Files
 2 | 
 3 | ## GFF to GTF
 4 | Convert from GFF to GTF:
 5 | ```
 6 | gffread in.gff -T -o out.gtf
 7 | ```
 8 | gffread program is part of the Cufflinks package
 9 | 
10 | ***
11 | 
12 | ## GTF to refFlat
13 | Convert gene annotations from GTF to genePred refFlat format:
14 | ```
15 | gtfToGenePred -genePredExt -geneNameAsName2 genes.gtf refFlat.tmp.txt
16 | paste <(cut -f 12 refFlat.tmp.txt) <(cut -f 1-10 refFlat.tmp.txt) > refFlat.txt
17 | rm refFlat.tmp.txt
18 | gzip refFlat.txt
19 | ```
20 | Tested with Ensembl GTF file and used for Picard CollectRnaSeqMetrics (must be gzipped).  
21 | gtfToGenePred obtained from http://hgdownload.cse.ucsc.edu/admin/exe/
22 | 
23 | ***
24 | 
25 | ## SRA to FASTQ
26 | Convert SRA (Sequence Read Archives) files to FASTQs:
27 | ```
28 | /path/sratoolkit.2.3.4/bin/fastq-dump -v --split-files --gzip file.sra
29 | ```
30 | SRA run ID to FASTQ (will download the SRA file and put it in a temp directory):
31 | ```
32 | /path/sratoolkit.2.3.4/bin/fastq-dump -v --split-files --gzip SRR0000000
33 | ```
34 | NCBI SRA Toolkit obtained from http://eutils.ncbi.nih.gov/Traces/sra/?view=software
35 | 
36 | ***
37 | 
38 | ## DOT to SVG/PNG
39 | Convert DOT database schema file to SVG or PNG:
40 | ```
41 | dot -Tsvg file.dot > file.svg
42 | dot -Tpng file.dot > file.png
43 | ```
44 | 
45 | ***
46 | 
47 | ## Between Various Image Formats
48 | ImageMagick:
49 | ```
50 | convert in.png out.pdf
51 | ```
52 | Ghostscript (seems to perform better):
53 | ```
54 | gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -sOutputFile=out.pdf in.pdf
55 | ```
56 | Both can do wildcard inputs to merge multiple files.
57 | 


--------------------------------------------------------------------------------
/notes/files-chrom-sizes.md:
--------------------------------------------------------------------------------
 1 | # Working with chrom.sizes Files
 2 | 
 3 | Generate a chrom.sizes file (from an indexed FASTA file):
 4 | ```
 5 | cut -f 1,2 genome.fa.fai > chrom.sizes
 6 | ```
 7 | 
 8 | Alternatives:
 9 | * fetchChromSizes (http://hgdownload.soe.ucsc.edu/admin/exe/)
10 | 


--------------------------------------------------------------------------------
/notes/files-fasta.md:
--------------------------------------------------------------------------------
 1 | # Working with FASTA Files
 2 | 
 3 | 
 4 | Index FASTA:
 5 | ```bash
 6 | samtools faidx genome.fa
 7 | ```
 8 | 
 9 | ***
10 | 
11 | Remove empty records (description without sequence):
12 | ```bash
13 | awk '$2{print RS}$2' FS='\n' RS=\> ORS= in.fasta > out.fasta
14 | ```
15 | 
16 | ***
17 | 
18 | Remove blank lines:
19 | ```bash
20 | sed -i '/^$/d' in.fasta
21 | ```
22 | 
23 | ***
24 | 
25 | Remove problematic characters (they may cause issues with some tools):
26 | ```bash
27 | sed -i -e "s/[ ,\(\)\.\/\|:=]/_/g" in.fasta
28 | sed -i 's/___/__/g' in.fasta
29 | ```
30 | 
31 | ***
32 | 
33 | Filter FASTA file by sequence length.
34 | 
35 | Using `awk`:
36 | ```bash
37 | # if applicable, convert multi-line FASTA to single-line FASTA
38 | # using awk:
39 | awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' file.fa > file.nowrap.fa
40 | # using FASTX-Toolkit:
41 | fasta_formatter -w 0 -i file.fa -o file.nowrap.fa
42 | 
43 | # filter by sequence size (1000 in this example)
44 | awk 'BEGIN {RS = ">" ; ORS = ""} length($2) >= 1000 {print ">"$0}' file.nowrap.fa > file.1000.fa
45 | ```
46 | Using `faFilter`:
47 | ```bash
48 | faFilter -minSize=N -maxSize=N in.fa out.fa
49 | ```
50 | `faFilter` obtained from http://hgdownload.soe.ucsc.edu/admin/exe/
51 | 


--------------------------------------------------------------------------------
/notes/files-fastq.md:
--------------------------------------------------------------------------------
 1 | # Working with FASTQ Files
 2 | 
 3 | 
 4 | Subset a FASTQ (where NNN is the desired number of reads):
 5 | ```bash
 6 | seqtk sample -s 100 in.fastq.gz NNN | gzip > out.fastq.gz
 7 | ```
 8 | Seqtk obtained from https://github.com/lh3/seqtk
 9 | 
10 | Alternatives:
11 | * fastq-tools fastq-sample (http://homes.cs.washington.edu/~dcjones/fastq-tools/fastq-sample.html)
12 | * sample_fastq.py (https://github.com/mojones/random_scripts/blob/master/sample_fastq.py)
13 | 


--------------------------------------------------------------------------------
/notes/files-gtf.md:
--------------------------------------------------------------------------------
 1 | # Working with GTF Files
 2 | 
 3 | 
 4 | Verify the GTF file format and confirm that the genes specified do not violate the rules of gene structure:
 5 | ```bash
 6 | validate_gtf.pl genes.gtf
 7 | ```
 8 | `validate_gtf.pl` (by Evan Keibler) obtained from http://mblab.wustl.edu/software.html (there is also a different version included in the Eval package)
 9 | 
10 | ***
11 | 
12 | Add gene names to GTF gene IDs to make them more readable (merge `gene_id` and `gene_name`):
13 | ```bash
14 | cat genes.gtf \
15 | | grep "transcript_id" \
16 | | perl -pe 's/(gene_id "(.+?)"; )(.*)(gene_name "(.+?)"; )/gene_id "\5:\2"; \3 \4/g' \
17 | > genes.name-id.gtf
18 | ```
19 | 
20 | ***
21 | 
22 | Some viral and bacterial GTF files only contain `CDS` features (column 3), but some tools require `exon` features.
23 | Convert `CDS` to `exon`, which would be equivalent for that purpose:
24 | ```bash
25 | cat in.gtf | perl -pe 's/\tCDS\t/\texon\t/g' > out.gtf
26 | ```
27 | 


--------------------------------------------------------------------------------
/notes/files-vcf.md:
--------------------------------------------------------------------------------
 1 | # Working with VCF Files
 2 | 
 3 | Sort VCF file:
 4 | ```
 5 | vcfsorter.pl genome.dict in.vcf > out.vcf
 6 | ```
 7 | The output is compatible with Genome Analysis Toolkit (GATK).  
 8 | vcfsorter.pl (by German Gaston Leparc) obtained from https://code.google.com/p/vcfsorter/  
 9 | Alternative: Picard SortVcf
10 | 


--------------------------------------------------------------------------------
/notes/heatmaps.md:
--------------------------------------------------------------------------------
  1 | # Heatmap Generation Tools
  2 | 
  3 | 
  4 | ## R functions (static)
  5 | 
  6 | * [heatmap (stats package)](https://www.rdocumentation.org/packages/stats/topics/heatmap)
  7 | * [heatmap.2 (gplots package)](https://www.rdocumentation.org/packages/gplots/topics/heatmap.2)
  8 | * [heatmap.3 (GMD package)](https://www.rdocumentation.org/packages/GMD/topics/heatmap.3)
  9 | * [aheatmap (NMF package)](https://www.rdocumentation.org/packages/NMF/topics/aheatmap)
 10 | * [pheatmap](https://github.com/raivokolde/pheatmap)
 11 | * [ComplexHeatmap](https://github.com/jokergoo/ComplexHeatmap)
 12 | * [heatmap3](https://www.rdocumentation.org/packages/heatmap3/topics/heatmap3)
 13 | * [heatmap1 (NeatMap package)](https://www.rdocumentation.org/packages/NeatMap/topics/heatmap1)
 14 | * [superheat](https://rlbarter.github.io/superheat)
 15 | * [heatmap.bp (vcfR package)](https://www.rdocumentation.org/packages/vcfR/topics/heatmap.bp)
 16 | * [geom_tile (ggplot2 package)](https://www.rdocumentation.org/packages/ggplot2/topics/geom_tile)
 17 | * [gapmap](https://cran.r-project.org/package=gapmap)
 18 | 
 19 | 
 20 | ## R functions (interactive)
 21 | 
 22 | * [d3heatmap](https://github.com/rstudio/d3heatmap) (not actively maintained)
 23 | * [heatmaply](https://github.com/talgalili/heatmaply)
 24 | * [annHeatmap (Heatplus package)](https://www.rdocumentation.org/packages/Heatplus/topics/annHeatmap)
 25 | * [iheatmapr](https://github.com/AliciaSchep/iheatmapr)
 26 | * [iheatmap (qtlcharts package)](https://www.rdocumentation.org/packages/qtlcharts/topics/iheatmap)
 27 | 
 28 | 
 29 | ## Microsoft Excel
 30 | 
 31 | tutorial 1: http://policyviz.com/create-a-heatmap-in-excel/
 32 | 
 33 | ![image](https://cloud.githubusercontent.com/assets/6363505/20320165/e6508b20-ab3e-11e6-869a-f7652a1130b1.png)
 34 | 
 35 | tutorial 2: http://peltiertech.com/heat-map-excel-conditional-formatting/
 36 | 
 37 | ![image](https://cloud.githubusercontent.com/assets/6363505/20320201/066ca0a6-ab3f-11e6-82be-85da3b87df7b.png)
 38 | 
 39 | 
 40 | ## Matrix2png
 41 | 
 42 | http://www.chibi.ubc.ca/matrix2png/
 43 | 
 44 | > This program converts tab-delmited matrix files into png images. It is implemented in ANSI C and utilized Tom Boutell's gd library (which in turn uses libpng and zlib). It is designed to be called from the command line or within a script.
 45 | 
 46 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319394/deeac36c-ab3b-11e6-8aea-8a2b38f646ab.png)
 47 | 
 48 | 
 49 | ## Heatmap from HIV sequence database
 50 | 
 51 | http://www.hiv.lanl.gov/content/sequence/HEATMAP/heatmap.html
 52 | 
 53 | > A heatmap is a graphical way of displaying a table of numbers by using colors to represent the numerical values. The clustering algorithm groups related rows and/or columns together by similarity.
 54 | 
 55 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319475/43bde904-ab3c-11e6-92a0-dcf5cfd2f443.png)
 56 | 
 57 | 
 58 | ## jHeatmap
 59 | 
 60 | http://jheatmap.github.io/jheatmap/
 61 | 
 62 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319614/d74cac50-ab3c-11e6-86ee-2596160fa02d.png)
 63 | 
 64 | 
 65 | ## MicroScope
 66 | 
 67 | http://microscopebioinformatics.org/
 68 | 
 69 | > We propose a user-friendly ChIP-seq and RNA-seq software suite for the interactive visualization and analysis of genomic data, including integrated features to support differential expression analysis, interactive heatmap production, principal component analysis, gene ontology analysis, and dynamic network visualization.
 70 | 
 71 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319787/73c6ff18-ab3d-11e6-9c00-f60b5a132f44.png)
 72 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319823/8de008d6-ab3d-11e6-88ab-52ba98293609.png)
 73 | 
 74 | 
 75 | ## HeatmapGenerator
 76 | 
 77 | https://github.com/Bohdan-Khomtchouk/HeatmapGenerator
 78 | 
 79 | > a graphical user interface software program written in C++, R, and OpenGL
 80 | 
 81 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319693/1a5f8062-ab3d-11e6-8008-7949f81236b3.png)
 82 | 
 83 | 
 84 | ## GENE-E
 85 | 
 86 | http://www.broadinstitute.org/cancer/software/GENE-E/
 87 | 
 88 | > GENE-E is a matrix visualization and analysis platform designed to support visual data exploration. It includes heat map, clustering, filtering, charting, marker selection, and many other tools. In addition to supporting generic matrices, GENE-E also contains tools that are designed specifically for genomics data.GENE-E is a matrix visualization and analysis platform designed to support visual data exploration. It includes heat map, clustering, filtering, charting, marker selection, and many other tools. In addition to supporting generic matrices, GENE-E also contains tools that are designed specifically for genomics data.
 89 | 
 90 | 
 91 | ## Morpheus
 92 | 
 93 | https://software.broadinstitute.org/morpheus/
 94 | 
 95 | > JavaScript matrix visualization and analysis
 96 | 
 97 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319970/2dc5276e-ab3e-11e6-812a-84f2db7d57a6.png)
 98 | 
 99 | 
100 | ## MeV: MultiExperiment Viewer
101 | 
102 | http://www.tm4.org/mev.html
103 | 
104 | 
105 | ## Web MEV
106 | 
107 | http://mev.tm4.org/
108 | 
109 | > WebMeV (Multiple Experiment Viewer) is a cloud-based application supporting analysis, visualization, and stratification of large genomic data, particularly for RNASeq and microarray data.
110 | 
111 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319258/73762a5e-ab3b-11e6-829f-ad84cb26407d.png)
112 | 
113 | 
114 | ## Next Generation Clustered Heat Map Tool (NG-CHM)
115 | 
116 | http://bioinformatics.mdanderson.org/testchm/
117 | 
118 | > Next-Generation (Clustered) Heat Maps are interactive heat maps that enable the user to zoom and pan across the heatmap, alter its color scheme, generate production quality PDFs, and link out from rows, columns, and individual heatmap entries to related statistics, databases and other information.
119 | 
120 | ![image](https://user-images.githubusercontent.com/6363505/31510128-cb6de678-af51-11e7-96ed-dd8d3db94316.png)
121 | 
122 | 
123 | ## Clustergrammer
124 | 
125 | http://amp.pharm.mssm.edu/clustergrammer/
126 | 
127 | > Clustergrammer is a web-based tool for visualizing and analyzing high-dimensional data as interactive and shareable hierarchically clustered heatmaps. Clustergrammer enables intuitive exploration of high-dimensional data and has several optional biology-specific features.
128 | 
129 | ![image](https://user-images.githubusercontent.com/6363505/31510322-592e408e-af52-11e7-857a-33e747b63e06.png)
130 | 


--------------------------------------------------------------------------------
/notes/r.md:
--------------------------------------------------------------------------------
  1 | # R
  2 | 
  3 | ## General R
  4 | 
  5 | Vignette:
  6 | ```r
  7 | # show vignettes for a package
  8 | browseVignettes(package = "package")
  9 | # get vignette
 10 | vignette("topic")
 11 | ```
 12 | 
 13 | Get version of package:
 14 | ```r
 15 | packageVersion("package")
 16 | ```
 17 | 
 18 | Check that all installed packages can be loaded:
 19 | ```r
 20 | for (p in rownames(installed.packages())) {
 21 |   message(p)
 22 |   suppressPackageStartupMessages(library(p, character.only = TRUE))
 23 | }
 24 | ```
 25 | 
 26 | Working with methods:
 27 | ```r
 28 | # prints source code for method
 29 | getMethod(method, "class")
 30 | # find method
 31 | selectMethod(method, "class")
 32 | # show all methods for class
 33 | showMethods(classes = "class")
 34 | methods(class = "class")
 35 | # method help for S3 objects
 36 | ?"method.class"
 37 | ```
 38 | 
 39 | ***
 40 | 
 41 | ## GenomicRanges
 42 | 
 43 | The GenomicRanges package defines general purpose containers for storing and manipulating genomic intervals and variables defined along a genome.
 44 | 
 45 | Basics:
 46 | ```r
 47 | library(GenomicRanges)
 48 | # create a new GRanges object with one range
 49 | g = GRanges("chr1", IRanges(10001, 10100), strand = "+")
 50 | # get basic info for ranges
 51 | start(g)
 52 | end(g)
 53 | width(g)
 54 | strand(g)
 55 | # get metadata columns (additional optional information for ranges)
 56 | mcols(g)
 57 | # get IRanges
 58 | ranges(g)
 59 | # get chromosomes for each range
 60 | seqnames(g)
 61 | # get all chromosomes
 62 | seqlevels(g)
 63 | ```
 64 | 
 65 | Intra-range methods (modify each range independently):
 66 | * `shift`:	move the ranges by a specific number of base pairs
 67 | * `resize`:	resizes to width, keeping start for + and end for -
 68 | * `narrow`:	narrows by relative position within range
 69 | * `flank`:	returns flanking ranges upstream
 70 | * `promoters`:	similar to flank
 71 | * `restrict`:	restricts ranges to a start and end position
 72 | * `trim`:	trims out of bound ranges
 73 | * `+/-`:	add or subtract a fixed amount
 74 | * `?"intra-range-methods"`: summarize all intra-range methods
 75 | 
 76 | Inter-range methods (comparisons between ranges):
 77 | * `reduce`:	merge overlapping ranges to produce a simplified set
 78 | * `gaps`:	get gaps between the ranges
 79 | * `disjoin`:	break into discrete non-overlapping ranges based on original starts/ends
 80 | * `?"inter-range-methods"`: summarize all inter-range methods
 81 | 
 82 | Distance methods (compare each range in `x` to `subject`):
 83 | * `nearest`:	get an integer vector containing the index of the nearest neighbor range in subject
 84 | * `precede`:	get the index of the range in subject that is directly preceded by the range in x
 85 | * `follow`:	get the index of the range in subject that is directly followed by the range in x
 86 | * `distanceToNearest`:	get the distances to the nearest neighbor in subject (Hits object)
 87 | * `distance`: get the distances to the nearest neighbor (integer vector)
 88 | 
 89 | Overlaps:
 90 | ```r
 91 | # vector of which x ranges overlap y ranges
 92 | x %over% y
 93 | # overlaps Hits object
 94 | o = findOverlaps(x, y)
 95 | # relative to x
 96 | queryHits(o)
 97 | # relative to y
 98 | subjectHits(o)
 99 | ```
100 | 
101 | ***
102 | 
103 | Add chr to chromosome names RangedData data structure (from NCBI/Ensembl to UCSC style).
104 | ```r
105 | ann
106 | # RangedData with 38293 rows and 2 value columns across 51 spaces
107 | #                       space               ranges   |    strand
108 | #                    <factor>            <IRanges>   | <integer>
109 | # ENSMUSG00000090025        1   [3054233, 3054733]   |         1
110 | # ENSMUSG00000064842        1   [3102016, 3102125]   |         1
111 | # ENSMUSG00000051951        1   [3205901, 3671498]   |        -1
112 |  
113 | names(ann) = paste("chr", names(ann), sep="")
114 |  
115 | ann
116 | # RangedData with 38293 rows and 2 value columns across 51 spaces
117 | #                       space               ranges   |    strand
118 | #                    <factor>            <IRanges>   | <integer>
119 | # ENSMUSG00000090025     chr1   [3054233, 3054733]   |         1
120 | # ENSMUSG00000064842     chr1   [3102016, 3102125]   |         1
121 | # ENSMUSG00000051951     chr1   [3205901, 3671498]   |        -1
122 | ```
123 | 


--------------------------------------------------------------------------------
/notes/rna-seq-strand.md:
--------------------------------------------------------------------------------
 1 | # RNA-Seq Strand
 2 | 
 3 | |                                      | forward (transcript)                 | reverse (rev comp of transcript)     |
 4 | |:-------------------------------------|:-------------------------------------|:-------------------------------------|
 5 | | TopHat/Cufflinks `--library-type`    | `fr-secondstrand`                    | `fr-firststrand`                     |
 6 | | STAR                                 | 1st read strand                      | 2nd read strand                      |
 7 | | Picard CollectRnaSeqMetrics `STRAND_SPECIFICITY` | `FIRST_READ_TRANSCRIPTION_STRAND` | `SECOND_READ_TRANSCRIPTION_STRAND` |
 8 | | htseq-count `-s/--stranded`          | `yes`                                | `reverse`                            |
 9 | | subread featureCounts `-s`           | `1`                                  | `2`                                  |
10 | | RSEM `--forward-prob`                | `1`                                  | `0`                                  |
11 | | Salmon/Sailfish `--libType`          | `SF`/`ISF`                           | `SR`/`ISR`                           |
12 | | HISAT2 `--rna-strandness`            | `FR` (`F` for single-end)            | `RF` (`R` for single-end)            |
13 | | Library Kit                          | Illumina ScriptSeq                   | Illumina TruSeq Stranded Total RNA   |
14 | 
15 | ***
16 | 
17 | Strand-specific protocols
18 | ([ref](http://onetipperday.sterding.com/2012/07/how-to-tell-which-library-type-to-use.html)):
19 | ![](https://3.bp.blogspot.com/-BkupUsIrnXk/UBbmmmx6T8I/AAAAAAAAAUU/_rcrd_ahT48/s1600/strand.png)
20 | 
21 | ***
22 | 
23 | Three widely used protocols for strand-specific RNA sequencing
24 | ([ref](http://www.nature.com/neuro/journal/v17/n11/full/nn.3814.html)):
25 | ![](https://images.nature.com/full/nature-assets/neuro/journal/v17/n11/images/nn.3814-F3.jpg)
26 | 
27 | ***
28 | 
29 | Illumina TruSeq Stranded Total RNA Kit
30 | ([ref](https://www.abmgood.com/marketing/knowledge_base/next_generation_sequencing_experimental_design.php)):
31 | ![](https://www.abmgood.com/marketing/knowledge_base/img/NGS/Next_Generation_Sequencing_NGS_TruSeq_Stranded_Total_RNA.png)
32 | 


--------------------------------------------------------------------------------
/notes/rna-seq.md:
--------------------------------------------------------------------------------
 1 | # RNA-Seq Analysis
 2 | 
 3 | 
 4 | Some basic RNA-seq analysis resources.
 5 | 
 6 | ## Web Resources
 7 | 
 8 | [RNA-seqlopedia](http://rnaseq.uoregon.edu/)
 9 | > The RNA-seqlopedia provides an overview of RNA-seq and of the choices necessary to carry out a successful RNA-seq experiment.
10 | 
11 | [Thinking About RNA Seq Experimental Design for Measuring Differential Gene Expression: The Basics](http://gkno2.tumblr.com/post/24629975632/thinking-about-rna-seq-experimental-design-for)
12 | > RNA-Seq is a powerful tool that can be used effectively by a diverse community of people with different backgrounds. We expect that some of those who could benefit from RNA Seq do not yet have the background in sequencing and statistics that is necessary to make effective use of this technology. Much of the existing literature may be time-consuming to read without this background. Therefore, we put together this primer with the intention of helping scientists and students understand the basic statistical principles associated with measuring gene expression using RNA Seq.
13 | 
14 | [RNA Sequence Analysis Training/Courses/Papers](https://www.biostars.org/p/174376/)
15 | > I did a little bit of research and started to take free online courses and read papers. Could anybody recommend me more up-to-date online trainings or courses? Here are the resources that I found so far.
16 | 
17 | ## Publications
18 | 
19 | [RNA-Seq workflow: gene-level exploratory analysis and differential expression](http://f1000research.com/articles/4-1070/v1) (10/2015)
20 | > Here we walk through an end-to-end gene-level RNA-Seq differential expression workflow using Bioconductor packages. We will start from the FASTQ files, show how these were aligned to the reference genome, and prepare a count matrix which tallies the number of RNA-seq reads/fragments within each gene for each sample. We will perform exploratory data analysis (EDA) for quality assessment and to explore the relationship between samples, perform differential gene expression analysis, and visually explore the results.
21 | 
22 | [A survey of best practices for RNA-seq data analysis](http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0881-8) (1/2016)
23 | > We review all of the major steps in RNA-seq data analysis, including experimental design, quality control, read alignment, quantification of gene and transcript levels, visualization, differential gene expression, alternative splicing, functional analysis, gene fusion detection and eQTL mapping. We highlight the challenges associated with each step. We discuss the analysis of small RNAs and the integration of RNA-seq with other functional genomics techniques. Finally, we discuss the outlook for novel technologies that are changing the state of the art in transcriptomics.
24 | 
25 | [How many biological replicates are needed in an RNA-seq experiment and which differential expression tool should you use?](http://rnajournal.cshlp.org/content/22/6/839.long) (6/2016)
26 | > For future RNA-seq experiments, these results suggest that at least six biological replicates should be used, rising to at least 12 when it is important to identify SDE genes for all fold changes. If fewer than 12 replicates are used, a superior combination of true positive and false positive performances makes edgeR and DESeq2 the leading tools. For higher replicate numbers, minimizing false positives is more important and DESeq marginally outperforms the other tools.
27 | 


--------------------------------------------------------------------------------
/notes/single-cell-rna-seq.md:
--------------------------------------------------------------------------------
 1 | # Single-Cell RNA-Seq
 2 | 
 3 | |tool       |comment    |
 4 | |:----------|:----------|
 5 | |[Monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html)||
 6 | |scLVM||
 7 | |SCDE||
 8 | |[scDD](https://github.com/kdkorthauer/scDD)||
 9 | |[MAST](https://github.com/RGLab/MAST)||
10 | |[Sincera](https://research.cchmc.org/pbge/sincera.html)||
11 | |[sincell](http://bioconductor.org/packages/devel/bioc/html/sincell.html)||
12 | |[cellTree](http://bioconductor.org/packages/devel/bioc/html/cellTree.html)||
13 | |[RaceID](https://github.com/dgrun/RaceID)||
14 | |[SIMLR](https://github.com/BatzoglouLabSU/SIMLR)||
15 | |[scater](https://github.com/davismcc/scater)||
16 | 
17 | This weak attempt to compile a collection of scRNA-seq tools has been rendered useless by [scRNA-tools](https://www.scrna-tools.org/) and [awesome-single-cell](https://github.com/seandavi/awesome-single-cell).
18 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/assembly-10x-supernova.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova.
  6 | ##
  7 | ## Usage:
  8 | ## sbatch --job-name=supernova --nodes=1 --ntasks=1 --cpus-per-task=17 --mem-per-cpu=32G --time=15-00 \
  9 | ## --partition=fn_long --mail-user=${USER}@nyulangone.org --mail-type=END,FAIL,REQUEUE --export=NONE \
 10 | ## --wrap="bash /gpfs/data/igorlab/public/genomics/scripts-bigpurple/assembly-10x-supernova.sh fastq_dir [max_reads]"
 11 | ##
 12 | ## optimal number of reads: 56x raw coverage (Supernova will estimate the genome size)
 13 | ## number of reads calculator: (genome size) x 56 / 150, assuming reads are 150bp (default is 1,200M for human genome)
 14 | ## https://support.10xgenomics.com/de-novo-assembly/guidance/doc/achieving-success-with-de-novo-assembly
 15 | ##
 16 | 
 17 | 
 18 | #########################
 19 | 
 20 | 
 21 | # system-specific settings
 22 | 
 23 | # supernova directory
 24 | supernova_version="2.1.1"
 25 | supernova_dir="/gpfs/data/igorlab/software/supernova/supernova-${supernova_version}"
 26 | 
 27 | # prepare environment
 28 | module purge
 29 | module add default-environment
 30 | 
 31 | 
 32 | #########################
 33 | 
 34 | 
 35 | # check for correct number of arguments
 36 | if [ $# -lt 1 ] ; then
 37 | 	echo -e "\n ERROR: wrong number of arguments supplied \n" >&2
 38 | 	echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2
 39 | 	exit 1
 40 | fi
 41 | 
 42 | # arguments
 43 | fastq_dir=$(readlink -f "$1")
 44 | max_reads="$2"
 45 | 
 46 | # check that input exists
 47 | if [ ! -d "$fastq_dir" ] ; then
 48 | 	echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
 49 | 	exit 1
 50 | fi
 51 | 
 52 | 
 53 | #########################
 54 | 
 55 | 
 56 | # step 1: assembly (supernova run)
 57 | 
 58 | # million reads cutoff (default is 1200M)
 59 | # set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads
 60 | # coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset
 61 | # default value is 1.2B, which only makes sense for ~3.2 Gb genomes
 62 | if [ -n "$max_reads" ] ; then
 63 | 	max_reads_m="$max_reads"
 64 | else
 65 | 	max_reads_m="1200"
 66 | fi
 67 | 
 68 | # system load settings (reserve an extra thread for overhead)
 69 | threads=$SLURM_CPUS_PER_TASK
 70 | threads=$(( threads - 1 ))
 71 | mem=$(echo "$threads * 32" | bc)
 72 | 
 73 | # run name (used to name output folder)
 74 | supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g')
 75 | run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M"
 76 | 
 77 | # display settingse
 78 | echo
 79 | echo " * fastq dir:               $fastq_dir "
 80 | echo " * supernova bin dir:       $supernova_dir "
 81 | echo " * reads cutoff (million):  $max_reads_m "
 82 | echo " * threads:                 $threads "
 83 | echo " * mem:                     $mem "
 84 | echo " * run name (output dir):   $run_id "
 85 | echo
 86 | 
 87 | echo -e "\n assembly started: $(date) \n" >&2
 88 | 
 89 | # supernova assembly command
 90 | 
 91 | supernova_cmd="
 92 | ${supernova_dir}/supernova run \
 93 | --maxreads ${max_reads_m}000000 \
 94 | --localcores ${threads} \
 95 | --localmem ${mem} \
 96 | --id ${run_id} \
 97 | --fastqs ${fastq_dir}
 98 | "
 99 | echo -e "\n CMD: $supernova_cmd \n"
100 | $supernova_cmd
101 | 
102 | echo -e "\n assembly ended: $(date) \n" >&2
103 | 
104 | # check that output generated
105 | supernova_out_dir=$(readlink -f "$(pwd)/${run_id}")
106 | if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then
107 | 	echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2
108 | 	exit 1
109 | fi
110 | 
111 | 
112 | #########################
113 | 
114 | 
115 | # step 2: generate fasta file (supernova mkoutput)
116 | 
117 | # display settings
118 | echo
119 | echo " * assembly dir:  ${supernova_out_dir}/outs/assembly "
120 | echo " * fasta prefix:  ${supernova_out_dir}/assembly "
121 | echo
122 | 
123 | # generate different style fasta files
124 | styles="raw megabubbles pseudohap pseudohap2"
125 | for s in $styles; do
126 | 
127 | 	echo -e "\n generate fasta: style $s \n" >&2
128 | 
129 | 	# supernova mkoutput command
130 | 	${supernova_dir}/supernova mkoutput \
131 | 	--asmdir "${supernova_out_dir}/outs/assembly" \
132 | 	--outprefix "${supernova_out_dir}/assembly.${s}" \
133 | 	--style "${s}"
134 | 
135 | done
136 | 
137 | # check that output generated
138 | styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2"
139 | for s in $styles_out; do
140 | 
141 | 	# check that output generated
142 | 	if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then
143 | 		echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2
144 | 		exit 1
145 | 	fi
146 | 
147 | done
148 | 
149 | 
150 | #########################
151 | 
152 | 
153 | # cleanup
154 | 
155 | # check file size before cleanup
156 | echo
157 | echo "file size before cleanup"
158 | du -sh "$run_id"
159 | echo
160 | 
161 | # delete large assembly files (keep small ones just in case)
162 | rm -rf ${run_id}/outs/assembly/a*
163 | rm -rf ${run_id}/outs/assembly/closures*
164 | rm -rf ${run_id}/outs/assembly/data
165 | # delete temp files
166 | rm -rf ${run_id}/ASSEMBLER_CS
167 | 
168 | # check file size after cleanup
169 | echo
170 | echo "file size after cleanup"
171 | du -sh "$run_id"
172 | echo
173 | 
174 | 
175 | #########################
176 | 
177 | 
178 | 
179 | # end
180 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/cnvs-wgs-freec.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## WGS copy number variant analysis using Control-FREEC (with optional matched normal).
  6 | ## Based on SNS WES version.
  7 | ##
  8 | ## Usage:
  9 | ## sbatch --job-name=cnvs-wgs-${sample} --nodes=1 --ntasks=1 --cpus-per-task=5 --mem=100G --time=5:00:00 \
 10 | ## --mail-user=${USER}@nyulangone.org --mail-type=FAIL,REQUEUE --export=NONE \
 11 | ## --wrap="bash ./cnvs-wgs-freec.sh project_dir genome_build sample_name bam [control_bam] [window_size]"
 12 | ##
 13 | 
 14 | 
 15 | #########################
 16 | 
 17 | 
 18 | # script filename
 19 | script_path="${BASH_SOURCE[0]}"
 20 | script_name=$(basename "$script_path")
 21 | segment_name=${script_name/%.sh/}
 22 | echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
 23 | 
 24 | # check for correct number of arguments
 25 | if [ $# -lt 4 ] ; then
 26 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
 27 | 	echo -e "\n USAGE: $script_name project_dir genome_build sample_name bam [control_bam] [window_size] \n" >&2
 28 | 	exit 1
 29 | fi
 30 | 
 31 | # arguments
 32 | proj_dir=$(readlink -f "$1")
 33 | genome_build="$2"
 34 | sample_t="$3"
 35 | bam_t=$(readlink -f "$4")
 36 | bam_n="$5"
 37 | win_size="$6"
 38 | 
 39 | 
 40 | #########################
 41 | 
 42 | 
 43 | # check if control sample and/or window size are specified
 44 | 
 45 | if [ -n "$win_size" ] ; then
 46 | 	# both control sample and window size are specified
 47 | 	bam_n=$(readlink -f "$bam_n")
 48 | 	win_size_label="paired-${win_size}"
 49 | elif [ -n "$bam_n" ] ; then
 50 | 	# either control sample or window size are specified
 51 | 	if [ -e "$bam_n" ] ; then
 52 | 		bam_n=$(readlink -f "$bam_n")
 53 | 		win_size=""
 54 | 		win_size_label="paired-auto"
 55 | 	else
 56 | 		win_size="$bam_n"
 57 | 		bam_n=""
 58 | 		win_size_label="$win_size"
 59 | 	fi
 60 | else
 61 | 	# no control sample or window size are specified
 62 | 	win_size_label="auto"
 63 | fi
 64 | 
 65 | # check that inputs exist
 66 | 
 67 | if [ ! -d "$proj_dir" ] ; then
 68 | 	echo -e "\n $script_name ERROR: PROJ DIR $proj_dir DOES NOT EXIST \n" >&2
 69 | 	exit 1
 70 | fi
 71 | 
 72 | if [ ! -s "$bam_t" ] ; then
 73 | 	echo -e "\n $script_name ERROR: BAM $bam_t DOES NOT EXIST \n" >&2
 74 | 	exit 1
 75 | fi
 76 | 
 77 | if [ -n "$bam_n" ] && [ ! -s "$bam_n" ] ; then
 78 | 	echo -e "\n $script_name ERROR: CONTROL BAM $bam_n DOES NOT EXIST \n" >&2
 79 | 	exit 1
 80 | fi
 81 | 
 82 | 
 83 | #########################
 84 | 
 85 | 
 86 | # settings and files
 87 | 
 88 | sample="${sample_t}"
 89 | 
 90 | cnv_freec_dir="${proj_dir}/CNV-FREEC-${win_size_label}"
 91 | mkdir -p "$cnv_freec_dir"
 92 | 
 93 | # need a separate directory for each sample since some auto-generated files have identical filenames
 94 | sample_freec_logs_base_dir="${proj_dir}/logs-${segment_name}-${win_size_label}"
 95 | sample_freec_logs_dir="${sample_freec_logs_base_dir}/${sample_t}"
 96 | mkdir -p "$sample_freec_logs_dir"
 97 | 
 98 | summary_csv="${sample_freec_logs_base_dir}/${sample}.${segment_name}.csv"
 99 | 
100 | config_txt="${sample_freec_logs_dir}/config.txt"
101 | 
102 | out_base_sample="${sample_freec_logs_dir}/$(basename $bam_t)"
103 | # out_base_control="${sample_freec_logs_dir}/$(basename $bam_n)"
104 | fixed_base="${cnv_freec_dir}/${sample_t}"
105 | 
106 | cpn_sample="${out_base_sample}_sample.cpn"
107 | cpn_control="${out_base_control}_control.cpn"
108 | 
109 | cnvs_original="${out_base_sample}_CNVs"
110 | ratio_original="${out_base_sample}_ratio.txt"
111 | info_original="${out_base_sample}_info.txt"
112 | 
113 | # repeated later again based on resolution
114 | cnvs_fixed="${fixed_base}.CNVs.txt"
115 | ratio_fixed="${fixed_base}.ratio.txt"
116 | graph_fixed="${fixed_base}.png"
117 | 
118 | # unload all loaded modulefiles
119 | module purge
120 | module add default-environment
121 | 
122 | 
123 | #########################
124 | 
125 | 
126 | # genome-specific settings
127 | 
128 | if [[ "$genome_build" == "hg19" ]] ; then
129 | 	chr_files_dir="/gpfs/data/igorlab/ref/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/"
130 | 	chr_len_file="/gpfs/data/igorlab/ref/hg19/genome.fa.fai"
131 | 	gem="/gpfs/data/igorlab/ref/hg19/FREEC/out100m2_hg19.gem"
132 | elif [[ "$genome_build" == "hg38" ]] ; then
133 | 	chr_files_dir="/gpfs/data/igorlab/ref/hg38/chromosomes/"
134 | 	chr_len_file="/gpfs/data/igorlab/ref/hg38/genome.fa.fai"
135 | 	gem="/gpfs/data/igorlab/ref/hg38/genome.len100.mm2.mappability"
136 | elif [[ "$genome_build" == "mm10" ]] ; then
137 | 	chr_files_dir="/gpfs/data/igorlab/ref/iGenomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/"
138 | 	chr_len_file="/gpfs/data/igorlab/ref/mm10/genome.fa.fai"
139 | 	gem="/gpfs/data/igorlab/ref/mm10/FREEC/out100m4_mm10.gem"
140 | else
141 | 	echo -e "\n $script_name ERROR: UNSUPPORTED GENOME \n" >&2
142 | 	exit 1
143 | fi
144 | 
145 | if [ ! -s "$chr_len_file" ] ; then
146 | 	echo -e "\n $script_name ERROR: CHROM LENGTHS $chr_len_file DOES NOT EXIST \n" >&2
147 | 	exit 1
148 | fi
149 | 
150 | if [ ! -s "$gem" ] ; then
151 | 	echo -e "\n $script_name ERROR: GEM $gem DOES NOT EXIST \n" >&2
152 | 	exit 1
153 | fi
154 | 
155 | 
156 | #########################
157 | 
158 | 
159 | # skip if output exits already
160 | 
161 | if [ -s "$cpn_sample" ] || [ -s "$cnvs_original" ] || [ -s "$cnvs_fixed" ] ; then
162 | 	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
163 | 	exit 0
164 | fi
165 | 
166 | 
167 | #########################
168 | 
169 | 
170 | # create config
171 | 
172 | # either coefficientOfVariation or window must be specified for whole genome sequencing data
173 | if [ -n "$win_size" ] ; then
174 | 	win_size_config="window = $win_size"
175 | else
176 | 	win_size_config="coefficientOfVariation = 0.05"
177 | fi
178 | 
179 | # config for control dataset
180 | 
181 | if [ -n "$bam_n" ] ; then
182 | control_config="
183 | [control]
184 | mateFile = $bam_n
185 | inputFormat = BAM
186 | mateOrientation = FR
187 | "
188 | else
189 | 	control_config=""
190 | fi
191 | 
192 | config_contents="
193 | 
194 | # whole genome sequencing config
195 | # based on: https://github.com/BoevaLab/FREEC/blob/master/data/config_WGS.txt
196 | 
197 | 
198 | [general]
199 | 
200 | # output directory
201 | outputDir = .
202 | 
203 | # number of threads
204 | maxThreads = 4
205 | 
206 | # path to sambamba (used only to read .BAM files)
207 | sambamba = /gpfs/data/igorlab/software/sambamba/sambamba-0.7.0
208 | 
209 | # file with chromosome lengths (fa.fai accepted starting from v9.3)
210 | chrLenFile = $chr_len_file
211 | 
212 | # path to the directory with chromosomes fasta files
213 | # necessary to calculate a GC-content profile if a control dataset and GC-content profile are not available
214 | chrFiles = $chr_files_dir
215 | 
216 | # information about mappable positions (GEM output)
217 | gemMappabilityFile = $gem
218 | 
219 | # use a mappability profile to correct read counts (provided with gemMappabilityFile)
220 | uniqueMatch = TRUE
221 | 
222 | # genome ploidy
223 | # you can set different values and Control-FREEC will select the one that explains most observed CNAs
224 | ploidy = 2
225 | 
226 | # sample sex
227 | # sex=XY will not annotate one copy of chr X and Y as a losssex=XY
228 | sex = XY
229 | 
230 | # either coefficientOfVariation or window must be specified for whole genome sequencing data
231 | # for whole exome sequencing: window=0
232 | $win_size_config
233 | 
234 | # set to 1 or 2 to correct the Read Count (RC) for GC-content bias and low mappability
235 | # Default (WGS): 0
236 | # Default (WES): 1 (≥ v9.5) and 0 (< v9.5)
237 | # forceGCcontentNormalization = 1
238 | 
239 | # degree of polynomial
240 | # Default: 3&4 (GC-content based normalization, WGS) or 1 (control-read-count-based normalization, WES)
241 | # degree = 1
242 | 
243 | # desired behavior in the ambiguous regions
244 | # 4: make a separate fragment of this unknown region and do not assign any copy number to this region at all
245 | # breakPointType = 4
246 | 
247 | # positive value of threshold for segmentation of normalized profiles
248 | # Default: 0.8 (for WGS)
249 | breakPointThreshold = 0.8
250 | 
251 | # threshold on the minimal number of reads per window in the control sample
252 | # recommended value >=50 for for exome data
253 | # readCountThreshold = 10
254 | 
255 | # additional output in BedGraph format for the UCSC genome browser
256 | BedGraphOutput = TRUE
257 | 
258 | 
259 | [sample]
260 | 
261 | # file with mapped reads (can be single end reads, mate-pairs or paired-end reads)
262 | mateFile = $bam_t
263 | 
264 | # format of reads (in mateFile)
265 | # SAM, BAM, pileup, bowtie, eland, arachne, psl (BLAT), BED, Eland
266 | inputFormat = BAM
267 | 
268 | # format of reads (in mateFile)
269 | # 0 (for single ends), RF (Illumina mate-pairs), FR (Illumina paired-ends), FF (SOLiD mate-pairs)
270 | mateOrientation = FR
271 | 
272 | $control_config
273 | 
274 | "
275 | 
276 | echo "$config_contents" > "$config_txt"
277 | 
278 | 
279 | #########################
280 | 
281 | 
282 | # Control-FREEC
283 | 
284 | # FREEC compiled with GCC 6.1.0 (same GCC must be in the environment when running)
285 | module add gcc/6.1.0
286 | # bedtools to create .pileup files for WES data
287 | module add bedtools/2.27.1
288 | # samtools to create .pileup files (for BAF) (even with sambamba enabled)
289 | module add samtools/1.3
290 | 
291 | cd "$sample_freec_logs_dir"
292 | 
293 | freec_dir="/gpfs/data/igorlab/software/FREEC/FREEC-11.6"
294 | freec_bin="${freec_dir}/src/freec"
295 | 
296 | echo
297 | echo " * FREEC: $(readlink -f $freec_bin) "
298 | echo " * sample T : $sample_t "
299 | echo " * BAM T : $bam_t "
300 | echo " * BAM N : $bam_n "
301 | echo " * window size : $win_size "
302 | echo " * CNVs original: $cnvs_original "
303 | echo " * CNVs fixed: $cnvs_fixed "
304 | echo " * ratio original: $ratio_original "
305 | echo " * ratio fixed: $ratio_fixed "
306 | echo
307 | 
308 | freec_cmd="$freec_bin -conf $config_txt"
309 | echo -e "\n CMD: $freec_cmd \n"
310 | ($freec_cmd)
311 | 
312 | sleep 30
313 | 
314 | 
315 | #########################
316 | 
317 | 
318 | # check that output generated
319 | 
320 | if [ ! -s "$cnvs_original" ] ; then
321 | 	echo -e "\n $script_name ERROR: $cnvs_original NOT GENERATED \n" >&2
322 | 	exit 1
323 | fi
324 | 
325 | if [ ! -s "$ratio_original" ] ; then
326 | 	echo -e "\n $script_name ERROR: $ratio_original NOT GENERATED \n" >&2
327 | 	exit 1
328 | fi
329 | 
330 | 
331 | #########################
332 | 
333 | 
334 | # clean up
335 | 
336 | # delete raw copy number profiles
337 | rm -v "$cpn_sample"
338 | 
339 | if [ -s "$cpn_control" ] ; then
340 | 	rm -v "$cpn_control"
341 | fi
342 | 
343 | 
344 | #########################
345 | 
346 | 
347 | # get resolution and add to output names
348 | 
349 | # get resolution
350 | res=$(cat "$info_original" | grep "Window" | cut -f 2)
351 | 
352 | # adjust output file names
353 | cnvs_fixed="${fixed_base}.${res}.CNVs.txt"
354 | ratio_fixed="${fixed_base}.${res}.ratio.txt"
355 | graph_fixed="${fixed_base}.${res}.png"
356 | 
357 | echo
358 | echo " * res: $res "
359 | echo " * CNVs fixed: $cnvs_fixed "
360 | echo " * ratio fixed: $ratio_fixed "
361 | echo " * graph fixed: $graph_fixed "
362 | echo
363 | 
364 | 
365 | #########################
366 | 
367 | 
368 | # post-processing
369 | 
370 | module add r/3.6.1
371 | 
372 | echo
373 | echo " * R: $(readlink -f $(which R)) "
374 | echo " * R version: $(R --version | head -1) "
375 | echo " * Rscript: $(readlink -f $(which Rscript)) "
376 | echo " * Rscript version: $(Rscript --version 2>&1) "
377 | echo
378 | 
379 | # required libraries: rtracklayer
380 | 
381 | # add p-value to the predicted CNVs
382 | # add Wilcoxon test and Kolmogorov-Smirnov test p-values to _CNVs file (also add headers to the columns)
383 | freec_asses_sig_cmd="cat ${freec_dir}/scripts/assess_significance.R | R --slave --args $cnvs_original $ratio_original"
384 | echo -e "\n CMD: $freec_asses_sig_cmd \n"
385 | eval "$freec_asses_sig_cmd"
386 | 
387 | # add "chr" to CNV table chromosomes
388 | cat "${cnvs_original}.p.value.txt" | sed 's/^\([0-9XY]\)/chr\1/' | LC_ALL=C sort -k1,1 -k2,2n | uniq > "$cnvs_fixed"
389 | 
390 | # visualize normalized copy number profile with predicted CNAs as well as BAF profile by running makeGraph.R
391 | freec_makegraph_cmd="cat ${freec_dir}/scripts/makeGraph.R | R --slave --args 2 $ratio_original"
392 | echo -e "\n CMD: $freec_makegraph_cmd \n"
393 | eval "$freec_makegraph_cmd"
394 | 
395 | 
396 | #########################
397 | 
398 | 
399 | # fix some of the names
400 | 
401 | mv -v "$ratio_original" "$ratio_fixed"
402 | mv -v "${ratio_original}.png" "$graph_fixed"
403 | 
404 | 
405 | #########################
406 | 
407 | 
408 | # check that output generated
409 | 
410 | if [ ! -s "$cnvs_fixed" ] ; then
411 | 	echo -e "\n $script_name ERROR: CNVs $cnvs_fixed NOT GENERATED \n" >&2
412 | 	exit 1
413 | fi
414 | 
415 | 
416 | #########################
417 | 
418 | 
419 | # summary
420 | 
421 | # ratios and predicted copy number alterations for each window
422 | num_bins=$(cat "$ratio_fixed" | grep -v 'MedianRatio' | wc -l)
423 | echo "num bins: $num_bins"
424 | 
425 | # predicted copy number alterations
426 | num_cnas=$(cat "$cnvs_fixed" | grep -v 'uncertainty' | wc -l)
427 | echo "num CNAs: $num_cnas"
428 | 
429 | # header for summary file
430 | echo "#SAMPLE,res,bins,CNAs" > "$summary_csv"
431 | 
432 | # summarize log file
433 | echo "${sample},${res},${num_bins},${num_cnas}" >> "$summary_csv"
434 | 
435 | sleep 30
436 | 
437 | # combine all sample summaries
438 | cat ${sample_freec_logs_base_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq \
439 | > "${proj_dir}/summary.${segment_name}-${win_size_label}.csv"
440 | 
441 | 
442 | #########################
443 | 
444 | 
445 | # annotate
446 | 
447 | annot_cmd="bash /gpfs/data/igorlab/public/sns/segments/annot-regions-annovar.sh $proj_dir $sample $cnvs_fixed"
448 | echo -e "\n CMD: $annot_cmd \n"
449 | ($annot_cmd)
450 | 
451 | 
452 | #########################
453 | 
454 | 
455 | 
456 | # end
457 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/jupyter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ##
 5 | ## Use a BigPurple compute node to run a Jupyter notebook and access it from your local machine.
 6 | ## Can be executed through sbatch or directly.
 7 | ## Run this script on the cluster to start a Jupyter notebook.
 8 | ##
 9 | ## Usage (direct):
10 | ## bash ./jupyter.sh
11 | ## Usage (via sbatch):
12 | ## sbatch --job-name=jupyter --nodes=1 --ntasks=1 --cpus-per-task=1 --mem=8G --time=8:00:00 ./jupyter.sh
13 | ##
14 | ## Further instructions will be printed after executing the script (check the log file if executed via sbatch).
15 | ##
16 | 
17 | 
18 | #########################
19 | 
20 | 
21 | # check that Jupyter notebook is available
22 | 
23 | if [ ! -x "$(command -v jupyter)" ] ; then
24 | 	echo -e "\n ERROR: 'jupyter' is not available \n" >&2
25 | 	exit 1
26 | fi
27 | 
28 | if [ ! -x "$(command -v jupyter-notebook)" ] ; then
29 | 	echo -e "\n ERROR: 'jupyter-notebook' is not available \n" >&2
30 | 	exit 1
31 | fi
32 | 
33 | 
34 | #########################
35 | 
36 | 
37 | # https://docs.ycrc.yale.edu/clusters-at-yale/guides/jupyter/
38 | # originally modified for use on BigPurple by Paul Glick
39 | 
40 | # get tunneling info
41 | XDG_RUNTIME_DIR=""
42 | user=$(whoami)
43 | node=$(hostname -s)
44 | # port=$(shuf -i 8000-9999 -n 1)
45 | # generate a unqiue port for each user
46 | port=$(shuf -i 8000-9999 -n 1 --random-source <(echo "$user"))
47 | 
48 | echo -e "
49 | 
50 | Two additional steps should be perfomed on a local machine.
51 | 
52 | (1) Create an SSH tunnel in a new terminal on a local maching (there is no output):
53 |       ssh -N -L ${port}:${node}:${port} ${user}@bigpurple.nyumc.org
54 | 
55 | (2) Access Jupyter through a web browser at:
56 |       http://127.0.0.1:${port} (complete URL with token string will be shown below)
57 | 
58 | "
59 | 
60 | # clean up the environment and load modules or conda environments (should be a parameter)
61 | # module purge
62 | # module add default-environment
63 | 
64 | # classic Jupyter Notebook
65 | # jupyter-notebook --no-browser --port=${port} --ip=${node}
66 | 
67 | # JupyterLab
68 | jupyter lab --no-browser --port=${port} --ip=${node}
69 | 
70 | 
71 | #########################
72 | 
73 | 
74 | 
75 | # end
76 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/scrna-10x-cellranger-aggr.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## 10X Cell Ranger
  6 | ## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count'
  7 | ##
  8 | 
  9 | 
 10 | # script filename
 11 | script_name=$(basename "${BASH_SOURCE[0]}")
 12 | 
 13 | # check for correct number of arguments
 14 | if [ $# -lt 1 ] || [ $# -gt 2 ] ; then
 15 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
 16 | 	echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2
 17 | 	exit 1
 18 | fi
 19 | 
 20 | # arguments
 21 | sample_sheet=$1
 22 | analysis_name=$2
 23 | 
 24 | # settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant)
 25 | threads="4"
 26 | mem="32"
 27 | 
 28 | # make the output group-writeable
 29 | umask 007
 30 | 
 31 | # output (add analysis name if provided)
 32 | sample_name="aggregated"
 33 | if [ -n "$analysis_name" ] ; then
 34 | 	sample_name="${sample_name}-${analysis_name}"
 35 | fi
 36 | web_summary_html="${sample_name}/outs/web_summary.html"
 37 | 
 38 | # check that input exists
 39 | if [ ! -s "$sample_sheet" ] ; then
 40 | 	echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2
 41 | 	exit 1
 42 | fi
 43 | 
 44 | echo -e "\n $(date) \n" >&2
 45 | 
 46 | # check if output already exits
 47 | if [ -s "$web_summary_html" ]; then
 48 | 	echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2
 49 | 	exit 1
 50 | fi
 51 | 
 52 | module unload gcc
 53 | module load cellranger/3.0.0
 54 | module load dos2unix/7.4.0
 55 | 
 56 | # clean up sample sheet
 57 | dos2unix -q "$sample_sheet"
 58 | sed -i 's/"//g' "$sample_sheet"
 59 | sed -i -e '$a\' "$sample_sheet"
 60 | 
 61 | # display settings
 62 | echo " * cellranger: $(which cellranger) "
 63 | echo " * sample sheet: $sample_sheet "
 64 | 
 65 | # cellranger aggr command
 66 | 
 67 | # id      A unique run id, used to name output folder [a-zA-Z0-9_-]+.
 68 | # csv     Path of CSV file enumerating 'cellranger count' outputs.
 69 | 
 70 | cellranger_cmd="
 71 | cellranger aggr \
 72 | --jobmode local \
 73 | --localcores $threads \
 74 | --localmem $mem \
 75 | --id $sample_name \
 76 | --csv $sample_sheet
 77 | "
 78 | echo -e "\n CMD: $cellranger_cmd \n"
 79 | $cellranger_cmd
 80 | 
 81 | sleep 15
 82 | 
 83 | # check that output html summary (and probably everything else) exists
 84 | if [ ! -s "$web_summary_html" ] ; then
 85 | 	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
 86 | 	exit 1
 87 | fi
 88 | 
 89 | # copy html summary to top level for easy navigation
 90 | rsync -t "$web_summary_html" "./${sample_name}.html"
 91 | 
 92 | # clean up (temp files)
 93 | rm -rf "${sample_name}/SC_RNA_COUNTER_CS"
 94 | 
 95 | echo -e "\n $(date) \n"
 96 | 
 97 | 
 98 | 
 99 | # end
100 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/scrna-10x-cellranger-count-features.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## 10X Cell Ranger
  6 | ## processes Chromium single cell RNA-seq output with expression and antibody libraries
  7 | ##
  8 | 
  9 | 
 10 | # script filename
 11 | script_name=$(basename "${BASH_SOURCE[0]}")
 12 | 
 13 | # check for correct number of arguments
 14 | if [ ! $# == 4 ] ; then
 15 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
 16 | 	echo -e "\n USAGE: $script_name genome_name sample_name libraries_csv features_csv \n" >&2
 17 | 	exit 1
 18 | fi
 19 | 
 20 | # arguments
 21 | genome_name=$1
 22 | sample_name=$2
 23 | sample_name_out="count-$sample_name"
 24 | libraries_csv=$(readlink -f "$3")
 25 | features_csv=$(readlink -f "$4")
 26 | 
 27 | # settings
 28 | # threads=$NSLOTS
 29 | # threads=$SLURM_NTASKS
 30 | threads=16
 31 | # mem=$(echo "$threads * 8" | bc)
 32 | mem=128
 33 | 
 34 | # make the output group-writeable
 35 | umask 007
 36 | 
 37 | if [[ "$genome_name" == "hg19" ]] ; then
 38 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0"
 39 | elif [[ "$genome_name" == "hg38" ]] ; then
 40 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0"
 41 | elif [[ "$genome_name" == "GRCh38" ]] ; then
 42 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0"
 43 | elif [[ "$genome_name" == "mm10" ]] ; then
 44 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-mm10-3.0.0"
 45 | elif [[ "$genome_name" == "hg19_and_mm10" ]] ; then
 46 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-and-mm10-3.0.0"
 47 | else
 48 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/ref/${genome_name}/cellranger"
 49 | 	#transcriptome_dir="/gpfs/home/id460/ref/${genome_name}/cellranger"
 50 | fi
 51 | 
 52 | # unload all loaded modulefiles
 53 | module purge
 54 | 
 55 | # check that input exists
 56 | if [ ! -s "$libraries_csv" ] ; then
 57 | 	echo -e "\n ERROR: libraries csv $libraries_csv does not exist \n" >&2
 58 | 	exit 1
 59 | fi
 60 | 
 61 | if [ ! -s "$features_csv" ] ; then
 62 | 	echo -e "\n ERROR: features csv $features_csv does not exist \n" >&2
 63 | 	exit 1
 64 | fi
 65 | 
 66 | if [ ! -d "$transcriptome_dir" ] ; then
 67 | 	echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2
 68 | 	exit 1
 69 | fi
 70 | 
 71 | module load cellranger/3.1.0
 72 | 
 73 | # display settings
 74 | echo " * cellranger:        $(which cellranger) "
 75 | echo " * threads:           $threads "
 76 | echo " * mem:               $mem "
 77 | echo " * transcriptome dir: $transcriptome_dir "
 78 | echo " * libraries csv:     $libraries_csv "
 79 | echo " * features csv:      $features_csv "
 80 | echo " * sample:            $sample_name "
 81 | echo " * out dir:           $sample_name_out "
 82 | 
 83 | echo -e "\n $(date) \n" >&2
 84 | 
 85 | # cellranger run command
 86 | 
 87 | # id             A unique run id, used to name output folder
 88 | # fastqs         Path of folder created by 10x demultiplexing or bcl2fastq (must not be passed)
 89 | # sample         Prefix of the filenames of FASTQs to select
 90 | # transcriptome  Path of folder containing 10X-compatible transcriptome
 91 | # libraries      Path to a file declaring FASTQ paths and library types of input libraries
 92 | # feature-ref    Path to a file declaring the Feature Barcoding reagents
 93 | 
 94 | cellranger_cmd="
 95 | cellranger count \
 96 | --localmem $mem \
 97 | --localcores $threads \
 98 | --transcriptome $transcriptome_dir \
 99 | --libraries $libraries_csv \
100 | --feature-ref $features_csv \
101 | --id $sample_name_out \
102 | "
103 | echo -e "\n CMD: $cellranger_cmd \n"
104 | $cellranger_cmd
105 | 
106 | sleep 15
107 | 
108 | web_summary_html="./${sample_name_out}/outs/web_summary.html"
109 | 
110 | # check that output html summary (and probably everything else) exists
111 | if [ ! -s "$web_summary_html" ] ; then
112 | 	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
113 | 	exit 1
114 | fi
115 | 
116 | # copy html summary to top level for easy navigation
117 | rsync -tv "$web_summary_html" "./${sample_name_out}.html"
118 | 
119 | # delete temp files
120 | rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS"
121 | 
122 | echo -e "\n $(date) \n"
123 | 
124 | 
125 | 
126 | # end
127 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/scrna-10x-cellranger-count.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## Processes 10x Genomics Chromium single-cell RNA-seq FASTQs with Cell Ranger (cellranger count).
  6 | ## Provide a FASTQ directory for classic expression libraries.
  7 | ## Provide tables of libraries and features for expression and antibody/hashtag libraries.
  8 | ##
  9 | ## Usage:
 10 | ## sbatch --job-name=cellranger-${sample} --ntasks=1 --cpus-per-task=17 --mem=128G --time=10:00:00 \
 11 | ##   --mail-user=${USER}@nyumc.org --mail-type=FAIL,END --export=NONE \
 12 | ##   --wrap="bash ./scrna-10x-cellranger-count.sh module_version genome_build sample_name fastq_dir"
 13 | ##
 14 | 
 15 | 
 16 | # script filename
 17 | script_name=$(basename "${BASH_SOURCE[0]}")
 18 | 
 19 | # check for correct number of arguments
 20 | if [ $# -lt 4 ] ; then
 21 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
 22 | 	echo -e " Usage:" >&2
 23 | 	echo -e "" >&2
 24 | 	echo -e "   RNA only:        ./${script_name} module_version genome_name sample_name fastq_dir" >&2
 25 | 	echo -e "   RNA and ADT/HTO: ./${script_name} module_version genome_name sample_name libraries_csv features_csv" >&2
 26 | 	echo -e "" >&2
 27 | 	echo -e "   RNA example:         ./${script_name} 9.0.0 hg38 my_sample /gpfs/data/fastq" >&2
 28 | 	echo -e "" >&2
 29 | 	if [ $# -gt 0 ] ; then echo -e " Provided arguments: $* \n" >&2 ; fi
 30 | 	exit 1
 31 | fi
 32 | 
 33 | # arguments
 34 | module_version=$1
 35 | genome_name=$2
 36 | sample_name_fastq=$3
 37 | sample_name_out="count-$sample_name_fastq"
 38 | if [ $# -eq 4 ] ; then
 39 | 	fastq_dir=$(readlink -f "$4")
 40 | fi
 41 | if [ $# -eq 5 ] ; then
 42 | 	libraries_csv=$(readlink -f "$4")
 43 | 	features_csv=$(readlink -f "$5")
 44 | fi
 45 | 
 46 | # settings (16 threads and 64G does not finish with 10h time limit)
 47 | threads=16
 48 | mem=128
 49 | 
 50 | # make the output group-writeable
 51 | umask 007
 52 | 
 53 | if [[ "$genome_name" == "hg19" ]] ; then
 54 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0"
 55 | elif [[ "$genome_name" == "hg38" ]] ; then
 56 | 	# transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2020-A"
 57 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2024-A"
 58 | elif [[ "$genome_name" == "GRCh38" ]] ; then
 59 | 	# transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2020-A"
 60 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2024-A"
 61 | elif [[ "$genome_name" == "mm10" ]] ; then
 62 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-mm10-2020-A"
 63 | elif [[ "$genome_name" == "mm39" ]] ; then
 64 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCm39-2024-A"
 65 | elif [[ "$genome_name" == "GRCm39" ]] ; then
 66 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCm39-2024-A"
 67 | elif [[ "$genome_name" == "GRCh38_and_mm10" ]] ; then
 68 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-and-mm10-2020-A"
 69 | elif [[ "$genome_name" == "GRCh38_and_GRCm39" ]] ; then
 70 | 	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38_and_GRCm39-2024-A"
 71 | else
 72 | 	# additional custom genomes
 73 | 	transcriptome_dir="/gpfs/data/igorlab/ref/${genome_name}/cellranger"
 74 | fi
 75 | 
 76 | # check that input exists
 77 | 
 78 | if [ ! -d "$transcriptome_dir" ] ; then
 79 | 	echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2
 80 | 	exit 1
 81 | fi
 82 | 
 83 | if [ -n "$fastq_dir" ] ; then
 84 | 	# RNA-only command
 85 | 	if [ ! -d "$fastq_dir" ] ; then
 86 | 		echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
 87 | 		exit 1
 88 | 	fi
 89 | else
 90 | 	# RNA and ADT command
 91 | 	if [ ! -s "$libraries_csv" ] ; then
 92 | 		echo -e "\n ERROR: libraries csv $libraries_csv does not exist \n" >&2
 93 | 		exit 1
 94 | 	fi
 95 | 	if [ ! -s "$features_csv" ] ; then
 96 | 		echo -e "\n ERROR: features csv $features_csv does not exist \n" >&2
 97 | 		exit 1
 98 | 	fi
 99 | fi
100 | 
101 | # clean up the environment
102 | module purge
103 | module add default-environment
104 | module add cellranger/${module_version}
105 | 
106 | # display settings
107 | echo " * cellranger:        $(which cellranger) "
108 | echo " * threads:           $threads "
109 | echo " * mem:               $mem "
110 | echo " * transcriptome dir: $transcriptome_dir "
111 | echo " * out dir:           $sample_name_out "
112 | if [ -n "$fastq_dir" ] ; then
113 | 	echo " * sample name:       $sample_name_fastq "
114 | 	echo " * fastq dir:         $fastq_dir "
115 | 	extra_args="--sample $sample_name_fastq --fastqs $fastq_dir"
116 | else
117 | 	echo " * libraries csv:     $libraries_csv "
118 | 	echo " * features csv:      $features_csv "
119 | 	extra_args="--libraries $libraries_csv --feature-ref $features_csv"
120 | fi
121 | 
122 | echo -e "\n $(date) \n" >&2
123 | 
124 | # cellranger count command
125 | 
126 | # transcriptome  Path of folder containing 10x-compatible transcriptome reference
127 | # id             A unique run id and output folder name [a-zA-Z0-9_-]+
128 | # sample         Prefix of the filenames of FASTQs to select
129 | # fastqs         Path of folder created by 10x demultiplexing or bcl2fastq
130 | # libraries      CSV file declaring input library data sources
131 | # feature-ref    Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes
132 | 
133 | cellranger_cmd="
134 | cellranger count \
135 | --create-bam true \
136 | --localmem $mem \
137 | --localcores $threads \
138 | --transcriptome $transcriptome_dir \
139 | --id $sample_name_out \
140 | $extra_args \
141 | --disable-ui \
142 | "
143 | echo -e "\n CMD: $cellranger_cmd \n"
144 | eval "$cellranger_cmd"
145 | 
146 | sleep 15
147 | 
148 | web_summary_html="./${sample_name_out}/outs/web_summary.html"
149 | 
150 | # check that output html summary (and probably everything else) exists
151 | if [ ! -s "$web_summary_html" ] ; then
152 | 	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
153 | 	exit 1
154 | fi
155 | 
156 | # copy html summary to top level for easy navigation
157 | rsync -tv "$web_summary_html" "./${sample_name_out}.html"
158 | 
159 | # delete temp files
160 | rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS"
161 | 
162 | echo -e "\n $(date) \n"
163 | 
164 | 
165 | 
166 | # end
167 | 


--------------------------------------------------------------------------------
/scripts-bigpurple/scrna-10x-cellranger-multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ##
 5 | ## 10X Cell Ranger
 6 | ## Processes Chromium single-cell V(D)J and Gene Expression output (cellranger multi)
 7 | ## Enables the analysis of multiple library types together (compared to using cellranger vdj and cellranger count separately)
 8 | ##
 9 | ## Usage:
10 | ## sbatch --job-name=cellranger-${sample} --ntasks=1 --cpus-per-task=17 --mem=128G --time=10:00:00 \
11 | ##   --mail-user=${USER}@nyumc.org --mail-type=FAIL,END --export=NONE \
12 | ##   --wrap="bash ./scrna-10x-cellranger-multi.sh module_version sample_name config_csv"
13 | ##
14 | 
15 | 
16 | # script filename
17 | script_name=$(basename "${BASH_SOURCE[0]}")
18 | 
19 | # check for correct number of arguments
20 | if [ ! $# == 3 ] ; then
21 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
22 | 	echo -e "\n USAGE: $script_name module_version sample_name \n" >&2
23 | 	exit 1
24 | fi
25 | 
26 | # arguments
27 | module_version=$1
28 | sample_name=$2
29 | config_csv=$(readlink -f "$3")
30 | 
31 | # settings (16 threads and 64G does not finish with 10h time limit)
32 | threads=16
33 | mem=128
34 | 
35 | # make the output group-writeable
36 | umask 007
37 | 
38 | # check that input exists
39 | if [ ! -s "$config_csv" ] ; then
40 | 	echo -e "\n ERROR: config $config_csv does not exist \n" >&2
41 | 	exit 1
42 | fi
43 | 
44 | module purge
45 | module add default-environment
46 | module add cellranger/${module_version}
47 | 
48 | # display settings
49 | echo " * cellranger:        $(which cellranger) "
50 | echo " * threads:           $threads "
51 | echo " * mem:               $mem "
52 | echo " * out dir:           $sample_name "
53 | 
54 | echo -e "\n $(date) \n" >&2
55 | 
56 | # cellranger multi command
57 | 
58 | # id             A unique run id and output folder name [a-zA-Z0-9_-]+
59 | # csv            Path of CSV file enumerating input libraries and analysis parameters
60 | # sample         Prefix of the filenames of FASTQs to select
61 | 
62 | # The multi config CSV contains both the library definitions and experiment configuration variables.
63 | # It is composed of up to four sections: [gene-expression], [feature], [vdj], and [libraries].
64 | # Template: https://support.10xgenomics.com/multi-config-template.csv
65 | 
66 | cellranger_cmd="
67 | cellranger multi \
68 | --localmem $mem \
69 | --localcores $threads \
70 | --csv $config_csv \
71 | --id $sample_name \
72 | --disable-ui \
73 | "
74 | echo -e "\n CMD: $cellranger_cmd \n"
75 | $cellranger_cmd
76 | 
77 | sleep 15
78 | 
79 | web_summary_html="./${sample_name}/outs/per_sample_outs/${sample_name}/web_summary.html"
80 | 
81 | # check that output html summary (and probably everything else) exists
82 | if [ ! -s "$web_summary_html" ] ; then
83 | 	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
84 | 	exit 1
85 | fi
86 | 
87 | # copy html summary to top level for easy navigation
88 | rsync -tv "$web_summary_html" "./${sample_name}.html"
89 | 
90 | # delete temp files
91 | rm -rf "./${sample_name}/SC_MULTI_CS"
92 | 
93 | echo -e "\n $(date) \n"
94 | 
95 | 
96 | 
97 | # end
98 | 


--------------------------------------------------------------------------------
/scripts-phoenix/assembly-10x-supernova.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova.
  6 | ##
  7 | ## Usage:
  8 | ## qsub -N supernova -M ${USER}@nyumc.org -m ae -j y -cwd -pe threaded 16 -b y \
  9 | ## -hard -l mem_free=512G -l mem_token=32G \
 10 | ## bash /ifs/home/id460/public/genomics/scripts-phoenix/assembly-10x-supernova.sh fastq_dir [max_reads]
 11 | ##
 12 | 
 13 | 
 14 | #########################
 15 | 
 16 | 
 17 | # system-specific settings
 18 | 
 19 | # supernova directory
 20 | supernova_version="2.1.1"
 21 | supernova_dir="/ifs/home/id460/software/supernova/supernova-${supernova_version}"
 22 | 
 23 | 
 24 | #########################
 25 | 
 26 | 
 27 | # check for correct number of arguments
 28 | if [ $# -lt 1 ] ; then
 29 | 	echo -e "\n ERROR: wrong number of arguments supplied \n" >&2
 30 | 	echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2
 31 | 	exit 1
 32 | fi
 33 | 
 34 | # arguments
 35 | fastq_dir=$(readlink -f "$1")
 36 | max_reads="$2"
 37 | 
 38 | # check that input exists
 39 | if [ ! -d "$fastq_dir" ] ; then
 40 | 	echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
 41 | 	exit 1
 42 | fi
 43 | 
 44 | 
 45 | #########################
 46 | 
 47 | 
 48 | # step 1: assembly (supernova run)
 49 | 
 50 | # million reads cutoff (default is 1200M)
 51 | # set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads
 52 | # coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset
 53 | # default value is 1.2B, which only makes sense for ~3.2 Gb genomes
 54 | if [ -n "$max_reads" ] ; then
 55 | 	max_reads_m="$max_reads"
 56 | else
 57 | 	max_reads_m="1200"
 58 | fi
 59 | 
 60 | # system load settings (leave extra room for memory)
 61 | threads=$NSLOTS
 62 | mem=$(echo "$threads * 30" | bc)
 63 | 
 64 | # run name (used to name output folder)
 65 | supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g')
 66 | run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M"
 67 | 
 68 | # display settingse
 69 | echo
 70 | echo " * fastq dir:               $fastq_dir "
 71 | echo " * supernova bin dir:       $supernova_dir "
 72 | echo " * reads cutoff (million):  $max_reads_m "
 73 | echo " * threads:                 $threads "
 74 | echo " * mem:                     $mem "
 75 | echo " * run name (output dir):   $run_id "
 76 | echo
 77 | 
 78 | echo -e "\n assembly started: $(date) \n" >&2
 79 | 
 80 | # supernova assembly command
 81 | 
 82 | supernova_cmd="
 83 | ${supernova_dir}/supernova run \
 84 | --maxreads ${max_reads_m}000000 \
 85 | --localcores ${threads} \
 86 | --localmem ${mem} \
 87 | --id ${run_id} \
 88 | --fastqs ${fastq_dir}
 89 | "
 90 | echo -e "\n CMD: $supernova_cmd \n"
 91 | $supernova_cmd
 92 | 
 93 | echo -e "\n assembly ended: $(date) \n" >&2
 94 | 
 95 | # check that output generated
 96 | supernova_out_dir=$(readlink -f "$(pwd)/${run_id}")
 97 | if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then
 98 | 	echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2
 99 | 	exit 1
100 | fi
101 | 
102 | 
103 | #########################
104 | 
105 | 
106 | # step 2: generate fasta file (supernova mkoutput)
107 | 
108 | # display settings
109 | echo
110 | echo " * assembly dir:  ${supernova_out_dir}/outs/assembly "
111 | echo " * fasta prefix:  ${supernova_out_dir}/assembly "
112 | echo
113 | 
114 | # generate different style fasta files
115 | styles="raw megabubbles pseudohap pseudohap2"
116 | for s in $styles; do
117 | 
118 | 	echo -e "\n generate fasta: style $s \n" >&2
119 | 
120 | 	# supernova mkoutput command
121 | 	${supernova_dir}/supernova mkoutput \
122 | 	--asmdir "${supernova_out_dir}/outs/assembly" \
123 | 	--outprefix "${supernova_out_dir}/assembly.${s}" \
124 | 	--style "${s}"
125 | 
126 | done
127 | 
128 | # check that output generated
129 | styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2"
130 | for s in $styles_out; do
131 | 
132 | 	# check that output generated
133 | 	if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then
134 | 		echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2
135 | 		exit 1
136 | 	fi
137 | 
138 | done
139 | 
140 | 
141 | #########################
142 | 
143 | 
144 | # cleanup
145 | 
146 | # check file size before cleanup
147 | du -sh "$run_id"
148 | 
149 | # delete large assembly files (keep small ones just in case)
150 | rm -rf ${run_id}/outs/assembly/a*
151 | rm -rf ${run_id}/outs/assembly/closures*
152 | rm -rf ${run_id}/outs/assembly/data
153 | # delete temp files
154 | rm -rf ${run_id}/ASSEMBLER_CS
155 | 
156 | # check file size after cleanup
157 | du -sh "$run_id"
158 | 
159 | 
160 | #########################
161 | 
162 | 
163 | 
164 | # end
165 | 


--------------------------------------------------------------------------------
/scripts-phoenix/bcl2fastq-sample-sheet-fix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ##
 5 | ## fix bcl2fastq demultiplexing sample sheet to get rid of problematic characters
 6 | ##
 7 | 
 8 | 
 9 | # input
10 | proj=$1
11 | 
12 | if [ -z "$1" ]
13 | then
14 | 	echo "ERROR! NO ARGUMENT SUPPLIED."
15 | 	exit 1
16 | fi
17 | 
18 | basecalls_dir="/ifs/data/sequence/Illumina/production/${proj}/Data/Intensities/BaseCalls"
19 | ss=${basecalls_dir}/SampleSheet.csv
20 | 
21 | printf "\n\n FIX SAMPLE SHEET $ss \n\n"
22 | 
23 | # make the output group-writeable
24 | umask 007
25 | 
26 | # fix and show sample sheet
27 | if [ -s $ss ]
28 | then
29 | 
30 | 	# fix newlines
31 | 	dos2unix --quiet $ss
32 | 	mac2unix --quiet $ss
33 | 
34 | 	# replace commas inside quoted fields with dashes
35 | 	awk -F '"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "-", $i) } 1' $ss > ${ss}.tmp && mv ${ss}.tmp $ss
36 | 	# replace periods, parentheses, quotes, and blanks in sample names with dashes
37 | 	awk -F ',' 'BEGIN { OFS="," } { gsub(/\.|\(|\)|\#|\:|\/|\047|[[:blank:]]/, "-", $3); print }' $ss > ${ss}.tmp && mv ${ss}.tmp $ss
38 | 	# replace multiple dashes
39 | 	sed -i 's/--*/-/g' $ss
40 | 	# replaces dashes at the beginning of the field
41 | 	sed -i 's/,-/,/g' $ss
42 | 	# replaces dashes at the end of the field
43 | 	sed -i 's/-,/,/g' $ss
44 | 	# remove lines missing values
45 | 	sed -i '/^,,,,,/d' $ss
46 | 	# add newline to end of file if one does not exist (some scripts may complain)
47 | 	sed -i -e '$a\' $ss
48 | 
49 | 	# check for extra columns in sample sheet
50 | 	max_comma_count=0
51 | 	while read i
52 | 	do
53 | 		comma_count=$(echo $i | tr -d -c "," | wc -c)
54 | 		if [ $comma_count -gt $max_comma_count ]
55 | 		then
56 | 			max_comma_count=$comma_count
57 | 		fi
58 | 	done < $ss
59 | 
60 | 	# if too many commas, replace trailing commas
61 | 	if [ $max_comma_count -gt 9 ]
62 | 	then
63 | 		sed -i 's/,,*$//g' $ss
64 | 	fi
65 | 
66 | 	# display sample sheet for easy review
67 | 	column -s "," -t $ss
68 | 
69 | else
70 | 
71 | 	printf "\n\n NO SAMPLE SHEET FOUND AT $ss \n\n"
72 | 	sleep 5
73 | 	exit 1
74 | 
75 | fi
76 | 
77 | 
78 | 
79 | # end
80 | 


--------------------------------------------------------------------------------
/scripts-phoenix/join-all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ##
 5 | ## merge any number of tab or comma-separated files (coreutils join can only do 2 at a time)
 6 | ## for tab field separator, use $'\t'
 7 | ##
 8 | 
 9 | 
10 | # script filename
11 | script_name=$(basename "${BASH_SOURCE[0]}")
12 | 
13 | # check for correct number of arguments
14 | if [ $# -lt 3 ] ; then
15 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
16 | 	echo -e "\n USAGE: $script_name field_separator missing_field_char in1.txt [in2.txt in3.txt ...] > merged.txt \n" >&2
17 | 	exit 1
18 | fi
19 | 
20 | # load recent coreutils ("-o auto" support added in release 8.12)
21 | module load coreutils/8.24
22 | 
23 | # arguments
24 | separator="$1"
25 | shift
26 | empty_char="$1"
27 | shift
28 | 
29 | # check if at least the first file exists
30 | if [ ! -s "$1" ] ; then
31 | 	echo -e "\n $script_name ERROR: file $1 does not exist \n" >&2
32 | 	exit 1
33 | fi
34 | 
35 | # recursive join function
36 | function rjoin {
37 | 	if [[ $# -gt 1 ]]; then
38 | 		LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1") | rjoin "${@:2}"
39 | 	else
40 | 		LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1")
41 | 	fi
42 | }
43 | 
44 | rjoin "${@:2}" < "$1"
45 | 
46 | 
47 | 
48 | # end
49 | 


--------------------------------------------------------------------------------
/scripts-phoenix/scrna-10x-cellranger-aggr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ##
 5 | ## 10X Cell Ranger
 6 | ## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count'
 7 | ##
 8 | 
 9 | 
10 | # script filename
11 | script_name=$(basename "${BASH_SOURCE[0]}")
12 | 
13 | # check for correct number of arguments
14 | if [ $# -lt 1 ] || [ $# -gt 2 ] ; then
15 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
16 | 	echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2
17 | 	exit 1
18 | fi
19 | 
20 | # arguments
21 | sample_sheet=$1
22 | analysis_name=$2
23 | 
24 | # settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant)
25 | threads="4"
26 | mem="32"
27 | 
28 | # output (add analysis name if provided)
29 | sample_name="aggregated"
30 | if [ -n "$analysis_name" ] ; then
31 | 	sample_name="${sample_name}-${analysis_name}"
32 | fi
33 | web_summary_html="${sample_name}/outs/web_summary.html"
34 | 
35 | # check that input exists
36 | if [ ! -s "$sample_sheet" ] ; then
37 | 	echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2
38 | 	exit 1
39 | fi
40 | 
41 | # delete empty .po files to keep directory clean
42 | rm -rf cellranger*.po*
43 | 
44 | echo -e "\n $(date) \n" >&2
45 | 
46 | # check if output already exits
47 | if [ -s "$web_summary_html" ]; then
48 | 	echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2
49 | 	exit 1
50 | fi
51 | 
52 | # clean up sample sheet
53 | dos2unix -q "$sample_sheet"
54 | sed -i 's/"//g' "$sample_sheet"
55 | sed -i -e '$a\' "$sample_sheet"
56 | 
57 | module unload gcc
58 | module load cellranger/2.1.0
59 | 
60 | # display settings
61 | echo " * cellranger: $(which cellranger) "
62 | echo " * sample sheet: $sample_sheet "
63 | 
64 | # cellranger aggr command
65 | 
66 | # id      A unique run id, used to name output folder [a-zA-Z0-9_-]+.
67 | # csv     Path of CSV file enumerating 'cellranger count' outputs.
68 | 
69 | cellranger_cmd="
70 | cellranger aggr \
71 | --jobmode local \
72 | --localcores $threads \
73 | --localmem $mem \
74 | --id $sample_name \
75 | --csv $sample_sheet
76 | "
77 | echo -e "\n CMD: $cellranger_cmd \n"
78 | $cellranger_cmd
79 | 
80 | sleep 15
81 | 
82 | # check that output html summary (and probably everything else) exists
83 | if [ ! -s "$web_summary_html" ] ; then
84 | 	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
85 | 	exit 1
86 | fi
87 | 
88 | # copy html summary to top level for easy navigation
89 | rsync -t "$web_summary_html" "./${sample_name}.html"
90 | 
91 | # clean up (temp files)
92 | rm -rf "${sample_name}/SC_RNA_COUNTER_CS"
93 | 
94 | echo -e "\n $(date) \n"
95 | 
96 | 
97 | 
98 | # end
99 | 


--------------------------------------------------------------------------------
/scripts-phoenix/scrna-10x-cellranger-count.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## 10X Cell Ranger - processes Chromium single cell RNA-seq output
  6 | ##
  7 | 
  8 | 
  9 | # script filename
 10 | script_name=$(basename "${BASH_SOURCE[0]}")
 11 | 
 12 | # check for correct number of arguments
 13 | if [ ! $# == 3 ] ; then
 14 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
 15 | 	echo -e "\n USAGE: $script_name genome_name sample_name fastq_dir \n" >&2
 16 | 	exit 1
 17 | fi
 18 | 
 19 | # arguments
 20 | genome_name=$1
 21 | sample_name_fastq=$2
 22 | sample_name_out="count-$sample_name_fastq"
 23 | fastq_dir=$(readlink -f "$3")
 24 | 
 25 | # settings
 26 | threads=$NSLOTS
 27 | mem=$(echo "$threads * 8" | bc)
 28 | transcriptome_dir="/ifs/data/cellranger-refdata/refdata-cellranger-${genome_name}-1.2.0"
 29 | alt_transcriptome_dir="/ifs/home/id460/ref/${genome_name}/cellranger"
 30 | 
 31 | # unload all loaded modulefiles
 32 | module purge
 33 | module load local
 34 | 
 35 | # check that input exists
 36 | if [ ! -d "$fastq_dir" ] ; then
 37 | 	echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
 38 | 	exit 1
 39 | fi
 40 | 
 41 | if [ ! -d "$transcriptome_dir" ] ; then
 42 | 	echo -e "\n WARNING: genome dir $transcriptome_dir does not exist \n" >&2
 43 | 	echo -e "\n setting genome dir to $alt_transcriptome_dir \n" >&2
 44 | 	transcriptome_dir="${alt_transcriptome_dir}"
 45 | fi
 46 | 
 47 | if [ ! -d "$transcriptome_dir" ] ; then
 48 | 	echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2
 49 | 	exit 1
 50 | fi
 51 | 
 52 | # delete empty .po files to keep directory clean
 53 | rm -rf cellranger*.po*
 54 | 
 55 | module load cellranger/2.1.0
 56 | 
 57 | # display settings
 58 | echo " * cellranger: $(which cellranger) "
 59 | echo " * threads: $threads "
 60 | echo " * mem: $mem "
 61 | echo " * transcriptome dir: $transcriptome_dir "
 62 | echo " * fastq dir: $fastq_dir "
 63 | echo " * sample: $sample_name_fastq "
 64 | echo " * out dir: $sample_name_out "
 65 | 
 66 | echo -e "\n $(date) \n" >&2
 67 | 
 68 | # cellranger run command
 69 | 
 70 | # id             A unique run id, used to name output folder
 71 | # fastqs         Path of folder created by 10x demultiplexing or bcl2fastq
 72 | # sample         Prefix of the filenames of FASTQs to select
 73 | # transcriptome  Path of folder containing 10X-compatible transcriptome
 74 | 
 75 | cellranger_cmd="
 76 | cellranger count \
 77 | --localmem $mem \
 78 | --localcores $threads \
 79 | --transcriptome $transcriptome_dir \
 80 | --fastqs $fastq_dir \
 81 | --sample $sample_name_fastq \
 82 | --id $sample_name_out \
 83 | "
 84 | echo -e "\n CMD: $cellranger_cmd \n"
 85 | $cellranger_cmd
 86 | 
 87 | sleep 15
 88 | 
 89 | web_summary_html="./${sample_name_out}/outs/web_summary.html"
 90 | 
 91 | # check that output html summary (and probably everything else) exists
 92 | if [ ! -s "$web_summary_html" ] ; then
 93 | 	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
 94 | 	exit 1
 95 | fi
 96 | 
 97 | # copy html summary to top level for easy navigation
 98 | rsync -tv "$web_summary_html" "./${sample_name_out}.html"
 99 | 
100 | # delete temp files
101 | rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS"
102 | 
103 | # delete empty .po files to keep directory clean
104 | rm -rf cellranger*.po*
105 | 
106 | echo -e "\n $(date) \n"
107 | 
108 | 
109 | 
110 | # end
111 | 


--------------------------------------------------------------------------------
/scripts-phoenix/wgs-10x-longranger.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | ##
  5 | ## Whole Genome Phasing and SV Calling from 10x Genomics Chromium Linked-Reads using Long Ranger.
  6 | ##
  7 | ## Usage:
  8 | ## qsub -N longranger -M ${USER}@nyumc.org -m ae -j y -cwd -pe threaded 16 -b y \
  9 | ## -hard -l mem_free=128G -l mem_token=8G \
 10 | ## bash /ifs/home/id460/public/genomics/scripts-phoenix/wgs-10x-longranger.sh <sample> <fastq_dir>
 11 | ##
 12 | 
 13 | 
 14 | #########################
 15 | 
 16 | 
 17 | # check for correct number of arguments
 18 | if [ ! $# == 2 ] ; then
 19 | 	echo -e "\n ERROR: wrong number of arguments supplied \n" >&2
 20 | 	echo -e "\n USAGE: bash wgs-10x-longranger.sh sample fastq_dir \n" >&2
 21 | 	exit 1
 22 | fi
 23 | 
 24 | # arguments
 25 | sample="$1"
 26 | fastq_dir=$(readlink -f "$2")
 27 | 
 28 | # check that input exists
 29 | if [ ! -d "$fastq_dir" ] ; then
 30 | 	echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
 31 | 	exit 1
 32 | fi
 33 | 
 34 | 
 35 | #########################
 36 | 
 37 | 
 38 | # system-specific settings
 39 | 
 40 | # Long Ranger directory
 41 | longranger_version="2.2.2"
 42 | longranger_dir="/ifs/home/id460/software/longranger/longranger-${longranger_version}"
 43 | 
 44 | # Long Ranger reference directory
 45 | longranger_ref_dir="/ifs/home/id460/ref/hg38/longranger-2.1.0"
 46 | 
 47 | # GATK path (Long Ranger 2.2 compatible with versions 3.3-4.0, excluding 3.6)
 48 | gatk_jar="/ifs/home/id460/software/GenomeAnalysisTK/gatk-4.0.4.0/gatk-package-4.0.4.0-local.jar"
 49 | 
 50 | # unload all loaded modulefiles
 51 | module purge
 52 | module load local
 53 | 
 54 | # load java (for GATK)
 55 | module load java/1.8
 56 | 
 57 | 
 58 | #########################
 59 | 
 60 | 
 61 | # system load settings
 62 | threads=$NSLOTS
 63 | mem=$(echo "$threads * 8" | bc)
 64 | 
 65 | # output settings
 66 | run_id="longranger-${sample}"
 67 | 
 68 | # display settings
 69 | echo
 70 | echo " * sample:               $sample "
 71 | echo " * FASTQ dir:            $fastq_dir "
 72 | echo " * Long Ranger bin dir:  $longranger_dir "
 73 | echo " * GATK jar file:        $gatk_jar "
 74 | echo " * threads:              $threads "
 75 | echo " * mem:                  $mem "
 76 | echo " * run output:           $run_id "
 77 | echo
 78 | 
 79 | echo -e "\n analysis started: $(date) \n" >&2
 80 | 
 81 | longranger_cmd="
 82 | ${longranger_dir}/longranger wgs \
 83 | --fastqs ${fastq_dir} \
 84 | --sample ${sample} \
 85 | --id ${run_id} \
 86 | --reference ${longranger_ref_dir} \
 87 | --vcmode=gatk:${gatk_jar} \
 88 | --localcores ${threads} \
 89 | --localmem ${mem} \
 90 | "
 91 | echo -e "\n CMD: $longranger_cmd \n"
 92 | $longranger_cmd
 93 | 
 94 | longranger_out_dir=$(readlink -f "$(pwd)/${run_id}")
 95 | 
 96 | echo -e "\n analysis ended: $(date) \n" >&2
 97 | 
 98 | 
 99 | #########################
100 | 
101 | 
102 | # check that output generated
103 | 
104 | if [ ! -e "${longranger_out_dir}/outs/summary.csv" ] ; then
105 | 	echo -e "\n ERROR: output ${longranger_out_dir}/outs/summary.csv does not exist \n" >&2
106 | 	exit 1
107 | fi
108 | 
109 | if [ ! -e "${longranger_out_dir}/outs/loupe.loupe" ] ; then
110 | 	echo -e "\n ERROR: output ${longranger_out_dir}/outs/loupe.loupe does not exist \n" >&2
111 | 	exit 1
112 | fi
113 | 
114 | 
115 | #########################
116 | 
117 | 
118 | # cleanup
119 | 
120 | # check file size before cleanup
121 | du -sh "$run_id"
122 | 
123 | # delete temp files
124 | rm -rf "${run_id}/PHASER_SVCALLER_CS"
125 | 
126 | # check file size after cleanup
127 | du -sh "$run_id"
128 | 
129 | 
130 | #########################
131 | 
132 | 
133 | 
134 | # end
135 | 


--------------------------------------------------------------------------------
/scripts/cnv-freec-genome-plot.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | '
  5 | Description:
  6 |   Plot genome-wide Control-FREEC copy number analysis results with proportional chromosomes in a single line.
  7 | 
  8 | Usage:
  9 |   cnv-freec-genome-plot.R <genome> <sample_name> <ratio_txt> <out_png>
 10 | 
 11 | Arguments:
 12 |   <genome>       genome build (UCSC-style such as "hg19" or "mm10")
 13 |   <sample_name>  sample name
 14 |   <ratio_txt>    Control-FREEC "_ratio.txt" file with ratios and predicted copy number alterations for each window
 15 |   <out_png>      output png image
 16 | 
 17 | Options:
 18 |   -h, --help        show this screen
 19 | ' -> doc
 20 | 
 21 | 
 22 | # output width
 23 | options(width = 120)
 24 | # print warnings as they occur
 25 | options(warn = 1)
 26 | 
 27 | # retrieve the command-line arguments
 28 | suppressPackageStartupMessages(library(docopt))
 29 | opts = docopt(doc)
 30 | 
 31 | # relevent arguments
 32 | genome = opts$genome
 33 | sample_name = opts$sample_name
 34 | ratio_txt = opts$ratio_txt
 35 | ratio_png = opts$out_png
 36 | 
 37 | # check that input file exists
 38 | if (!file.exists(ratio_txt)) stop("file does not exist: ", ratio_txt)
 39 | 
 40 | # load libraries
 41 | suppressPackageStartupMessages({
 42 |   library(dplyr)
 43 |   library(readr)
 44 |   library(karyoploteR)
 45 |   library(scales)
 46 | })
 47 | 
 48 | # ploidy
 49 | ploidy = 2
 50 | # maximum copy number level to plot (to avoid very high values)
 51 | max_cn = 6
 52 | 
 53 | # import ratio table, remove uncertain regions, and cap copy numbers at defined maximum value
 54 | ratio =
 55 |   read_tsv(ratio_txt, guess_max = 999999, show_col_types = FALSE, progress = FALSE) %>%
 56 |   dplyr::filter(CopyNumber >= 0) %>%
 57 |   dplyr::mutate(chr = Chromosome, start = Start, end = Start) %>%
 58 |   dplyr::mutate(Ratio = Ratio * ploidy) %>%
 59 |   dplyr::mutate(CopyNumber = ifelse(CopyNumber > max_cn, max_cn, CopyNumber)) %>%
 60 |   dplyr::mutate(Ratio = ifelse(Ratio > max_cn, max_cn, Ratio)) %>%
 61 |   dplyr::select(chr, start, end, Ratio, CopyNumber)
 62 | 
 63 | # convert ratio table to GRanges
 64 | ratio_gr = GRanges(ratio)
 65 | seqlevelsStyle(ratio_gr) = "UCSC"
 66 | 
 67 | # separate ratios based on amplifications/deletions
 68 | ratio_filtered = ratio_gr[ratio_gr$Ratio > 0]
 69 | ratio_norm = ratio_filtered[ratio_filtered$CopyNumber == ploidy]
 70 | ratio_amp = ratio_filtered[ratio_filtered$CopyNumber > ploidy]
 71 | ratio_del = ratio_filtered[ratio_filtered$CopyNumber < ploidy]
 72 | 
 73 | # plot
 74 | png(ratio_png, res = 300, width = 15, height = 3, units = "in")
 75 | pp = getDefaultPlotParams(plot.type = 4)
 76 | pp$data1inmargin = 0
 77 | pp$bottommargin = 50
 78 | pp$ideogramheight = 20
 79 | kp = plotKaryotype(genome = genome, plot.type = 4, ideogram.plotter = NULL, labels.plotter = NULL, plot.params = pp, main = sample_name)
 80 | kp = kpAxis(kp, ymin = 0, ymax = max_cn, tick.pos = 0:max_cn)
 81 | kp = kpAddCytobandsAsLine(kp)
 82 | kp = kpAddChromosomeNames(kp, srt = 45)
 83 | kp = kpPoints(kp, data = ratio_norm, y = ratio_norm$Ratio,
 84 |   cex = 0.3, ymin = 0, ymax = max_cn, col = alpha("darkolivegreen2", 0.3))
 85 | if (length(ratio_amp) > 0) {
 86 |   kp = kpPoints(kp, data = ratio_amp, y = ratio_amp$Ratio,
 87 |     cex = 0.3, ymin = 0, ymax = max_cn, col = alpha("firebrick2", 0.3))
 88 | }
 89 | if (length(ratio_del) > 0) {
 90 |   kp = kpPoints(kp, data = ratio_del, y = ratio_del$Ratio,
 91 |     cex = 0.3, ymin = 0, ymax = max_cn, col = alpha("royalblue4", 0.3))
 92 | }
 93 | kp = kpPoints(kp, data = ratio_gr, y = ratio_gr$CopyNumber,
 94 |   cex = 0.5, ymin = 0, ymax = max_cn, col = "gray20")
 95 | dev.off()
 96 | 
 97 | 
 98 | 
 99 | # end
100 | 


--------------------------------------------------------------------------------
/scripts/cnv-freec-heatmap.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | 
 4 | '
 5 | Description:
 6 |   Generate a color-coded gain/loss plot (heatmap-style) from Control-FREEC output.
 7 | 
 8 | Usage:
 9 |   cnv-freec-heatmap.R <genome> <cnvs_txt> <out_png>
10 | 
11 | Arguments:
12 |   <genome>    genome build (UCSC-style such as "hg19" or "mm10")
13 |   <cnvs_txt>  Control-FREEC "_CNVs" file with copy number alterations and p-values added by assess_significance.R
14 |   <out_png>   output png image
15 | 
16 | Options:
17 |   -h, --help        show this screen
18 | ' -> doc
19 | 
20 | 
21 | # print warnings as they occur
22 | options(warn = 1)
23 | 
24 | # retrieve the command-line arguments
25 | suppressPackageStartupMessages(library(docopt))
26 | opts = docopt(doc)
27 | 
28 | # relevent arguments
29 | genome = opts$genome
30 | cnvs_txt = opts$cnvs_txt
31 | cnvs_png = opts$out_png
32 | 
33 | # check that input file exists
34 | if (!file.exists(cnvs_txt)) stop("file does not exist: ", cnvs_txt)
35 | 
36 | # load libraries
37 | suppressPackageStartupMessages(library(magrittr))
38 | suppressPackageStartupMessages(library(tidyverse))
39 | suppressPackageStartupMessages(library(karyoploteR))
40 | 
41 | # import CNVs table and filter by Wilcoxon p-value
42 | cnvs = read_tsv(cnvs_txt, guess_max = 999999) %>%
43 |   filter(WilcoxonRankSumTestPvalue < 0.05)
44 | 
45 | # convert ratio table to GRanges
46 | cnvs_gr = GRanges(cnvs)
47 | seqlevelsStyle(cnvs_gr) = "UCSC"
48 | 
49 | # separate ratios based on amplifications/deletions
50 | cvns_gain = cnvs_gr[cnvs_gr$status == "gain"]
51 | cnvs_loss = cnvs_gr[cnvs_gr$status == "loss"]
52 | 
53 | # plot
54 | png(cnvs_png, res = 300, width = 15, height = 2, units = "in")
55 | pp = getDefaultPlotParams(plot.type = 4)
56 | pp$data1inmargin = 0
57 | pp$bottommargin = 80
58 | pp$ideogramheight = 20
59 | kp = plotKaryotype(genome = genome, plot.type = 4, ideogram.plotter = NULL, labels.plotter = NULL, plot.params = pp) %>%
60 |   kpAddCytobandsAsLine() %>%
61 |   kpAddChromosomeNames(srt = 90) %>%
62 |   kpRect(data = cvns_gain, y0 = 0, y1 = 1, col = "firebrick2", border = NA) %>%
63 |   kpRect(data = cnvs_loss, y0 = 0, y1 = 1, col = "royalblue4", border = NA)
64 | dev.off()
65 | 
66 | 
67 | 
68 | # end
69 | 


--------------------------------------------------------------------------------
/scripts/csv-clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ##
 5 | ## make csv file more compatible with various tools by removing problematic characters
 6 | ##
 7 | 
 8 | 
 9 | # script filename
10 | script_name=$(basename "${BASH_SOURCE[0]}")
11 | 
12 | # check for correct number of arguments
13 | if [ ! $# == 1 ] ; then
14 | 	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
15 | 	echo -e "\n USAGE: $script_name file.csv \n" >&2
16 | 	exit 1
17 | fi
18 | 
19 | # arguments
20 | csv=$1
21 | 
22 | # check that input exists
23 | if [ ! -s "$csv" ] ; then
24 | 	echo -e "\n $script_name ERROR: file $csv does not exist \n" >&2
25 | 	exit 1
26 | fi
27 | 
28 | # fix newlines
29 | dos2unix --quiet $csv
30 | mac2unix --quiet $csv
31 | 
32 | # replace commas inside quoted fields with dashes
33 | awk -F '"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "-", $i) } 1' $csv > ${csv}.tmp && mv ${csv}.tmp $csv
34 | 
35 | # remove quotes
36 | sed -i 's/\"//g' $csv
37 | 
38 | # remove lines missing any values (only commas present)
39 | sed -i '/^,,*$/d' $csv
40 | 
41 | # add newline to end of file if one does not exist (some scripts may complain)
42 | sed -i -e '$a\' $csv
43 | 
44 | 
45 | 
46 | # end
47 | 


--------------------------------------------------------------------------------
/scripts/fastq-merge.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | my $HELP = <<HELP;
  7 | 
  8 |   Find FASTQ files in a given directory (must have "_R1" or "_1" in file name).
  9 |   Extract sample names and paired reads based on file names (ignore lane numbers and barcodes).
 10 |   Combine multiple FASTQs so each sample ends up with only one R1 and R2 file.
 11 | 
 12 |   Usage:
 13 |     perl fastq-merge.pl search_dir out_dir
 14 | 
 15 |   Arguments:
 16 |     search_dir   directory with original files
 17 |     out_dir      output directory for merged files
 18 | 
 19 | HELP
 20 | 
 21 | if (!$ARGV[1]) {
 22 | 	die $HELP;
 23 | }
 24 | 
 25 | main();
 26 | 
 27 | # main subroutine
 28 | sub main {
 29 | 	my $search_dir = $ARGV[0];
 30 | 	my $out_dir = $ARGV[1];
 31 | 
 32 | 	# convert dir from relative to absolute
 33 | 	$search_dir = `readlink -f $search_dir`;
 34 | 	chomp($search_dir);
 35 | 	$out_dir = `readlink -f $out_dir`;
 36 | 	chomp($out_dir);
 37 | 
 38 | 	# check that given directories exist
 39 | 	unless ( -d $search_dir ) {
 40 | 		die "\n\n ERROR! $search_dir DOES NOT EXIST \n\n";
 41 | 	}
 42 | 	unless ( -d $out_dir ) {
 43 | 		die "\n\n ERROR! $out_dir DOES NOT EXIST \n\n";
 44 | 	}
 45 | 
 46 | 	# find fastqs in given directory
 47 | 	my $find_fastq_cmd_names = "-name '*_R1_0*.fastq.gz' -or -name '*_R1.fastq.gz' -or -name '*_1.fastq.gz'";
 48 | 	my $find_fastq_cmd = "find -L $search_dir -maxdepth 3 -type f $find_fastq_cmd_names | LC_ALL=C sort";
 49 | 	my @fastqs = `$find_fastq_cmd`;
 50 | 
 51 | 	# process each fastq
 52 | 	while (my $fastq_r1 = shift(@fastqs)) {
 53 | 		chomp($fastq_r1);
 54 | 
 55 | 		# check that R1 exists
 56 | 		unless ( -e $fastq_r1 ) {
 57 | 			die "\n\n ERROR! $fastq_r1 DOES NOT EXIST \n\n";
 58 | 		}
 59 | 
 60 | 		# generate R2 filename
 61 | 		my $fastq_r2 = $fastq_r1;
 62 | 		$fastq_r2 =~ s/(.*)_R1_0([0-9]+.fastq.gz)/${1}_R2_0${2}/;
 63 | 		$fastq_r2 =~ s/(.*)_R1.fastq.gz/${1}_R2.fastq.gz/;
 64 | 		$fastq_r2 =~ s/(.*)_1.fastq.gz/${1}_2.fastq.gz/;
 65 | 
 66 | 		# blank if R2 does not exist
 67 | 		unless ( -e $fastq_r2 ) {
 68 | 			$fastq_r2 = "";
 69 | 		}
 70 | 
 71 | 		# blank if R2 is same as R1 (in case of not standard file name, for example)
 72 | 		if ( $fastq_r1 eq $fastq_r2 ) {
 73 | 			$fastq_r2 = "";
 74 | 		}
 75 | 
 76 | 		# extract sample name
 77 | 		my $sample = $fastq_r1;
 78 | 		# remove directory structure
 79 | 		$sample =~ s/.*\///;
 80 | 		# bcl2fastq2 format (with S sample number)
 81 | 		$sample =~ s/_S[0-9]{1,3}_L00[0-9]_R1.*//;
 82 | 		# bcl2fastq format with 2 barcodes
 83 | 		$sample =~ s/_[ACTG]{6,}-[ACTG]{6,}_L00[0-9]_R1.*//;
 84 | 		# bcl2fastq format with 1 barcode
 85 | 		$sample =~ s/_[ACTG]{4,}_L00[0-9]_R1.*//;
 86 | 		# no barcodes
 87 | 		$sample =~ s/_L00[0-9]_R[12].*//;
 88 | 		# no barcodes or lane
 89 | 		$sample =~ s/_R[12].fastq.gz//;
 90 | 		# no barcodes or lane
 91 | 		$sample =~ s/_[12].fastq.gz//;
 92 | 
 93 | 		# show found sample info
 94 | 		print STDERR " SAMPLE : $sample \n";
 95 | 		print STDERR "  FASTQ R1 : $fastq_r1 \n";
 96 | 		print STDERR "  FASTQ R2 : $fastq_r2 \n";
 97 | 
 98 | 		# file names after merging
 99 | 		my $fastq_r1_merged = "${out_dir}/${sample}_R1.fastq.gz";
100 | 		my $fastq_r2_merged = "${out_dir}/${sample}_R2.fastq.gz";
101 | 
102 | 		# merge without overwriting (">>" instead of ">")
103 | 		print STDERR "  MERGED R1 : $fastq_r1_merged \n";
104 | 		my $merge_cmd = "cat $fastq_r1 >> $fastq_r1_merged";
105 | 		print STDERR "  CMD : $merge_cmd \n";
106 | 		system($merge_cmd);
107 | 
108 | 		# repeat merge for R2 if present
109 | 		if ( -e $fastq_r2 ) {
110 | 			print STDERR "  MERGED R2 : $fastq_r2_merged \n";
111 | 			$merge_cmd = "cat $fastq_r2 >> $fastq_r2_merged";
112 | 			print STDERR "  CMD : $merge_cmd \n";
113 | 			system($merge_cmd);
114 | 		}
115 | 
116 | 		sleep(1);
117 | 
118 | 	}
119 | 
120 | }
121 | 
122 | 
123 | 
124 | # end
125 | 


--------------------------------------------------------------------------------
/scripts/fastq-quality-bars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Visualize FASTQ quality using bars and "animate" them by looping through the individual reads.
 4 | # Demo: https://asciinema.org/a/194133
 5 | # Usage: cat <file.fastq> | ./fastq-quality-bars.sh
 6 | #
 7 | 
 8 | 
 9 | # check if a stdin pipe exists
10 | if [ -p /dev/stdin ]; then
11 | 
12 |   # blank line for cleaner output
13 |   echo ""
14 | 
15 |   # initiate line counter
16 |   n=0
17 | 
18 |   while read -r line; do
19 | 
20 |     # update line counter
21 |     n=$(($n+1))
22 | 
23 |     # use only every fourth line (quality scores)
24 |     if [ "$n" -eq 4 ]; then
25 | 
26 |       # clear screen
27 |       printf "\033c"
28 | 
29 |       # bin quality scores
30 |       qual8=$(echo "$line" | sed -e 'y/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJKL/00001111122222333334444455555666667777788888/')
31 | 
32 |       # convert binned quality scores to vertical bars
33 |       awk -v q="$qual8" 'BEGIN {
34 |         # number of bases
35 |         len = split(q, qarr, "");
36 |         # height of bars (plus an extra row on top and bottom)
37 |         h = 8 + 2;
38 |         # matrix for output characters
39 |         for (i = 1; i <= len; i++) {
40 |           # top border
41 |           a[i,h] = "▁";
42 |           # bottom border
43 |           a[i,1] = "▔";
44 |           # add quality bars
45 |           for (j = 1; j <= qarr[i]; j++) {
46 |             a[i,j+1] = "▊";
47 |           }
48 |         }
49 |         # transpose matrix and print from top
50 |         for (j = h; j >= 1; j--) {
51 |           out = "";
52 |           for (i = 1; i <= len; i++) {
53 |             if (a[i,j] == "") a[i,j] = " ";
54 |             out = out a[i,j];
55 |           }
56 |           print out;
57 |         }
58 |       }'
59 | 
60 |       # pause
61 |       sleep 0.2
62 | 
63 |       # reset line counter
64 |       n=0
65 | 
66 |     fi
67 | 
68 |   done
69 | 
70 |   echo "fastq ended"
71 | 
72 | else
73 | 
74 |   # show usage if nothing was piped in
75 |   echo -e "\n Usage: cat <file.fastq> | ./fastq-quality-bars.sh \n"
76 | 
77 | fi
78 | 
79 | 
80 | 
81 | # end
82 | 


--------------------------------------------------------------------------------
/scripts/gtf-remove-overlapping.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | 
  5 | my $HELP = <<HELP;
  6 | 
  7 |   REMOVE GENES FROM GTF FILE WITH OVERLAPPING REGIONS
  8 | 
  9 |   Usage: gtf-remove-overlapping.pl in.gtf
 10 |          (will create a new filtered file in the same dir)
 11 | HELP
 12 | 
 13 | if(!$ARGV[0]){
 14 | 	die $HELP;
 15 | }
 16 | 
 17 | main();
 18 | 
 19 | # main subroutine
 20 | sub main {
 21 | 	my $gtf = $ARGV[0];
 22 | 
 23 | 	# check that the gtf file exists
 24 | 	unless ( -s $gtf ) {
 25 | 		print "\n\n ERROR! $gtf DOES NOT EXIST \n\n";
 26 | 		die;
 27 | 	}
 28 | 
 29 | 	# get file stats
 30 | 	my $gtf_lines = `cat $gtf | cut -f 9 | wc -l`;
 31 | 	my $gtf_transcripts = `cat $gtf | cut -f 9 | sort | uniq | wc -l`;
 32 | 	print " original gtf lines: $gtf_lines";
 33 | 	print " original gtf transcripts: $gtf_transcripts";
 34 | 
 35 | 	# keep only cds entries and sort gtf
 36 | 	my $gtf_sorted = "${gtf}.sort.tmp";
 37 | 	my $sort_cmd = "grep 'CDS' $gtf | sort -k1,1 -k4,4n -k5,5n -k9,9n > $gtf_sorted";
 38 | 	system($sort_cmd);
 39 | 	sleep(1);
 40 | 
 41 | 	# scan the gtf file one line at a time checking for overlapping transcripts
 42 | 	my ($prev_chr, $prev_pos0, $prev_pos1, $prev_transcript);
 43 | 	my @bad_transcripts;
 44 | 	open(SORTED, "<", $gtf_sorted);
 45 | 	while (<SORTED>) {
 46 | 		chomp;
 47 | 		my ($chr, $src, $feat, $pos0, $pos1, $score, $strand, $frame, $attr) = split(/\t/);
 48 | 
 49 | 		# extract just the transcript id from attributes column
 50 | 		my $transcript = $attr;
 51 | 		$transcript =~ s/.*transcript_id\s"(.+?)".*/$1/;
 52 | 
 53 | 		# check if current region is same chr as previous region but transcript is different
 54 | 		if ( ( $chr eq $prev_chr ) && ( $transcript ne $prev_transcript ) ) {
 55 | 			# check if previous end pos is larger than current start pos
 56 | 			if ( $pos0 < $prev_pos1 ) {
 57 | 				# flag previous transcript if not flagged already
 58 | 				unless ($prev_transcript ~~ @bad_transcripts) {
 59 | 					push (@bad_transcripts, $prev_transcript);
 60 | 					#print "$prev_transcript \n";
 61 | 				}
 62 | 				# flag current transcript if not flagged already
 63 | 					unless ($transcript ~~ @bad_transcripts) {
 64 | 					push (@bad_transcripts, $transcript);
 65 | 					#print "$transcript \n";
 66 | 				}
 67 | 			}
 68 | 		}
 69 | 
 70 | 		#print " $i $gtf_array[$i][0] $gtf_array[$i][3] $gtf_array[$i][4] \n";
 71 | 
 72 | 		# update info of previous entry
 73 | 		$prev_chr = $chr;
 74 | 		$prev_pos0 = $pos0;
 75 | 		$prev_pos1 = $pos1;
 76 | 		$prev_transcript = $transcript;
 77 | 
 78 | 
 79 | 		#for my $row (@gtf_array) {
 80 | 		#	print "@$row[0]\t@$row[1]\t@$row[2]\n";
 81 | 		#}
 82 | 	}
 83 | 	close(SORTED);
 84 | 
 85 | 	# delete sorted cds-only gtf created at the beginning
 86 | 	system("rm -f $gtf_sorted");
 87 | 
 88 | 	# count number of overlapping transcripts
 89 | 	my $bad_transcript_count = scalar(@bad_transcripts);
 90 | 	print " overlapping transcripts: $bad_transcript_count \n";
 91 | 
 92 | 	# create new gtf for overlapping and non-overlapping transcripts
 93 | 	open(UNIQUE, ">", "${gtf}.unique.gtf");
 94 | 	open(OVERLAPPING, ">", "${gtf}.overlapping.gtf");
 95 | 
 96 | 	# process original gtf and sort each element based on transcript being one of overlapping transcripts
 97 | 	open(GTF, "<", $gtf);
 98 | 	while (<GTF>) {
 99 | 		chomp;
100 | 		my ($chr, $src, $feat, $pos0, $pos1, $score, $strand, $frame, $attr) = split(/\t/);
101 | 
102 | 		# extract just the transcript id from attributes column
103 | 		my $transcript = $attr;
104 | 		$transcript =~ s/.*transcript_id\s"(.+?)".*/$1/;
105 | 
106 | 		# check if transcript is one of overlapping transcripts
107 | 		if ($transcript ~~ @bad_transcripts) {
108 | 			print OVERLAPPING "$_\n";
109 | 		}
110 | 		else {
111 | 			print UNIQUE "$_\n";
112 | 		}
113 | 	}
114 | 	close(GTF);
115 | 
116 | 	close(UNIQUE);
117 | 	close(OVERLAPPING);
118 | 
119 | }
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/scripts/hdcyto-1-import-fcs.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Import FCS files and generate a flowSet object"
  3 | subtitle: "HDCyto workflow step 1"
  4 | date-modified: last-modified
  5 | format:
  6 |   html:
  7 |     embed-resources: true
  8 |     code-tools: true
  9 |     toc: true
 10 |     df-print: paged
 11 | execute:
 12 |   cache: false
 13 | params:
 14 |   # project label for the output directory
 15 |   project_name: "project"
 16 |   # path to the FCS files
 17 |   fcs_dir: "/path/to/fcs"
 18 |   # subset large samples to a maximum number of cells
 19 |   max_cells_per_sample: 500000
 20 | # quarto render hdcyto-1-import-fcs.qmd -P project_name:? -P fcs_dir:? -P max_cells_per_sample:?
 21 | ---
 22 | 
 23 | This script generates a flowSet object from FCS files.
 24 | The Quarto format allows both interactive and command-line execution.
 25 | It additionally generates an HTML report that can be used for recordkeeping and troubleshooting.
 26 | It is structured to facilitate the import and quality control of FCS files in a reproducible and organized manner.
 27 | It searches for FCS files in the specified directory, imports the files into a flowSet object, downsamples the samples to avoid extremely large objects, and generates summary tables and plots.
 28 | It generates `input/metadata-files.csv` and `input/metadata-channels.csv` metadata tables which should be manually edited to clean up labels, define sample groups, and specify the relevant samples and markers/antibodies used for the next step (`hdcyto-2-prepare-sce.qmd`).
 29 | 
 30 | # Settings
 31 | 
 32 | ```{r packages}
 33 | #| message: false
 34 | #| warning: false
 35 | library(tidyverse)
 36 | library(glue)
 37 | library(cowplot)
 38 | library(qs2)
 39 | library(flowCore)
 40 | library(ncdfFlow)
 41 | library(ggsci)
 42 | ```
 43 | 
 44 | Check parameters
 45 | 
 46 | ```{r params}
 47 | params
 48 | ```
 49 | 
 50 | Define inputs
 51 | 
 52 | ```{r settings-inputs}
 53 | if (!dir.exists(params$fcs_dir)) stop("FCS dir does not exist: ", params$fcs_dir)
 54 | ```
 55 | 
 56 | Define outputs
 57 | 
 58 | ```{r settings-outputs}
 59 | out_dir <- glue("./out-{params$project_name}")
 60 | input_dir <- glue("{out_dir}/input")
 61 | data_dir <- glue("{out_dir}/r-data")
 62 | qc_dir <- glue("{out_dir}/qc")
 63 | 
 64 | dir.create(out_dir, showWarnings = FALSE)
 65 | dir.create(input_dir, showWarnings = FALSE)
 66 | dir.create(data_dir, showWarnings = FALSE)
 67 | dir.create(qc_dir, showWarnings = FALSE)
 68 | 
 69 | files_csv <- glue("{input_dir}/metadata-files.csv")
 70 | channels_csv <- glue("{input_dir}/metadata-channels.csv")
 71 | fs_qs2 <- glue("{data_dir}/flowset.qs2")
 72 | nfs_cdf <- glue("{data_dir}/ncdfflowset.cdf")
 73 | ```
 74 | 
 75 | Stop if the output files that need to be edited already exist (to prevent overwriting)
 76 | 
 77 | ```{r}
 78 | if (file.exists(files_csv)) stop("files metadata table already exists: ", files_csv)
 79 | if (file.exists(channels_csv)) stop("channels metadata table already exists: ", channels_csv)
 80 | ```
 81 | 
 82 | # Determine input files
 83 | 
 84 | Find FCS files
 85 | 
 86 | ```{r find-fcs-files}
 87 | fcs_dir <- params$fcs_dir
 88 | fcs_files <- list.files(path = fcs_dir, pattern = "\\.fcs$", full.names = TRUE, recursive = TRUE)
 89 | ```
 90 | 
 91 | Generate a files/samples metadata table
 92 | 
 93 | `CATALYST::prepData()` expects `file_name`, `sample_id`, `patient_id`, and `condition` columns
 94 | 
 95 | ```{r files_df}
 96 | files_df <-
 97 |   data.frame(
 98 |     file_name = basename(fcs_files),
 99 |     sample_id = str_remove(basename(fcs_files), ".fcs$"),
100 |     patient_id = "?",
101 |     condition = "?",
102 |     full_path = fcs_files,
103 |     row.names = basename(fcs_files)
104 |   )
105 | files_df
106 | ```
107 | 
108 | ```{r}
109 | write_csv(files_df, files_csv)
110 | ```
111 | 
112 | Generate an AnnotatedDataFrame
113 | 
114 | ```{r files_adf}
115 | files_adf <- new("AnnotatedDataFrame", data = files_df)
116 | ```
117 | 
118 | # Generate flowSet
119 | 
120 | Import FCS files
121 | 
122 | ```{r read.flowSet}
123 | # fs <- read.flowSet(files, alter.names = TRUE)
124 | # fs <- read.flowSet(files, transformation = "scale", alter.names = TRUE)
125 | # fs <- suppressWarnings(read.flowSet(path = fcs_dir, alter.names = TRUE, transformation = FALSE, phenoData = samples_adf))
126 | ```
127 | 
128 | The flowCore flowSet represents a set of FCS files and requires the data elements to remain in memory. The ncdfFlowSet inherits most of data structures from flowSet. It stores event-level data on disk and only keeps the file handler and meta data in memory.
129 | 
130 | `read.ncdfFlowSet()` will load only common channels if there are discrepancies between files (`read.flowSet()` requires identical channels)
131 | 
132 | ```{r read.ncdfFlowSet}
133 | nfs <- read.ncdfFlowSet(files = files_df$full_path, ncdfFile = nfs_cdf, alter.names = TRUE, transformation = FALSE, phenoData = files_adf, compress = 5, mc.cores = 4)
134 | ```
135 | 
136 | Check which files were imported
137 | 
138 | ```{r}
139 | pData(nfs)
140 | ```
141 | 
142 | Save phenotypic data
143 | 
144 | ```{r}
145 | write_csv(pData(nfs), glue("{qc_dir}/flowset-pdata.csv"))
146 | ```
147 | 
148 | Save a table of the number of cells in each imported file
149 | 
150 | ```{r flowSet-num-cells}
151 | fsApply(nfs, nrow) |>
152 |   as.data.frame() |>
153 |   as_tibble(rownames = "filename") |>
154 |   dplyr::rename(num_cells = V1) |>
155 |   arrange(filename) |>
156 |   write_csv(glue("{qc_dir}/fcs-num-cells.csv"))
157 | ```
158 | 
159 | Generate a channels/antibodies metadata table
160 | 
161 | `CATALYST::prepData()` expects `fcs_colname`, `antigen`, and optional `marker_class`
162 | 
163 | `CATALYST::guessPanel()` guesses the marker class based on the channel name and outputs CATALYST-compatible column names (some columns may need to be coerced to character from list)
164 | 
165 | ```{r channels_df}
166 | channels_df <-
167 |   CATALYST::guessPanel(nfs[[1]]) |>
168 |   mutate(marker_class = ifelse(use_channel, "state", "none")) |>
169 |   mutate(across(where(is.list), ~ map_chr(., toString)))
170 | channels_df
171 | ```
172 | 
173 | ```{r}
174 | write_csv(channels_df, channels_csv)
175 | ```
176 | 
177 | Randomly subsample ncdfFlowSet to avoid extremely large objects
178 | 
179 | ```{r subsample}
180 | set.seed(99)
181 | fres <- filter(nfs, filter = sampleFilter(size = params$max_cells_per_sample))
182 | # summary(fres)
183 | nfs <- Subset(nfs, fres)
184 | ```
185 | 
186 | Save a table of the number of cells in each file after subsampling
187 | 
188 | ```{r}
189 | fsApply(nfs, nrow) |>
190 |   as.data.frame() |>
191 |   as_tibble(rownames = "filename") |>
192 |   dplyr::rename(num_cells = V1) |>
193 |   arrange(filename) |>
194 |   write_csv(glue("{qc_dir}/flowset-num-cells.csv"))
195 | ```
196 | 
197 | Convert ncdfFlowSet to flowSet
198 | 
199 | ```{r as.flowSet}
200 | fs <- ncdfFlow::as.flowSet(nfs)
201 | fs
202 | ```
203 | 
204 | Check that the conversion did not lose data
205 | 
206 | ```{r}
207 | if (!identical(pData(fs), pData(nfs))) stop("pData does not match")
208 | if (!identical(exprs(fs[[1]]), exprs(nfs[[1]]))) stop("exprs does not match")
209 | ```
210 | 
211 | Save flowSet object
212 | 
213 | ```{r save-flowSet}
214 | qs2::qs_save(object = fs, file = fs_qs2)
215 | unlink(nfs_cdf)
216 | ```
217 | 
218 | # Plot expression density histograms
219 | 
220 | Subset to random samples to make plots more readable
221 | 
222 | ```{r plot-subset-samples}
223 | samples_subset <- sampleNames(fs)
224 | if (length(samples_subset) > 15) {
225 |   set.seed(99)
226 |   samples_subset <- sort(sample(sampleNames(fs), 10))
227 | }
228 | fs_min <- subset(fs, sampleNames(fs) %in% samples_subset)
229 | length(sampleNames(fs_min))
230 | ```
231 | 
232 | Subset to random cells to speed up plot generation
233 | 
234 | ```{r plot-subset-cells}
235 | set.seed(99)
236 | fres <- filter(fs_min, filter = sampleFilter(size = 5000))
237 | fs_min <- Subset(fs_min, fres)
238 | sum(fsApply(fs_min, nrow))
239 | ```
240 | 
241 | Generate a ggplot-friendly expression table with extreme outliers removed
242 | 
243 | ```{r tidy-exprs-tbl}
244 | exprs_tbl <-
245 |   bind_rows(
246 |     lapply(
247 |       1:length(fs_min),
248 |       function(i) {
249 |         data.frame(sample_id = pData(fs_min)[i, "sample_id"], exprs(fs_min[[i]]))
250 |       }
251 |     )
252 |   ) |>
253 |   pivot_longer(!sample_id, names_to = "channel", values_to = "exprs") |>
254 |   left_join(channels_df, by = c("channel" = "fcs_colname")) |>
255 |   dplyr::mutate(desc0 = if_else(is.na(desc0), channel, desc0)) |>
256 |   group_by(channel) |>
257 |   dplyr::mutate(min_cutoff = quantile(exprs, 0.01), max_cutoff = quantile(exprs, 0.99)) |>
258 |   dplyr::mutate(zscore = scale(exprs)) |>
259 |   ungroup() |>
260 |   dplyr::filter(exprs >= min_cutoff, exprs <= max_cutoff)
261 | # dplyr::filter(zscore > -3, zscore < 3)
262 | dim(exprs_tbl)
263 | ```
264 | 
265 | Generate a density plot for the original unmodified values
266 | 
267 | ```{r plot-density-original}
268 | dens_plot <-
269 |   exprs_tbl |>
270 |   ggplot(aes(x = exprs, color = sample_id)) +
271 |   geom_density() +
272 |   facet_wrap(vars(desc0), scales = "free") +
273 |   theme_minimal() +
274 |   theme(
275 |     plot.background = element_rect(fill = "white"),
276 |     panel.grid.minor = element_blank()
277 |   ) +
278 |   scale_color_igv()
279 | ggsave(glue("{qc_dir}/flowset-exprs-density-raw.png"), dens_plot, width = 16, height = 12)
280 | ```
281 | 
282 | Generate a density plot for the log-transformed values
283 | 
284 | ```{r plot-density-log}
285 | if (min(exprs_tbl$exprs) >= 0) {
286 |   dens_plot <-
287 |     exprs_tbl |>
288 |     mutate(exprs_log10 = log10(exprs + 1)) |>
289 |     ggplot(aes(x = exprs_log10, color = sample_id)) +
290 |     geom_density() +
291 |     facet_wrap(vars(desc0), scales = "free") +
292 |     theme_minimal() +
293 |     theme(
294 |       plot.background = element_rect(fill = "white"),
295 |       panel.grid.minor = element_blank()
296 |     ) +
297 |     scale_color_igv()
298 |   ggsave(glue("{qc_dir}/flowset-exprs-density-log.png"), dens_plot, width = 16, height = 12)
299 | } else {
300 |   warning("negative expression values present")
301 | }
302 | ```
303 | 
304 | # Session info
305 | 
306 | ```{r}
307 | sessionInfo()
308 | ```
309 | 


--------------------------------------------------------------------------------
/scripts/hdcyto-2-prepare-sce.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Convert flowSet to SingleCellExperiment with sample annotations"
  3 | subtitle: "HDCyto workflow step 2"
  4 | date-modified: last-modified
  5 | format:
  6 |   html:
  7 |     embed-resources: true
  8 |     code-tools: true
  9 |     toc: true
 10 |     df-print: paged
 11 | execute:
 12 |   cache: false
 13 | params:
 14 |   # project label for the input/output directory
 15 |   project_name: "project"
 16 |   # cofactor for arcsinh transformation (generally 5 for CyTOF and 150 for flow cytometry)
 17 |   arcsinh_cofactor: 150
 18 | # quarto render hdcyto-2-prepare-sce.qmd -P project_name:? -P arcsinh_cofactor:?
 19 | ---
 20 | 
 21 | This script generates a SingleCellExperiment object with sample annotations.
 22 | The previous step (`hdcyto-1-import-fcs.qmd`) saves an unfiltered flowSet object.
 23 | It generates `input/metadata-files.csv` and `input/metadata-channels.csv` tables used to identify the relevant samples and markers/antibodies.
 24 | Those files should be edited to clean up labels, define sample groups, and exclude problematic samples or markers/antibodies.
 25 | The generated SingleCellExperiment object can then be used for CATALYST-based downstream analysis in the next step (`hdcyto-3-analyze-sce.qmd`).
 26 | 
 27 | # Settings
 28 | 
 29 | ```{r packages}
 30 | #| message: false
 31 | #| warning: false
 32 | library(tidyverse)
 33 | library(glue)
 34 | library(cowplot)
 35 | library(qs2)
 36 | library(flowCore)
 37 | library(CATALYST)
 38 | ```
 39 | 
 40 | Check parameters
 41 | 
 42 | ```{r params}
 43 | params
 44 | ```
 45 | 
 46 | Define inputs
 47 | 
 48 | ```{r settings-inputs}
 49 | out_dir <- glue("./out-{params$project_name}")
 50 | input_dir <- glue("{out_dir}/input")
 51 | data_dir <- glue("{out_dir}/r-data")
 52 | qc_dir <- glue("{out_dir}/qc")
 53 | if (!dir.exists(out_dir)) stop("output dir does not exist: ", out_dir)
 54 | 
 55 | fs_qs2 <- glue("{data_dir}/flowset.qs2")
 56 | if (!file.exists(fs_qs2)) stop("flowSet does not exist: ", fs_qs2)
 57 | 
 58 | files_csv <- glue("{input_dir}/metadata-files.csv")
 59 | if (!file.exists(files_csv)) stop("files metadata table does not exist: ", files_csv)
 60 | 
 61 | channels_csv <- glue("{input_dir}/metadata-channels.csv")
 62 | if (!file.exists(channels_csv)) stop("channels metadata table does not exist: ", channels_csv)
 63 | ```
 64 | 
 65 | Define outputs
 66 | 
 67 | ```{r settings-outputs}
 68 | sce_qs2 <- glue("{data_dir}/sce.qs2")
 69 | ```
 70 | 
 71 | # Import data
 72 | 
 73 | Import files metadata table
 74 | 
 75 | ```{r import-files-csv}
 76 | files_tbl <- read_csv(files_csv, show_col_types = FALSE)
 77 | files_tbl
 78 | ```
 79 | 
 80 | Validate files metadata table
 81 | 
 82 | ```{r check-files-table}
 83 | if (!"file_name" %in% names(files_tbl)) {
 84 |   stop("files metadata table should have 'file_name' column")
 85 | }
 86 | if (!"sample_id" %in% names(files_tbl)) {
 87 |   stop("files metadata table should have 'sample_id' column (for CATALYST)")
 88 | }
 89 | if (!"patient_id" %in% names(files_tbl)) {
 90 |   stop("files metadata table should have 'patient_id' column (for CATALYST)")
 91 | }
 92 | if (!"condition" %in% names(files_tbl)) {
 93 |   stop("files metadata table should have 'condition' column (for CATALYST)")
 94 | }
 95 | if (anyNA(files_tbl$sample_id)) {
 96 |   stop("files metadata table 'sample_id' column has NAs")
 97 | }
 98 | if (anyNA(files_tbl$condition)) {
 99 |   stop("files metadata table 'condition' column has NAs")
100 | }
101 | if (n_distinct(files_tbl$condition) == 1) {
102 |   stop("files metadata table should have multiple conditions")
103 | }
104 | if (!all(sapply(files_tbl, "class") %in% c("factor", "character"))) {
105 |   stop("files metadata table column contents should be discrete")
106 | }
107 | ```
108 | 
109 | Import channels metadata table
110 | 
111 | ```{r import-channels-csv}
112 | channels_tbl <- read_csv(channels_csv, show_col_types = FALSE)
113 | channels_tbl
114 | ```
115 | 
116 | Validate channels metadata table
117 | 
118 | ```{r check-channels-table}
119 | if (!"fcs_colname" %in% names(channels_tbl)) {
120 |   stop("channels metadata table should have 'fcs_colname' column")
121 | }
122 | if (!"antigen" %in% names(channels_tbl)) {
123 |   stop("channels metadata table should have 'antigen' column (for CATALYST)")
124 | }
125 | ```
126 | 
127 | Import flowSet
128 | 
129 | ```{r import-flowSet}
130 | fs <- qs2::qs_read(fs_qs2, validate_checksum = TRUE)
131 | fs
132 | ```
133 | 
134 | Check that the parameter names are not all identical
135 | 
136 | ```{r check-flowset-parameters}
137 | if (n_distinct(fs[[1]]@parameters@data$name) == 1) stop("parameter names are all identical")
138 | if (n_distinct(fs[[1]]@parameters@data$desc) == 1) stop("parameter descriptions are all identical")
139 | ```
140 | 
141 | Check if any files were removed from the files table
142 | 
143 | ```{r}
144 | removed_samples <- setdiff(rownames(pData(fs)), files_tbl$file_name)
145 | removed_samples
146 | ```
147 | 
148 | Add removed files back to the table (will be removed after SingleCellExperiment is generated)
149 | 
150 | ```{r}
151 | if (length(removed_samples) > 0) {
152 |   files_tbl <- bind_rows(files_tbl, data.frame(file_name = removed_samples))
153 | }
154 | ```
155 | 
156 | ```{r}
157 | # remove rownames (causes an error with diffcyt)
158 | # rownames(files_tbl) <- NULL
159 | # rownames(channels_tbl) <- NULL
160 | ```
161 | 
162 | # Generate a SingleCellExperiment
163 | 
164 | Convert a flowSet into a SingleCellExperiment
165 | 
166 | ```{r prepData}
167 | arcsinh_cofactor <- params$arcsinh_cofactor
168 | 
169 | # transform: arcsinh-transformation should be performed
170 | # FACS: keep all channels as assay data
171 | sce <-
172 |   prepData(
173 |     fs,
174 |     panel = channels_tbl,
175 |     md = files_tbl,
176 |     transform = TRUE,
177 |     cofactor = arcsinh_cofactor,
178 |     FACS = TRUE
179 |   )
180 | sce
181 | ```
182 | 
183 | Remove files not found in the files table
184 | 
185 | ```{r filter-samples}
186 | sce <- filterSCE(sce, !is.na(sample_id))
187 | sce
188 | ```
189 | 
190 | Check the contents of the SingleCellExperiment object
191 | 
192 | ```{r}
193 | rowData(sce)
194 | ```
195 | 
196 | ```{r save-rowdata}
197 | write_csv(as_tibble(rowData(sce), rownames = "feature_id"), glue("{qc_dir}/sce-rowdata.csv"))
198 | ```
199 | 
200 | `CATALYST::prepData()` does not keep all files metadata columns, so they need to be added
201 | 
202 | ```{r clean-coldata}
203 | colData(sce) <- colData(sce)[, "sample_id", drop = FALSE]
204 | 
205 | coldata_full_df <- left_join(as.data.frame(colData(sce)), files_tbl, by = c("sample_id"))
206 | if (!all(coldata_full_df$sample_id == colData(sce)$sample_id)) stop("colData order mismatch")
207 | coldata_full_df <- dplyr::select(coldata_full_df, !sample_id)
208 | coldata_full_df <- dplyr::select(coldata_full_df, !file_name)
209 | coldata_full_df <- dplyr::select(coldata_full_df, !full_path)
210 | 
211 | colData(sce) <- cbind(colData(sce), coldata_full_df)
212 | colData(sce)
213 | ```
214 | 
215 | Add rownames to column metadata
216 | 
217 | ```{r add-coldata-rownames}
218 | colnames(sce) <- make.names(colData(sce)$sample_id, unique = TRUE)
219 | ```
220 | 
221 | Extract the experimental design table
222 | 
223 | ```{r ei}
224 | ei(sce)
225 | ```
226 | 
227 | ```{r save-ei}
228 | write_csv(as_tibble(ei(sce)), glue("{qc_dir}/sce-exp-design.csv"))
229 | ```
230 | 
231 | Plot the number of cells per sample
232 | 
233 | ```{r plot-num-cells}
234 | # plotCounts(sce, group_by = "sample_id", color_by = NULL)
235 | # plotCounts(sce, group_by = "sample_id", color_by = "condition") + scale_fill_igv()
236 | num_cells_plot <- plotCounts(sce, group_by = "sample_id", color_by = NULL)
237 | ggsave(glue("{qc_dir}/sce-num-cells.png"), num_cells_plot, width = 15, height = 5)
238 | ```
239 | 
240 | Check expression
241 | 
242 | ```{r check-assay-counts}
243 | quantile(assay(sce, "counts")[, sample(1:ncol(sce), 10000)])
244 | ```
245 | 
246 | ```{r check-assay-exprs}
247 | quantile(assay(sce, "exprs")[, sample(1:ncol(sce), 10000)])
248 | ```
249 | 
250 | ```{r}
251 | # p = plotExprs(sce, color_by = "condition")
252 | # p = plotExprs(sce, color_by = "sample_id")
253 | # p$facet$params$ncol <- 6
254 | # p
255 | ```
256 | 
257 | Save SingleCellExperiment
258 | 
259 | ```{r save-sce}
260 | qs2::qs_save(object = sce, file = sce_qs2)
261 | ```
262 | 
263 | Delete Rplots.pdf
264 | 
265 | ```{r}
266 | if (file.exists("Rplots.pdf")) {
267 |   file.remove("Rplots.pdf")
268 | }
269 | ```
270 | 
271 | # Session info
272 | 
273 | ```{r session-info}
274 | sessionInfo()
275 | ```
276 | 


--------------------------------------------------------------------------------
/scripts/hdcyto-3-analyze-sce.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Perform CATALYST-based analysis"
  3 | subtitle: "HDCyto workflow step 3"
  4 | date-modified: last-modified
  5 | format:
  6 |   html:
  7 |     embed-resources: true
  8 |     code-tools: true
  9 |     toc: true
 10 |     df-print: paged
 11 | execute:
 12 |   cache: false
 13 | params:
 14 |   # project label for the input/output directory
 15 |   project_name: "project"
 16 | # quarto render hdcyto-3-analyze-sce.qmd -P project_name:?
 17 | ---
 18 | 
 19 | This script performs CATALYST-based analysis, including dimensionality reduction and clustering.
 20 | It starts with a SingleCellExperiment object generated in the previous step (`hdcyto-2-prepare-sce.qmd`).
 21 | 
 22 | # Settings
 23 | 
 24 | ```{r packages}
 25 | #| message: false
 26 | #| warning: false
 27 | library(tidyverse)
 28 | library(glue)
 29 | library(cowplot)
 30 | library(qs2)
 31 | library(RColorBrewer)
 32 | library(ggsci)
 33 | library(CATALYST)
 34 | ```
 35 | 
 36 | Check parameters
 37 | 
 38 | ```{r params}
 39 | params
 40 | ```
 41 | 
 42 | Define inputs
 43 | 
 44 | ```{r settings-inputs}
 45 | out_dir <- glue("./out-{params$project_name}")
 46 | input_dir <- glue("{out_dir}/input")
 47 | data_dir <- glue("{out_dir}/r-data")
 48 | qc_dir <- glue("{out_dir}/qc")
 49 | if (!dir.exists(out_dir)) stop("output dir does not exist: ", out_dir)
 50 | 
 51 | sce_qs2 <- glue("{data_dir}/sce.qs2")
 52 | if (!file.exists(sce_qs2)) stop("SingleCellExperiment does not exist: ", sce_qs2)
 53 | ```
 54 | 
 55 | Define outputs
 56 | 
 57 | ```{r settings-outputs}
 58 | phono_dir <- glue("{out_dir}/phenotypes")
 59 | exprs_dir <- glue("{out_dir}/expression")
 60 | clust_dir <- glue("{out_dir}/clusters")
 61 | 
 62 | dir.create(phono_dir, showWarnings = FALSE)
 63 | dir.create(exprs_dir, showWarnings = FALSE)
 64 | dir.create(clust_dir, showWarnings = FALSE)
 65 | ```
 66 | 
 67 | # Import data
 68 | 
 69 | Import SingleCellExperiment
 70 | 
 71 | ```{r import-sce}
 72 | sce <- qs2::qs_read(sce_qs2, validate_checksum = TRUE)
 73 | sce
 74 | ```
 75 | 
 76 | Check the contents of the SingleCellExperiment object
 77 | 
 78 | ```{r}
 79 | ei(sce)
 80 | ```
 81 | 
 82 | ```{r}
 83 | write_csv(as_tibble(rowData(sce), rownames = "feature_id"), glue("{qc_dir}/sce-rowdata.csv"))
 84 | ```
 85 | 
 86 | ```{r}
 87 | write_csv(as_tibble(ei(sce)), glue("{qc_dir}/sce-exp-design.csv"))
 88 | ```
 89 | 
 90 | Define the color scheme, accounting for many groups
 91 | 
 92 | ```{r color-scheme}
 93 | color_scheme <- c(pal_igv("default")(51), pal_igv(alpha = 0.6)(51), pal_igv(alpha = 0.3)(51))
 94 | ```
 95 | 
 96 | Pseudobulk-level MDS plot (computed on median marker expressions in each sample)
 97 | 
 98 | ```{r plot-pb-mds}
 99 | for (p in names(colData(sce))) {
100 |   mds_plot <-
101 |     pbMDS(sce, by = "sample_id", color_by = p, pal = color_scheme) +
102 |     theme_classic() +
103 |     theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank(), strip.background = element_blank())
104 |   ggsave(glue("{qc_dir}/dr-mds-pseudobulk-{p}.png"), mds_plot, width = 10, height = 6)
105 | }
106 | ```
107 | 
108 | # Plot expression patterns
109 | 
110 | Subset to random samples to make plots more readable
111 | 
112 | ```{r plot-subset-samples}
113 | samples_subset <- levels(sce$sample_id)
114 | if (length(samples_subset) > 15) {
115 |   set.seed(99)
116 |   samples_subset <- sort(sample(samples_subset, 10))
117 | }
118 | sce_rand <- sce[, sce$sample_id %in% samples_subset]
119 | ```
120 | 
121 | Subset to random cells to speed up plot generation
122 | 
123 | ```{r plot-subset-cells}
124 | if (ncol(sce_rand) > 100000) {
125 |   set.seed(99)
126 |   sce_rand <- sce_rand[, sample(colnames(sce_rand), 100000)]
127 | }
128 | ```
129 | 
130 | Create a function to convert expression matrix to a tidy data frame for ggplot2
131 | 
132 | ```{r tidy_expression}
133 | tidy_expression <- function(sce, assay_name) {
134 |   t(assay(sce, assay_name)) |>
135 |     as_tibble(rownames = "cell_id") |>
136 |     pivot_longer(!cell_id, names_to = "channel", values_to = "exprs") |>
137 |     left_join(as_tibble(colData(sce_rand), rownames = "cell_id"), by = "cell_id") |>
138 |     group_by(channel) |>
139 |     dplyr::mutate(min_cutoff = quantile(exprs, 0.01), max_cutoff = quantile(exprs, 0.99)) |>
140 |     dplyr::mutate(zscore = as.vector(scale(exprs))) |>
141 |     ungroup() |>
142 |     dplyr::filter(exprs >= min_cutoff, exprs <= max_cutoff)
143 |   # dplyr::filter(zscore > -3, zscore < 3)
144 | }
145 | ```
146 | 
147 | Create a function to generate expression density plots
148 | 
149 | ```{r}
150 | plot_expression_density <- function(x, values_col) {
151 |   ggplot(x, aes(x = .data[[values_col]], color = sample_id)) +
152 |     geom_density() +
153 |     facet_wrap(vars(channel), scales = "free") +
154 |     theme_minimal() +
155 |     theme(
156 |       plot.background = element_rect(fill = "white"),
157 |       panel.grid.minor = element_blank()
158 |     ) +
159 |     scale_color_manual(values = color_scheme)
160 | }
161 | ```
162 | 
163 | Generate a density plot for the original unmodified values
164 | 
165 | ```{r plot-density-original}
166 | exprs_tbl <- tidy_expression(sce = sce_rand, assay_name = "counts")
167 | density_plot <-
168 |   exprs_tbl |>
169 |   plot_expression_density(values_col = "exprs")
170 | ggsave(glue("{qc_dir}/sce-exprs-density-raw.png"), density_plot, width = 16, height = 12)
171 | ```
172 | 
173 | Generate a density plot for the original log-transformed values
174 | 
175 | ```{r plot-density-log}
176 | if (min(exprs_tbl$exprs) >= 0) {
177 |   density_plot <-
178 |     exprs_tbl |>
179 |     mutate(exprs_log10 = log10(exprs + 1)) |>
180 |     plot_expression_density(values_col = "exprs_log10")
181 |   ggsave(glue("{qc_dir}/sce-exprs-density-log.png"), density_plot, width = 16, height = 12)
182 | }
183 | ```
184 | 
185 | Generate a density plot for the arcsinh-transformed values
186 | 
187 | ```{r plot-density-arcsinh}
188 | density_plot <-
189 |   tidy_expression(sce = sce_rand, assay_name = "exprs") |>
190 |   plot_expression_density(values_col = "exprs")
191 | ggsave(glue("{qc_dir}/sce-exprs-density-arcsinh.png"), density_plot, width = 16, height = 12)
192 | ```
193 | 
194 | Generate correlation plots for specific markers
195 | 
196 | ```{r}
197 | if (all(c("CD4", "CD8") %in% rownames(sce))) {
198 |   cor_plot <-
199 |     t(assay(sce_rand, "exprs")) |>
200 |     as_tibble(rownames = "cell_id") |>
201 |     # dplyr::filter(CD3 > 4) |>
202 |     left_join(as_tibble(colData(sce_rand), rownames = "cell_id"), by = "cell_id") |>
203 |     ggplot(aes(x = CD4, y = CD8)) +
204 |     # geom_point(size = 0.1, alpha = 0.2) +
205 |     # geom_density_2d(color = "darkred", alpha = 0.8) +
206 |     geom_density_2d_filled(contour_var = "ndensity") +
207 |     facet_wrap(vars(sample_id), scales = "free") +
208 |     theme_minimal() +
209 |     theme(
210 |       plot.background = element_rect(fill = "white"),
211 |       panel.grid.minor = element_blank(),
212 |       aspect.ratio = 1
213 |     ) +
214 |     scale_fill_viridis_d(option = "plasma")
215 |   save_plot(glue("{qc_dir}/expr-marker-cor-CD4-CD8.png"), cor_plot, base_width = 12, base_height = 12)
216 | }
217 | 
218 | if (all(c("CD3", "CD19") %in% rownames(sce))) {
219 |   cor_plot <-
220 |     t(assay(sce_rand, "exprs")) |>
221 |     as_tibble(rownames = "cell_id") |>
222 |     left_join(as_tibble(colData(sce_rand), rownames = "cell_id"), by = "cell_id") |>
223 |     ggplot(aes(x = CD3, y = CD19)) +
224 |     # geom_point(size = 0.1, alpha = 0.2) +
225 |     # geom_density_2d(color = "darkred", alpha = 0.8) +
226 |     geom_density_2d_filled(contour_var = "ndensity") +
227 |     facet_wrap(vars(sample_id), scales = "free") +
228 |     theme_minimal() +
229 |     theme(
230 |       plot.background = element_rect(fill = "white"),
231 |       panel.grid.minor = element_blank(),
232 |       aspect.ratio = 1
233 |     ) +
234 |     scale_fill_viridis_d(option = "plasma")
235 |   save_plot(glue("{qc_dir}/expr-marker-cor-CD3-CD19.png"), cor_plot, base_width = 12, base_height = 12)
236 | }
237 | ```
238 | 
239 | Plot median expression of all markers per sample
240 | 
241 | ```{r plot-median-expression}
242 | pb_plot <- plotPbExprs(sce, features = NULL) + scale_color_manual(values = color_scheme)
243 | save_plot(glue("{exprs_dir}/exprs-markers-samples-boxplot.png"), pb_plot, base_width = 12, base_height = 6)
244 | ```
245 | 
246 | Modified `CATALYST::.anno_factors()` function
247 | 
248 | ```{r}
249 | .anno_factors <- function(x, ids, which, type = c("row", "column")) {
250 |   type <- match.arg(type)
251 |   # get non-numeric cell metadata variables
252 |   cd <- colData(x)
253 |   df <- data.frame(cd, check.names = FALSE)
254 |   df <- select_if(df, ~ !is.numeric(.))
255 |   df <- mutate_all(df, ~ droplevels(factor(.x)))
256 | 
257 |   # store sample matching
258 |   m <- match(ids, df$sample_id)
259 | 
260 |   # get number of matches per variable
261 |   ns <- split(df, df$sample_id) |>
262 |     lapply(mutate_all, droplevels) |>
263 |     lapply(summarize_all, nlevels) |>
264 |     do.call(what = "rbind")
265 | 
266 |   # keep only uniquely mapable factors included in 'which'
267 |   keep <- names(which(colMeans(ns) == 1))
268 |   keep <- setdiff(keep, c("sample_id", "cluster_id"))
269 |   if (is.character(which)) {
270 |     keep <- intersect(keep, which)
271 |   }
272 |   if (length(keep) == 0) {
273 |     return(NULL)
274 |   }
275 |   df <- df[m, keep, drop = FALSE]
276 | 
277 |   # get list of colors for each annotation
278 |   lvls <- lapply(as.list(df), levels)
279 |   nlvls <- vapply(lvls, length, numeric(1))
280 |   pal <- pal_igv("default")(51)
281 |   if (any(nlvls > length(pal))) {
282 |     pal <- colorRampPalette(pal)(max(nlvls))
283 |   }
284 |   names(is) <- is <- colnames(df)
285 |   cols <- lapply(is, function(i) {
286 |     u <- pal[seq_len(nlvls[i])]
287 |     names(u) <- lvls[[i]]
288 |     u
289 |   })
290 | 
291 |   ComplexHeatmap::HeatmapAnnotation(
292 |     which = type, df = df,
293 |     col = cols, gp = grid::gpar(col = "white")
294 |   )
295 | }
296 | ```
297 | 
298 | Modified `CATALYST::plotExprHeatmap()` function
299 | 
300 | ```{r}
301 | plotExprHeatmap_ed <- function(x, features = NULL,
302 |                                by = c("sample_id", "cluster_id", "both"), k = "meta20", m = NULL,
303 |                                assay = "exprs", fun = c("median", "mean", "sum"),
304 |                                scale = c("first", "last", "never"), q = 0.01,
305 |                                row_anno = TRUE, col_anno = TRUE,
306 |                                row_clust = TRUE, col_clust = TRUE,
307 |                                row_dend = TRUE, col_dend = TRUE,
308 |                                bars = FALSE, perc = FALSE, bin_anno = FALSE,
309 |                                hm_pal = rev(RColorBrewer::brewer.pal(11, "RdYlBu")),
310 |                                k_pal = CATALYST:::.cluster_cols, m_pal = k_pal,
311 |                                distance = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"),
312 |                                linkage = c("average", "ward.D", "single", "complete", "mcquitty", "median", "centroid", "ward.D2")) {
313 |   # check validity of input arguments
314 |   args <- as.list(environment())
315 |   CATALYST:::.check_args_plotExprHeatmap(args)
316 |   distance <- match.arg(distance)
317 |   linkage <- match.arg(linkage)
318 |   scale <- match.arg(scale)
319 |   fun <- match.arg(fun)
320 |   by <- match.arg(by)
321 | 
322 |   # subset features of interest
323 |   x <- x[unique(CATALYST:::.get_features(x, features)), ]
324 | 
325 |   # get specified cluster IDs
326 |   if (by != "sample_id") {
327 |     CATALYST:::.check_k(x, k)
328 |     x$cluster_id <- cluster_ids(x, k)
329 |   }
330 |   if (by == "both") {
331 |     by <- c("cluster_id", "sample_id")
332 |   }
333 | 
334 |   # aggregate to pseudobulks by sample/cluster/both
335 |   # using 'assay' data & 'fun' as summary statistic
336 |   .do_agg <- function() {
337 |     z <- CATALYST:::.agg(x, by, fun, assay)
338 |     if (length(by) == 1) {
339 |       return(z)
340 |     }
341 |     set_rownames(
342 |       do.call("rbind", z),
343 |       levels(x$cluster_id)
344 |     )
345 |   }
346 |   # do 0-1 scaling for each marker trimming
347 |   # lower ('q'%) & upper (1-'q'%) quantiles
348 |   .do_scale <- function() {
349 |     if (scale == "first") {
350 |       z <- assay(x, assay)
351 |       z <- CATALYST:::.scale_exprs(z, 1, q)
352 |       assay(x, assay, FALSE) <- z
353 |       return(x)
354 |     } else {
355 |       CATALYST:::.scale_exprs(z, 1, q)
356 |     }
357 |   }
358 | 
359 |   # apply one of...
360 |   # - scale & trim then aggregate
361 |   # - aggregate then scale & trim
362 |   # - aggregate only
363 |   z <- switch(scale,
364 |     first = {
365 |       x <- .do_scale()
366 |       .do_agg()
367 |     },
368 |     last = {
369 |       z <- .do_agg()
370 |       .do_scale()
371 |     },
372 |     never = {
373 |       .do_agg()
374 |     }
375 |   )
376 |   if (length(by) == 1) z <- t(z)
377 | 
378 |   if (scale != "never" && !(assay == "counts" && fun == "sum")) {
379 |     qs <- round(quantile(z, c(0.01, 0.99)) * 5) / 5
380 |     lgd_aes <- list(at = seq(qs[1], qs[2], 0.2))
381 |   } else {
382 |     lgd_aes <- list()
383 |   }
384 |   lgd_aes$title_gp <- grid::gpar(
385 |     fontsize = 10,
386 |     fontface = "bold",
387 |     lineheight = 0.8
388 |   )
389 | 
390 |   # left-hand side heatmap annotation:
391 |   # non-numeric cell metadata variables
392 |   if (!isFALSE(row_anno)) {
393 |     left_anno <- switch(by[1],
394 |       sample_id = .anno_factors(x, levels(x$sample_id), row_anno, "row"),
395 |       CATALYST:::.anno_clusters(x, k, m, k_pal, m_pal)
396 |     )
397 |   } else {
398 |     left_anno <- NULL
399 |   }
400 |   if (!isFALSE(col_anno) && length(by) == 2) {
401 |     top_anno <- .anno_factors(x, levels(x$sample_id), col_anno, "colum")
402 |   } else {
403 |     top_anno <- NULL
404 |   }
405 | 
406 |   # right-hand side heatmap annotation:
407 |   # labeled barplot of event counts by sample
408 |   if (bars) {
409 |     right_anno <- .anno_counts(x[[by[1]]], perc)
410 |   } else {
411 |     right_anno <- NULL
412 |   }
413 | 
414 |   # get bin annotation
415 |   if (bin_anno) {
416 |     cell_fun <- function(j, i, x, y, ...) {
417 |       grid.text(
418 |         gp = gpar(fontsize = 8),
419 |         sprintf("%.2f", z[i, j]), x, y
420 |       )
421 |     }
422 |   } else {
423 |     cell_fun <- NULL
424 |   }
425 | 
426 |   a <- ifelse(assay == "exprs", "expression", assay)
427 |   f <- switch(fun,
428 |     "median" = "med",
429 |     fun
430 |   )
431 |   hm_title <- switch(scale,
432 |     first = sprintf("%s %s\n%s", fun, "scaled", a),
433 |     last = sprintf("%s %s\n%s", "scaled", fun, a),
434 |     never = paste(fun, a, sep = "\n")
435 |   )
436 |   if (length(by) == 2) {
437 |     col_title <- features
438 |   } else if (length(features) == 1 &&
439 |     features %in% c("type", "state")) {
440 |     col_title <- paste0(features, "_markers")
441 |   } else {
442 |     col_title <- ""
443 |   }
444 | 
445 |   ComplexHeatmap::Heatmap(
446 |     matrix = z,
447 |     name = hm_title,
448 |     col = circlize::colorRamp2(
449 |       seq(min(z), max(z), l = n <- 100),
450 |       colorRampPalette(hm_pal)(n)
451 |     ),
452 |     column_title = col_title,
453 |     column_title_side = ifelse(length(by) == 2, "top", "bottom"),
454 |     cell_fun = cell_fun,
455 |     cluster_rows = row_clust,
456 |     cluster_columns = col_clust,
457 |     show_row_dend = row_dend,
458 |     show_column_dend = col_dend,
459 |     clustering_distance_rows = distance,
460 |     clustering_method_rows = linkage,
461 |     clustering_distance_columns = distance,
462 |     clustering_method_columns = linkage,
463 |     show_row_names = (
464 |       is.null(left_anno) ||
465 |         isTRUE(by == "sample_id")) && !perc,
466 |     row_names_side = ifelse(
467 |       by[1] == "cluster_id" ||
468 |         isFALSE(row_anno) && !row_dend ||
469 |         isFALSE(row_clust),
470 |       "left", "right"
471 |     ),
472 |     top_annotation = top_anno,
473 |     left_annotation = left_anno,
474 |     right_annotation = right_anno,
475 |     rect_gp = grid::gpar(col = "white"),
476 |     heatmap_legend_param = lgd_aes
477 |   )
478 | }
479 | ```
480 | 
481 | ```{r plot-markers-samples-heatmap}
482 | num_samples <- n_distinct(sce$sample_id)
483 | plot_height <- 4 + (num_samples / 8)
484 | plot_height <- round(plot_height, 1)
485 | # hm_plot <- plotExprHeatmap(sce, k_pal = pal_igv()(51), m_pal = pal_igv()(51), scale = "last")
486 | hm_plot <- plotExprHeatmap_ed(sce, scale = "last")
487 | png(glue("{exprs_dir}/exprs-markers-samples-heatmap.png"), width = 15, height = plot_height, units = "in", res = 300)
488 | print(hm_plot)
489 | dev.off()
490 | ```
491 | 
492 | Aggregate expression the same way as `CATALYST::plotExprHeatmap()` function
493 | 
494 | ```{r}
495 | agg_exprs <- function(x, assay = "exprs", fun = c("median", "mean", "sum"),
496 |                       scale = c("first", "last", "never"), q = 0.01) {
497 |   # check validity of input arguments
498 |   scale <- match.arg(scale)
499 |   fun <- match.arg(fun)
500 |   by <- "sample_id"
501 | 
502 |   # aggregate to pseudobulks by sample/cluster/both
503 |   # using 'assay' data & 'fun' as summary statistic
504 |   .do_agg <- function() {
505 |     z <- CATALYST:::.agg(x, by, fun, assay)
506 |     if (length(by) == 1) {
507 |       return(z)
508 |     }
509 |     set_rownames(
510 |       do.call("rbind", z),
511 |       levels(x$cluster_id)
512 |     )
513 |   }
514 |   # do 0-1 scaling for each marker trimming
515 |   # lower ('q'%) & upper (1-'q'%) quantiles
516 |   .do_scale <- function() {
517 |     if (scale == "first") {
518 |       z <- assay(x, assay)
519 |       z <- CATALYST:::.scale_exprs(z, 1, q)
520 |       assay(x, assay, FALSE) <- z
521 |       return(x)
522 |     } else {
523 |       CATALYST:::.scale_exprs(z, 1, q)
524 |     }
525 |   }
526 | 
527 |   # apply one of...
528 |   # - scale & trim then aggregate
529 |   # - aggregate then scale & trim
530 |   # - aggregate only
531 |   z <- switch(scale,
532 |     first = {
533 |       x <- .do_scale()
534 |       .do_agg()
535 |     },
536 |     last = {
537 |       z <- .do_agg()
538 |       .do_scale()
539 |     },
540 |     never = {
541 |       .do_agg()
542 |     }
543 |   )
544 |   if (length(by) == 1) z <- t(z)
545 | 
546 |   if (scale != "never" && !(assay == "counts" && fun == "sum")) {
547 |     qs <- round(quantile(z, c(0.01, 0.99)) * 5) / 5
548 |     lgd_aes <- list(at = seq(qs[1], qs[2], 0.2))
549 |   } else {
550 |     lgd_aes <- list()
551 |   }
552 |   lgd_aes$title_gp <- grid::gpar(
553 |     fontsize = 10,
554 |     fontface = "bold",
555 |     lineheight = 0.8
556 |   )
557 | 
558 |   as_tibble(t(z), rownames = "marker")
559 | }
560 | ```
561 | 
562 | Aggregate expression
563 | 
564 | ```{r aggregate-expression}
565 | # aggregate assay data is just (no scaling)
566 | agg_exprs(sce, scale = "never") |> write_csv(glue("{exprs_dir}/exprs-samples-median.csv"))
567 | # aggregate assay data first and scale subsequently (range of each marker will be 0-1)
568 | agg_exprs(sce, scale = "last") |> write_csv(glue("{exprs_dir}/exprs-samples-median-scaled.csv"))
569 | # scale and trim then aggregate
570 | # agg_exprs(sce, scale = "first") |> write_csv(glue("{proj_dir}/expr-samples-scale-agg.csv"))
571 | ```
572 | 
573 | # Generate UMAPs
574 | 
575 | Run UMAP
576 | 
577 | ```{r run-umap}
578 | # cells: maximal number of cells per sample
579 | # features: "type"/"state" for type/state_markers(x) or NULL to use all features
580 | set.seed(99)
581 | sce <- runDR(sce, dr = "UMAP", cells = 1000, n_neighbors = 50, features = NULL, min_dist = 0.3, n_epochs = 500)
582 | sce
583 | ```
584 | 
585 | Get only the cells with UMAP coordinates
586 | 
587 | ```{r}
588 | # umap_cells <- reducedDims(sce)$UMAP[,1]
589 | # umap_cells <- umap_cells[!is.na(umap_cells)]
590 | ```
591 | 
592 | Randomize cell order for plotting
593 | 
594 | ```{r}
595 | set.seed(99)
596 | sce_rand <- sce[, sample(colnames(sce))]
597 | ```
598 | 
599 | Plot phenotypes overlaid on a UMAP
600 | 
601 | ```{r umap-phenotypes}
602 | for (p in names(colData(sce))) {
603 |   if (n_distinct(colData(sce)[[p]]) < length(color_scheme)) {
604 |     umap_pheno <-
605 |       plotDR(sce_rand, "UMAP", color_by = p) +
606 |       theme_classic() +
607 |       theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank()) +
608 |       scale_color_manual(values = color_scheme)
609 |     ggsave(glue("{phono_dir}/dr-umap-pheno-{p}.png"), umap_pheno, width = 10, height = 6)
610 |   }
611 | }
612 | ```
613 | 
614 | Plot samples overlaid on a UMAP, split by patient/condition when there are a lot of samples
615 | 
616 | ```{r umap-samples-subsets}
617 | if (n_distinct(sce$sample_id) > 15) {
618 |   for (sub_p in sort(unique(sce$patient_id))) {
619 |     umap_pheno <-
620 |       plotDR(sce_rand[, sce_rand$patient_id == sub_p], "UMAP", color_by = "sample_id") +
621 |       theme_classic() +
622 |       theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank()) +
623 |       scale_color_manual(values = color_scheme)
624 |     ggsave(glue("{phono_dir}/dr-umap-pheno-subset-sample_id-patient_id-{sub_p}.png"), umap_pheno, width = 10, height = 6)
625 |   }
626 |   for (sub_c in sort(unique(sce$condition))) {
627 |     umap_pheno <-
628 |       plotDR(sce_rand[, sce_rand$condition == sub_c], "UMAP", color_by = "sample_id") +
629 |       theme_classic() +
630 |       theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank()) +
631 |       scale_color_manual(values = color_scheme)
632 |     ggsave(glue("{phono_dir}/dr-umap-pheno-subset-sample_id-condition-{sub_c}.png"), umap_pheno, width = 10, height = 6)
633 |   }
634 | }
635 | ```
636 | 
637 | Plot markers overlaid on a UMAP
638 | 
639 | ```{r umap-markers}
640 | marker_colors <- rev(RColorBrewer::brewer.pal(11, "RdYlBu"))
641 | # hcl.colors(10, "reds", rev = TRUE)
642 | for (m in sort(rownames(sce))) {
643 |   marker_plot <-
644 |     plotDR(sce_rand, "UMAP", color_by = m, assay = "exprs", a_pal = marker_colors) +
645 |     theme_cowplot() +
646 |     theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank(), strip.background = element_blank())
647 |   ggsave(glue("{exprs_dir}/dr-umap-exprs-{m}.png"), marker_plot, width = 8, height = 6)
648 | }
649 | ```
650 | 
651 | # Perform clustering
652 | 
653 | Cluster
654 | 
655 | ```{r cluster}
656 | sce <- cluster(sce, features = NULL, xdim = 15, ydim = 15, maxK = 50, seed = 99)
657 | ```
658 | 
659 | Randomize cell order for plotting
660 | 
661 | ```{r}
662 | set.seed(99)
663 | sce_rand <- sce[, sample(colnames(sce))]
664 | ```
665 | 
666 | Generate cluster-related plots (UMAP, heatmap, abundance)
667 | 
668 | ```{r plot-clusters}
669 | for (clust in c("meta5", "meta8", "meta10", "meta15", "meta20", "meta25", "meta30", "meta50")) {
670 |   umap_clust <-
671 |     plotDR(sce_rand, "UMAP", color_by = clust) +
672 |     theme_classic() +
673 |     theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank(), strip.background = element_blank()) +
674 |     scale_color_manual(values = color_scheme)
675 |   ggsave(glue("{clust_dir}/dr-umap-clusters-{clust}.png"), umap_clust, width = 8, height = 6)
676 | 
677 |   hm_h <- n_distinct(cluster_ids(sce, clust))
678 |   hm_h <- (hm_h / 5) + 4
679 |   hm_plot <- plotExprHeatmap(sce, scale = "last", k = clust, by = "cluster_id", bars = TRUE, perc = TRUE, k_pal = pal_igv()(51))
680 |   png(glue("{clust_dir}/exprs-markers-clusters-heatmap-{clust}.png"), width = 15, height = hm_h, units = "in", res = 300)
681 |   print(hm_plot)
682 |   dev.off()
683 | 
684 |   plot_bar <- plotAbundances(sce, k = clust, by = "sample_id", group_by = "condition", k_pal = pal_igv()(51))
685 |   ggsave(glue("{clust_dir}/abundance-bar-{clust}.png"), plot_bar, width = 9, height = 9)
686 | 
687 |   plot_box <- plotAbundances(sce, k = clust, by = "cluster_id", group_by = "condition", k_pal = pal_igv()(51))
688 |   ggsave(glue("{clust_dir}/abundance-box-{clust}.png"), plot_box, width = 9, height = 6)
689 | }
690 | ```
691 | 
692 | Generate a complete metadata table (meta-clusters are not stored in colData)
693 | 
694 | ```{r combine-coldata-clusters}
695 | # confirm that cluster_id and som100 (when grid is 10x10) are referring to the same clusters
696 | # names(cluster_codes(x))
697 | if (identical(table(colData(sce)$cluster_id), table(cluster_ids(sce, "som225")))) {
698 |   metadata_tbl <-
699 |     left_join(
700 |       as_tibble(colData(sce), rownames = "cell_id"),
701 |       cluster_codes(sce),
702 |       by = c("cluster_id" = "som225")
703 |     )
704 | } else {
705 |   stop("cluster_id mismatch")
706 | }
707 | dim(metadata_tbl)
708 | ```
709 | 
710 | ```{r save-metadata}
711 | write_csv(metadata_tbl, glue("{qc_dir}/sce-metadata.csv.gz"))
712 | ```
713 | 
714 | # Finalize analysis
715 | 
716 | Save SingleCellExperiment
717 | 
718 | ```{r save-sce}
719 | qs2::qs_save(object = sce, file = sce_qs2)
720 | ```
721 | 
722 | Delete Rplots.pdf
723 | 
724 | ```{r}
725 | if (file.exists("Rplots.pdf")) {
726 |   file.remove("Rplots.pdf")
727 | }
728 | ```
729 | 
730 | # Session info
731 | 
732 | ```{r session-info}
733 | sessionInfo()
734 | ```
735 | 


--------------------------------------------------------------------------------
/scripts/meth-minfi.R:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## minfi wrapper functions to streamline the analysis of methylation microarrays
  3 | ##
  4 | 
  5 | 
  6 | # output width
  7 | options(width = 120)
  8 | # print warnings as they occur
  9 | options(warn = 1)
 10 | # default type for the bitmap devices such as png (should default to "cairo")
 11 | options(bitmapType = "cairo")
 12 | 
 13 | # dependencies
 14 | suppressPackageStartupMessages({
 15 |   library(tidyverse)
 16 |   library(minfi)
 17 |   library(RColorBrewer)
 18 |   library(ggsci)
 19 | })
 20 | 
 21 | # color scale for plots
 22 | plot_colors = c(brewer.pal(5, "Set1"), brewer.pal(8, "Dark2"), pal_igv("default")(51))
 23 | 
 24 | # import data and generate some qc plots
 25 | load_data = function(sample_sheet) {
 26 | 
 27 |   # check if sample sheet exists
 28 |   if (!file.exists(sample_sheet)) stop("sample sheet ", sample_sheet, " does not exist")
 29 | 
 30 |   # import sample sheet
 31 |   samples_tbl = read_csv(sample_sheet)
 32 | 
 33 |   # sample sheet needs to have "Basename" and "Sentrix_ID" for minfi
 34 |   if (!("Basename" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Basename\" column")
 35 |   if (!("Sentrix_ID" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Sentrix_ID\" column")
 36 | 
 37 |   # sample sheet needs to have "Sample" and "Condition" for this workflow
 38 |   if (!("Sample" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Sample\" column")
 39 |   if (!("Condition" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Condition\" column")
 40 | 
 41 |   # add array ID based on basename (to check for batch effects, for example)
 42 |   samples_tbl$Array = gsub(".*/([0-9]*)_R[0-9][0-9]C[0-9][0-9]", "\\1", samples_tbl$Basename)
 43 | 
 44 |   message("\n\n ===== minfi::read.metharray.exp() ===== \n\n")
 45 | 
 46 |   # red and green channel measurements of the samples (combine() combines two sets of samples)
 47 |   raw_set = read.metharray.exp(targets = samples_tbl, recursive = TRUE, verbose = FALSE)
 48 | 
 49 |   # check that sample names and pData are in the same order (probably not necessary)
 50 |   if (!(identical(sampleNames(raw_set), sub(".*/", "", pData(raw_set)$Basename)))) stop("sample names not identical")
 51 | 
 52 |   # change sample identifier from "Basename" to "Sample"
 53 |   sampleNames(raw_set) = pData(raw_set)$Sample
 54 | 
 55 |   # show which array type and corresponding package are being used
 56 |   message("array: ", annotation(raw_set)[["array"]])
 57 |   message("annotation: ", annotation(raw_set)[["annotation"]])
 58 | 
 59 |   # show conditions
 60 |   message("samples per condition: ")
 61 |   raw_set$Condition %>% table(useNA = "ifany") %>% print()
 62 | 
 63 |   message("\n\n ===== minfi::read.qcReport() ===== \n\n")
 64 | 
 65 |   # PDF QC report of the most common plots
 66 |   qcReport(raw_set, sampGroups=pData(raw_set)$Condition, pdf="plot.qcreport.pdf")
 67 | 
 68 |   png("plot.density.raw.condition.png", width = 8, height = 5, units = "in", res = 300)
 69 |     densityPlot(raw_set, sampGroups = pData(raw_set)$Condition, pal = plot_colors)
 70 |   dev.off()
 71 | 
 72 |   png("plot.density.raw.array.png", width = 8, height = 5, units = "in", res = 300)
 73 |     densityPlot(raw_set, sampGroups = pData(raw_set)$Array, pal = plot_colors)
 74 |   dev.off()
 75 | 
 76 |   # delete Rplots.pdf
 77 |   if (file.exists("Rplots.pdf")) file.remove("Rplots.pdf")
 78 | 
 79 |   message("\n\n ===== minfi::detectionP() ===== \n\n")
 80 | 
 81 |   # identify failed positions
 82 |   det_p = detectionP(raw_set)
 83 | 
 84 |   # save detection stats
 85 |   det_p_summary =
 86 |     tibble(
 87 |       sample = colnames(det_p),
 88 |       detected_positions = colSums(det_p < 0.01),
 89 |       failed_positions = colSums(det_p >= 0.01),
 90 |       failed_positions_pct = round(colMeans(det_p > 0.01), digits = 3)
 91 |     ) %>%
 92 |     arrange(-failed_positions) %>%
 93 |     mutate(failed_positions_pct = failed_positions_pct * 100)
 94 |   write_csv(det_p_summary, "summary.detection.csv")
 95 | 
 96 |   return(raw_set)
 97 | 
 98 | }
 99 | 
100 | # normalize (functional normalization) raw data using functional normalization and generate some qc plots
101 | normalize_data = function(raw_channel_set) {
102 | 
103 |   # FunNorm: preprocessFunnorm() -> GenomicRatioSet
104 |   # Noob: preprocessNoob() -> MethylSet -> mapToGenome() -> GenomicMethylSet - > ratioConvert() -> GenomicRatioSet
105 |   # may not be necessary to convert to GenomicRatioSet, getBeta() works with both
106 | 
107 |   message("\n\n ===== minfi::preprocessRaw() ===== \n\n")
108 | 
109 |   mset = preprocessRaw(raw_channel_set)
110 | 
111 |   # plot and save median intensity QC
112 |   qc = getQC(mset)
113 |   mset = addQC(mset, qc = qc)
114 |   png("plot.medianintensity.png", width = 8, height = 8, units = "in", res = 300)
115 |     plotQC(qc)
116 |   dev.off()
117 | 
118 |   # worst samples (median intensity < 10.5 is failing by default)
119 |   # qc[qc[,"mMed"] < 10.5 | qc[,"uMed"] < 10.5,]
120 | 
121 |   message("\n\n ===== minfi::preprocessFunnorm() ===== \n\n")
122 | 
123 |   # functional normalization (FunNorm) - produces GenomicRatioSet
124 |   norm_set = preprocessFunnorm(raw_set, bgCorr = TRUE, dyeCorr = TRUE)
125 |   # class(norm_set)
126 |   write(paste0("total probes: ", nrow(norm_set)), file = "norm.log", append = TRUE)
127 | 
128 |   # identify failed positions
129 |   det_p = detectionP(raw_set)
130 | 
131 |   # keep only probes that pass through preprocessFunnorm()
132 |   det_p = det_p[intersect(rownames(det_p), rownames(norm_set)), ]
133 | 
134 |   # probes detected in at least 90% of the samples
135 |   # normSet = normSet[rowSums(det_p < 0.01) > ncol(det_p) * 0.9, ]
136 |   # probes detected in all samples
137 |   norm_set = norm_set[rowSums(det_p < 0.01) == ncol(det_p), ]
138 |   write(paste0("detected probes: ", nrow(norm_set)), file = "norm.log", append = TRUE)
139 | 
140 |   # drop the probes that contain either a SNP at the CpG interrogation or at the single nucleotide extension
141 |   norm_set = addSnpInfo(norm_set)
142 |   # head(granges(norm_set))
143 |   norm_set = dropLociWithSnps(norm_set, snps = c("SBE","CpG"), maf = 0)
144 |   write(paste0("non-SNP probes: ", nrow(norm_set)), file = "norm.log", append = TRUE)
145 | 
146 |   # sex prediction plot
147 |   png("plot.sex.png", width = 8, height = 8, units = "in", res = 300)
148 |     plotSex(getSex(norm_set), id = sampleNames(norm_set))
149 |   dev.off()
150 | 
151 |   # annotation
152 |   annot = getAnnotation(norm_set)
153 | 
154 |   # remove extra annotation columns and save
155 |   annot_tbl = annot %>% as_tibble(rownames = "probe") %>% arrange(probe)
156 |   remove_cols = c(
157 |     "AddressA", "AddressB", "ProbeSeqA", "ProbeSeqB", "NextBase", "Color", "Forward_Sequence", "SourceSeq",
158 |     "Probe_rs", "CpG_rs", "SBE_rs","Probe_maf", "CpG_maf", "SBE_maf", "Islands_Name", "UCSC_RefGene_Accession",
159 |     "GencodeBasicV12_NAME", "GencodeBasicV12_Accession", "GencodeBasicV12_Group", "GencodeCompV12_Accession",
160 |     "DNase_Hypersensitivity_NAME", "OpenChromatin_NAME", "Methyl27_Loci", "Methyl450_Loci", "Random_Loci")
161 |   annot_tbl = annot_tbl %>% dplyr::select(!any_of(remove_cols))
162 |   write_csv(head(annot_tbl, 100), "annot.head100.csv")
163 |   write_csv(annot_tbl, "annot.csv.gz")
164 | 
165 |   # remove sex probes
166 |   sex_probes = annot$Name[annot$chr %in% c("chrX", "chrY")]
167 |   norm_set = norm_set[!(rownames(norm_set) %in% sex_probes), ]
168 |   write(paste0("non-sex probes: ", nrow(norm_set)), file = "norm.log", append = TRUE)
169 | 
170 |   beta = getBeta(norm_set)
171 | 
172 |   png("plot.density.norm.fnorm.png", width = 8, height = 5, units = "in", res = 300)
173 |     densityPlot(beta, sampGroups = pData(norm_set)$Condition, pal = plot_colors)
174 |   dev.off()
175 | 
176 |   # MDS plots (must be an 'RGChannelSet', a 'MethylSet'  or matrix)
177 | 
178 |   png("plot.mds.raw.condition.png", width = 8, height = 8, units = "in", res = 300)
179 |     mdsPlot(raw_set, numPositions = 10000, sampNames = sampleNames(raw_set), sampGroups = pData(raw_set)$Condition,
180 |             legendPos = "topright", legendNCol = 1, pal = plot_colors)
181 |   dev.off()
182 | 
183 |   png("plot.mds.norm.fnorm.condition.png", width = 8, height = 8, units = "in", res = 300)
184 |     mdsPlot(beta, numPositions = 10000, sampNames = sampleNames(norm_set), sampGroups = pData(norm_set)$Condition,
185 |             legendPos = "topright", legendNCol = 1, pal = plot_colors)
186 |   dev.off()
187 | 
188 |   png("plot.mds.norm.fnorm.array.png", width = 8, height = 8, units = "in", res = 300)
189 |     mdsPlot(beta, numPositions = 10000, sampNames = sampleNames(norm_set), sampGroups = pData(norm_set)$Array,
190 |             legendPos = "topright", legendNCol = 1, pal = plot_colors)
191 |   dev.off()
192 | 
193 |   # save beta values
194 |   beta = getBeta(norm_set)
195 |   beta_tbl = beta %>% round(3) %>% as_tibble(rownames = "probe") %>% arrange(probe)
196 |   write_csv(head(beta_tbl, 100), "beta.head100.csv")
197 |   write_csv(beta_tbl, "beta.csv.gz")
198 | 
199 |   return(norm_set)
200 | 
201 | }
202 | 
203 | 
204 | 
205 | # end
206 | 


--------------------------------------------------------------------------------
/scripts/mut-mhc-binding.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use v5.10;
  6 | use File::Basename;
  7 | use Bio::SeqIO;
  8 | 
  9 | my $HELP = <<HELP;
 10 | 
 11 |   Predict MHC-I and MHC-II binding for ANNOVAR-annotated point mutations using IEDB MHC binding prediction tool.
 12 |   Protein coding changes are annotated using ANNOVAR.
 13 |   Binding predictions are calculated by IEDB MHC class I/II binding prediction tool using consensus method.
 14 |   MHC class is selected automatically based on the binding tool.
 15 | 
 16 |   Usage:
 17 |     perl mut-mhc-binding.pl base_name avinput_file annovar_scripts_dir annovar_refgene_txt iedb_mhc_dir
 18 | 
 19 |   Arguments:
 20 |     base_name             output prefix
 21 |     avinput_file          ANNOVAR avinput file - 5 required cols (chr, start, end, ref, alt) and then optional cols
 22 |     annovar_scripts_dir   ANNOVAR scripts directory
 23 |     annovar_refgene_txt   ANNOVAR refGene.txt reference file
 24 |     iedb_mhc_dir          IEDB MHC I or II binding tool directory (tested with version 2.17)
 25 | 
 26 | HELP
 27 | 
 28 | if (!$ARGV[4]) {
 29 | 	die $HELP;
 30 | }
 31 | 
 32 | main();
 33 | 
 34 | # main subroutine
 35 | sub main {
 36 | 	my $base_name = $ARGV[0];
 37 | 	my $avinput = $ARGV[1];
 38 | 	my $annovar_scripts_dir = $ARGV[2];
 39 | 	my $annovar_refgene_txt = $ARGV[3];
 40 | 	my $iedb_mhc_dir = $ARGV[4];
 41 | 
 42 | 	# check that inputs exist
 43 | 	unless ( -e $avinput ) {
 44 | 		die "\n\n ERROR: $avinput does not exist \n\n";
 45 | 	}
 46 | 	unless ( -d $annovar_scripts_dir ) {
 47 | 		die "\n\n ERROR: $annovar_scripts_dir does not exist \n\n";
 48 | 	}
 49 | 	unless ( -e $annovar_refgene_txt ) {
 50 | 		die "\n\n ERROR: $annovar_refgene_txt does not exist \n\n";
 51 | 	}
 52 | 	unless ( -d $iedb_mhc_dir ) {
 53 | 		die "\n\n ERROR: $iedb_mhc_dir does not exist \n\n";
 54 | 	}
 55 | 
 56 | 	# check that the ANNOVAR scripts exist
 57 | 	my $annovar_annotate_pl = "${annovar_scripts_dir}/annotate_variation.pl";
 58 | 	unless ( -e $annovar_annotate_pl ) {
 59 | 		die "\n\n ERROR: $annovar_annotate_pl does not exist \n\n";
 60 | 	}
 61 | 	my $annovar_coding_change_pl = "${annovar_scripts_dir}/coding_change.pl";
 62 | 	unless ( -e $annovar_coding_change_pl ) {
 63 | 		die "\n\n ERROR: $annovar_coding_change_pl does not exist \n\n";
 64 | 	}
 65 | 
 66 | 	# peptide length for binding predictions (and preparing binding prediction input sequences)
 67 | 	my $peptide_length;
 68 | 
 69 | 	# check that the IEDB scripts exist and determine class based on the available script
 70 | 	my $mhc_class;
 71 | 	my $predict_binding_py;
 72 | 	my $predict_binding_i_py = "${iedb_mhc_dir}/src/predict_binding.py";
 73 | 	my $predict_binding_ii_py = "${iedb_mhc_dir}/mhc_II_binding.py";
 74 | 	if ( -e $predict_binding_i_py ) {
 75 | 		$mhc_class = "I";
 76 | 		$predict_binding_py = $predict_binding_i_py;
 77 | 		# 8-14 on the IEDB and NetMHC web server
 78 | 		$peptide_length = 9;
 79 | 	}
 80 | 	elsif ( -e $predict_binding_ii_py ) {
 81 | 		$mhc_class = "II";
 82 | 		$predict_binding_py = $predict_binding_ii_py;
 83 | 		# generally between 15 and 24, 15 is default on the NetMHC web server
 84 | 		$peptide_length = 15;
 85 | 	}
 86 | 	else {
 87 | 		die "\n\n ERROR: binding.py does not exist \n\n";
 88 | 	}
 89 | 
 90 | 	# update file prefix since the rest will depend on MHC class
 91 | 	$base_name = "${base_name}.MHC-${mhc_class}.pad${peptide_length}";
 92 | 
 93 | 	say "annotating input mutations (avinput file)";
 94 | 	my $evf = annotate_avinput($base_name, $avinput, $annovar_annotate_pl, $annovar_refgene_txt);
 95 | 
 96 | 	say "getting coding change";
 97 | 	my $cod_change_fa = get_coding_change($base_name, $evf, $annovar_coding_change_pl, $annovar_refgene_txt);
 98 | 
 99 | 	say "formatting coding change";
100 | 	my $mutpad_fa = format_coding_change_fa($base_name, $cod_change_fa, $peptide_length);
101 | 
102 | 	say "predicting binding";
103 | 	my $bindpred_txt;
104 | 	if ($mhc_class eq "I") {
105 | 		say "MHC class: I";
106 | 		$bindpred_txt = predict_mhc_i_binding($base_name, $mutpad_fa, $peptide_length, $predict_binding_py);
107 | 	}
108 | 	if ($mhc_class eq "II") {
109 | 		say "MHC class: II";
110 | 		$bindpred_txt = predict_mhc_ii_binding($base_name, $mutpad_fa, $peptide_length, $predict_binding_py);
111 | 	}
112 | 
113 | 	say "annotating binding predictions";
114 | 	my $annot_txt = annotate_binding_predictions($base_name, $bindpred_txt, $evf);
115 | 	say "created $annot_txt";
116 | 
117 | }
118 | 
119 | # annotate avinput
120 | sub annotate_avinput {
121 | 	my $base_name = $_[0];
122 | 	my $avinput = $_[1];
123 | 	my $annovar_annotate_pl = $_[2];
124 | 	my $annovar_refgene_txt = $_[3];
125 | 
126 | 	my $genome = basename($annovar_refgene_txt);
127 | 	$genome =~ s/_refGene.txt//;
128 | 	my $annovar_ref_dir = dirname($annovar_refgene_txt);
129 | 
130 | 	# run ANNOVAR annotate_variation.pl
131 | 	my $cmd = "perl $annovar_annotate_pl";
132 | 	$cmd .= " --geneanno --dbtype refGene --buildver $genome --outfile $base_name";
133 | 	$cmd .= " $avinput $annovar_ref_dir";
134 | 	system $cmd;
135 | 
136 | 	# can only specify output file prefix
137 | 	my $evf = "${base_name}.exonic_variant_function";
138 | 
139 | 	# confirm that exonic_variant_function file generated
140 | 	unless ( -e $evf ) {
141 | 		die "\n\n ERROR: $evf DOES NOT EXIST \n\n";
142 | 	}
143 | 	if ( -z $evf ) {
144 | 		die "\n\n ERROR: $evf IS EMPTY \n\n";
145 | 	}
146 | 
147 | 	# clean up
148 | 	unlink "${base_name}.log";
149 | 	unlink "${base_name}.variant_function";
150 | 
151 | 	say "created $evf";
152 | 
153 | 	return $evf;
154 | }
155 | 
156 | # generate coding change FASTA
157 | sub get_coding_change {
158 | 	my $base_name = $_[0];
159 | 	my $evf = $_[1];
160 | 	my $annovar_coding_change_pl = $_[2];
161 | 	my $annovar_refgene_txt = $_[3];
162 | 
163 | 	my $annovar_refgene_fa = $annovar_refgene_txt;
164 | 	$annovar_refgene_fa =~ s/_refGene.txt/_refGeneMrna.fa/;
165 | 	my $out_file = "${base_name}.codchange.fa";
166 | 
167 | 	# run ANNOVAR coding_change.pl
168 | 	my $cmd = "perl $annovar_coding_change_pl";
169 | 	$cmd .= " --includesnp --onlyAltering";
170 | 	$cmd .= " $evf $annovar_refgene_txt $annovar_refgene_fa";
171 | 	$cmd .= " > $out_file";
172 | 	system $cmd;
173 | 
174 | 	# confirm that coding change file generated
175 | 	unless ( -e $out_file ) {
176 | 		die "\n\n ERROR: $out_file DOES NOT EXIST \n\n";
177 | 	}
178 | 	if ( -z $out_file ) {
179 | 		die "\n\n ERROR: $out_file IS EMPTY \n\n";
180 | 	}
181 | 
182 | 	say "created $out_file";
183 | 
184 | 	return $out_file;
185 | }
186 | 
187 | # format coding change FASTA for IEDB binding predictions
188 | sub format_coding_change_fa {
189 | 	my $base_name = $_[0];
190 | 	my $coding_change_fa = $_[1];
191 | 	my $peptide_length = $_[2];
192 | 
193 | 	# sequence padding (peptide length)
194 | 	my $padding = $peptide_length - 1;
195 | 
196 | 	my $out_file = "${base_name}.fa";
197 | 
198 | 	# delete output if already exists
199 | 	if ( -e $out_file ) {
200 | 		unlink $out_file;
201 | 	}
202 | 
203 | 	my $seqio_in = Bio::SeqIO->new(-file => $coding_change_fa, -format => 'fasta');
204 | 	my $seqio_out = Bio::SeqIO->new(-file => ">$out_file", -format => 'fasta');
205 | 	my $seq_num = 0;
206 | 	# tracking WT sequence info (listed before mutant)
207 | 	my ($seq_id_wt, $tx_name_wt, $sequence_wt, $seq_padded_wt) = ("ERR", "ERR", "ERR", "ERR");
208 | 	while ( my $seq_in = $seqio_in->next_seq() ) {
209 | 		# extract relevant parts
210 | 		my $seq_id = $seq_in->id;
211 | 		my $seq_desc = $seq_in->desc;
212 | 		$seq_desc =~ s/\s+$//;
213 | 		my $sequence = $seq_in->seq;
214 | 
215 | 		# extract mutation details and sequence around the WT sequence (listed before mutant)
216 | 		if ($seq_desc =~ m/(.+?)\sWILDTYPE/) {
217 | 			$seq_id_wt = $seq_id;
218 | 			$sequence_wt = $sequence;
219 | 			$tx_name_wt = $1;
220 | 		}
221 | 
222 | 		# process only altered sequences and single amino acid substitutions
223 | 		if ($seq_desc !~ m/silent/ && $seq_desc =~ m/\sp\.\w\d+\w\s/) {
224 | 			my ($tx_name, $aa_num, $aa_from, $aa_to, $seq_padded) = ("ERR", "ERR", "ERR", "ERR", "ERR");
225 | 			# extract mutation details and sequence around the altered amino acid
226 | 			if ($seq_desc =~ m/(.+?)\s.+?\(position\s(\d+)\schanged\sfrom\s(\w)\sto\s(\w)\)/) {
227 | 				$tx_name = $1;
228 | 				$aa_num = $2;
229 | 				$aa_from = $3;
230 | 				$aa_to = $4;
231 | 				# adjust offsets if amino acid is close to the beginning of sequence
232 | 				my $substr_start = $aa_num - 1 - $padding;
233 | 				$substr_start = $substr_start < 0 ? 0 : $substr_start;
234 | 				my $substr_length = $aa_num > $padding ? $padding + $padding + 1 : $aa_num + $padding;
235 | 				# get sequence (0-based)
236 | 				$seq_padded = substr($sequence, $substr_start, $substr_length);
237 | 				# remove trailing asterisk (stop codon)
238 | 				$seq_padded =~ s/\*$//;
239 | 				# say "${seq_id}\t${tx_name}\t${aa_from}${aa_num}${aa_to}\t${seq_padded}";
240 | 
241 | 				# repeat for WT (was processed as previous sequence)
242 | 				my $seq_padded_wt = substr($sequence_wt, $substr_start, $substr_length);
243 | 				$seq_padded_wt =~ s/\*$//;
244 | 
245 | 				# create output FASTA record and write it (binding predictions will just assign consecutive numbers)
246 | 				$seq_num++;
247 | 				my $new_id = "${seq_num}|${seq_id}|${tx_name}|${aa_from}${aa_num}${aa_to}|${seq_padded}";
248 | 				my $seq_out = Bio::Seq->new(-seq => $seq_padded, -id => $new_id);
249 | 				$seqio_out->write_seq($seq_out);
250 | 
251 | 				# create WT output FASTA record and write it (binding predictions will just assign consecutive numbers)
252 | 				$seq_num++;
253 | 				my $new_id_wt = "${seq_num}|${seq_id_wt}|${tx_name_wt}|WILDTYPE|${seq_padded_wt}";
254 | 				my $seq_out_wt = Bio::Seq->new(-seq => $seq_padded_wt, -id => $new_id_wt);
255 | 				$seqio_out->write_seq($seq_out_wt);
256 | 			}
257 | 		}
258 | 	}
259 | 
260 | 	# confirm that padded mutations FASTA file generated
261 | 	unless ( -e $out_file ) {
262 | 		die "\n\n ERROR: $out_file DOES NOT EXIST \n\n";
263 | 	}
264 | 	if ( -z $out_file ) {
265 | 		die "\n\n ERROR: $out_file IS EMPTY \n\n";
266 | 	}
267 | 
268 | 	say "created $out_file";
269 | 
270 | 	return $out_file;
271 | }
272 | 
273 | # run IEDB MHC-I Binding Predictions
274 | sub predict_mhc_i_binding {
275 | 	my $base_name = $_[0];
276 | 	my $mutpad_fa = $_[1];
277 | 	my $peptide_length = $_[2];
278 | 	my $predict_binding_py = $_[3];
279 | 
280 | 	my $raw_iedb_out_file = "${base_name}.iedb.txt";
281 | 
282 | 	# delete output if already exists
283 | 	if ( -e $raw_iedb_out_file ) {
284 | 		unlink $raw_iedb_out_file;
285 | 	}
286 | 
287 | 	# a reference panel of 27 alleles (human HLA reference set with maximal population coverage)
288 | 	# http://help.iedb.org/hc/en-us/articles/114094151851
289 | 	# my @alleles = ('HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:03', 'HLA-A*02:06', 'HLA-A*03:01', 'HLA-A*11:01',
290 | 	#	'HLA-A*23:01', 'HLA-A*24:02', 'HLA-A*26:01', 'HLA-A*30:01', 'HLA-A*30:02', 'HLA-A*31:01','HLA-A*32:01',
291 | 	#	'HLA-A*33:01', 'HLA-A*68:01', 'HLA-A*68:02', 'HLA-B*07:02', 'HLA-B*08:01', 'HLA-B*15:01', 'HLA-B*35:01',
292 | 	#	'HLA-B*40:01', 'HLA-B*44:02', 'HLA-B*44:03', 'HLA-B*51:01', 'HLA-B*53:01', 'HLA-B*57:01', 'HLA-B*58:01');
293 | 
294 | 	# mouse alleles
295 | 	my @alleles = ('H-2-Db', 'H-2-Dd', 'H-2-Kb', 'H-2-Kd', 'H-2-Kk', 'H-2-Ld');
296 | 
297 | 	# make predictions for each allele
298 | 	foreach (@alleles) {
299 | 		say "running predictions for allele $_";
300 | 
301 | 		# predict_binding.py command
302 | 		# ./src/predict_binding [method] [mhc] [peptide_length] [input_file]
303 | 		my $predict_binding_cmd = "$predict_binding_py consensus $_ $peptide_length $mutpad_fa >> $raw_iedb_out_file";
304 | 		system $predict_binding_cmd;
305 | 
306 | 		# confirm that binding predictions file generated
307 | 		unless ( -e $raw_iedb_out_file ) {
308 | 			die "\n\n ERROR: $raw_iedb_out_file DOES NOT EXIST \n\n";
309 | 		}
310 | 		if ( -z $raw_iedb_out_file ) {
311 | 			die "\n\n ERROR: $raw_iedb_out_file IS EMPTY \n\n";
312 | 		}
313 | 	}
314 | 
315 | 	my $merged_output = merge_binding_predictions_input_output($base_name, $mutpad_fa, $raw_iedb_out_file);
316 | 
317 | 	# clean up
318 | 	sleep(1);
319 | 	# unlink $mutpad_fa;
320 | 
321 | 	say "created $merged_output";
322 | 
323 | 	return $merged_output;
324 | 
325 | }
326 | 
327 | # run IEDB MHC-II Binding Predictions
328 | sub predict_mhc_ii_binding {
329 | 	my $base_name = $_[0];
330 | 	my $mutpad_fa = $_[1];
331 | 	my $peptide_length = $_[2];
332 | 	my $predict_binding_py = $_[3];
333 | 
334 | 	my $raw_iedb_out_file = "${base_name}.iedb.txt";
335 | 
336 | 	# delete output if already exists
337 | 	if ( -e $raw_iedb_out_file ) {
338 | 		unlink $raw_iedb_out_file;
339 | 	}
340 | 
341 | 	# mouse alleles
342 | 	my @alleles = ('H2-IAb', 'H2-IAd');
343 | 
344 | 	# make predictions for each allele
345 | 	foreach (@alleles) {
346 | 		say "running predictions for allele $_";
347 | 
348 | 		# mhc_II_binding.py command
349 | 		# python mhc_II_binding.py prediction_method_name allele_name input_sequence_file_name
350 | 		# Example: python mhc_II_binding.py consensus3 HLA-DRB1*03:01 test.fasta
351 | 		my $predict_binding_cmd = "$predict_binding_py consensus3 $_ $mutpad_fa >> $raw_iedb_out_file";
352 | 		system $predict_binding_cmd;
353 | 		say "$predict_binding_cmd";
354 | 
355 | 		# confirm that binding predictions file generated
356 | 		unless ( -e $raw_iedb_out_file ) {
357 | 			die "\n\n ERROR: $raw_iedb_out_file DOES NOT EXIST \n\n";
358 | 		}
359 | 		if ( -z $raw_iedb_out_file ) {
360 | 			die "\n\n ERROR: $raw_iedb_out_file IS EMPTY \n\n";
361 | 		}
362 | 	}
363 | 
364 | 	my $merged_output = merge_binding_predictions_input_output($base_name, $mutpad_fa, $raw_iedb_out_file);
365 | 
366 | 	# clean up
367 | 	sleep(1);
368 | 	# unlink $mutpad_fa;
369 | 
370 | 	say "created $merged_output";
371 | 
372 | 	return $merged_output;
373 | 
374 | }
375 | 
376 | # combine binding predictions input FASTA and output table
377 | sub merge_binding_predictions_input_output {
378 | 	my $base_name = $_[0];
379 | 	my $mutpad_fa = $_[1];
380 | 	my $iedb_out_txt = $_[2];
381 | 
382 | 	my $out_file = "${base_name}.binding.txt";
383 | 
384 | 	# delete output if already exists
385 | 	if ( -e $out_file ) {
386 | 		unlink $out_file;
387 | 	}
388 | 
389 | 	# $iedb_out_txt columns: allele, seq_num, start, end, peptide, ...
390 | 	# ${mutpad_fa}.txt columns: seq_num, line_id, transcript_id, mutation, peptide
391 | 
392 | 	# header for the mutated and WT parts of the combined file
393 | 	my $bindpred_header = `cat $iedb_out_txt | head -1 | cut -f 1,3,5-11`;
394 | 	system "printf \"line_id\ttranscript_id\tp_change\taa_padded\t${bindpred_header}\" > ${out_file}.mut.txt";
395 | 	$bindpred_header =~ s/\t/_wt\t/g;
396 | 	system "printf \"p_change_wt\taa_padded_wt\t${bindpred_header}\" > ${out_file}.wt.txt";
397 | 
398 | 	# clean up input files for joining
399 | 	system "cat $mutpad_fa | grep '^>' | cut -c 2- | tr '|' '\t' | LC_ALL=C sort -k1,1 > ${mutpad_fa}.txt";
400 | 	system "cat $iedb_out_txt | grep -v '^allele' | LC_ALL=C sort -k2,2 -k1,1 -k3,3 | cut -f 1,2,3,5-11 > ${iedb_out_txt}.tmp";
401 | 
402 | 	system "cat ${mutpad_fa}.txt | grep -v 'WILDTYPE' > ${mutpad_fa}.mut.txt";
403 | 	system "cat ${mutpad_fa}.txt | grep 'WILDTYPE' > ${mutpad_fa}.wt.txt";
404 | 
405 | 	# join by seq_num, remove seq_num col, sort by line_id and start
406 | 	my $join_mut_cmd = 'LC_ALL=C join -t $\'\t\' -a 1 -1 1 -2 2';
407 | 	$join_mut_cmd .= " ${mutpad_fa}.mut.txt";
408 | 	$join_mut_cmd .= " ${iedb_out_txt}.tmp";
409 | 	$join_mut_cmd .= " | LC_ALL=C sort -k2,2 -k7,7";
410 | 	$join_mut_cmd .= " | cut -f 2-";
411 | 	$join_mut_cmd .= " >> ${out_file}.mut.txt";
412 | 	system $join_mut_cmd;
413 | 
414 | 	# join by seq_num
415 | 	my $join_wt_cmd = 'LC_ALL=C join -t $\'\t\' -a 1 -1 1 -2 2';
416 | 	$join_wt_cmd .= " ${mutpad_fa}.wt.txt";
417 | 	$join_wt_cmd .= " ${iedb_out_txt}.tmp";
418 | 	$join_wt_cmd .= " | LC_ALL=C sort -k2,2 -k7,7";
419 | 	$join_wt_cmd .= " | cut -f 4-";
420 | 	$join_wt_cmd .= " >> ${out_file}.wt.txt";
421 | 	system $join_wt_cmd;
422 | 
423 | 	sleep(1);
424 | 
425 | 	system "paste ${out_file}.mut.txt ${out_file}.wt.txt >> ${out_file}";
426 | 
427 | 	# confirm that binding predictions file generated
428 | 	unless ( -e $out_file ) {
429 | 		die "\n\n ERROR: $out_file DOES NOT EXIST \n\n";
430 | 	}
431 | 
432 | 	# clean up
433 | 	sleep(1);
434 | 	# unlink $mutpad_fa;
435 | 	unlink "${iedb_out_txt}.tmp";
436 | 	unlink "${mutpad_fa}.mut.txt";
437 | 	unlink "${mutpad_fa}.wt.txt";
438 | 	unlink "${out_file}.mut.txt";
439 | 	unlink "${out_file}.wt.txt";
440 | 
441 | 	return $out_file;
442 | 
443 | }
444 | 
445 | # combine binding predictions table with variant annotations
446 | sub annotate_binding_predictions {
447 | 	my $base_name = $_[0];
448 | 	my $bindpred_txt = $_[1];
449 | 	my $evf = $_[2];
450 | 
451 | 	my $out_file = "${base_name}.binding.annot.txt";
452 | 
453 | 	# header for the combined file
454 | 	my $bindpred_header = `cat $bindpred_txt | head -1 | cut -f 2-`;
455 | 	system "printf \"#MUT\taa_change\tchr\tpos\tref\talt\t${bindpred_header}\" > $out_file";
456 | 
457 | 	# clean up input files for joining
458 | 	system "cat $evf | LC_ALL=C sort -k1,1 | cut -f 1,3,4,5,7,8 > ${evf}.tmp";
459 | 	system "cat $bindpred_txt | grep -v '^line_id' | LC_ALL=C sort -k1,1 > ${bindpred_txt}.tmp";
460 | 
461 | 	# join, remove seq col, add sample and mut cols, sort by consensus_percentile_rank
462 | 	my $join_cmd = 'LC_ALL=C join -t $\'\t\' -a 2';
463 | 	$join_cmd .= " ${evf}.tmp";
464 | 	$join_cmd .= " ${bindpred_txt}.tmp";
465 | 	$join_cmd .= " | cut -f 2-";
466 | 	$join_cmd .= ' | awk -F $\'\t\' \'BEGIN {OFS=FS} {print $2":"$3":"$4":"$5,$0}\'';
467 | 	$join_cmd .= " | LC_ALL=C sort -k13,13n -k14,14n";
468 | 	$join_cmd .= " >> $out_file";
469 | 	system $join_cmd;
470 | 
471 | 	# clean up
472 | 	sleep(1);
473 | 	unlink "${evf}.tmp";
474 | 	unlink "${bindpred_txt}.tmp";
475 | 
476 | 	return $out_file;
477 | }
478 | 
479 | 
480 | 
481 | # end
482 | 


--------------------------------------------------------------------------------
/scripts/scrna-decontaminate-soupx.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | '
  4 | Description:
  5 |   Remove ambient RNA contamination from 10x Genomics Chromium single-cell RNA-seq data using SoupX.
  6 |   Input and output are in the format produced by the Cell Ranger software suite.
  7 | 
  8 | Usage:
  9 |   scrna-decontaminate-soupx.R <in_dir> <out_dir>
 10 | 
 11 | Arguments:
 12 |   <in_dir>     input directory
 13 |   <out_dir>    output directory (will contain "outs/filtered_feature_bc_matrix")
 14 | 
 15 | Options:
 16 |   -h, --help   show this screen
 17 | ' -> doc
 18 | 
 19 | 
 20 | # increase output width
 21 | options(width = 120)
 22 | # print warnings as they occur
 23 | options(warn = 1)
 24 | 
 25 | # retrieve the command-line arguments
 26 | library(docopt)
 27 | opts = docopt(doc)
 28 | 
 29 | # relevent arguments
 30 | in_dir = opts$in_dir
 31 | out_dir = opts$out_dir
 32 | 
 33 | # check if the input parameters are valid
 34 | message("input dir: ", in_dir)
 35 | if (!dir.exists(in_dir)) { stop("input dir does not exist") }
 36 | message("output dir: ", out_dir)
 37 | if (dir.exists(out_dir)) { stop("output dir already exists") }
 38 | 
 39 | # load libraries
 40 | suppressPackageStartupMessages({
 41 |   library(glue)
 42 |   library(Matrix)
 43 |   library(SoupX)
 44 |   library(DropletUtils)
 45 | })
 46 | 
 47 | # set output directory as working directory
 48 | dir.create(out_dir)
 49 | if (dir.exists(out_dir)) {
 50 |   setwd(out_dir)
 51 | } else {
 52 |   stop(glue("output dir {out_dir} could not be created"))
 53 | }
 54 | 
 55 | # log to file
 56 | write(glue("analysis: {out_dir}"), file = "create.log", append = TRUE)
 57 | write(glue("soupx version: {packageVersion('SoupX')}"), file = "create.log", append = TRUE)
 58 | 
 59 | # find "outs" dir (contains "raw_feature_bc_matrix")
 60 | cr_outs_dir = list.files(path = in_dir, pattern = "raw_feature_bc_matrix$", full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
 61 | cr_outs_dir = dirname(cr_outs_dir)
 62 | if (length(cr_outs_dir) != 1) stop(glue("no outs directory in {in_dir}"))
 63 | 
 64 | # load data and estimate soup profile
 65 | sc = SoupX::load10X(cr_outs_dir)
 66 | 
 67 | # log the stats
 68 | write(glue("counts matrix cells: {ncol(sc$toc)}"), file = "create.log", append = TRUE)
 69 | write(glue("counts matrix genes: {nrow(sc$toc)}"), file = "create.log", append = TRUE)
 70 | in_umis = Matrix::colSums(sc$toc)
 71 | write(glue("unfiltered mean UMIs per cell: {round(mean(in_umis), 3)}"), file = "create.log", append = TRUE)
 72 | write(glue("unfiltered median UMIs per cell: {median(in_umis)}"), file = "create.log", append = TRUE)
 73 | in_detected_genes = Matrix::colSums(sc$toc > 0)
 74 | write(glue("unfiltered min genes per cell: {min(in_detected_genes)}"), file = "create.log", append = TRUE)
 75 | write(glue("unfiltered max genes per cell: {max(in_detected_genes)}"), file = "create.log", append = TRUE)
 76 | write(glue("unfiltered mean genes per cell: {round(mean(in_detected_genes), 3)}"), file = "create.log", append = TRUE)
 77 | write(glue("unfiltered median genes per cell: {median(in_detected_genes)}"), file = "create.log", append = TRUE)
 78 | 
 79 | # estimate the level of background contamination (represented as rho)
 80 | # creates a plot showing the density of estimates
 81 | png("qc.soupx.estimates.png", res = 300, width = 8, height = 5, units = "in")
 82 | sc = SoupX::autoEstCont(sc)
 83 | dev.off()
 84 | 
 85 | write(glue("estimated rho: {sc$fit$rhoEst}"), file = "create.log", append = TRUE)
 86 | 
 87 | # clean the data
 88 | soupx_out = SoupX::adjustCounts(sc)
 89 | dim(soupx_out)
 90 | 
 91 | # log the stats
 92 | out_umis = Matrix::colSums(soupx_out)
 93 | write(glue("decontaminated mean UMIs per cell: {round(mean(out_umis), 3)}"), file = "create.log", append = TRUE)
 94 | write(glue("decontaminated median UMIs per cell: {median(out_umis)}"), file = "create.log", append = TRUE)
 95 | out_detected_genes = Matrix::colSums(soupx_out > 0)
 96 | write(glue("decontaminated min genes per cell: {min(out_detected_genes)}"), file = "create.log", append = TRUE)
 97 | write(glue("decontaminated max genes per cell: {max(out_detected_genes)}"), file = "create.log", append = TRUE)
 98 | write(glue("decontaminated mean genes per cell: {round(mean(out_detected_genes), 3)}"), file = "create.log", append = TRUE)
 99 | write(glue("decontaminated median genes per cell: {median(out_detected_genes)}"), file = "create.log", append = TRUE)
100 | 
101 | # write count data in the 10x format (path must not exist)
102 | dir.create("./outs")
103 | DropletUtils::write10xCounts(x = soupx_out, path = "./outs/filtered_feature_bc_matrix", version = "3")
104 | 
105 | # delete Rplots.pdf
106 | if (file.exists("Rplots.pdf")) file.remove("Rplots.pdf")
107 | 
108 | 
109 | 
110 | # end
111 | 


--------------------------------------------------------------------------------
/scripts/scrna-doublets-scdblfinder.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | '
  4 | Description:
  5 |   Mark putative doublets in single-cell RNA-seq data stored as a Seurat object using scDblFinder.
  6 |   Input is a Seurat object stored as an RDS file.
  7 | 
  8 | Usage:
  9 |   scrna-doublets-scdblfinder.R <seurat_rds>
 10 | 
 11 | Arguments:
 12 |   <seurat_rds>     input directory
 13 | 
 14 | Options:
 15 |   -h, --help   show this screen
 16 | ' -> doc
 17 | 
 18 | 
 19 | # increase output width
 20 | options(width = 120)
 21 | # print warnings as they occur
 22 | options(warn = 1)
 23 | 
 24 | # retrieve the command-line arguments
 25 | library(docopt)
 26 | opts = docopt(doc)
 27 | 
 28 | # relevent arguments
 29 | seurat_rds = opts$seurat_rds
 30 | out_dir = dirname(seurat_rds)
 31 | 
 32 | message("seurat object: ", seurat_rds)
 33 | message("output dir: ", out_dir)
 34 | 
 35 | # check if the input is valid
 36 | if (!file.exists(seurat_rds)) { stop("object file does not exist") }
 37 | if (!dir.exists(out_dir)) { stop("output dir does not exist") }
 38 | 
 39 | # load libraries
 40 | suppressPackageStartupMessages({
 41 |   library(Seurat)
 42 |   library(tidyverse)
 43 |   library(cowplot)
 44 |   library(glue)
 45 |   library(scran)
 46 |   library(scDblFinder)
 47 | })
 48 | 
 49 | # import seurat object
 50 | seurat_obj = readRDS(seurat_rds)
 51 | 
 52 | # set output directory as working directory
 53 | setwd(out_dir)
 54 | 
 55 | # check if output exists already
 56 | if (file.exists("doublets.scDblFinder.csv.gz")) { stop("output already exists") }
 57 | 
 58 | # log to file
 59 | write(glue("scDblFinder version: {packageVersion('scDblFinder')}"), file = "create.log", append = TRUE)
 60 | 
 61 | Idents(seurat_obj) = "orig.ident"
 62 | sce = Seurat::as.SingleCellExperiment(seurat_obj, assay = "RNA")
 63 | sce = scran::computeSumFactors(sce, BPPARAM = BiocParallel::MulticoreParam(4))
 64 | 
 65 | set.seed(99)
 66 | if (("hash.ID" %in% names(seurat_obj@meta.data)) && ("HTO" %in% Seurat::Assays(seurat_obj))) {
 67 |   # hashed multi-sample experiment
 68 |   # `hash.ID` is created by `HTODemux`, `MULTI_ID` is the `MULTIseqDemux` equivalent and renamed `hash.ID` by scooter
 69 |   # samples are independent captures, not biological samples, if multiplexed using cell hashes
 70 |   if ("library" %in% names(seurat_obj@meta.data)) {
 71 |     if ("Doublet" %in% seurat_obj@meta.data$hash.ID) {
 72 |       message("library/batch: `library` with known doublets")
 73 |       known_doublets = sce$hash.ID == "Doublet"
 74 |       hto_doublet_rate = round(sum(known_doublets) / ncol(seurat_obj), 3)
 75 |       message(glue("hashtag doublet rate: {hto_doublet_rate}"))
 76 |       write(glue("hashtag doublet rate: {hto_doublet_rate}"), file = "create.log", append = TRUE)
 77 |     } else {
 78 |       message("library/batch: `library` without known doublets")
 79 |       known_doublets = NULL
 80 |     }
 81 |     doublet_tbl =
 82 |       scDblFinder(
 83 |         sce, samples = "library", knownDoublets = known_doublets,
 84 |         returnType = "table", BPPARAM = BiocParallel::MulticoreParam(4)
 85 |       )
 86 |   } else {
 87 |     stop("hashed multi-sample experiment should have a `library` metadata column")
 88 |   }
 89 | } else if (n_distinct(seurat_obj@meta.data$orig.ident) > 1) {
 90 |   # multi-sample experiment
 91 |   message("library/batch: `orig.ident` without known doublets")
 92 |   doublet_tbl = scDblFinder(sce, samples = "orig.ident", returnType = "table", BPPARAM = BiocParallel::MulticoreParam(4))
 93 | } else {
 94 |   message("library/batch: none")
 95 |   doublet_tbl = scDblFinder(sce, returnType = "table", BPPARAM = BiocParallel::MulticoreParam(4))
 96 | }
 97 | 
 98 | # using the "samples" parameter does not return a table (fixed in 1.11.4)
 99 | if (class(doublet_tbl) == "SingleCellExperiment") {
100 |   doublet_tbl = colData(doublet_tbl) %>% as.data.frame() %>% dplyr::select(starts_with("scDblFinder"))
101 |   colnames(doublet_tbl) = stringr::str_remove(colnames(doublet_tbl), "scDblFinder.")
102 |   doublet_tbl$type = "real"
103 | }
104 | doublet_tbl = doublet_tbl %>% as_tibble(rownames = "cell") %>% dplyr::filter(type == "real") %>% dplyr::arrange(cell)
105 | write_csv(doublet_tbl, "doublets.scDblFinder.csv.gz")
106 | 
107 | if (nrow(doublet_tbl) != ncol(seurat_obj)) { stop("doublet table and seurat object are not the same size") }
108 | 
109 | # add doublet stats to the seurat object
110 | doublet_tbl = doublet_tbl %>% select(cell, doublet_score_scDblFinder = score, doublet_class_scDblFinder = class)
111 | doublet_df = doublet_tbl %>% as.data.frame() %>% column_to_rownames("cell") %>% sample_frac()
112 | seurat_obj = AddMetaData(seurat_obj, doublet_df)
113 | seurat_obj@meta.data$doublet_class_scDblFinder = factor(seurat_obj@meta.data$doublet_class_scDblFinder)
114 | 
115 | # check doublet rate
116 | num_doublets = table(seurat_obj@meta.data$doublet_class_scDblFinder)[["doublet"]]
117 | doublet_rate = round(num_doublets / ncol(seurat_obj), 3)
118 | message(glue("doublet rate: {doublet_rate}"))
119 | write(glue("num doublets: {num_doublets}"), file = "create.log", append = TRUE)
120 | write(glue("doublet rate: {doublet_rate}"), file = "create.log", append = TRUE)
121 | 
122 | # dot size for plots
123 | num_cells = ncol(seurat_obj)
124 | pt_size = 1.8
125 | if (num_cells > 1000) pt_size = 1.4
126 | if (num_cells > 5000) pt_size = 1.0
127 | if (num_cells > 10000) pt_size = 0.6
128 | if (num_cells > 50000) pt_size = 0.2
129 | 
130 | # plot doublet score
131 | featplot_colors = colorRampPalette(c("#d9cfcb", "#d49070", "#ca5528", "#b72600", "#981000", "#730000"))(100)
132 | random_cells = sample(colnames(seurat_obj))
133 | plot_umap =
134 |   FeaturePlot(
135 |     seurat_obj, features = "doublet_score_scDblFinder", reduction = "umap",
136 |     cells = random_cells, pt.size = pt_size, cols = featplot_colors
137 |   ) +
138 |   theme_cowplot() +
139 |   theme(
140 |     plot.background = element_rect(fill = "white"),
141 |     aspect.ratio = 1, plot.title = element_text(hjust = 0.5),
142 |     axis.ticks = element_blank(), axis.text = element_blank()
143 |   )
144 | save_plot("dr.umap.doublet_score_scDblFinder.png", plot = plot_umap, base_height = 6.5, base_width = 8)
145 | Sys.sleep(1)
146 | save_plot("dr.umap.doublet_score_scDblFinder.pdf", plot = plot_umap, base_height = 6.5, base_width = 8)
147 | Sys.sleep(1)
148 | 
149 | # plot doublet class
150 | plot_umap =
151 |   DimPlot(
152 |     seurat_obj, group.by = "doublet_class_scDblFinder", reduction = "umap",
153 |     cells = random_cells, pt.size = pt_size, cols = c("#E41A1C", "#377EB8")
154 |   ) +
155 |   theme_cowplot() +
156 |   theme(
157 |     plot.background = element_rect(fill = "white"),
158 |     aspect.ratio = 1, plot.title = element_text(hjust = 0.5),
159 |     axis.ticks = element_blank(), axis.text = element_blank()
160 |   )
161 | save_plot("dr.umap.doublet_class_scDblFinder.png", plot = plot_umap, base_height = 6.5, base_width = 8)
162 | Sys.sleep(1)
163 | save_plot("dr.umap.doublet_class_scDblFinder.pdf", plot = plot_umap, base_height = 6.5, base_width = 8)
164 | Sys.sleep(1)
165 | 
166 | # delete Rplots.pdf
167 | if (file.exists("Rplots.pdf")) file.remove("Rplots.pdf")
168 | 
169 | # save
170 | Idents(seurat_obj) = "orig.ident"
171 | saveRDS(seurat_obj, "seurat_obj.rds")
172 | 
173 | 
174 | 
175 | # end
176 | 


--------------------------------------------------------------------------------
/scripts/snvs-cnvs-mutect-strelka-freec-pyclone.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | '
  5 | Description:
  6 | Convert mutations from Mutect and Strelka VCFs and CNVs from Control-FREEC to PyClone input table.
  7 | 
  8 | Usage:
  9 | snvs-cnvs-tsv-freec-pyclone.R <sample_name> <mutect_vcf> <strelka_vcf> <cnvs_txt> <out_txt>
 10 | 
 11 | Arguments:
 12 | <sample_name>  sample name in the tumor:normal format, must match the VCF sample names
 13 | <mutect_vcf>   Mutect (GATK4) VCF
 14 | <strelka_vcf>  Strelka VCF
 15 | <cnvs_txt>     Control-FREEC "ratio.txt" file with ratios, copy numbers, and genotypes for each window
 16 | <out_txt>      output PyClone TSV
 17 | 
 18 | Options:
 19 | -h, --help        show this screen
 20 | ' -> doc
 21 | 
 22 | 
 23 | # print warnings as they occur
 24 | options(warn = 1)
 25 | 
 26 | # retrieve the command-line arguments
 27 | suppressPackageStartupMessages(library(docopt))
 28 | opts = docopt(doc)
 29 | 
 30 | # relevent arguments
 31 | sample_name = opts$sample_name
 32 | mutect_vcf = opts$mutect_vcf
 33 | strelka_vcf = opts$strelka_vcf
 34 | cnvs_txt = opts$cnvs_txt
 35 | out_txt = opts$out_txt
 36 | 
 37 | # check that input files exists
 38 | if (!file.exists(mutect_vcf)) stop("file does not exist: ", mutect_vcf)
 39 | if (!file.exists(strelka_vcf)) stop("file does not exist: ", strelka_vcf)
 40 | if (!file.exists(cnvs_txt)) stop("file does not exist: ", cnvs_txt)
 41 | 
 42 | # load libraries
 43 | suppressPackageStartupMessages({
 44 |   library("tidyverse")
 45 |   library("glue")
 46 |   library("vcfR")
 47 |   library("GenomicRanges")
 48 |   library("stringr")
 49 | })
 50 | 
 51 | # Mutect 2.1 (GATK 4)
 52 | parse_mutect21 = function(vcfr_obj, sample_T, sample_N) {
 53 | 
 54 |   # confirm mutect version
 55 |   if (!any(str_detect(vcfr_obj@meta, "##Mutect Version=2.1"))) {
 56 |     stop("version mismatch: expecting Mutect 2.1")
 57 |   }
 58 | 
 59 |   # confirm sample names
 60 |   if (!any(str_detect(vcfr_obj@meta, glue("##tumor_sample={sample_T}")))) {
 61 |     stop("sample mismatch")
 62 |   }
 63 |   if (!any(str_detect(vcfr_obj@meta, glue("##normal_sample={sample_N}")))) {
 64 |     stop("sample mismatch")
 65 |   }
 66 | 
 67 |   # if a read is considered uninformative, it is counted towards the DP, but not the AD
 68 |   # an uninformative read is not reported in the AD, it is still used in calculations for genotyping
 69 |   # if uninformative reads are the only reads, we report the potential variant allele, but keep the AD values 0
 70 |   # AD is the number of reads that more likely than not support an allele
 71 |   # if you have 10 reads, each with 0.6 probability of having a certain alt allele, you get an AD of 10, whereas you get essentially 0.6 x 10 = 6 reads for the purpose of AF
 72 | 
 73 |   ##INFO DP Approximate read depth; some reads may have been filtered
 74 |   ##INFO TLOD Tumor LOD score
 75 |   ##FORMAT AD Allelic depths for the ref and alt alleles in the order listed
 76 |   ##FORMAT AF Allele fractions of alternate alleles in the tumor
 77 |   ##FORMAT DP Approximate read depth (reads with MQ=255 or with bad mates are filtered
 78 | 
 79 |   # convert vcfR object to a tibble
 80 |   muts_tbl = vcfR2tidy(vcfr_obj, single_frame = TRUE, verbose = FALSE,
 81 |                        info_fields = c("DP", "TLOD"),
 82 |                        format_fields = c("AD", "AF", "DP"))
 83 |   muts_tbl = muts_tbl$dat
 84 | 
 85 |   # extract relevant metrics
 86 |   muts_tbl =
 87 |     muts_tbl %>%
 88 |     # unique mutation identifier (for joining T and N)
 89 |     mutate(mutation_id = as.character(glue("{CHROM}:{POS}:{REF}:{ALT}"))) %>%
 90 |     filter(FILTER == "PASS") %>%
 91 |     separate(gt_AD, into = c("ref_counts", "alt_counts"), sep = ",", convert = TRUE, extra = "drop") %>%
 92 |     # manual AF calculation for comparison
 93 |     mutate(myAF = alt_counts / (ref_counts + alt_counts)) %>%
 94 |     mutate(
 95 |       QUAL = round(as.numeric(TLOD), 1),
 96 |       T_DEPTH = ref_counts + alt_counts,
 97 |       T_FREQ = round(as.numeric(gt_AF), 3)
 98 |     )
 99 | 
100 |   # extract samples T and N to put side by side ("wide" format)
101 |   snvs_n_tbl =
102 |     muts_tbl %>%
103 |     filter(Indiv == sample_N) %>%
104 |     dplyr::rename(N_DEPTH = T_DEPTH, N_FREQ = T_FREQ) %>%
105 |     dplyr::select(mutation_id, N_DEPTH, N_FREQ)
106 |   muts_tbl =
107 |     muts_tbl %>%
108 |     filter(Indiv == sample_T) %>%
109 |     inner_join(snvs_n_tbl, by = "mutation_id")
110 | 
111 |   # manual filtering
112 |   muts_tbl %>% filter((alt_counts >= 3) & (T_DEPTH >= 10) & (T_FREQ > 0.01) & (T_FREQ > (N_FREQ * 5)))
113 | 
114 | }
115 | 
116 | # Strelka 2
117 | parse_strelka2 = function(vcfr_obj, sample_T, sample_N) {
118 | 
119 |   # confirm strelka version
120 |   if (!any(str_detect(vcfr_obj@meta, "##source_version=2"))) {
121 |     stop("version mismatch: expecting Strelka 2")
122 |   }
123 | 
124 |   ##INFO QSS Quality score for any somatic snv
125 |   ##INFO SOMATIC Somatic mutation">
126 |   ##INFO QSI Quality score for the ALT haplotype to be present at a significantly different freq in the T and N
127 |   ##FORMAT AU Number of 'A' alleles used in tiers 1,2
128 |   ##FORMAT CU Number of 'C' alleles used in tiers 1,2
129 |   ##FORMAT GU Number of 'G' alleles used in tiers 1,2
130 |   ##FORMAT TU Number of 'T' alleles used in tiers 1,2
131 |   ##FORMAT TAR Reads strongly supporting alternate allele for tiers 1,2
132 |   ##FORMAT TIR Reads strongly supporting indel allele for tiers 1,2
133 | 
134 |   # convert vcfR object to a tibble
135 |   muts_tbl = vcfR2tidy(vcfr_obj,
136 |                        info_fields = c("SOMATIC", "QSS", "QSI"),
137 |                        format_fields = c("DP", "AU", "CU", "GU", "TU", "TAR", "TIR"),
138 |                        single_frame = TRUE, verbose = FALSE)
139 |   muts_tbl = muts_tbl$dat
140 |   colnames(muts_tbl)
141 | 
142 |   # extract relevant metrics
143 |   muts_tbl =
144 |     muts_tbl %>%
145 |     mutate(mutation_id = as.character(glue("{CHROM}:{POS}:{REF}:{ALT}"))) %>%
146 |     filter(FILTER == "PASS") %>%
147 |     # extract tier1 counts for each nucleotide
148 |     separate(gt_AU, into = "A_counts", sep = ",", convert = TRUE, extra = "drop") %>%
149 |     separate(gt_CU, into = "C_counts", sep = ",", convert = TRUE, extra = "drop") %>%
150 |     separate(gt_GU, into = "G_counts", sep = ",", convert = TRUE, extra = "drop") %>%
151 |     separate(gt_TU, into = "T_counts", sep = ",", convert = TRUE, extra = "drop") %>%
152 |     separate(gt_TAR, into = "indel_ref_counts", sep = ",", convert = TRUE, extra = "drop") %>%
153 |     separate(gt_TIR, into = "indel_alt_counts", sep = ",", convert = TRUE, extra = "drop") %>%
154 |     # set ref/alt counts
155 |     mutate(
156 |       ref_counts = case_when(
157 |         is.na(QSS) ~ indel_ref_counts,
158 |         REF == "A" ~ A_counts,
159 |         REF == "C" ~ C_counts,
160 |         REF == "G" ~ G_counts,
161 |         REF == "T" ~ T_counts
162 |       )
163 |     ) %>%
164 |     mutate(
165 |       alt_counts = case_when(
166 |         is.na(QSS) ~ indel_alt_counts,
167 |         ALT == "A" ~ A_counts,
168 |         ALT == "C" ~ C_counts,
169 |         ALT == "G" ~ G_counts,
170 |         ALT == "T" ~ T_counts
171 |       )
172 |     ) %>%
173 |     # extract quality
174 |     mutate(
175 |       QUAL = case_when(
176 |         is.na(QSI) ~ QSS,
177 |         is.na(QSS) ~ QSI
178 |       )
179 |     ) %>%
180 |     mutate(T_DEPTH = gt_DP) %>%
181 |     mutate(T_FREQ = round(alt_counts / (ref_counts + alt_counts), 3))
182 | 
183 |   # extract samples T and N to put side by side ("wide" format)
184 |   snvs_n_tbl =
185 |     muts_tbl %>%
186 |     filter(Indiv == "NORMAL") %>%
187 |     dplyr::rename(N_DEPTH = T_DEPTH, N_FREQ = T_FREQ) %>%
188 |     dplyr::select(mutation_id, N_DEPTH, N_FREQ)
189 |   muts_tbl =
190 |     muts_tbl %>%
191 |     filter(Indiv == "TUMOR") %>%
192 |     inner_join(snvs_n_tbl, by = "mutation_id")
193 | 
194 |   # manual filtering
195 |   muts_tbl %>% filter((alt_counts >= 3) & (T_DEPTH >= 10) & (T_FREQ > 0.01) & (T_FREQ > (N_FREQ * 5)))
196 | 
197 | }
198 | 
199 | # parse either vcf
200 | parse_vcf = function(sample_name, vcf) {
201 | 
202 |   # split sample name for somatic variants
203 |   if (str_detect(sample_name, ":")) {
204 |     sample_T = str_split_fixed(sample_name, ":", 2)[1]
205 |     sample_N = str_split_fixed(sample_name, ":", 2)[2]
206 |   }
207 | 
208 |   # import VCF as a vcfR object
209 |   muts_vcfr = read.vcfR(vcf, verbose = FALSE)
210 | 
211 |   # check if there are any variants
212 |   if (nrow(muts_vcfr@fix) == 0) stop("no variants in imported VCF")
213 | 
214 |   # determine variant caller based on VCF contents and parse accordingly
215 |   if (any(str_detect(muts_vcfr@meta, "##source=Mutect2"))) {
216 | 
217 |     # Mutect 2.1 (GATK 4)
218 |     message("parsing Mutect 2.1 VCF")
219 |     caller_type = "somatic"
220 |     vcf_type = "mutect21"
221 |     vcf_tbl = parse_mutect21(vcfr_obj = muts_vcfr, sample_T = sample_T, sample_N = sample_N)
222 | 
223 |   } else if (any(str_detect(muts_vcfr@meta, "##source=strelka"))) {
224 | 
225 |     # Strelka 2
226 |     message("parsing Strelka 2 VCF")
227 |     caller_type = "somatic"
228 |     vcf_type = "strelka2"
229 |     vcf_tbl = parse_strelka2(vcfr_obj = muts_vcfr, sample_T = sample_T, sample_N = sample_N)
230 | 
231 |   } else {
232 | 
233 |     stop("unknown variant caller")
234 | 
235 |   }
236 | 
237 |   # check if table is empty
238 |   if (nrow(vcf_tbl) == 0) stop("output table is empty after parsing")
239 | 
240 |   # create and sort by mutation_id
241 |   vcf_tbl =
242 |     vcf_tbl %>%
243 |     mutate(mutation_id = as.character(glue("{CHROM}:{POS}:{REF}:{ALT}"))) %>%
244 |     arrange(mutation_id) %>%
245 |     mutate(
246 |       mut_type = case_when(
247 |         str_length(REF) > str_length(ALT) ~ "del",
248 |         str_length(ALT) > str_length(REF) ~ "ins",
249 |         TRUE ~ "pt"
250 |       )
251 |     ) %>%
252 |     filter(mut_type == "pt")
253 | 
254 |   # output table columns
255 |   vcf_tbl = vcf_tbl %>% mutate(SAMPLE_T = sample_T, SAMPLE_N = sample_N)
256 |   vcf_tbl
257 | 
258 | }
259 | 
260 | # parse mutect VCF
261 | muts_mutect_tbl = parse_vcf(sample_name = sample_name, vcf = mutect_vcf)
262 | muts_mutect_tbl$variant_caller = "mutect"
263 | 
264 | # parse strelka VCF
265 | muts_strelka_tbl = parse_vcf(sample_name = sample_name, vcf = strelka_vcf)
266 | muts_strelka_tbl$variant_caller = "strelka"
267 | 
268 | # combine both tables
269 | muts_all = bind_rows(muts_mutect_tbl, muts_strelka_tbl)
270 | 
271 | # determine number of callers (including duplicates)
272 | snvs_tbl =
273 |   muts_all %>%
274 |   add_count(mutation_id, SAMPLE_T, SAMPLE_N) %>%
275 |   mutate(variant_caller = if_else(n == 2, "mutect+strelka", variant_caller)) %>%
276 |   dplyr::select(-QUAL, -n)
277 | 
278 | # keep one row if called by both callers
279 | # for mutations with entry for each caller, select entry with higher T_FREQ
280 | snvs_tbl =
281 |   snvs_tbl %>%
282 |   group_by(mutation_id, SAMPLE_T, SAMPLE_N) %>%
283 |   arrange(-T_FREQ, -T_DEPTH) %>%
284 |   dplyr::slice(1) %>%
285 |   ungroup()
286 | 
287 | # chrs to filter out
288 | chr_filter = c("chrX", "chrY", "chrM", "X", "Y", "MT", "M")
289 | 
290 | # clean up and add PyClone columns
291 | snvs_tbl = snvs_tbl %>%
292 |   filter(!CHROM %in% chr_filter) %>%
293 |   dplyr::rename(chr = CHROM, variant_freq = T_FREQ, var_counts = alt_counts) %>%
294 |   mutate(start = POS, end = POS)
295 | 
296 | # convert SNVs to GRanges for overlapping
297 | snvs_gr = makeGRangesFromDataFrame(df = snvs_tbl,
298 |                                    ignore.strand = TRUE,
299 |                                    keep.extra.columns = TRUE,
300 |                                    starts.in.df.are.0based = FALSE)
301 | names(snvs_gr) = snvs_gr$mutation_id
302 | 
303 | # import CNVs file
304 | cnvs_tbl = read_tsv(cnvs_txt, guess_max = 500000, progress = FALSE)
305 | 
306 | # clean up
307 | cnvs_tbl = cnvs_tbl %>%
308 |   transmute(chr = Chromosome, start = Start - 1, end = Start, genotype = Genotype) %>%
309 |   dplyr::select(chr, start, end, genotype) %>%
310 |   mutate(cnv_id = str_c(chr, ":", end)) %>%
311 |   filter(!chr %in% chr_filter)
312 | 
313 | # calculate window size (most common start site difference)
314 | win_size = cnvs_tbl %>%
315 |   mutate(win_size = (start - lag(start))) %>%
316 |   filter(win_size > 100) %>%
317 |   na.omit() %>%
318 |   pull(win_size)
319 | win_size = unique(win_size)[which.max(tabulate(match(win_size, unique(win_size))))]
320 | 
321 | # update coordinates to reflect window size (0-based)
322 | # can be extracted from targeted sequencing results, but not WGS
323 | cnvs_tbl = cnvs_tbl %>% mutate(end = start + win_size)
324 | 
325 | # add PyClone columns
326 | cnvs_tbl = cnvs_tbl %>%
327 |   filter(genotype != "-", genotype != "") %>%
328 |   mutate(normal_cn = 2,
329 |          minor_cn = str_count(genotype, "B"),
330 |          major_cn = str_count(genotype, "A"))
331 | 
332 | # convert CNVs to GRanges for overlapping
333 | cnvs_gr = makeGRangesFromDataFrame(df = cnvs_tbl,
334 |                                    ignore.strand = TRUE,
335 |                                    keep.extra.columns = TRUE,
336 |                                    starts.in.df.are.0based = TRUE)
337 | names(cnvs_gr) = cnvs_gr$cnv_id
338 | 
339 | # set seqlevels in both GRanges to UCSC style
340 | seqlevelsStyle(snvs_gr) = "UCSC"
341 | seqlevelsStyle(cnvs_gr) = "UCSC"
342 | 
343 | # get overlaps for SNVs and CNVs (consider adjacent windows as well)
344 | overlaps = distanceToNearest(x = snvs_gr, subject = cnvs_gr)
345 | overlaps = overlaps %>% as("data.frame") %>% as_tibble() %>% filter(distance < win_size * 2)
346 | 
347 | # convert overlaps from identifiers to names
348 | overlaps$mutation_id = snvs_gr[overlaps$queryHits] %>% names()
349 | overlaps$cnv_id = cnvs_gr[overlaps$subjectHits] %>% names()
350 | overlaps = overlaps %>% dplyr::select(-queryHits, -subjectHits)
351 | 
352 | # merge overlapping SNVs and CNVs
353 | overlaps = overlaps %>%
354 |   inner_join(snvs_tbl, by = "mutation_id") %>%
355 |   inner_join(cnvs_tbl, by = "cnv_id")
356 | 
357 | # extract columns for PyClone
358 | pyclone_tbl = overlaps %>%
359 |   dplyr::select(mutation_id, ref_counts, var_counts, normal_cn, minor_cn, major_cn, variant_freq, genotype)
360 | 
361 | # save PyClone table
362 | write_tsv(pyclone_tbl, path = out_txt)
363 | 
364 | 
365 | 
366 | # end
367 | 


--------------------------------------------------------------------------------
/workflows/draft-genome-init.md:
--------------------------------------------------------------------------------
  1 | # Initializing draft genome directory
  2 | 
  3 | 
  4 | The requirement for [setting up a reference genome directory](https://github.com/igordot/reference-genomes) is having two basic files:
  5 | 
  6 | * `genome.fa` - genome sequence in FASTA format
  7 | * `genes.gtf` - gene annotations in GTF format
  8 | 
  9 | Properly formatted sequence and annotation files are readily available for commonly analyzed genomes. Less popular and 
 10 | draft genomes are less standardized and can be more difficult to work with.
 11 | 
 12 | ## Saimiri boliviensis boliviensis (Bolivian squirrel monkey) example
 13 | 
 14 | As of September 2016, the best reference for the Bolivian squirrel monkey is the first preliminary assembly SaiBol1.0 
 15 | (GCA_000235385.1), provided by the Broad Institute in October 2011. The assembly comprises 2685 top level sequences, all 
 16 | of which are unplaced scaffolds (from 151,413 contigs).
 17 | 
 18 | When working with a new genome, Ensembl is usually a good place to start as it contains well-formatted reference files for 
 19 | many species. You can find the [SaiBol1 genome](http://pre.ensembl.org/Saimiri_boliviensis/Info/Index).
 20 | 
 21 | The FASTA and GTF files are available and can be downloaded:
 22 | 
 23 | ```bash
 24 | wget -O genome.ensembl.pre.fa.gz ftp://ftp.ensembl.org/pub/pre/fasta/dna/saimiri_boliviensis/Saimiri_boliviensis.SaiBol1.0.dna_rm.toplevel.fa.gz
 25 | wget -O genes.ensembl.pre.gtf.gz ftp://ftp.ensembl.org/pub/pre/gtf/saimiri_boliviensis/SaiBol1.0.genes.gtf.gz
 26 | ```
 27 | 
 28 | Once you have the reference files, it's a good idea to spot-check them.
 29 | 
 30 | ```bash
 31 | zcat genome.ensembl.pre.fa.gz | grep ">" | head
 32 | ```
 33 | 
 34 | Output:
 35 | 
 36 | ```
 37 | >scaffold:SaiBol1.0:JH378105.1:1:72162052:1 scaffold JH378105.1
 38 | >scaffold:SaiBol1.0:JH378106.1:1:71252344:1 scaffold JH378106.1
 39 | >scaffold:SaiBol1.0:JH378107.1:1:58292249:1 scaffold JH378107.1
 40 | >scaffold:SaiBol1.0:JH378108.1:1:54856640:1 scaffold JH378108.1
 41 | >scaffold:SaiBol1.0:JH378109.1:1:50794693:1 scaffold JH378109.1
 42 | >scaffold:SaiBol1.0:JH378110.1:1:49021937:1 scaffold JH378110.1
 43 | >scaffold:SaiBol1.0:JH378111.1:1:46157118:1 scaffold JH378111.1
 44 | >scaffold:SaiBol1.0:JH378112.1:1:45331107:1 scaffold JH378112.1
 45 | >scaffold:SaiBol1.0:JH378113.1:1:44311105:1 scaffold JH378113.1
 46 | >scaffold:SaiBol1.0:JH378114.1:1:44255708:1 scaffold JH378114.1
 47 | ```
 48 | 
 49 | ```bash
 50 | zcat genes.ensembl.pre.gtf.gz | head
 51 | ```
 52 | 
 53 | Output:
 54 | 
 55 | ```
 56 | JH378796.1	protein_coding	exon	805	910	.	+	.	 gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "1"; gene_biotype "protein_coding";
 57 | JH378796.1	protein_coding	CDS	805	910	.	+	0	 gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "1"; gene_biotype "protein_coding"; protein_id "ENSP00000271139_1";
 58 | JH378796.1	protein_coding	exon	2580	3055	.	+	.	 gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "2"; gene_biotype "protein_coding";
 59 | JH378796.1	protein_coding	CDS	2580	3055	.	+	2	 gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "2"; gene_biotype "protein_coding"; protein_id "ENSP00000271139_1";
 60 | JH378584.1	protein_coding	exon	731	835	.	+	.	 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "1"; gene_biotype "protein_coding";
 61 | JH378584.1	protein_coding	CDS	731	835	.	+	0	 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "1"; gene_biotype "protein_coding"; protein_id "ENSP00000250416_1";
 62 | JH378584.1	protein_coding	exon	2441	2603	.	+	.	 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "2"; gene_biotype "protein_coding";
 63 | JH378584.1	protein_coding	CDS	2441	2603	.	+	0	 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "2"; gene_biotype "protein_coding"; protein_id "ENSP00000250416_1";
 64 | JH378584.1	protein_coding	exon	3155	3293	.	+	.	 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "3"; gene_biotype "protein_coding";
 65 | JH378584.1	protein_coding	CDS	3155	3293	.	+	2	 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "3"; gene_biotype "protein_coding"; protein_id "ENSP00000250416_1";
 66 | ```
 67 | 
 68 | There are several issues:
 69 | 
 70 | * FASTA contig names have spaces. This is usually fine, but will cause errors with some tools.
 71 | * GTF contig names (first column) do not match FASTA contig names (the full line after `>`).
 72 | * GTF records have `gene_id`, but not `gene_name`. Most biologists will want to know the gene names.
 73 | 
 74 | The first two problems can be solved with a few command line commands. The last one is more complicated.
 75 | 
 76 | Let's try NCBI next.
 77 | 
 78 | NCBI has a lot of databases, so it can be difficult to navigate. It also has the 
 79 | [SaiBol1 genome](https://www.ncbi.nlm.nih.gov/genome/6907), which is also based GCA_000235385.1 like Ensembl.
 80 | 
 81 | There is no GTF, but there is a GFF.
 82 | 
 83 | ```bash
 84 | wget -O genome.ncbi.fa.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/Saimiri_boliviensis/CHR_Un/39432_ref_SaiBol1.0_chrUn.fa.gz
 85 | wget -O genes.ncbi.gff3.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/Saimiri_boliviensis/GFF/ref_SaiBol1.0_scaffolds.gff3.gz
 86 | ```
 87 | 
 88 | There are actually two GFF files: `scaffolds.gff3.gz` and `top_level.gff3.gz`, but they are identical based on both file 
 89 | size and `md5sum`.
 90 | 
 91 | ```
 92 | $ zcat genome.ncbi.fa.gz | grep ">" | head
 93 | >gi|395726353|ref|NW_003943604.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00001, whole genome shotgun sequence
 94 | >gi|395726233|ref|NW_003943605.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00002, whole genome shotgun sequence
 95 | >gi|395726111|ref|NW_003943606.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00003, whole genome shotgun sequence
 96 | >gi|395725977|ref|NW_003943607.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00004, whole genome shotgun sequence
 97 | >gi|395725897|ref|NW_003943608.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00005, whole genome shotgun sequence
 98 | >gi|395725895|ref|NW_003943609.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00006, whole genome shotgun sequence
 99 | >gi|395725818|ref|NW_003943610.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00007, whole genome shotgun sequence
100 | >gi|395725816|ref|NW_003943611.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00008, whole genome shotgun sequence
101 | >gi|395725734|ref|NW_003943612.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00009, whole genome shotgun sequence
102 | >gi|395725652|ref|NW_003943613.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00010, whole genome shotgun sequence
103 | ```
104 | 
105 | ```
106 | $ zcat genes.ncbi.gff3.gz | grep -v "#" | head
107 | NW_003943604.1	RefSeq	region	1	72162052	.	+	.	ID=id0;Dbxref=taxon:39432;Name=Unknown;chromosome=Unknown;gbkey=Src;genome=genomic;isolate=3227;mol_type=genomic DNA;sex=female;sub-species=boliviensis
108 | NW_003943604.1	Gnomon	gene	8363	13782	.	-	.	ID=gene0;Dbxref=GeneID:101049931;Name=LOC101049931;gbkey=Gene;gene=LOC101049931
109 | NW_003943604.1	Gnomon	mRNA	8363	13782	.	-	.	ID=rna0;Parent=gene0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;Name=XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1
110 | NW_003943604.1	Gnomon	exon	13673	13782	.	-	.	ID=id1;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1
111 | NW_003943604.1	Gnomon	exon	11227	11475	.	-	.	ID=id2;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1
112 | NW_003943604.1	Gnomon	exon	8363	8619	.	-	.	ID=id3;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1
113 | NW_003943604.1	Gnomon	CDS	13673	13739	.	-	0	ID=cds0;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XP_010335103.1;Name=XP_010335103.1;gbkey=CDS;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;protein_id=XP_010335103.1
114 | NW_003943604.1	Gnomon	CDS	11227	11475	.	-	2	ID=cds0;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XP_010335103.1;Name=XP_010335103.1;gbkey=CDS;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;protein_id=XP_010335103.1
115 | NW_003943604.1	Gnomon	CDS	8363	8619	.	-	2	ID=cds0;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XP_010335103.1;Name=XP_010335103.1;gbkey=CDS;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;protein_id=XP_010335103.1
116 | NW_003943604.1	Gnomon	gene	16666	24141	.	+	.	ID=gene1;Dbxref=GeneID:101050261;Name=ZAR1L;gbkey=Gene;gene=ZAR1L
117 | ```
118 | 
119 | These reference files aren't perfect either, but at least the gene names are present.
120 | 
121 | Fix the FASTA contig names so they match the GFF contig names (`NW_*` or `NC_*`):
122 | 
123 | ```bash
124 | zcat genome.ncbi.fa.gz | perl -pe 's/gi\|.*\|(N._.+?)\|.*/\1/g' > genome.fa
125 | ```
126 | 
127 | Convert GFF to GTF using `gffread` (part of Cufflinks suite):
128 | 
129 | ```bash
130 | zcat genes.ncbi.gff3.gz | gffread - -T -o genes.ncbi.gff3.gtf
131 | ```
132 | 
133 | A few (24 out of over 850,000) of the GTF entries do not contain `gene_id` or `gene_name`. Remove those:
134 | 
135 | ```bash
136 | cat genes.ncbi.gff3.gtf | grep "gene_name" > genes.gtf
137 | ```
138 | 
139 | This leaves us with a clean `genome.fa` and `genes.gtf` for [setting up a reference genome directory](https://github.com/igordot/reference-genomes).
140 | 


--------------------------------------------------------------------------------
/workflows/gatk-mouse-mm10.md:
--------------------------------------------------------------------------------
 1 | # Creating GATK mm10 resource bundle
 2 | 
 3 | 
 4 | The GATK resource bundle is a collection of standard files for working with human resequencing data.
 5 | It contains known SNPs and indels to be used for BaseRecalibrator, RealignerTargetCreator, and IndelRealigner.
 6 | This is an attempt to recreate a similar bundle for the mouse genome (UCSC build mm10).
 7 | 
 8 | For mouse SNPs, it's possible to use the dbSNP database, which should be comparable to the human version.
 9 | 
10 | Download dbSNP GRCm38 VCF files (each chromosome is in a separate file):
11 | 
12 | ```bash
13 | wget --recursive --no-parent --no-directories \
14 | --accept vcf*vcf.gz \
15 | ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/
16 | ```
17 | 
18 | Delete the non-primary chromosomes if they are not included in the reference FASTA file.
19 | 
20 | Add "chr" to each chromosome (convert from GRCm38 to mm10 format):
21 | 
22 | ```bash
23 | for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do
24 |   vcf_new=${vcf/.vcf.gz/.vcf}
25 |   echo $vcf
26 |   zcat $vcf | sed 's/^\([0-9XY]\)/chr\1/' > $vcf_new
27 |   rm -fv $vcf
28 | done
29 | ```
30 | 
31 | Combine all dbSNP VCF files into one:
32 | 
33 | ```bash
34 | # generate parameter string containing all VCF files
35 | vcf_file_string=""
36 | for vcf in $(ls -1 vcf_chr_*.vcf) ; do
37 |   vcf_file_string="$vcf_file_string -V $vcf"
38 | done
39 | echo $vcf_file_string
40 | 
41 | # concatenate VCF files
42 | java -Xms16G -Xmx16G -cp ${gatk_path}/GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \
43 | -R genome.fa $vcf_file_string -out dbsnp.vcf
44 | ```
45 | 
46 | More recent dbSNP releases include a merged `00-All.vcf.gz` file in addition to the separate chromosome files.
47 | Although it will not need to be concatenated, it will likely still need to be adjusted for GATK compatibility.
48 | 
49 | For mouse indels, the Sanger Mouse Genetics Programme (Sanger MGP) is probably the best resource.
50 | 
51 | Download all MGP indels (5/2015 release):
52 | 
53 | ```bash
54 | wget ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz \
55 | -O mgp.v5.indels.vcf.gz
56 | ```
57 | 
58 | Filter for passing variants with chr added:
59 | 
60 | ```bash
61 | # adjust header
62 | zcat mgp.v5.indels.vcf.gz | head -1000 | grep "^#" | cut -f 1-8 \
63 | | grep -v "#contig" | grep -v "#source" \
64 | > mgp.v5.indels.pass.chr.vcf
65 | # keep only passing and adjust chromosome name
66 | zcat mgp.v5.indels.vcf.gz | grep -v "^#" | cut -f 1-8 \
67 | | grep -w "PASS" | sed 's/^\([0-9MXY]\)/chr\1/' \
68 | >> mgp.v5.indels.pass.chr.vcf
69 | ```
70 | 
71 | Sort VCF (automatically generated index has to be deleted due to a known bug):
72 | 
73 | ```bash
74 | java -Xms16G -Xmx16G -jar ${PICARD_ROOT}/picard.jar SortVcf VERBOSITY=WARNING \
75 | SD=genome.dict \
76 | I=mgp.v5.indels.pass.chr.vcf \
77 | O=mgp.v5.indels.pass.chr.sort.vcf
78 | rm -fv mgp.v5.indels.pass.chr.sort.vcf.idx
79 | ```
80 | 
81 | Additional info:
82 | 
83 | * [GATK Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle)
84 | * [What should I use as known variants/sites for running tool X?](http://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x)
85 | 


--------------------------------------------------------------------------------
/workflows/microarray.md:
--------------------------------------------------------------------------------
 1 | # Microarray differential expression analysis
 2 | 
 3 | Basic microarray differential expression analysis in R using `limma`.
 4 | 
 5 | ```r
 6 | library(affy)
 7 | library(limma)
 8 | sample_info = read.AnnotatedDataFrame("samples.csv")
 9 | eset = justRMA("/path/to/cel-files", phenoData = sample_info)
10 | design = model.matrix(~ group, pData(eset))
11 | fit = lmFit(eset, design)
12 | efit = eBayes(fit)
13 | topTable(efit, coef = 2)
14 | ```
15 | 


--------------------------------------------------------------------------------
/workflows/nanopore-init.md:
--------------------------------------------------------------------------------
  1 | # Processing of Oxford Nanopore Technologies (ONT) data
  2 | 
  3 | 
  4 | ## Sequencing
  5 | 
  6 | There are multiple sequencing protocols. The basic one is 1D, which is analogous to Illumina's single-read option where each DNA fragment is sequenced once. For higher accuracy, there was a 2D workflow where each fragment will generate both a template and complement reads (sepated by hairpin). In May 2017, ONT replaced the 2D system (part of a legal dispute since Pacific Biosciences patented the hairpin approach) with 1D^2 or "1D squared". 
  7 | 
  8 | During a standard MinION run with a single unbarcoded sample, the MinKNOW software writes a FAST5 file for each DNA molecule in a local directory. These FAST5 files contain aggregated signal measurements and may be basecalled.
  9 | 
 10 | FAST5 files overview from [Simpson Lab blog](http://simpsonlab.github.io/2017/09/06/nanopolish-v0.8.0/):
 11 | > Oxford Nanopore’s sequencers measure the disruption in electric current caused by single-stranded DNA moving through the nanopore. The device samples the current six thousand times per second and writes the samples to a FAST5 file. We refer to these measurements as “the raw signal” or “the raw samples”, or simply “the raw”. For the past three years nanopore basecallers have converted the raw samples into segments called “events”, with the boundaries between events roughly corresponding to movements of DNA through the pore. After the samples are segmented into events, the basecaller predicts which k-mer was in the pore when the samples for each event were taken. The basecalling results are stored in a new FAST5 file that has a table containing every event and its k-mer label.
 12 | 
 13 | ## Basecalling
 14 | 
 15 | MinKNOW can output basecalled and non-basecalled FAST5 files, but MinKNOW basecalled files may cause issues downstream. ONT also offers [Albacore](https://community.nanoporetech.com/protocols/albacore-offline-basecalli/v/abec_2003_v1_revx_29nov2016) for local offline basecalling.
 16 | 
 17 | Albacore needs Python 3.4+ and can by installed using `pip` (it's probably best to use `virtualenv`):
 18 | ```bash
 19 | 
 20 | # create a new environment in your virtualenv directory
 21 | cd /virtualenv_path/
 22 | pyvenv albacore
 23 | 
 24 | # activate virtualenv
 25 | source /virtualenv_path/albacore/bin/activate
 26 | 
 27 | # upgrade pip
 28 | pip install --upgrade pip
 29 | 
 30 | # install albacore from .whl (from ONT website)
 31 | pip install /path/ont_albacore-x.x.x.whl
 32 | 
 33 | # deactivate virtualenv
 34 | deactivate
 35 | ```
 36 | 
 37 | Perform basecalling:
 38 | ```bash
 39 | # it may be necessary to unset PYTHONPATH if it was set
 40 | unset PYTHONPATH
 41 | 
 42 | # activate virtualenv
 43 | source /virtualenv_path/albacore/bin/activate
 44 | 
 45 | # check available flowcells and kits (must be specified for basecalling)
 46 | read_fast5_basecaller.py --list_workflows
 47 | 
 48 | # run basecaller (use & to run the process in the background)
 49 | read_fast5_basecaller.py \
 50 | --worker_threads 8 \
 51 | --recursive \
 52 | --save_path ./albacore-out \
 53 | --output_format fast5 \
 54 | --input ./fast5 \
 55 | --flowcell FLO-MIN000 \
 56 | --kit SQK-NSK000 \
 57 | &
 58 | 
 59 | # check the number of processed reads
 60 | cat ./albacore-out/pipeline.log | grep "Finished" | wc -l
 61 | 
 62 | # deactivate virtualenv
 63 | deactivate
 64 | ```
 65 | 
 66 | ## Data extraction
 67 | 
 68 | Basecalled FAST5 files can be converted to FASTQ format that is more compatible with verious downstream analysis tools.
 69 | 
 70 | [Poretools](http://poretools.readthedocs.io/) is a popular tool for extracting data and information from FAST5 files.
 71 | 
 72 | Create a variable for the basecalled reads directory:
 73 | ```bash
 74 | fast5_dir="/albacore_path/workspace"
 75 | ```
 76 | 
 77 | Calculate overall stats (number of reads, mean read length, etc.):
 78 | ```bash
 79 | poretools stats --type 2D $fast5_dir > reads.2D.stats.txt
 80 | ```
 81 | 
 82 | Determine nucleotide composition:
 83 | ```bash
 84 | poretools nucdist $fast5_dir > reads.2D.nucdist.txt
 85 | ```
 86 | 
 87 | Generate gzipped FASTQ file:
 88 | ```bash
 89 | poretools fastq --min-length 500 --type 2D $fast5_dir | gzip > reads.2D.fastq.gz
 90 | ```
 91 | 
 92 | Generate FASTA file:
 93 | ```bash
 94 | poretools fasta --min-length 500 --type 2D $fast5_dir > reads.2D.fasta
 95 | ```
 96 | 
 97 | Generate a histogram of read lengths:
 98 | ```bash
 99 | poretools hist --theme-bw --min-length 0 --max-length 20000 --num-bins 39 --saveas reads.2D.hist.png $fast5_dir
100 | ```
101 | 
102 | Determine read lengths:
103 | ```bash
104 | samtools faidx reads.2D.fasta
105 | cat reads.2D.fasta.fai | grep "_Basecall_2D_2d" | cut -f 1,2 > reads.2D.length.txt
106 | ```
107 | 
108 | ## Assembly
109 | 
110 | [Canu](https://canu.readthedocs.io/) is a common de novo assembler for Nanopore long reads. It performs error correction, but additional polishing is helpful. [Nanopolish](https://github.com/jts/nanopolish) can calculate an improved consensus sequence for a draft genome assembly.
111 | 
112 | If you have both Illumina and Nanopore data, then [SPAdes](http://bioinf.spbau.ru/spades) is a good option for hybrid assembly. SPAdes will use Nanopore reads for gap closure and repeat resolution.
113 | 
114 | ## Additional info
115 | 
116 | PoreCamp is a training bootcamp based around Oxford Nanopore MinION sequencing that provides great [tutorials](https://porecamp.github.io/2017/) for basic processing of the data.
117 | 


--------------------------------------------------------------------------------
/workflows/ref-genome-gfp.md:
--------------------------------------------------------------------------------
 1 | # Creating a reference genome with exogenous sequences such as GFP 
 2 | 
 3 | 
 4 | A standard reference genome is sufficient for most sequencing-based studies, but the experiment may be more complicated.
 5 | Studies may involve knock-in or transgenic organisms where the genome sequence is altered.
 6 | Since a FASTA file can contain multiple sequences, it's trivial to create a combined one if the exact position in the genome where the foreign sequence is introduced is not relevant.
 7 | However, for RNA-seq, you additionally need to modify gene annotations, which is more involved.
 8 | 
 9 | Green fluorescent protein (GFP) is a frequently introduced sequence.
10 | Searching for the exact GFP sequence yields many variants, since there are multiple source species of wild-type GFP and
11 | various engineered derivatives, such as yellow fluorescent protein (YFP) or TurboGFP.
12 | This example uses the "enhanced" or "eukaryotic" GFP (EGFP), commonly used in the mammalian expression vectors.
13 | 
14 | This is the sequence that was used to create a GFP FASTA file `genome.EGFP.fa`:
15 | ```
16 | >EGFP
17 | ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAA
18 | GTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGC
19 | TGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAG
20 | CAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTA
21 | CAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGG
22 | ACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAAC
23 | GGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACAC
24 | CCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACG
25 | AGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA
26 | ```
27 | 
28 | To create the corresponding GTF, it's necessary to find the sequence length:
29 | ```bash
30 | cat genome.EGFP.fa | grep -v "^>" | tr -d "\n" | wc -c
31 | ```
32 | 
33 | The length is 720. This was used to manually create `genes.EGFP.gtf`:
34 | ```
35 | EGFP	unknown	gene	1	720	.	+	.	gene_id "EGFP"; gene_name "EGFP"; gene_biotype "protein_coding";
36 | EGFP	unknown	transcript	1	720	.	+	.	gene_id "EGFP"; transcript_id "EGFP"; gene_name "EGFP"; gene_biotype "protein_coding";
37 | EGFP	unknown	exon	1	720	.	+	.	gene_id "EGFP"; transcript_id "EGFP"; gene_name "EGFP"; gene_biotype "protein_coding";
38 | ```
39 | 
40 | Make sure that the sequence name (column 1) in the GTF matches the one in the FASTA file.
41 | 
42 | Different tools expect different content from a GTF file.
43 | Using `gene`, `transcript`, and `exon` features seems to be sufficient.
44 | The `gene_biotype` attribute was only added to help with filtering.
45 | 
46 | Finally, merge the standard genome FASTA and GTF files with the GFP ones:
47 | ```bash
48 | cat genome.mm10.fa genome.EGFP.fa > genome.fa
49 | cat genes.mm10.gtf genes.EGFP.gtf > genes.gtf
50 | ```
51 | 
52 | This produces the FASTA and GTF files for the combined reference genome that should be compatible with most GTF-based tools, such as STAR or Cell Ranger.
53 | Pseudoaligners like Kallisto or Salmon build an index from a FASTA formatted file of target sequences, so you can simply append `genome.EGFP.fa` to the cDNA FASTA file.
54 | 


--------------------------------------------------------------------------------
/workflows/ref-genome-init.md:
--------------------------------------------------------------------------------
1 | # Initializing reference genome directory
2 | 
3 | 
4 | The workflow is now in a [dedicated reference genomes repo](https://github.com/igordot/reference-genomes).
5 | 


--------------------------------------------------------------------------------
/workflows/rna-seq-diff-exp.md:
--------------------------------------------------------------------------------
 1 | # RNA-seq differential expression analysis
 2 | 
 3 | Basic RNA-seq differential expression analysis in R.
 4 | 
 5 | ## DESeq2
 6 | 
 7 | Load library:
 8 | ```r
 9 | library(DESeq2)
10 | ```
11 | 
12 | Import data:
13 | ```r
14 | # import raw counts matrix
15 | dds = DESeqDataSetFromMatrix(counts, coldata, ~ group)
16 | # import SummarizedExperiment
17 | dds = DESeqDataSet(se, ~ group)
18 | # import tximport
19 | dds = DESeqDataSetFromTximport(txi, coldata, ~ group)
20 | ```
21 | 
22 | Analysis:
23 | ```r
24 | dds = DESeq(dds)
25 | res = results(dds)
26 | ```
27 | 
28 | ***
29 | 
30 | ## edgeR
31 | 
32 | Load library:
33 | ```r
34 | library(edgeR) 
35 | ```
36 | 
37 | Import data (assuming four RNA-Seq libraries in two groups with counts are stored in a tab-delimited text file and gene symbols in a column "gene"):
38 | ```r
39 | # import data 
40 | counts = read.delim("counts.txt", row.names = "gene")
41 | group = factor(c(1,1,2,2))
42 | # edgeR stores data in a list-based data object called a DGEList
43 | y = DGEList(counts = counts, group = group)
44 | y = calcNormFactors(y)
45 | design = model.matrix(~ group)
46 | y = estimateDisp(y, design)
47 | ```
48 | 
49 | Perform likelihood ratio tests:
50 | ```r
51 | fit = glmFit(y, design)
52 | lrt = glmLRT(fit)
53 | topTags(lrt)
54 | ```
55 | 
56 | ***
57 | 
58 | ## limma-voom
59 | 
60 | ```r
61 | library(limma)
62 | design = model.matrix(~ group)
63 | dgel = DGEList(counts)
64 | dgel = calcNormFactors(dgel)
65 | v = voom(dgel, design, plot = FALSE)
66 | fit = lmFit(v, design)
67 | fit = eBayes(fit)
68 | topTable(fit)
69 | ```
70 | 


--------------------------------------------------------------------------------
/workflows/rrna-ref.md:
--------------------------------------------------------------------------------
 1 | # Creating ribosomal RNA reference sequence
 2 | 
 3 | 
 4 | RNA-seq libraries are typically prepared from total RNA using poly(A) enrichment of the mRNA to remove ribosomal RNAs, but this method fails to capture non-poly(A) transcripts or partially degraded mRNAs.
 5 | As an alternative, there are total RNA-seq protocols that require a separate rRNA depletion step.
 6 | To test the effectiveness of rRNA depletion, it's a good idea to check rRNA levels in the final RNA-seq library.
 7 | 
 8 | Some rRNAs are annotated in the GTF file and show up along with other genes in the final output.
 9 | However, rRNA abundance may be substantially underrepresented, since those sequences can fall in the repetitive regions of genome and many tools filter out multi-mapping reads.
10 | Thus, it may be useful to create a separate set of rRNA sequences to align against.
11 | This would also be necessary for tools like [FastQ Screen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/), which check the composition of a library by screening it against a set of sequence databases.
12 | 
13 | A good resource for rRNA sequences is [RNAcentral](https://rnacentral.org/), a database of non-coding RNA from multiple databases such as Rfam and RDP (Ribosomal Database Project).
14 | 
15 | Download and merge RNAcentral FASTA files:
16 | 
17 | ```bash
18 | wget ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/releases/12.0/sequences/rnacentral_species_specific_ids.fasta.gz
19 | ```
20 | 
21 | Convert multi-line sequences to single-line (using `fasta_formatter` from FASTX-Toolkit):
22 | 
23 | ```bash
24 | gzip -cd rnacentral_species_specific_ids.fasta.gz \
25 |   | fasta_formatter -w 0 \
26 |   | gzip \
27 |   > rnacentral.nowrap.fasta.gz
28 | ```
29 | 
30 | Remove empty lines, replace spaces with underscores, and keep just ribosomal sequences:
31 | 
32 | ```bash
33 | gzip -cd rnacentral.nowrap.fasta.gz \
34 |   | sed '/^$/d' \
35 |   | sed 's/\s/_/g' \
36 |   | grep -E -A 1 "ribosomal_RNA|rRNA" \
37 |   | grep -v "^--$" \
38 |   | gzip \
39 |   > rnacentral.ribosomal.nowrap.fasta.gz
40 | ```
41 | 
42 | Set a variable for the species of interest. For example:
43 | 
44 | ```bash
45 | species="homo_sapiens"
46 | species="mus_musculus"
47 | species="drosophila_melanogaster"
48 | ```
49 | 
50 | Extract species-specific ribosomal sequences:
51 | 
52 | ```bash
53 | zcat rnacentral.ribosomal.nowrap.fasta.gz \
54 |   | grep -A 1 -F -i "${species}" \
55 |   | grep -v "^--$" \
56 |   | fasta_formatter -w 80 \
57 |   > rRNA.${species}.fa
58 | ```
59 | 
60 | Index the species-specific FASTA file (not necessary, but will confirm that the FASTA file is valid):
61 | 
62 | ```bash
63 | samtools faidx rRNA.${species}.fa
64 | ```
65 | 
66 | Check the number of sequences per species (there are currently around 6,000 for human and 600 for mouse in RNAcentral):
67 | 
68 | ```bash
69 | wc -l *fai
70 | ```
71 | 
72 | Build the species-specific Bowtie2 index for tools like FastQ Screen:
73 | 
74 | ```bash
75 | bowtie2-build rRNA.${species}.fa rRNA.${species}
76 | ```
77 | 


--------------------------------------------------------------------------------