├── .gitignore ├── README.md ├── functions-r └── plotPCAWithSampleNames.R ├── notes ├── converting-files.md ├── files-chrom-sizes.md ├── files-fasta.md ├── files-fastq.md ├── files-gtf.md ├── files-vcf.md ├── heatmaps.md ├── pathways.md ├── r.md ├── rna-seq-strand.md ├── rna-seq.md └── single-cell-rna-seq.md ├── scripts-bigpurple ├── assembly-10x-supernova.sh ├── cnvs-wgs-freec.sh ├── jupyter.sh ├── scrna-10x-cellranger-aggr.sh ├── scrna-10x-cellranger-count-features.sh ├── scrna-10x-cellranger-count.sh └── scrna-10x-cellranger-multi.sh ├── scripts-phoenix ├── assembly-10x-supernova.sh ├── bcl2fastq-sample-sheet-fix.sh ├── join-all.sh ├── scrna-10x-cellranger-aggr.sh ├── scrna-10x-cellranger-count.sh └── wgs-10x-longranger.sh ├── scripts ├── cnv-freec-genome-plot.R ├── cnv-freec-heatmap.R ├── csv-clean.sh ├── fastq-merge.pl ├── fastq-quality-bars.sh ├── gtf-remove-overlapping.pl ├── hdcyto-1-import-fcs.qmd ├── hdcyto-2-prepare-sce.qmd ├── hdcyto-3-analyze-sce.qmd ├── meth-minfi.R ├── mut-mhc-binding.pl ├── scrna-10x-seurat-1.R ├── scrna-10x-seurat-2.R ├── scrna-10x-seurat-3.R ├── scrna-decontaminate-soupx.R ├── scrna-doublets-scdblfinder.R └── snvs-cnvs-mutect-strelka-freec-pyclone.R └── workflows ├── draft-genome-init.md ├── gatk-mouse-mm10.md ├── microarray.md ├── nanopore-init.md ├── ref-genome-gfp.md ├── ref-genome-init.md ├── rna-seq-diff-exp.md └── rrna-ref.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ### macOS ### 3 | # General 4 | .DS_Store 5 | .AppleDouble 6 | .LSOverride 7 | # Icon must end with two \r 8 | Icon 9 | # Thumbnails 10 | ._* 11 | # Files that might appear in the root of a volume 12 | .DocumentRevisions-V100 13 | .fseventsd 14 | .Spotlight-V100 15 | .TemporaryItems 16 | .Trashes 17 | .VolumeIcon.icns 18 | .com.apple.timemachine.donotpresent 19 | # Directories potentially created on remote AFP share 20 | .AppleDB 21 | .AppleDesktop 22 | Network Trash Folder 23 | Temporary Items 24 | .apdisk 25 | 26 | ### Windows ### 27 | # Windows thumbnail cache files 28 | Thumbs.db 29 | Thumbs.db:encryptable 30 | ehthumbs.db 31 | ehthumbs_vista.db 32 | # Dump file 33 | *.stackdump 34 | # Folder config file 35 | [Dd]esktop.ini 36 | # Recycle Bin used on file shares 37 | $RECYCLE.BIN/ 38 | # Windows Installer files 39 | *.cab 40 | *.msi 41 | *.msix 42 | *.msm 43 | *.msp 44 | # Windows shortcuts 45 | *.lnk 46 | 47 | ### Linux ### 48 | *~ 49 | # temporary files which can be created if a process still has a handle open of a deleted file 50 | .fuse_hidden* 51 | # KDE directory preferences 52 | .directory 53 | # Linux trash folder which might appear on any partition or disk 54 | .Trash-* 55 | # .nfs files are created when an open file is removed but is still being accessed 56 | .nfs* 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | computational genomics resources 2 | 3 | contents: 4 | 5 | * `notes`: notes and simple computational tasks 6 | * `workflows`: multi-step protocols 7 | * `scripts`: complete scripts for specific tasks 8 | * `scripts-bigpurple` and `scripts-phoenix`: scripts optimized for a specific cluster (include references to data and system variables) 9 | 10 | related repositories: 11 | 12 | * [sns/scripts](https://github.com/igordot/sns/blob/master/scripts): scripts from the `sns` pipeline 13 | * [tutorials](https://github.com/igordot/tutorials): tutorials 14 | -------------------------------------------------------------------------------- /functions-r/plotPCAWithSampleNames.R: -------------------------------------------------------------------------------- 1 | # Modified DESeq2 plotPCA function with sample names and proportion of variance added. 2 | # Sample names will be shown underneath each dot. 3 | # The axis will display proportion of variance for each principal component. 4 | # Tested using DESeq2 1.2.8, 1.6.2, and 1.8.1. 5 | # The native DESeq2 plotPCA function switched from lattice to ggplot2 in version 1.5.11. 6 | 7 | plotPCAWithSampleNames = function(x, intgroup="condition", ntop=500) 8 | { 9 | library(RColorBrewer) 10 | library(genefilter) 11 | library(lattice) 12 | 13 | # pca 14 | rv = rowVars(assay(x)) 15 | select = order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] 16 | pca = prcomp(t(assay(x)[select,])) 17 | 18 | # proportion of variance 19 | variance = pca$sdev^2 / sum(pca$sdev^2) 20 | variance = round(variance, 3) * 100 21 | 22 | # sample names 23 | names = colnames(x) 24 | 25 | # factor of groups 26 | fac = factor(apply(as.data.frame(colData(x)[, intgroup, drop=FALSE]), 1, paste, collapse=" : ")) 27 | 28 | # colors 29 | if( nlevels(fac) >= 10 ) 30 | colors = rainbow(nlevels(fac)) 31 | else if( nlevels(fac) >= 3 ) 32 | colors = brewer.pal(nlevels(fac), "Set1") 33 | else 34 | colors = c( "dodgerblue3", "firebrick3" ) 35 | 36 | # plot 37 | xyplot( 38 | PC2 ~ PC1, groups=fac, data=as.data.frame(pca$x), pch=16, cex=1.5, 39 | aspect = "fill", 40 | col = colors, 41 | xlab = list(paste("PC1 (", variance[1], "%)", sep=""), cex=0.8), 42 | ylab = list(paste("PC2 (", variance[2], "%)", sep=""), cex=0.8), 43 | panel = function(x, y, ...) { 44 | panel.xyplot(x, y, ...); 45 | ltext(x=x, y=y, labels=names, pos=1, offset=0.8, cex=0.7) 46 | }, 47 | main = draw.key( 48 | key = list( 49 | rect = list(col = colors), 50 | text = list(levels(fac)), 51 | rep = FALSE 52 | ) 53 | ) 54 | ) 55 | } 56 | -------------------------------------------------------------------------------- /notes/converting-files.md: -------------------------------------------------------------------------------- 1 | # Converting Files 2 | 3 | ## GFF to GTF 4 | Convert from GFF to GTF: 5 | ``` 6 | gffread in.gff -T -o out.gtf 7 | ``` 8 | gffread program is part of the Cufflinks package 9 | 10 | *** 11 | 12 | ## GTF to refFlat 13 | Convert gene annotations from GTF to genePred refFlat format: 14 | ``` 15 | gtfToGenePred -genePredExt -geneNameAsName2 genes.gtf refFlat.tmp.txt 16 | paste <(cut -f 12 refFlat.tmp.txt) <(cut -f 1-10 refFlat.tmp.txt) > refFlat.txt 17 | rm refFlat.tmp.txt 18 | gzip refFlat.txt 19 | ``` 20 | Tested with Ensembl GTF file and used for Picard CollectRnaSeqMetrics (must be gzipped). 21 | gtfToGenePred obtained from http://hgdownload.cse.ucsc.edu/admin/exe/ 22 | 23 | *** 24 | 25 | ## SRA to FASTQ 26 | Convert SRA (Sequence Read Archives) files to FASTQs: 27 | ``` 28 | /path/sratoolkit.2.3.4/bin/fastq-dump -v --split-files --gzip file.sra 29 | ``` 30 | SRA run ID to FASTQ (will download the SRA file and put it in a temp directory): 31 | ``` 32 | /path/sratoolkit.2.3.4/bin/fastq-dump -v --split-files --gzip SRR0000000 33 | ``` 34 | NCBI SRA Toolkit obtained from http://eutils.ncbi.nih.gov/Traces/sra/?view=software 35 | 36 | *** 37 | 38 | ## DOT to SVG/PNG 39 | Convert DOT database schema file to SVG or PNG: 40 | ``` 41 | dot -Tsvg file.dot > file.svg 42 | dot -Tpng file.dot > file.png 43 | ``` 44 | 45 | *** 46 | 47 | ## Between Various Image Formats 48 | ImageMagick: 49 | ``` 50 | convert in.png out.pdf 51 | ``` 52 | Ghostscript (seems to perform better): 53 | ``` 54 | gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -sOutputFile=out.pdf in.pdf 55 | ``` 56 | Both can do wildcard inputs to merge multiple files. 57 | -------------------------------------------------------------------------------- /notes/files-chrom-sizes.md: -------------------------------------------------------------------------------- 1 | # Working with chrom.sizes Files 2 | 3 | Generate a chrom.sizes file (from an indexed FASTA file): 4 | ``` 5 | cut -f 1,2 genome.fa.fai > chrom.sizes 6 | ``` 7 | 8 | Alternatives: 9 | * fetchChromSizes (http://hgdownload.soe.ucsc.edu/admin/exe/) 10 | -------------------------------------------------------------------------------- /notes/files-fasta.md: -------------------------------------------------------------------------------- 1 | # Working with FASTA Files 2 | 3 | 4 | Index FASTA: 5 | ```bash 6 | samtools faidx genome.fa 7 | ``` 8 | 9 | *** 10 | 11 | Remove empty records (description without sequence): 12 | ```bash 13 | awk '$2{print RS}$2' FS='\n' RS=\> ORS= in.fasta > out.fasta 14 | ``` 15 | 16 | *** 17 | 18 | Remove blank lines: 19 | ```bash 20 | sed -i '/^$/d' in.fasta 21 | ``` 22 | 23 | *** 24 | 25 | Remove problematic characters (they may cause issues with some tools): 26 | ```bash 27 | sed -i -e "s/[ ,\(\)\.\/\|:=]/_/g" in.fasta 28 | sed -i 's/___/__/g' in.fasta 29 | ``` 30 | 31 | *** 32 | 33 | Filter FASTA file by sequence length. 34 | 35 | Using `awk`: 36 | ```bash 37 | # if applicable, convert multi-line FASTA to single-line FASTA 38 | # using awk: 39 | awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' file.fa > file.nowrap.fa 40 | # using FASTX-Toolkit: 41 | fasta_formatter -w 0 -i file.fa -o file.nowrap.fa 42 | 43 | # filter by sequence size (1000 in this example) 44 | awk 'BEGIN {RS = ">" ; ORS = ""} length($2) >= 1000 {print ">"$0}' file.nowrap.fa > file.1000.fa 45 | ``` 46 | Using `faFilter`: 47 | ```bash 48 | faFilter -minSize=N -maxSize=N in.fa out.fa 49 | ``` 50 | `faFilter` obtained from http://hgdownload.soe.ucsc.edu/admin/exe/ 51 | -------------------------------------------------------------------------------- /notes/files-fastq.md: -------------------------------------------------------------------------------- 1 | # Working with FASTQ Files 2 | 3 | 4 | Subset a FASTQ (where NNN is the desired number of reads): 5 | ```bash 6 | seqtk sample -s 100 in.fastq.gz NNN | gzip > out.fastq.gz 7 | ``` 8 | Seqtk obtained from https://github.com/lh3/seqtk 9 | 10 | Alternatives: 11 | * fastq-tools fastq-sample (http://homes.cs.washington.edu/~dcjones/fastq-tools/fastq-sample.html) 12 | * sample_fastq.py (https://github.com/mojones/random_scripts/blob/master/sample_fastq.py) 13 | -------------------------------------------------------------------------------- /notes/files-gtf.md: -------------------------------------------------------------------------------- 1 | # Working with GTF Files 2 | 3 | 4 | Verify the GTF file format and confirm that the genes specified do not violate the rules of gene structure: 5 | ```bash 6 | validate_gtf.pl genes.gtf 7 | ``` 8 | `validate_gtf.pl` (by Evan Keibler) obtained from http://mblab.wustl.edu/software.html (there is also a different version included in the Eval package) 9 | 10 | *** 11 | 12 | Add gene names to GTF gene IDs to make them more readable (merge `gene_id` and `gene_name`): 13 | ```bash 14 | cat genes.gtf \ 15 | | grep "transcript_id" \ 16 | | perl -pe 's/(gene_id "(.+?)"; )(.*)(gene_name "(.+?)"; )/gene_id "\5:\2"; \3 \4/g' \ 17 | > genes.name-id.gtf 18 | ``` 19 | 20 | *** 21 | 22 | Some viral and bacterial GTF files only contain `CDS` features (column 3), but some tools require `exon` features. 23 | Convert `CDS` to `exon`, which would be equivalent for that purpose: 24 | ```bash 25 | cat in.gtf | perl -pe 's/\tCDS\t/\texon\t/g' > out.gtf 26 | ``` 27 | -------------------------------------------------------------------------------- /notes/files-vcf.md: -------------------------------------------------------------------------------- 1 | # Working with VCF Files 2 | 3 | Sort VCF file: 4 | ``` 5 | vcfsorter.pl genome.dict in.vcf > out.vcf 6 | ``` 7 | The output is compatible with Genome Analysis Toolkit (GATK). 8 | vcfsorter.pl (by German Gaston Leparc) obtained from https://code.google.com/p/vcfsorter/ 9 | Alternative: Picard SortVcf 10 | -------------------------------------------------------------------------------- /notes/heatmaps.md: -------------------------------------------------------------------------------- 1 | # Heatmap Generation Tools 2 | 3 | 4 | ## R functions (static) 5 | 6 | * [heatmap (stats package)](https://www.rdocumentation.org/packages/stats/topics/heatmap) 7 | * [heatmap.2 (gplots package)](https://www.rdocumentation.org/packages/gplots/topics/heatmap.2) 8 | * [heatmap.3 (GMD package)](https://www.rdocumentation.org/packages/GMD/topics/heatmap.3) 9 | * [aheatmap (NMF package)](https://www.rdocumentation.org/packages/NMF/topics/aheatmap) 10 | * [pheatmap](https://github.com/raivokolde/pheatmap) 11 | * [ComplexHeatmap](https://github.com/jokergoo/ComplexHeatmap) 12 | * [heatmap3](https://www.rdocumentation.org/packages/heatmap3/topics/heatmap3) 13 | * [heatmap1 (NeatMap package)](https://www.rdocumentation.org/packages/NeatMap/topics/heatmap1) 14 | * [superheat](https://rlbarter.github.io/superheat) 15 | * [heatmap.bp (vcfR package)](https://www.rdocumentation.org/packages/vcfR/topics/heatmap.bp) 16 | * [geom_tile (ggplot2 package)](https://www.rdocumentation.org/packages/ggplot2/topics/geom_tile) 17 | * [gapmap](https://cran.r-project.org/package=gapmap) 18 | 19 | 20 | ## R functions (interactive) 21 | 22 | * [d3heatmap](https://github.com/rstudio/d3heatmap) (not actively maintained) 23 | * [heatmaply](https://github.com/talgalili/heatmaply) 24 | * [annHeatmap (Heatplus package)](https://www.rdocumentation.org/packages/Heatplus/topics/annHeatmap) 25 | * [iheatmapr](https://github.com/AliciaSchep/iheatmapr) 26 | * [iheatmap (qtlcharts package)](https://www.rdocumentation.org/packages/qtlcharts/topics/iheatmap) 27 | 28 | 29 | ## Microsoft Excel 30 | 31 | tutorial 1: http://policyviz.com/create-a-heatmap-in-excel/ 32 | 33 | ![image](https://cloud.githubusercontent.com/assets/6363505/20320165/e6508b20-ab3e-11e6-869a-f7652a1130b1.png) 34 | 35 | tutorial 2: http://peltiertech.com/heat-map-excel-conditional-formatting/ 36 | 37 | ![image](https://cloud.githubusercontent.com/assets/6363505/20320201/066ca0a6-ab3f-11e6-82be-85da3b87df7b.png) 38 | 39 | 40 | ## Matrix2png 41 | 42 | http://www.chibi.ubc.ca/matrix2png/ 43 | 44 | > This program converts tab-delmited matrix files into png images. It is implemented in ANSI C and utilized Tom Boutell's gd library (which in turn uses libpng and zlib). It is designed to be called from the command line or within a script. 45 | 46 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319394/deeac36c-ab3b-11e6-8aea-8a2b38f646ab.png) 47 | 48 | 49 | ## Heatmap from HIV sequence database 50 | 51 | http://www.hiv.lanl.gov/content/sequence/HEATMAP/heatmap.html 52 | 53 | > A heatmap is a graphical way of displaying a table of numbers by using colors to represent the numerical values. The clustering algorithm groups related rows and/or columns together by similarity. 54 | 55 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319475/43bde904-ab3c-11e6-92a0-dcf5cfd2f443.png) 56 | 57 | 58 | ## jHeatmap 59 | 60 | http://jheatmap.github.io/jheatmap/ 61 | 62 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319614/d74cac50-ab3c-11e6-86ee-2596160fa02d.png) 63 | 64 | 65 | ## MicroScope 66 | 67 | http://microscopebioinformatics.org/ 68 | 69 | > We propose a user-friendly ChIP-seq and RNA-seq software suite for the interactive visualization and analysis of genomic data, including integrated features to support differential expression analysis, interactive heatmap production, principal component analysis, gene ontology analysis, and dynamic network visualization. 70 | 71 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319787/73c6ff18-ab3d-11e6-9c00-f60b5a132f44.png) 72 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319823/8de008d6-ab3d-11e6-88ab-52ba98293609.png) 73 | 74 | 75 | ## HeatmapGenerator 76 | 77 | https://github.com/Bohdan-Khomtchouk/HeatmapGenerator 78 | 79 | > a graphical user interface software program written in C++, R, and OpenGL 80 | 81 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319693/1a5f8062-ab3d-11e6-8008-7949f81236b3.png) 82 | 83 | 84 | ## GENE-E 85 | 86 | http://www.broadinstitute.org/cancer/software/GENE-E/ 87 | 88 | > GENE-E is a matrix visualization and analysis platform designed to support visual data exploration. It includes heat map, clustering, filtering, charting, marker selection, and many other tools. In addition to supporting generic matrices, GENE-E also contains tools that are designed specifically for genomics data.GENE-E is a matrix visualization and analysis platform designed to support visual data exploration. It includes heat map, clustering, filtering, charting, marker selection, and many other tools. In addition to supporting generic matrices, GENE-E also contains tools that are designed specifically for genomics data. 89 | 90 | 91 | ## Morpheus 92 | 93 | https://software.broadinstitute.org/morpheus/ 94 | 95 | > JavaScript matrix visualization and analysis 96 | 97 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319970/2dc5276e-ab3e-11e6-812a-84f2db7d57a6.png) 98 | 99 | 100 | ## MeV: MultiExperiment Viewer 101 | 102 | http://www.tm4.org/mev.html 103 | 104 | 105 | ## Web MEV 106 | 107 | http://mev.tm4.org/ 108 | 109 | > WebMeV (Multiple Experiment Viewer) is a cloud-based application supporting analysis, visualization, and stratification of large genomic data, particularly for RNASeq and microarray data. 110 | 111 | ![image](https://cloud.githubusercontent.com/assets/6363505/20319258/73762a5e-ab3b-11e6-829f-ad84cb26407d.png) 112 | 113 | 114 | ## Next Generation Clustered Heat Map Tool (NG-CHM) 115 | 116 | http://bioinformatics.mdanderson.org/testchm/ 117 | 118 | > Next-Generation (Clustered) Heat Maps are interactive heat maps that enable the user to zoom and pan across the heatmap, alter its color scheme, generate production quality PDFs, and link out from rows, columns, and individual heatmap entries to related statistics, databases and other information. 119 | 120 | ![image](https://user-images.githubusercontent.com/6363505/31510128-cb6de678-af51-11e7-96ed-dd8d3db94316.png) 121 | 122 | 123 | ## Clustergrammer 124 | 125 | http://amp.pharm.mssm.edu/clustergrammer/ 126 | 127 | > Clustergrammer is a web-based tool for visualizing and analyzing high-dimensional data as interactive and shareable hierarchically clustered heatmaps. Clustergrammer enables intuitive exploration of high-dimensional data and has several optional biology-specific features. 128 | 129 | ![image](https://user-images.githubusercontent.com/6363505/31510322-592e408e-af52-11e7-857a-33e747b63e06.png) 130 | -------------------------------------------------------------------------------- /notes/r.md: -------------------------------------------------------------------------------- 1 | # R 2 | 3 | ## General R 4 | 5 | Vignette: 6 | ```r 7 | # show vignettes for a package 8 | browseVignettes(package = "package") 9 | # get vignette 10 | vignette("topic") 11 | ``` 12 | 13 | Get version of package: 14 | ```r 15 | packageVersion("package") 16 | ``` 17 | 18 | Check that all installed packages can be loaded: 19 | ```r 20 | for (p in rownames(installed.packages())) { 21 | message(p) 22 | suppressPackageStartupMessages(library(p, character.only = TRUE)) 23 | } 24 | ``` 25 | 26 | Working with methods: 27 | ```r 28 | # prints source code for method 29 | getMethod(method, "class") 30 | # find method 31 | selectMethod(method, "class") 32 | # show all methods for class 33 | showMethods(classes = "class") 34 | methods(class = "class") 35 | # method help for S3 objects 36 | ?"method.class" 37 | ``` 38 | 39 | *** 40 | 41 | ## GenomicRanges 42 | 43 | The GenomicRanges package defines general purpose containers for storing and manipulating genomic intervals and variables defined along a genome. 44 | 45 | Basics: 46 | ```r 47 | library(GenomicRanges) 48 | # create a new GRanges object with one range 49 | g = GRanges("chr1", IRanges(10001, 10100), strand = "+") 50 | # get basic info for ranges 51 | start(g) 52 | end(g) 53 | width(g) 54 | strand(g) 55 | # get metadata columns (additional optional information for ranges) 56 | mcols(g) 57 | # get IRanges 58 | ranges(g) 59 | # get chromosomes for each range 60 | seqnames(g) 61 | # get all chromosomes 62 | seqlevels(g) 63 | ``` 64 | 65 | Intra-range methods (modify each range independently): 66 | * `shift`: move the ranges by a specific number of base pairs 67 | * `resize`: resizes to width, keeping start for + and end for - 68 | * `narrow`: narrows by relative position within range 69 | * `flank`: returns flanking ranges upstream 70 | * `promoters`: similar to flank 71 | * `restrict`: restricts ranges to a start and end position 72 | * `trim`: trims out of bound ranges 73 | * `+/-`: add or subtract a fixed amount 74 | * `?"intra-range-methods"`: summarize all intra-range methods 75 | 76 | Inter-range methods (comparisons between ranges): 77 | * `reduce`: merge overlapping ranges to produce a simplified set 78 | * `gaps`: get gaps between the ranges 79 | * `disjoin`: break into discrete non-overlapping ranges based on original starts/ends 80 | * `?"inter-range-methods"`: summarize all inter-range methods 81 | 82 | Distance methods (compare each range in `x` to `subject`): 83 | * `nearest`: get an integer vector containing the index of the nearest neighbor range in subject 84 | * `precede`: get the index of the range in subject that is directly preceded by the range in x 85 | * `follow`: get the index of the range in subject that is directly followed by the range in x 86 | * `distanceToNearest`: get the distances to the nearest neighbor in subject (Hits object) 87 | * `distance`: get the distances to the nearest neighbor (integer vector) 88 | 89 | Overlaps: 90 | ```r 91 | # vector of which x ranges overlap y ranges 92 | x %over% y 93 | # overlaps Hits object 94 | o = findOverlaps(x, y) 95 | # relative to x 96 | queryHits(o) 97 | # relative to y 98 | subjectHits(o) 99 | ``` 100 | 101 | *** 102 | 103 | Add chr to chromosome names RangedData data structure (from NCBI/Ensembl to UCSC style). 104 | ```r 105 | ann 106 | # RangedData with 38293 rows and 2 value columns across 51 spaces 107 | # space ranges | strand 108 | # | 109 | # ENSMUSG00000090025 1 [3054233, 3054733] | 1 110 | # ENSMUSG00000064842 1 [3102016, 3102125] | 1 111 | # ENSMUSG00000051951 1 [3205901, 3671498] | -1 112 | 113 | names(ann) = paste("chr", names(ann), sep="") 114 | 115 | ann 116 | # RangedData with 38293 rows and 2 value columns across 51 spaces 117 | # space ranges | strand 118 | # | 119 | # ENSMUSG00000090025 chr1 [3054233, 3054733] | 1 120 | # ENSMUSG00000064842 chr1 [3102016, 3102125] | 1 121 | # ENSMUSG00000051951 chr1 [3205901, 3671498] | -1 122 | ``` 123 | -------------------------------------------------------------------------------- /notes/rna-seq-strand.md: -------------------------------------------------------------------------------- 1 | # RNA-Seq Strand 2 | 3 | | | forward (transcript) | reverse (rev comp of transcript) | 4 | |:-------------------------------------|:-------------------------------------|:-------------------------------------| 5 | | TopHat/Cufflinks `--library-type` | `fr-secondstrand` | `fr-firststrand` | 6 | | STAR | 1st read strand | 2nd read strand | 7 | | Picard CollectRnaSeqMetrics `STRAND_SPECIFICITY` | `FIRST_READ_TRANSCRIPTION_STRAND` | `SECOND_READ_TRANSCRIPTION_STRAND` | 8 | | htseq-count `-s/--stranded` | `yes` | `reverse` | 9 | | subread featureCounts `-s` | `1` | `2` | 10 | | RSEM `--forward-prob` | `1` | `0` | 11 | | Salmon/Sailfish `--libType` | `SF`/`ISF` | `SR`/`ISR` | 12 | | HISAT2 `--rna-strandness` | `FR` (`F` for single-end) | `RF` (`R` for single-end) | 13 | | Library Kit | Illumina ScriptSeq | Illumina TruSeq Stranded Total RNA | 14 | 15 | *** 16 | 17 | Strand-specific protocols 18 | ([ref](http://onetipperday.sterding.com/2012/07/how-to-tell-which-library-type-to-use.html)): 19 | ![](https://3.bp.blogspot.com/-BkupUsIrnXk/UBbmmmx6T8I/AAAAAAAAAUU/_rcrd_ahT48/s1600/strand.png) 20 | 21 | *** 22 | 23 | Three widely used protocols for strand-specific RNA sequencing 24 | ([ref](http://www.nature.com/neuro/journal/v17/n11/full/nn.3814.html)): 25 | ![](https://images.nature.com/full/nature-assets/neuro/journal/v17/n11/images/nn.3814-F3.jpg) 26 | 27 | *** 28 | 29 | Illumina TruSeq Stranded Total RNA Kit 30 | ([ref](https://www.abmgood.com/marketing/knowledge_base/next_generation_sequencing_experimental_design.php)): 31 | ![](https://www.abmgood.com/marketing/knowledge_base/img/NGS/Next_Generation_Sequencing_NGS_TruSeq_Stranded_Total_RNA.png) 32 | -------------------------------------------------------------------------------- /notes/rna-seq.md: -------------------------------------------------------------------------------- 1 | # RNA-Seq Analysis 2 | 3 | 4 | Some basic RNA-seq analysis resources. 5 | 6 | ## Web Resources 7 | 8 | [RNA-seqlopedia](http://rnaseq.uoregon.edu/) 9 | > The RNA-seqlopedia provides an overview of RNA-seq and of the choices necessary to carry out a successful RNA-seq experiment. 10 | 11 | [Thinking About RNA Seq Experimental Design for Measuring Differential Gene Expression: The Basics](http://gkno2.tumblr.com/post/24629975632/thinking-about-rna-seq-experimental-design-for) 12 | > RNA-Seq is a powerful tool that can be used effectively by a diverse community of people with different backgrounds. We expect that some of those who could benefit from RNA Seq do not yet have the background in sequencing and statistics that is necessary to make effective use of this technology. Much of the existing literature may be time-consuming to read without this background. Therefore, we put together this primer with the intention of helping scientists and students understand the basic statistical principles associated with measuring gene expression using RNA Seq. 13 | 14 | [RNA Sequence Analysis Training/Courses/Papers](https://www.biostars.org/p/174376/) 15 | > I did a little bit of research and started to take free online courses and read papers. Could anybody recommend me more up-to-date online trainings or courses? Here are the resources that I found so far. 16 | 17 | ## Publications 18 | 19 | [RNA-Seq workflow: gene-level exploratory analysis and differential expression](http://f1000research.com/articles/4-1070/v1) (10/2015) 20 | > Here we walk through an end-to-end gene-level RNA-Seq differential expression workflow using Bioconductor packages. We will start from the FASTQ files, show how these were aligned to the reference genome, and prepare a count matrix which tallies the number of RNA-seq reads/fragments within each gene for each sample. We will perform exploratory data analysis (EDA) for quality assessment and to explore the relationship between samples, perform differential gene expression analysis, and visually explore the results. 21 | 22 | [A survey of best practices for RNA-seq data analysis](http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0881-8) (1/2016) 23 | > We review all of the major steps in RNA-seq data analysis, including experimental design, quality control, read alignment, quantification of gene and transcript levels, visualization, differential gene expression, alternative splicing, functional analysis, gene fusion detection and eQTL mapping. We highlight the challenges associated with each step. We discuss the analysis of small RNAs and the integration of RNA-seq with other functional genomics techniques. Finally, we discuss the outlook for novel technologies that are changing the state of the art in transcriptomics. 24 | 25 | [How many biological replicates are needed in an RNA-seq experiment and which differential expression tool should you use?](http://rnajournal.cshlp.org/content/22/6/839.long) (6/2016) 26 | > For future RNA-seq experiments, these results suggest that at least six biological replicates should be used, rising to at least 12 when it is important to identify SDE genes for all fold changes. If fewer than 12 replicates are used, a superior combination of true positive and false positive performances makes edgeR and DESeq2 the leading tools. For higher replicate numbers, minimizing false positives is more important and DESeq marginally outperforms the other tools. 27 | -------------------------------------------------------------------------------- /notes/single-cell-rna-seq.md: -------------------------------------------------------------------------------- 1 | # Single-Cell RNA-Seq 2 | 3 | |tool |comment | 4 | |:----------|:----------| 5 | |[Monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html)|| 6 | |scLVM|| 7 | |SCDE|| 8 | |[scDD](https://github.com/kdkorthauer/scDD)|| 9 | |[MAST](https://github.com/RGLab/MAST)|| 10 | |[Sincera](https://research.cchmc.org/pbge/sincera.html)|| 11 | |[sincell](http://bioconductor.org/packages/devel/bioc/html/sincell.html)|| 12 | |[cellTree](http://bioconductor.org/packages/devel/bioc/html/cellTree.html)|| 13 | |[RaceID](https://github.com/dgrun/RaceID)|| 14 | |[SIMLR](https://github.com/BatzoglouLabSU/SIMLR)|| 15 | |[scater](https://github.com/davismcc/scater)|| 16 | 17 | This weak attempt to compile a collection of scRNA-seq tools has been rendered useless by [scRNA-tools](https://www.scrna-tools.org/) and [awesome-single-cell](https://github.com/seandavi/awesome-single-cell). 18 | -------------------------------------------------------------------------------- /scripts-bigpurple/assembly-10x-supernova.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova. 6 | ## 7 | ## Usage: 8 | ## sbatch --job-name=supernova --nodes=1 --ntasks=1 --cpus-per-task=17 --mem-per-cpu=32G --time=15-00 \ 9 | ## --partition=fn_long --mail-user=${USER}@nyulangone.org --mail-type=END,FAIL,REQUEUE --export=NONE \ 10 | ## --wrap="bash /gpfs/data/igorlab/public/genomics/scripts-bigpurple/assembly-10x-supernova.sh fastq_dir [max_reads]" 11 | ## 12 | ## optimal number of reads: 56x raw coverage (Supernova will estimate the genome size) 13 | ## number of reads calculator: (genome size) x 56 / 150, assuming reads are 150bp (default is 1,200M for human genome) 14 | ## https://support.10xgenomics.com/de-novo-assembly/guidance/doc/achieving-success-with-de-novo-assembly 15 | ## 16 | 17 | 18 | ######################### 19 | 20 | 21 | # system-specific settings 22 | 23 | # supernova directory 24 | supernova_version="2.1.1" 25 | supernova_dir="/gpfs/data/igorlab/software/supernova/supernova-${supernova_version}" 26 | 27 | # prepare environment 28 | module purge 29 | module add default-environment 30 | 31 | 32 | ######################### 33 | 34 | 35 | # check for correct number of arguments 36 | if [ $# -lt 1 ] ; then 37 | echo -e "\n ERROR: wrong number of arguments supplied \n" >&2 38 | echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2 39 | exit 1 40 | fi 41 | 42 | # arguments 43 | fastq_dir=$(readlink -f "$1") 44 | max_reads="$2" 45 | 46 | # check that input exists 47 | if [ ! -d "$fastq_dir" ] ; then 48 | echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 49 | exit 1 50 | fi 51 | 52 | 53 | ######################### 54 | 55 | 56 | # step 1: assembly (supernova run) 57 | 58 | # million reads cutoff (default is 1200M) 59 | # set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads 60 | # coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset 61 | # default value is 1.2B, which only makes sense for ~3.2 Gb genomes 62 | if [ -n "$max_reads" ] ; then 63 | max_reads_m="$max_reads" 64 | else 65 | max_reads_m="1200" 66 | fi 67 | 68 | # system load settings (reserve an extra thread for overhead) 69 | threads=$SLURM_CPUS_PER_TASK 70 | threads=$(( threads - 1 )) 71 | mem=$(echo "$threads * 32" | bc) 72 | 73 | # run name (used to name output folder) 74 | supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g') 75 | run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M" 76 | 77 | # display settingse 78 | echo 79 | echo " * fastq dir: $fastq_dir " 80 | echo " * supernova bin dir: $supernova_dir " 81 | echo " * reads cutoff (million): $max_reads_m " 82 | echo " * threads: $threads " 83 | echo " * mem: $mem " 84 | echo " * run name (output dir): $run_id " 85 | echo 86 | 87 | echo -e "\n assembly started: $(date) \n" >&2 88 | 89 | # supernova assembly command 90 | 91 | supernova_cmd=" 92 | ${supernova_dir}/supernova run \ 93 | --maxreads ${max_reads_m}000000 \ 94 | --localcores ${threads} \ 95 | --localmem ${mem} \ 96 | --id ${run_id} \ 97 | --fastqs ${fastq_dir} 98 | " 99 | echo -e "\n CMD: $supernova_cmd \n" 100 | $supernova_cmd 101 | 102 | echo -e "\n assembly ended: $(date) \n" >&2 103 | 104 | # check that output generated 105 | supernova_out_dir=$(readlink -f "$(pwd)/${run_id}") 106 | if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then 107 | echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2 108 | exit 1 109 | fi 110 | 111 | 112 | ######################### 113 | 114 | 115 | # step 2: generate fasta file (supernova mkoutput) 116 | 117 | # display settings 118 | echo 119 | echo " * assembly dir: ${supernova_out_dir}/outs/assembly " 120 | echo " * fasta prefix: ${supernova_out_dir}/assembly " 121 | echo 122 | 123 | # generate different style fasta files 124 | styles="raw megabubbles pseudohap pseudohap2" 125 | for s in $styles; do 126 | 127 | echo -e "\n generate fasta: style $s \n" >&2 128 | 129 | # supernova mkoutput command 130 | ${supernova_dir}/supernova mkoutput \ 131 | --asmdir "${supernova_out_dir}/outs/assembly" \ 132 | --outprefix "${supernova_out_dir}/assembly.${s}" \ 133 | --style "${s}" 134 | 135 | done 136 | 137 | # check that output generated 138 | styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2" 139 | for s in $styles_out; do 140 | 141 | # check that output generated 142 | if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then 143 | echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2 144 | exit 1 145 | fi 146 | 147 | done 148 | 149 | 150 | ######################### 151 | 152 | 153 | # cleanup 154 | 155 | # check file size before cleanup 156 | echo 157 | echo "file size before cleanup" 158 | du -sh "$run_id" 159 | echo 160 | 161 | # delete large assembly files (keep small ones just in case) 162 | rm -rf ${run_id}/outs/assembly/a* 163 | rm -rf ${run_id}/outs/assembly/closures* 164 | rm -rf ${run_id}/outs/assembly/data 165 | # delete temp files 166 | rm -rf ${run_id}/ASSEMBLER_CS 167 | 168 | # check file size after cleanup 169 | echo 170 | echo "file size after cleanup" 171 | du -sh "$run_id" 172 | echo 173 | 174 | 175 | ######################### 176 | 177 | 178 | 179 | # end 180 | -------------------------------------------------------------------------------- /scripts-bigpurple/cnvs-wgs-freec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## WGS copy number variant analysis using Control-FREEC (with optional matched normal). 6 | ## Based on SNS WES version. 7 | ## 8 | ## Usage: 9 | ## sbatch --job-name=cnvs-wgs-${sample} --nodes=1 --ntasks=1 --cpus-per-task=5 --mem=100G --time=5:00:00 \ 10 | ## --mail-user=${USER}@nyulangone.org --mail-type=FAIL,REQUEUE --export=NONE \ 11 | ## --wrap="bash ./cnvs-wgs-freec.sh project_dir genome_build sample_name bam [control_bam] [window_size]" 12 | ## 13 | 14 | 15 | ######################### 16 | 17 | 18 | # script filename 19 | script_path="${BASH_SOURCE[0]}" 20 | script_name=$(basename "$script_path") 21 | segment_name=${script_name/%.sh/} 22 | echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 23 | 24 | # check for correct number of arguments 25 | if [ $# -lt 4 ] ; then 26 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 27 | echo -e "\n USAGE: $script_name project_dir genome_build sample_name bam [control_bam] [window_size] \n" >&2 28 | exit 1 29 | fi 30 | 31 | # arguments 32 | proj_dir=$(readlink -f "$1") 33 | genome_build="$2" 34 | sample_t="$3" 35 | bam_t=$(readlink -f "$4") 36 | bam_n="$5" 37 | win_size="$6" 38 | 39 | 40 | ######################### 41 | 42 | 43 | # check if control sample and/or window size are specified 44 | 45 | if [ -n "$win_size" ] ; then 46 | # both control sample and window size are specified 47 | bam_n=$(readlink -f "$bam_n") 48 | win_size_label="paired-${win_size}" 49 | elif [ -n "$bam_n" ] ; then 50 | # either control sample or window size are specified 51 | if [ -e "$bam_n" ] ; then 52 | bam_n=$(readlink -f "$bam_n") 53 | win_size="" 54 | win_size_label="paired-auto" 55 | else 56 | win_size="$bam_n" 57 | bam_n="" 58 | win_size_label="$win_size" 59 | fi 60 | else 61 | # no control sample or window size are specified 62 | win_size_label="auto" 63 | fi 64 | 65 | # check that inputs exist 66 | 67 | if [ ! -d "$proj_dir" ] ; then 68 | echo -e "\n $script_name ERROR: PROJ DIR $proj_dir DOES NOT EXIST \n" >&2 69 | exit 1 70 | fi 71 | 72 | if [ ! -s "$bam_t" ] ; then 73 | echo -e "\n $script_name ERROR: BAM $bam_t DOES NOT EXIST \n" >&2 74 | exit 1 75 | fi 76 | 77 | if [ -n "$bam_n" ] && [ ! -s "$bam_n" ] ; then 78 | echo -e "\n $script_name ERROR: CONTROL BAM $bam_n DOES NOT EXIST \n" >&2 79 | exit 1 80 | fi 81 | 82 | 83 | ######################### 84 | 85 | 86 | # settings and files 87 | 88 | sample="${sample_t}" 89 | 90 | cnv_freec_dir="${proj_dir}/CNV-FREEC-${win_size_label}" 91 | mkdir -p "$cnv_freec_dir" 92 | 93 | # need a separate directory for each sample since some auto-generated files have identical filenames 94 | sample_freec_logs_base_dir="${proj_dir}/logs-${segment_name}-${win_size_label}" 95 | sample_freec_logs_dir="${sample_freec_logs_base_dir}/${sample_t}" 96 | mkdir -p "$sample_freec_logs_dir" 97 | 98 | summary_csv="${sample_freec_logs_base_dir}/${sample}.${segment_name}.csv" 99 | 100 | config_txt="${sample_freec_logs_dir}/config.txt" 101 | 102 | out_base_sample="${sample_freec_logs_dir}/$(basename $bam_t)" 103 | # out_base_control="${sample_freec_logs_dir}/$(basename $bam_n)" 104 | fixed_base="${cnv_freec_dir}/${sample_t}" 105 | 106 | cpn_sample="${out_base_sample}_sample.cpn" 107 | cpn_control="${out_base_control}_control.cpn" 108 | 109 | cnvs_original="${out_base_sample}_CNVs" 110 | ratio_original="${out_base_sample}_ratio.txt" 111 | info_original="${out_base_sample}_info.txt" 112 | 113 | # repeated later again based on resolution 114 | cnvs_fixed="${fixed_base}.CNVs.txt" 115 | ratio_fixed="${fixed_base}.ratio.txt" 116 | graph_fixed="${fixed_base}.png" 117 | 118 | # unload all loaded modulefiles 119 | module purge 120 | module add default-environment 121 | 122 | 123 | ######################### 124 | 125 | 126 | # genome-specific settings 127 | 128 | if [[ "$genome_build" == "hg19" ]] ; then 129 | chr_files_dir="/gpfs/data/igorlab/ref/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/" 130 | chr_len_file="/gpfs/data/igorlab/ref/hg19/genome.fa.fai" 131 | gem="/gpfs/data/igorlab/ref/hg19/FREEC/out100m2_hg19.gem" 132 | elif [[ "$genome_build" == "hg38" ]] ; then 133 | chr_files_dir="/gpfs/data/igorlab/ref/hg38/chromosomes/" 134 | chr_len_file="/gpfs/data/igorlab/ref/hg38/genome.fa.fai" 135 | gem="/gpfs/data/igorlab/ref/hg38/genome.len100.mm2.mappability" 136 | elif [[ "$genome_build" == "mm10" ]] ; then 137 | chr_files_dir="/gpfs/data/igorlab/ref/iGenomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/" 138 | chr_len_file="/gpfs/data/igorlab/ref/mm10/genome.fa.fai" 139 | gem="/gpfs/data/igorlab/ref/mm10/FREEC/out100m4_mm10.gem" 140 | else 141 | echo -e "\n $script_name ERROR: UNSUPPORTED GENOME \n" >&2 142 | exit 1 143 | fi 144 | 145 | if [ ! -s "$chr_len_file" ] ; then 146 | echo -e "\n $script_name ERROR: CHROM LENGTHS $chr_len_file DOES NOT EXIST \n" >&2 147 | exit 1 148 | fi 149 | 150 | if [ ! -s "$gem" ] ; then 151 | echo -e "\n $script_name ERROR: GEM $gem DOES NOT EXIST \n" >&2 152 | exit 1 153 | fi 154 | 155 | 156 | ######################### 157 | 158 | 159 | # skip if output exits already 160 | 161 | if [ -s "$cpn_sample" ] || [ -s "$cnvs_original" ] || [ -s "$cnvs_fixed" ] ; then 162 | echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 163 | exit 0 164 | fi 165 | 166 | 167 | ######################### 168 | 169 | 170 | # create config 171 | 172 | # either coefficientOfVariation or window must be specified for whole genome sequencing data 173 | if [ -n "$win_size" ] ; then 174 | win_size_config="window = $win_size" 175 | else 176 | win_size_config="coefficientOfVariation = 0.05" 177 | fi 178 | 179 | # config for control dataset 180 | 181 | if [ -n "$bam_n" ] ; then 182 | control_config=" 183 | [control] 184 | mateFile = $bam_n 185 | inputFormat = BAM 186 | mateOrientation = FR 187 | " 188 | else 189 | control_config="" 190 | fi 191 | 192 | config_contents=" 193 | 194 | # whole genome sequencing config 195 | # based on: https://github.com/BoevaLab/FREEC/blob/master/data/config_WGS.txt 196 | 197 | 198 | [general] 199 | 200 | # output directory 201 | outputDir = . 202 | 203 | # number of threads 204 | maxThreads = 4 205 | 206 | # path to sambamba (used only to read .BAM files) 207 | sambamba = /gpfs/data/igorlab/software/sambamba/sambamba-0.7.0 208 | 209 | # file with chromosome lengths (fa.fai accepted starting from v9.3) 210 | chrLenFile = $chr_len_file 211 | 212 | # path to the directory with chromosomes fasta files 213 | # necessary to calculate a GC-content profile if a control dataset and GC-content profile are not available 214 | chrFiles = $chr_files_dir 215 | 216 | # information about mappable positions (GEM output) 217 | gemMappabilityFile = $gem 218 | 219 | # use a mappability profile to correct read counts (provided with gemMappabilityFile) 220 | uniqueMatch = TRUE 221 | 222 | # genome ploidy 223 | # you can set different values and Control-FREEC will select the one that explains most observed CNAs 224 | ploidy = 2 225 | 226 | # sample sex 227 | # sex=XY will not annotate one copy of chr X and Y as a losssex=XY 228 | sex = XY 229 | 230 | # either coefficientOfVariation or window must be specified for whole genome sequencing data 231 | # for whole exome sequencing: window=0 232 | $win_size_config 233 | 234 | # set to 1 or 2 to correct the Read Count (RC) for GC-content bias and low mappability 235 | # Default (WGS): 0 236 | # Default (WES): 1 (≥ v9.5) and 0 (< v9.5) 237 | # forceGCcontentNormalization = 1 238 | 239 | # degree of polynomial 240 | # Default: 3&4 (GC-content based normalization, WGS) or 1 (control-read-count-based normalization, WES) 241 | # degree = 1 242 | 243 | # desired behavior in the ambiguous regions 244 | # 4: make a separate fragment of this unknown region and do not assign any copy number to this region at all 245 | # breakPointType = 4 246 | 247 | # positive value of threshold for segmentation of normalized profiles 248 | # Default: 0.8 (for WGS) 249 | breakPointThreshold = 0.8 250 | 251 | # threshold on the minimal number of reads per window in the control sample 252 | # recommended value >=50 for for exome data 253 | # readCountThreshold = 10 254 | 255 | # additional output in BedGraph format for the UCSC genome browser 256 | BedGraphOutput = TRUE 257 | 258 | 259 | [sample] 260 | 261 | # file with mapped reads (can be single end reads, mate-pairs or paired-end reads) 262 | mateFile = $bam_t 263 | 264 | # format of reads (in mateFile) 265 | # SAM, BAM, pileup, bowtie, eland, arachne, psl (BLAT), BED, Eland 266 | inputFormat = BAM 267 | 268 | # format of reads (in mateFile) 269 | # 0 (for single ends), RF (Illumina mate-pairs), FR (Illumina paired-ends), FF (SOLiD mate-pairs) 270 | mateOrientation = FR 271 | 272 | $control_config 273 | 274 | " 275 | 276 | echo "$config_contents" > "$config_txt" 277 | 278 | 279 | ######################### 280 | 281 | 282 | # Control-FREEC 283 | 284 | # FREEC compiled with GCC 6.1.0 (same GCC must be in the environment when running) 285 | module add gcc/6.1.0 286 | # bedtools to create .pileup files for WES data 287 | module add bedtools/2.27.1 288 | # samtools to create .pileup files (for BAF) (even with sambamba enabled) 289 | module add samtools/1.3 290 | 291 | cd "$sample_freec_logs_dir" 292 | 293 | freec_dir="/gpfs/data/igorlab/software/FREEC/FREEC-11.6" 294 | freec_bin="${freec_dir}/src/freec" 295 | 296 | echo 297 | echo " * FREEC: $(readlink -f $freec_bin) " 298 | echo " * sample T : $sample_t " 299 | echo " * BAM T : $bam_t " 300 | echo " * BAM N : $bam_n " 301 | echo " * window size : $win_size " 302 | echo " * CNVs original: $cnvs_original " 303 | echo " * CNVs fixed: $cnvs_fixed " 304 | echo " * ratio original: $ratio_original " 305 | echo " * ratio fixed: $ratio_fixed " 306 | echo 307 | 308 | freec_cmd="$freec_bin -conf $config_txt" 309 | echo -e "\n CMD: $freec_cmd \n" 310 | ($freec_cmd) 311 | 312 | sleep 30 313 | 314 | 315 | ######################### 316 | 317 | 318 | # check that output generated 319 | 320 | if [ ! -s "$cnvs_original" ] ; then 321 | echo -e "\n $script_name ERROR: $cnvs_original NOT GENERATED \n" >&2 322 | exit 1 323 | fi 324 | 325 | if [ ! -s "$ratio_original" ] ; then 326 | echo -e "\n $script_name ERROR: $ratio_original NOT GENERATED \n" >&2 327 | exit 1 328 | fi 329 | 330 | 331 | ######################### 332 | 333 | 334 | # clean up 335 | 336 | # delete raw copy number profiles 337 | rm -v "$cpn_sample" 338 | 339 | if [ -s "$cpn_control" ] ; then 340 | rm -v "$cpn_control" 341 | fi 342 | 343 | 344 | ######################### 345 | 346 | 347 | # get resolution and add to output names 348 | 349 | # get resolution 350 | res=$(cat "$info_original" | grep "Window" | cut -f 2) 351 | 352 | # adjust output file names 353 | cnvs_fixed="${fixed_base}.${res}.CNVs.txt" 354 | ratio_fixed="${fixed_base}.${res}.ratio.txt" 355 | graph_fixed="${fixed_base}.${res}.png" 356 | 357 | echo 358 | echo " * res: $res " 359 | echo " * CNVs fixed: $cnvs_fixed " 360 | echo " * ratio fixed: $ratio_fixed " 361 | echo " * graph fixed: $graph_fixed " 362 | echo 363 | 364 | 365 | ######################### 366 | 367 | 368 | # post-processing 369 | 370 | module add r/3.6.1 371 | 372 | echo 373 | echo " * R: $(readlink -f $(which R)) " 374 | echo " * R version: $(R --version | head -1) " 375 | echo " * Rscript: $(readlink -f $(which Rscript)) " 376 | echo " * Rscript version: $(Rscript --version 2>&1) " 377 | echo 378 | 379 | # required libraries: rtracklayer 380 | 381 | # add p-value to the predicted CNVs 382 | # add Wilcoxon test and Kolmogorov-Smirnov test p-values to _CNVs file (also add headers to the columns) 383 | freec_asses_sig_cmd="cat ${freec_dir}/scripts/assess_significance.R | R --slave --args $cnvs_original $ratio_original" 384 | echo -e "\n CMD: $freec_asses_sig_cmd \n" 385 | eval "$freec_asses_sig_cmd" 386 | 387 | # add "chr" to CNV table chromosomes 388 | cat "${cnvs_original}.p.value.txt" | sed 's/^\([0-9XY]\)/chr\1/' | LC_ALL=C sort -k1,1 -k2,2n | uniq > "$cnvs_fixed" 389 | 390 | # visualize normalized copy number profile with predicted CNAs as well as BAF profile by running makeGraph.R 391 | freec_makegraph_cmd="cat ${freec_dir}/scripts/makeGraph.R | R --slave --args 2 $ratio_original" 392 | echo -e "\n CMD: $freec_makegraph_cmd \n" 393 | eval "$freec_makegraph_cmd" 394 | 395 | 396 | ######################### 397 | 398 | 399 | # fix some of the names 400 | 401 | mv -v "$ratio_original" "$ratio_fixed" 402 | mv -v "${ratio_original}.png" "$graph_fixed" 403 | 404 | 405 | ######################### 406 | 407 | 408 | # check that output generated 409 | 410 | if [ ! -s "$cnvs_fixed" ] ; then 411 | echo -e "\n $script_name ERROR: CNVs $cnvs_fixed NOT GENERATED \n" >&2 412 | exit 1 413 | fi 414 | 415 | 416 | ######################### 417 | 418 | 419 | # summary 420 | 421 | # ratios and predicted copy number alterations for each window 422 | num_bins=$(cat "$ratio_fixed" | grep -v 'MedianRatio' | wc -l) 423 | echo "num bins: $num_bins" 424 | 425 | # predicted copy number alterations 426 | num_cnas=$(cat "$cnvs_fixed" | grep -v 'uncertainty' | wc -l) 427 | echo "num CNAs: $num_cnas" 428 | 429 | # header for summary file 430 | echo "#SAMPLE,res,bins,CNAs" > "$summary_csv" 431 | 432 | # summarize log file 433 | echo "${sample},${res},${num_bins},${num_cnas}" >> "$summary_csv" 434 | 435 | sleep 30 436 | 437 | # combine all sample summaries 438 | cat ${sample_freec_logs_base_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq \ 439 | > "${proj_dir}/summary.${segment_name}-${win_size_label}.csv" 440 | 441 | 442 | ######################### 443 | 444 | 445 | # annotate 446 | 447 | annot_cmd="bash /gpfs/data/igorlab/public/sns/segments/annot-regions-annovar.sh $proj_dir $sample $cnvs_fixed" 448 | echo -e "\n CMD: $annot_cmd \n" 449 | ($annot_cmd) 450 | 451 | 452 | ######################### 453 | 454 | 455 | 456 | # end 457 | -------------------------------------------------------------------------------- /scripts-bigpurple/jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## Use a BigPurple compute node to run a Jupyter notebook and access it from your local machine. 6 | ## Can be executed through sbatch or directly. 7 | ## Run this script on the cluster to start a Jupyter notebook. 8 | ## 9 | ## Usage (direct): 10 | ## bash ./jupyter.sh 11 | ## Usage (via sbatch): 12 | ## sbatch --job-name=jupyter --nodes=1 --ntasks=1 --cpus-per-task=1 --mem=8G --time=8:00:00 ./jupyter.sh 13 | ## 14 | ## Further instructions will be printed after executing the script (check the log file if executed via sbatch). 15 | ## 16 | 17 | 18 | ######################### 19 | 20 | 21 | # check that Jupyter notebook is available 22 | 23 | if [ ! -x "$(command -v jupyter)" ] ; then 24 | echo -e "\n ERROR: 'jupyter' is not available \n" >&2 25 | exit 1 26 | fi 27 | 28 | if [ ! -x "$(command -v jupyter-notebook)" ] ; then 29 | echo -e "\n ERROR: 'jupyter-notebook' is not available \n" >&2 30 | exit 1 31 | fi 32 | 33 | 34 | ######################### 35 | 36 | 37 | # https://docs.ycrc.yale.edu/clusters-at-yale/guides/jupyter/ 38 | # originally modified for use on BigPurple by Paul Glick 39 | 40 | # get tunneling info 41 | XDG_RUNTIME_DIR="" 42 | user=$(whoami) 43 | node=$(hostname -s) 44 | # port=$(shuf -i 8000-9999 -n 1) 45 | # generate a unqiue port for each user 46 | port=$(shuf -i 8000-9999 -n 1 --random-source <(echo "$user")) 47 | 48 | echo -e " 49 | 50 | Two additional steps should be perfomed on a local machine. 51 | 52 | (1) Create an SSH tunnel in a new terminal on a local maching (there is no output): 53 | ssh -N -L ${port}:${node}:${port} ${user}@bigpurple.nyumc.org 54 | 55 | (2) Access Jupyter through a web browser at: 56 | http://127.0.0.1:${port} (complete URL with token string will be shown below) 57 | 58 | " 59 | 60 | # clean up the environment and load modules or conda environments (should be a parameter) 61 | # module purge 62 | # module add default-environment 63 | 64 | # classic Jupyter Notebook 65 | # jupyter-notebook --no-browser --port=${port} --ip=${node} 66 | 67 | # JupyterLab 68 | jupyter lab --no-browser --port=${port} --ip=${node} 69 | 70 | 71 | ######################### 72 | 73 | 74 | 75 | # end 76 | -------------------------------------------------------------------------------- /scripts-bigpurple/scrna-10x-cellranger-aggr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## 10X Cell Ranger 6 | ## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count' 7 | ## 8 | 9 | 10 | # script filename 11 | script_name=$(basename "${BASH_SOURCE[0]}") 12 | 13 | # check for correct number of arguments 14 | if [ $# -lt 1 ] || [ $# -gt 2 ] ; then 15 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 16 | echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2 17 | exit 1 18 | fi 19 | 20 | # arguments 21 | sample_sheet=$1 22 | analysis_name=$2 23 | 24 | # settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant) 25 | threads="4" 26 | mem="32" 27 | 28 | # make the output group-writeable 29 | umask 007 30 | 31 | # output (add analysis name if provided) 32 | sample_name="aggregated" 33 | if [ -n "$analysis_name" ] ; then 34 | sample_name="${sample_name}-${analysis_name}" 35 | fi 36 | web_summary_html="${sample_name}/outs/web_summary.html" 37 | 38 | # check that input exists 39 | if [ ! -s "$sample_sheet" ] ; then 40 | echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2 41 | exit 1 42 | fi 43 | 44 | echo -e "\n $(date) \n" >&2 45 | 46 | # check if output already exits 47 | if [ -s "$web_summary_html" ]; then 48 | echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2 49 | exit 1 50 | fi 51 | 52 | module unload gcc 53 | module load cellranger/3.0.0 54 | module load dos2unix/7.4.0 55 | 56 | # clean up sample sheet 57 | dos2unix -q "$sample_sheet" 58 | sed -i 's/"//g' "$sample_sheet" 59 | sed -i -e '$a\' "$sample_sheet" 60 | 61 | # display settings 62 | echo " * cellranger: $(which cellranger) " 63 | echo " * sample sheet: $sample_sheet " 64 | 65 | # cellranger aggr command 66 | 67 | # id A unique run id, used to name output folder [a-zA-Z0-9_-]+. 68 | # csv Path of CSV file enumerating 'cellranger count' outputs. 69 | 70 | cellranger_cmd=" 71 | cellranger aggr \ 72 | --jobmode local \ 73 | --localcores $threads \ 74 | --localmem $mem \ 75 | --id $sample_name \ 76 | --csv $sample_sheet 77 | " 78 | echo -e "\n CMD: $cellranger_cmd \n" 79 | $cellranger_cmd 80 | 81 | sleep 15 82 | 83 | # check that output html summary (and probably everything else) exists 84 | if [ ! -s "$web_summary_html" ] ; then 85 | echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 86 | exit 1 87 | fi 88 | 89 | # copy html summary to top level for easy navigation 90 | rsync -t "$web_summary_html" "./${sample_name}.html" 91 | 92 | # clean up (temp files) 93 | rm -rf "${sample_name}/SC_RNA_COUNTER_CS" 94 | 95 | echo -e "\n $(date) \n" 96 | 97 | 98 | 99 | # end 100 | -------------------------------------------------------------------------------- /scripts-bigpurple/scrna-10x-cellranger-count-features.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## 10X Cell Ranger 6 | ## processes Chromium single cell RNA-seq output with expression and antibody libraries 7 | ## 8 | 9 | 10 | # script filename 11 | script_name=$(basename "${BASH_SOURCE[0]}") 12 | 13 | # check for correct number of arguments 14 | if [ ! $# == 4 ] ; then 15 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 16 | echo -e "\n USAGE: $script_name genome_name sample_name libraries_csv features_csv \n" >&2 17 | exit 1 18 | fi 19 | 20 | # arguments 21 | genome_name=$1 22 | sample_name=$2 23 | sample_name_out="count-$sample_name" 24 | libraries_csv=$(readlink -f "$3") 25 | features_csv=$(readlink -f "$4") 26 | 27 | # settings 28 | # threads=$NSLOTS 29 | # threads=$SLURM_NTASKS 30 | threads=16 31 | # mem=$(echo "$threads * 8" | bc) 32 | mem=128 33 | 34 | # make the output group-writeable 35 | umask 007 36 | 37 | if [[ "$genome_name" == "hg19" ]] ; then 38 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0" 39 | elif [[ "$genome_name" == "hg38" ]] ; then 40 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0" 41 | elif [[ "$genome_name" == "GRCh38" ]] ; then 42 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0" 43 | elif [[ "$genome_name" == "mm10" ]] ; then 44 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-mm10-3.0.0" 45 | elif [[ "$genome_name" == "hg19_and_mm10" ]] ; then 46 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-and-mm10-3.0.0" 47 | else 48 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/ref/${genome_name}/cellranger" 49 | #transcriptome_dir="/gpfs/home/id460/ref/${genome_name}/cellranger" 50 | fi 51 | 52 | # unload all loaded modulefiles 53 | module purge 54 | 55 | # check that input exists 56 | if [ ! -s "$libraries_csv" ] ; then 57 | echo -e "\n ERROR: libraries csv $libraries_csv does not exist \n" >&2 58 | exit 1 59 | fi 60 | 61 | if [ ! -s "$features_csv" ] ; then 62 | echo -e "\n ERROR: features csv $features_csv does not exist \n" >&2 63 | exit 1 64 | fi 65 | 66 | if [ ! -d "$transcriptome_dir" ] ; then 67 | echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2 68 | exit 1 69 | fi 70 | 71 | module load cellranger/3.1.0 72 | 73 | # display settings 74 | echo " * cellranger: $(which cellranger) " 75 | echo " * threads: $threads " 76 | echo " * mem: $mem " 77 | echo " * transcriptome dir: $transcriptome_dir " 78 | echo " * libraries csv: $libraries_csv " 79 | echo " * features csv: $features_csv " 80 | echo " * sample: $sample_name " 81 | echo " * out dir: $sample_name_out " 82 | 83 | echo -e "\n $(date) \n" >&2 84 | 85 | # cellranger run command 86 | 87 | # id A unique run id, used to name output folder 88 | # fastqs Path of folder created by 10x demultiplexing or bcl2fastq (must not be passed) 89 | # sample Prefix of the filenames of FASTQs to select 90 | # transcriptome Path of folder containing 10X-compatible transcriptome 91 | # libraries Path to a file declaring FASTQ paths and library types of input libraries 92 | # feature-ref Path to a file declaring the Feature Barcoding reagents 93 | 94 | cellranger_cmd=" 95 | cellranger count \ 96 | --localmem $mem \ 97 | --localcores $threads \ 98 | --transcriptome $transcriptome_dir \ 99 | --libraries $libraries_csv \ 100 | --feature-ref $features_csv \ 101 | --id $sample_name_out \ 102 | " 103 | echo -e "\n CMD: $cellranger_cmd \n" 104 | $cellranger_cmd 105 | 106 | sleep 15 107 | 108 | web_summary_html="./${sample_name_out}/outs/web_summary.html" 109 | 110 | # check that output html summary (and probably everything else) exists 111 | if [ ! -s "$web_summary_html" ] ; then 112 | echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 113 | exit 1 114 | fi 115 | 116 | # copy html summary to top level for easy navigation 117 | rsync -tv "$web_summary_html" "./${sample_name_out}.html" 118 | 119 | # delete temp files 120 | rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS" 121 | 122 | echo -e "\n $(date) \n" 123 | 124 | 125 | 126 | # end 127 | -------------------------------------------------------------------------------- /scripts-bigpurple/scrna-10x-cellranger-count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## Processes 10x Genomics Chromium single-cell RNA-seq FASTQs with Cell Ranger (cellranger count). 6 | ## Provide a FASTQ directory for classic expression libraries. 7 | ## Provide tables of libraries and features for expression and antibody/hashtag libraries. 8 | ## 9 | ## Usage: 10 | ## sbatch --job-name=cellranger-${sample} --ntasks=1 --cpus-per-task=17 --mem=128G --time=10:00:00 \ 11 | ## --mail-user=${USER}@nyumc.org --mail-type=FAIL,END --export=NONE \ 12 | ## --wrap="bash ./scrna-10x-cellranger-count.sh module_version genome_build sample_name fastq_dir" 13 | ## 14 | 15 | 16 | # script filename 17 | script_name=$(basename "${BASH_SOURCE[0]}") 18 | 19 | # check for correct number of arguments 20 | if [ $# -lt 4 ] ; then 21 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 22 | echo -e " Usage:" >&2 23 | echo -e "" >&2 24 | echo -e " RNA only: ./${script_name} module_version genome_name sample_name fastq_dir" >&2 25 | echo -e " RNA and ADT/HTO: ./${script_name} module_version genome_name sample_name libraries_csv features_csv" >&2 26 | echo -e "" >&2 27 | echo -e " RNA example: ./${script_name} 9.0.0 hg38 my_sample /gpfs/data/fastq" >&2 28 | echo -e "" >&2 29 | if [ $# -gt 0 ] ; then echo -e " Provided arguments: $* \n" >&2 ; fi 30 | exit 1 31 | fi 32 | 33 | # arguments 34 | module_version=$1 35 | genome_name=$2 36 | sample_name_fastq=$3 37 | sample_name_out="count-$sample_name_fastq" 38 | if [ $# -eq 4 ] ; then 39 | fastq_dir=$(readlink -f "$4") 40 | fi 41 | if [ $# -eq 5 ] ; then 42 | libraries_csv=$(readlink -f "$4") 43 | features_csv=$(readlink -f "$5") 44 | fi 45 | 46 | # settings (16 threads and 64G does not finish with 10h time limit) 47 | threads=16 48 | mem=128 49 | 50 | # make the output group-writeable 51 | umask 007 52 | 53 | if [[ "$genome_name" == "hg19" ]] ; then 54 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0" 55 | elif [[ "$genome_name" == "hg38" ]] ; then 56 | # transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2020-A" 57 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2024-A" 58 | elif [[ "$genome_name" == "GRCh38" ]] ; then 59 | # transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2020-A" 60 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-2024-A" 61 | elif [[ "$genome_name" == "mm10" ]] ; then 62 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-mm10-2020-A" 63 | elif [[ "$genome_name" == "mm39" ]] ; then 64 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCm39-2024-A" 65 | elif [[ "$genome_name" == "GRCm39" ]] ; then 66 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCm39-2024-A" 67 | elif [[ "$genome_name" == "GRCh38_and_mm10" ]] ; then 68 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38-and-mm10-2020-A" 69 | elif [[ "$genome_name" == "GRCh38_and_GRCm39" ]] ; then 70 | transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-gex-GRCh38_and_GRCm39-2024-A" 71 | else 72 | # additional custom genomes 73 | transcriptome_dir="/gpfs/data/igorlab/ref/${genome_name}/cellranger" 74 | fi 75 | 76 | # check that input exists 77 | 78 | if [ ! -d "$transcriptome_dir" ] ; then 79 | echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2 80 | exit 1 81 | fi 82 | 83 | if [ -n "$fastq_dir" ] ; then 84 | # RNA-only command 85 | if [ ! -d "$fastq_dir" ] ; then 86 | echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 87 | exit 1 88 | fi 89 | else 90 | # RNA and ADT command 91 | if [ ! -s "$libraries_csv" ] ; then 92 | echo -e "\n ERROR: libraries csv $libraries_csv does not exist \n" >&2 93 | exit 1 94 | fi 95 | if [ ! -s "$features_csv" ] ; then 96 | echo -e "\n ERROR: features csv $features_csv does not exist \n" >&2 97 | exit 1 98 | fi 99 | fi 100 | 101 | # clean up the environment 102 | module purge 103 | module add default-environment 104 | module add cellranger/${module_version} 105 | 106 | # display settings 107 | echo " * cellranger: $(which cellranger) " 108 | echo " * threads: $threads " 109 | echo " * mem: $mem " 110 | echo " * transcriptome dir: $transcriptome_dir " 111 | echo " * out dir: $sample_name_out " 112 | if [ -n "$fastq_dir" ] ; then 113 | echo " * sample name: $sample_name_fastq " 114 | echo " * fastq dir: $fastq_dir " 115 | extra_args="--sample $sample_name_fastq --fastqs $fastq_dir" 116 | else 117 | echo " * libraries csv: $libraries_csv " 118 | echo " * features csv: $features_csv " 119 | extra_args="--libraries $libraries_csv --feature-ref $features_csv" 120 | fi 121 | 122 | echo -e "\n $(date) \n" >&2 123 | 124 | # cellranger count command 125 | 126 | # transcriptome Path of folder containing 10x-compatible transcriptome reference 127 | # id A unique run id and output folder name [a-zA-Z0-9_-]+ 128 | # sample Prefix of the filenames of FASTQs to select 129 | # fastqs Path of folder created by 10x demultiplexing or bcl2fastq 130 | # libraries CSV file declaring input library data sources 131 | # feature-ref Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes 132 | 133 | cellranger_cmd=" 134 | cellranger count \ 135 | --create-bam true \ 136 | --localmem $mem \ 137 | --localcores $threads \ 138 | --transcriptome $transcriptome_dir \ 139 | --id $sample_name_out \ 140 | $extra_args \ 141 | --disable-ui \ 142 | " 143 | echo -e "\n CMD: $cellranger_cmd \n" 144 | eval "$cellranger_cmd" 145 | 146 | sleep 15 147 | 148 | web_summary_html="./${sample_name_out}/outs/web_summary.html" 149 | 150 | # check that output html summary (and probably everything else) exists 151 | if [ ! -s "$web_summary_html" ] ; then 152 | echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 153 | exit 1 154 | fi 155 | 156 | # copy html summary to top level for easy navigation 157 | rsync -tv "$web_summary_html" "./${sample_name_out}.html" 158 | 159 | # delete temp files 160 | rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS" 161 | 162 | echo -e "\n $(date) \n" 163 | 164 | 165 | 166 | # end 167 | -------------------------------------------------------------------------------- /scripts-bigpurple/scrna-10x-cellranger-multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## 10X Cell Ranger 6 | ## Processes Chromium single-cell V(D)J and Gene Expression output (cellranger multi) 7 | ## Enables the analysis of multiple library types together (compared to using cellranger vdj and cellranger count separately) 8 | ## 9 | ## Usage: 10 | ## sbatch --job-name=cellranger-${sample} --ntasks=1 --cpus-per-task=17 --mem=128G --time=10:00:00 \ 11 | ## --mail-user=${USER}@nyumc.org --mail-type=FAIL,END --export=NONE \ 12 | ## --wrap="bash ./scrna-10x-cellranger-multi.sh module_version sample_name config_csv" 13 | ## 14 | 15 | 16 | # script filename 17 | script_name=$(basename "${BASH_SOURCE[0]}") 18 | 19 | # check for correct number of arguments 20 | if [ ! $# == 3 ] ; then 21 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 22 | echo -e "\n USAGE: $script_name module_version sample_name \n" >&2 23 | exit 1 24 | fi 25 | 26 | # arguments 27 | module_version=$1 28 | sample_name=$2 29 | config_csv=$(readlink -f "$3") 30 | 31 | # settings (16 threads and 64G does not finish with 10h time limit) 32 | threads=16 33 | mem=128 34 | 35 | # make the output group-writeable 36 | umask 007 37 | 38 | # check that input exists 39 | if [ ! -s "$config_csv" ] ; then 40 | echo -e "\n ERROR: config $config_csv does not exist \n" >&2 41 | exit 1 42 | fi 43 | 44 | module purge 45 | module add default-environment 46 | module add cellranger/${module_version} 47 | 48 | # display settings 49 | echo " * cellranger: $(which cellranger) " 50 | echo " * threads: $threads " 51 | echo " * mem: $mem " 52 | echo " * out dir: $sample_name " 53 | 54 | echo -e "\n $(date) \n" >&2 55 | 56 | # cellranger multi command 57 | 58 | # id A unique run id and output folder name [a-zA-Z0-9_-]+ 59 | # csv Path of CSV file enumerating input libraries and analysis parameters 60 | # sample Prefix of the filenames of FASTQs to select 61 | 62 | # The multi config CSV contains both the library definitions and experiment configuration variables. 63 | # It is composed of up to four sections: [gene-expression], [feature], [vdj], and [libraries]. 64 | # Template: https://support.10xgenomics.com/multi-config-template.csv 65 | 66 | cellranger_cmd=" 67 | cellranger multi \ 68 | --localmem $mem \ 69 | --localcores $threads \ 70 | --csv $config_csv \ 71 | --id $sample_name \ 72 | --disable-ui \ 73 | " 74 | echo -e "\n CMD: $cellranger_cmd \n" 75 | $cellranger_cmd 76 | 77 | sleep 15 78 | 79 | web_summary_html="./${sample_name}/outs/per_sample_outs/${sample_name}/web_summary.html" 80 | 81 | # check that output html summary (and probably everything else) exists 82 | if [ ! -s "$web_summary_html" ] ; then 83 | echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 84 | exit 1 85 | fi 86 | 87 | # copy html summary to top level for easy navigation 88 | rsync -tv "$web_summary_html" "./${sample_name}.html" 89 | 90 | # delete temp files 91 | rm -rf "./${sample_name}/SC_MULTI_CS" 92 | 93 | echo -e "\n $(date) \n" 94 | 95 | 96 | 97 | # end 98 | -------------------------------------------------------------------------------- /scripts-phoenix/assembly-10x-supernova.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova. 6 | ## 7 | ## Usage: 8 | ## qsub -N supernova -M ${USER}@nyumc.org -m ae -j y -cwd -pe threaded 16 -b y \ 9 | ## -hard -l mem_free=512G -l mem_token=32G \ 10 | ## bash /ifs/home/id460/public/genomics/scripts-phoenix/assembly-10x-supernova.sh fastq_dir [max_reads] 11 | ## 12 | 13 | 14 | ######################### 15 | 16 | 17 | # system-specific settings 18 | 19 | # supernova directory 20 | supernova_version="2.1.1" 21 | supernova_dir="/ifs/home/id460/software/supernova/supernova-${supernova_version}" 22 | 23 | 24 | ######################### 25 | 26 | 27 | # check for correct number of arguments 28 | if [ $# -lt 1 ] ; then 29 | echo -e "\n ERROR: wrong number of arguments supplied \n" >&2 30 | echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2 31 | exit 1 32 | fi 33 | 34 | # arguments 35 | fastq_dir=$(readlink -f "$1") 36 | max_reads="$2" 37 | 38 | # check that input exists 39 | if [ ! -d "$fastq_dir" ] ; then 40 | echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 41 | exit 1 42 | fi 43 | 44 | 45 | ######################### 46 | 47 | 48 | # step 1: assembly (supernova run) 49 | 50 | # million reads cutoff (default is 1200M) 51 | # set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads 52 | # coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset 53 | # default value is 1.2B, which only makes sense for ~3.2 Gb genomes 54 | if [ -n "$max_reads" ] ; then 55 | max_reads_m="$max_reads" 56 | else 57 | max_reads_m="1200" 58 | fi 59 | 60 | # system load settings (leave extra room for memory) 61 | threads=$NSLOTS 62 | mem=$(echo "$threads * 30" | bc) 63 | 64 | # run name (used to name output folder) 65 | supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g') 66 | run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M" 67 | 68 | # display settingse 69 | echo 70 | echo " * fastq dir: $fastq_dir " 71 | echo " * supernova bin dir: $supernova_dir " 72 | echo " * reads cutoff (million): $max_reads_m " 73 | echo " * threads: $threads " 74 | echo " * mem: $mem " 75 | echo " * run name (output dir): $run_id " 76 | echo 77 | 78 | echo -e "\n assembly started: $(date) \n" >&2 79 | 80 | # supernova assembly command 81 | 82 | supernova_cmd=" 83 | ${supernova_dir}/supernova run \ 84 | --maxreads ${max_reads_m}000000 \ 85 | --localcores ${threads} \ 86 | --localmem ${mem} \ 87 | --id ${run_id} \ 88 | --fastqs ${fastq_dir} 89 | " 90 | echo -e "\n CMD: $supernova_cmd \n" 91 | $supernova_cmd 92 | 93 | echo -e "\n assembly ended: $(date) \n" >&2 94 | 95 | # check that output generated 96 | supernova_out_dir=$(readlink -f "$(pwd)/${run_id}") 97 | if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then 98 | echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2 99 | exit 1 100 | fi 101 | 102 | 103 | ######################### 104 | 105 | 106 | # step 2: generate fasta file (supernova mkoutput) 107 | 108 | # display settings 109 | echo 110 | echo " * assembly dir: ${supernova_out_dir}/outs/assembly " 111 | echo " * fasta prefix: ${supernova_out_dir}/assembly " 112 | echo 113 | 114 | # generate different style fasta files 115 | styles="raw megabubbles pseudohap pseudohap2" 116 | for s in $styles; do 117 | 118 | echo -e "\n generate fasta: style $s \n" >&2 119 | 120 | # supernova mkoutput command 121 | ${supernova_dir}/supernova mkoutput \ 122 | --asmdir "${supernova_out_dir}/outs/assembly" \ 123 | --outprefix "${supernova_out_dir}/assembly.${s}" \ 124 | --style "${s}" 125 | 126 | done 127 | 128 | # check that output generated 129 | styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2" 130 | for s in $styles_out; do 131 | 132 | # check that output generated 133 | if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then 134 | echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2 135 | exit 1 136 | fi 137 | 138 | done 139 | 140 | 141 | ######################### 142 | 143 | 144 | # cleanup 145 | 146 | # check file size before cleanup 147 | du -sh "$run_id" 148 | 149 | # delete large assembly files (keep small ones just in case) 150 | rm -rf ${run_id}/outs/assembly/a* 151 | rm -rf ${run_id}/outs/assembly/closures* 152 | rm -rf ${run_id}/outs/assembly/data 153 | # delete temp files 154 | rm -rf ${run_id}/ASSEMBLER_CS 155 | 156 | # check file size after cleanup 157 | du -sh "$run_id" 158 | 159 | 160 | ######################### 161 | 162 | 163 | 164 | # end 165 | -------------------------------------------------------------------------------- /scripts-phoenix/bcl2fastq-sample-sheet-fix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## fix bcl2fastq demultiplexing sample sheet to get rid of problematic characters 6 | ## 7 | 8 | 9 | # input 10 | proj=$1 11 | 12 | if [ -z "$1" ] 13 | then 14 | echo "ERROR! NO ARGUMENT SUPPLIED." 15 | exit 1 16 | fi 17 | 18 | basecalls_dir="/ifs/data/sequence/Illumina/production/${proj}/Data/Intensities/BaseCalls" 19 | ss=${basecalls_dir}/SampleSheet.csv 20 | 21 | printf "\n\n FIX SAMPLE SHEET $ss \n\n" 22 | 23 | # make the output group-writeable 24 | umask 007 25 | 26 | # fix and show sample sheet 27 | if [ -s $ss ] 28 | then 29 | 30 | # fix newlines 31 | dos2unix --quiet $ss 32 | mac2unix --quiet $ss 33 | 34 | # replace commas inside quoted fields with dashes 35 | awk -F '"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "-", $i) } 1' $ss > ${ss}.tmp && mv ${ss}.tmp $ss 36 | # replace periods, parentheses, quotes, and blanks in sample names with dashes 37 | awk -F ',' 'BEGIN { OFS="," } { gsub(/\.|\(|\)|\#|\:|\/|\047|[[:blank:]]/, "-", $3); print }' $ss > ${ss}.tmp && mv ${ss}.tmp $ss 38 | # replace multiple dashes 39 | sed -i 's/--*/-/g' $ss 40 | # replaces dashes at the beginning of the field 41 | sed -i 's/,-/,/g' $ss 42 | # replaces dashes at the end of the field 43 | sed -i 's/-,/,/g' $ss 44 | # remove lines missing values 45 | sed -i '/^,,,,,/d' $ss 46 | # add newline to end of file if one does not exist (some scripts may complain) 47 | sed -i -e '$a\' $ss 48 | 49 | # check for extra columns in sample sheet 50 | max_comma_count=0 51 | while read i 52 | do 53 | comma_count=$(echo $i | tr -d -c "," | wc -c) 54 | if [ $comma_count -gt $max_comma_count ] 55 | then 56 | max_comma_count=$comma_count 57 | fi 58 | done < $ss 59 | 60 | # if too many commas, replace trailing commas 61 | if [ $max_comma_count -gt 9 ] 62 | then 63 | sed -i 's/,,*$//g' $ss 64 | fi 65 | 66 | # display sample sheet for easy review 67 | column -s "," -t $ss 68 | 69 | else 70 | 71 | printf "\n\n NO SAMPLE SHEET FOUND AT $ss \n\n" 72 | sleep 5 73 | exit 1 74 | 75 | fi 76 | 77 | 78 | 79 | # end 80 | -------------------------------------------------------------------------------- /scripts-phoenix/join-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## merge any number of tab or comma-separated files (coreutils join can only do 2 at a time) 6 | ## for tab field separator, use $'\t' 7 | ## 8 | 9 | 10 | # script filename 11 | script_name=$(basename "${BASH_SOURCE[0]}") 12 | 13 | # check for correct number of arguments 14 | if [ $# -lt 3 ] ; then 15 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 16 | echo -e "\n USAGE: $script_name field_separator missing_field_char in1.txt [in2.txt in3.txt ...] > merged.txt \n" >&2 17 | exit 1 18 | fi 19 | 20 | # load recent coreutils ("-o auto" support added in release 8.12) 21 | module load coreutils/8.24 22 | 23 | # arguments 24 | separator="$1" 25 | shift 26 | empty_char="$1" 27 | shift 28 | 29 | # check if at least the first file exists 30 | if [ ! -s "$1" ] ; then 31 | echo -e "\n $script_name ERROR: file $1 does not exist \n" >&2 32 | exit 1 33 | fi 34 | 35 | # recursive join function 36 | function rjoin { 37 | if [[ $# -gt 1 ]]; then 38 | LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1") | rjoin "${@:2}" 39 | else 40 | LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1") 41 | fi 42 | } 43 | 44 | rjoin "${@:2}" < "$1" 45 | 46 | 47 | 48 | # end 49 | -------------------------------------------------------------------------------- /scripts-phoenix/scrna-10x-cellranger-aggr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## 10X Cell Ranger 6 | ## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count' 7 | ## 8 | 9 | 10 | # script filename 11 | script_name=$(basename "${BASH_SOURCE[0]}") 12 | 13 | # check for correct number of arguments 14 | if [ $# -lt 1 ] || [ $# -gt 2 ] ; then 15 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 16 | echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2 17 | exit 1 18 | fi 19 | 20 | # arguments 21 | sample_sheet=$1 22 | analysis_name=$2 23 | 24 | # settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant) 25 | threads="4" 26 | mem="32" 27 | 28 | # output (add analysis name if provided) 29 | sample_name="aggregated" 30 | if [ -n "$analysis_name" ] ; then 31 | sample_name="${sample_name}-${analysis_name}" 32 | fi 33 | web_summary_html="${sample_name}/outs/web_summary.html" 34 | 35 | # check that input exists 36 | if [ ! -s "$sample_sheet" ] ; then 37 | echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2 38 | exit 1 39 | fi 40 | 41 | # delete empty .po files to keep directory clean 42 | rm -rf cellranger*.po* 43 | 44 | echo -e "\n $(date) \n" >&2 45 | 46 | # check if output already exits 47 | if [ -s "$web_summary_html" ]; then 48 | echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2 49 | exit 1 50 | fi 51 | 52 | # clean up sample sheet 53 | dos2unix -q "$sample_sheet" 54 | sed -i 's/"//g' "$sample_sheet" 55 | sed -i -e '$a\' "$sample_sheet" 56 | 57 | module unload gcc 58 | module load cellranger/2.1.0 59 | 60 | # display settings 61 | echo " * cellranger: $(which cellranger) " 62 | echo " * sample sheet: $sample_sheet " 63 | 64 | # cellranger aggr command 65 | 66 | # id A unique run id, used to name output folder [a-zA-Z0-9_-]+. 67 | # csv Path of CSV file enumerating 'cellranger count' outputs. 68 | 69 | cellranger_cmd=" 70 | cellranger aggr \ 71 | --jobmode local \ 72 | --localcores $threads \ 73 | --localmem $mem \ 74 | --id $sample_name \ 75 | --csv $sample_sheet 76 | " 77 | echo -e "\n CMD: $cellranger_cmd \n" 78 | $cellranger_cmd 79 | 80 | sleep 15 81 | 82 | # check that output html summary (and probably everything else) exists 83 | if [ ! -s "$web_summary_html" ] ; then 84 | echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 85 | exit 1 86 | fi 87 | 88 | # copy html summary to top level for easy navigation 89 | rsync -t "$web_summary_html" "./${sample_name}.html" 90 | 91 | # clean up (temp files) 92 | rm -rf "${sample_name}/SC_RNA_COUNTER_CS" 93 | 94 | echo -e "\n $(date) \n" 95 | 96 | 97 | 98 | # end 99 | -------------------------------------------------------------------------------- /scripts-phoenix/scrna-10x-cellranger-count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## 10X Cell Ranger - processes Chromium single cell RNA-seq output 6 | ## 7 | 8 | 9 | # script filename 10 | script_name=$(basename "${BASH_SOURCE[0]}") 11 | 12 | # check for correct number of arguments 13 | if [ ! $# == 3 ] ; then 14 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 15 | echo -e "\n USAGE: $script_name genome_name sample_name fastq_dir \n" >&2 16 | exit 1 17 | fi 18 | 19 | # arguments 20 | genome_name=$1 21 | sample_name_fastq=$2 22 | sample_name_out="count-$sample_name_fastq" 23 | fastq_dir=$(readlink -f "$3") 24 | 25 | # settings 26 | threads=$NSLOTS 27 | mem=$(echo "$threads * 8" | bc) 28 | transcriptome_dir="/ifs/data/cellranger-refdata/refdata-cellranger-${genome_name}-1.2.0" 29 | alt_transcriptome_dir="/ifs/home/id460/ref/${genome_name}/cellranger" 30 | 31 | # unload all loaded modulefiles 32 | module purge 33 | module load local 34 | 35 | # check that input exists 36 | if [ ! -d "$fastq_dir" ] ; then 37 | echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 38 | exit 1 39 | fi 40 | 41 | if [ ! -d "$transcriptome_dir" ] ; then 42 | echo -e "\n WARNING: genome dir $transcriptome_dir does not exist \n" >&2 43 | echo -e "\n setting genome dir to $alt_transcriptome_dir \n" >&2 44 | transcriptome_dir="${alt_transcriptome_dir}" 45 | fi 46 | 47 | if [ ! -d "$transcriptome_dir" ] ; then 48 | echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2 49 | exit 1 50 | fi 51 | 52 | # delete empty .po files to keep directory clean 53 | rm -rf cellranger*.po* 54 | 55 | module load cellranger/2.1.0 56 | 57 | # display settings 58 | echo " * cellranger: $(which cellranger) " 59 | echo " * threads: $threads " 60 | echo " * mem: $mem " 61 | echo " * transcriptome dir: $transcriptome_dir " 62 | echo " * fastq dir: $fastq_dir " 63 | echo " * sample: $sample_name_fastq " 64 | echo " * out dir: $sample_name_out " 65 | 66 | echo -e "\n $(date) \n" >&2 67 | 68 | # cellranger run command 69 | 70 | # id A unique run id, used to name output folder 71 | # fastqs Path of folder created by 10x demultiplexing or bcl2fastq 72 | # sample Prefix of the filenames of FASTQs to select 73 | # transcriptome Path of folder containing 10X-compatible transcriptome 74 | 75 | cellranger_cmd=" 76 | cellranger count \ 77 | --localmem $mem \ 78 | --localcores $threads \ 79 | --transcriptome $transcriptome_dir \ 80 | --fastqs $fastq_dir \ 81 | --sample $sample_name_fastq \ 82 | --id $sample_name_out \ 83 | " 84 | echo -e "\n CMD: $cellranger_cmd \n" 85 | $cellranger_cmd 86 | 87 | sleep 15 88 | 89 | web_summary_html="./${sample_name_out}/outs/web_summary.html" 90 | 91 | # check that output html summary (and probably everything else) exists 92 | if [ ! -s "$web_summary_html" ] ; then 93 | echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 94 | exit 1 95 | fi 96 | 97 | # copy html summary to top level for easy navigation 98 | rsync -tv "$web_summary_html" "./${sample_name_out}.html" 99 | 100 | # delete temp files 101 | rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS" 102 | 103 | # delete empty .po files to keep directory clean 104 | rm -rf cellranger*.po* 105 | 106 | echo -e "\n $(date) \n" 107 | 108 | 109 | 110 | # end 111 | -------------------------------------------------------------------------------- /scripts-phoenix/wgs-10x-longranger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## Whole Genome Phasing and SV Calling from 10x Genomics Chromium Linked-Reads using Long Ranger. 6 | ## 7 | ## Usage: 8 | ## qsub -N longranger -M ${USER}@nyumc.org -m ae -j y -cwd -pe threaded 16 -b y \ 9 | ## -hard -l mem_free=128G -l mem_token=8G \ 10 | ## bash /ifs/home/id460/public/genomics/scripts-phoenix/wgs-10x-longranger.sh 11 | ## 12 | 13 | 14 | ######################### 15 | 16 | 17 | # check for correct number of arguments 18 | if [ ! $# == 2 ] ; then 19 | echo -e "\n ERROR: wrong number of arguments supplied \n" >&2 20 | echo -e "\n USAGE: bash wgs-10x-longranger.sh sample fastq_dir \n" >&2 21 | exit 1 22 | fi 23 | 24 | # arguments 25 | sample="$1" 26 | fastq_dir=$(readlink -f "$2") 27 | 28 | # check that input exists 29 | if [ ! -d "$fastq_dir" ] ; then 30 | echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 31 | exit 1 32 | fi 33 | 34 | 35 | ######################### 36 | 37 | 38 | # system-specific settings 39 | 40 | # Long Ranger directory 41 | longranger_version="2.2.2" 42 | longranger_dir="/ifs/home/id460/software/longranger/longranger-${longranger_version}" 43 | 44 | # Long Ranger reference directory 45 | longranger_ref_dir="/ifs/home/id460/ref/hg38/longranger-2.1.0" 46 | 47 | # GATK path (Long Ranger 2.2 compatible with versions 3.3-4.0, excluding 3.6) 48 | gatk_jar="/ifs/home/id460/software/GenomeAnalysisTK/gatk-4.0.4.0/gatk-package-4.0.4.0-local.jar" 49 | 50 | # unload all loaded modulefiles 51 | module purge 52 | module load local 53 | 54 | # load java (for GATK) 55 | module load java/1.8 56 | 57 | 58 | ######################### 59 | 60 | 61 | # system load settings 62 | threads=$NSLOTS 63 | mem=$(echo "$threads * 8" | bc) 64 | 65 | # output settings 66 | run_id="longranger-${sample}" 67 | 68 | # display settings 69 | echo 70 | echo " * sample: $sample " 71 | echo " * FASTQ dir: $fastq_dir " 72 | echo " * Long Ranger bin dir: $longranger_dir " 73 | echo " * GATK jar file: $gatk_jar " 74 | echo " * threads: $threads " 75 | echo " * mem: $mem " 76 | echo " * run output: $run_id " 77 | echo 78 | 79 | echo -e "\n analysis started: $(date) \n" >&2 80 | 81 | longranger_cmd=" 82 | ${longranger_dir}/longranger wgs \ 83 | --fastqs ${fastq_dir} \ 84 | --sample ${sample} \ 85 | --id ${run_id} \ 86 | --reference ${longranger_ref_dir} \ 87 | --vcmode=gatk:${gatk_jar} \ 88 | --localcores ${threads} \ 89 | --localmem ${mem} \ 90 | " 91 | echo -e "\n CMD: $longranger_cmd \n" 92 | $longranger_cmd 93 | 94 | longranger_out_dir=$(readlink -f "$(pwd)/${run_id}") 95 | 96 | echo -e "\n analysis ended: $(date) \n" >&2 97 | 98 | 99 | ######################### 100 | 101 | 102 | # check that output generated 103 | 104 | if [ ! -e "${longranger_out_dir}/outs/summary.csv" ] ; then 105 | echo -e "\n ERROR: output ${longranger_out_dir}/outs/summary.csv does not exist \n" >&2 106 | exit 1 107 | fi 108 | 109 | if [ ! -e "${longranger_out_dir}/outs/loupe.loupe" ] ; then 110 | echo -e "\n ERROR: output ${longranger_out_dir}/outs/loupe.loupe does not exist \n" >&2 111 | exit 1 112 | fi 113 | 114 | 115 | ######################### 116 | 117 | 118 | # cleanup 119 | 120 | # check file size before cleanup 121 | du -sh "$run_id" 122 | 123 | # delete temp files 124 | rm -rf "${run_id}/PHASER_SVCALLER_CS" 125 | 126 | # check file size after cleanup 127 | du -sh "$run_id" 128 | 129 | 130 | ######################### 131 | 132 | 133 | 134 | # end 135 | -------------------------------------------------------------------------------- /scripts/cnv-freec-genome-plot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | ' 5 | Description: 6 | Plot genome-wide Control-FREEC copy number analysis results with proportional chromosomes in a single line. 7 | 8 | Usage: 9 | cnv-freec-genome-plot.R 10 | 11 | Arguments: 12 | genome build (UCSC-style such as "hg19" or "mm10") 13 | sample name 14 | Control-FREEC "_ratio.txt" file with ratios and predicted copy number alterations for each window 15 | output png image 16 | 17 | Options: 18 | -h, --help show this screen 19 | ' -> doc 20 | 21 | 22 | # output width 23 | options(width = 120) 24 | # print warnings as they occur 25 | options(warn = 1) 26 | 27 | # retrieve the command-line arguments 28 | suppressPackageStartupMessages(library(docopt)) 29 | opts = docopt(doc) 30 | 31 | # relevent arguments 32 | genome = opts$genome 33 | sample_name = opts$sample_name 34 | ratio_txt = opts$ratio_txt 35 | ratio_png = opts$out_png 36 | 37 | # check that input file exists 38 | if (!file.exists(ratio_txt)) stop("file does not exist: ", ratio_txt) 39 | 40 | # load libraries 41 | suppressPackageStartupMessages({ 42 | library(dplyr) 43 | library(readr) 44 | library(karyoploteR) 45 | library(scales) 46 | }) 47 | 48 | # ploidy 49 | ploidy = 2 50 | # maximum copy number level to plot (to avoid very high values) 51 | max_cn = 6 52 | 53 | # import ratio table, remove uncertain regions, and cap copy numbers at defined maximum value 54 | ratio = 55 | read_tsv(ratio_txt, guess_max = 999999, show_col_types = FALSE, progress = FALSE) %>% 56 | dplyr::filter(CopyNumber >= 0) %>% 57 | dplyr::mutate(chr = Chromosome, start = Start, end = Start) %>% 58 | dplyr::mutate(Ratio = Ratio * ploidy) %>% 59 | dplyr::mutate(CopyNumber = ifelse(CopyNumber > max_cn, max_cn, CopyNumber)) %>% 60 | dplyr::mutate(Ratio = ifelse(Ratio > max_cn, max_cn, Ratio)) %>% 61 | dplyr::select(chr, start, end, Ratio, CopyNumber) 62 | 63 | # convert ratio table to GRanges 64 | ratio_gr = GRanges(ratio) 65 | seqlevelsStyle(ratio_gr) = "UCSC" 66 | 67 | # separate ratios based on amplifications/deletions 68 | ratio_filtered = ratio_gr[ratio_gr$Ratio > 0] 69 | ratio_norm = ratio_filtered[ratio_filtered$CopyNumber == ploidy] 70 | ratio_amp = ratio_filtered[ratio_filtered$CopyNumber > ploidy] 71 | ratio_del = ratio_filtered[ratio_filtered$CopyNumber < ploidy] 72 | 73 | # plot 74 | png(ratio_png, res = 300, width = 15, height = 3, units = "in") 75 | pp = getDefaultPlotParams(plot.type = 4) 76 | pp$data1inmargin = 0 77 | pp$bottommargin = 50 78 | pp$ideogramheight = 20 79 | kp = plotKaryotype(genome = genome, plot.type = 4, ideogram.plotter = NULL, labels.plotter = NULL, plot.params = pp, main = sample_name) 80 | kp = kpAxis(kp, ymin = 0, ymax = max_cn, tick.pos = 0:max_cn) 81 | kp = kpAddCytobandsAsLine(kp) 82 | kp = kpAddChromosomeNames(kp, srt = 45) 83 | kp = kpPoints(kp, data = ratio_norm, y = ratio_norm$Ratio, 84 | cex = 0.3, ymin = 0, ymax = max_cn, col = alpha("darkolivegreen2", 0.3)) 85 | if (length(ratio_amp) > 0) { 86 | kp = kpPoints(kp, data = ratio_amp, y = ratio_amp$Ratio, 87 | cex = 0.3, ymin = 0, ymax = max_cn, col = alpha("firebrick2", 0.3)) 88 | } 89 | if (length(ratio_del) > 0) { 90 | kp = kpPoints(kp, data = ratio_del, y = ratio_del$Ratio, 91 | cex = 0.3, ymin = 0, ymax = max_cn, col = alpha("royalblue4", 0.3)) 92 | } 93 | kp = kpPoints(kp, data = ratio_gr, y = ratio_gr$CopyNumber, 94 | cex = 0.5, ymin = 0, ymax = max_cn, col = "gray20") 95 | dev.off() 96 | 97 | 98 | 99 | # end 100 | -------------------------------------------------------------------------------- /scripts/cnv-freec-heatmap.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | ' 5 | Description: 6 | Generate a color-coded gain/loss plot (heatmap-style) from Control-FREEC output. 7 | 8 | Usage: 9 | cnv-freec-heatmap.R 10 | 11 | Arguments: 12 | genome build (UCSC-style such as "hg19" or "mm10") 13 | Control-FREEC "_CNVs" file with copy number alterations and p-values added by assess_significance.R 14 | output png image 15 | 16 | Options: 17 | -h, --help show this screen 18 | ' -> doc 19 | 20 | 21 | # print warnings as they occur 22 | options(warn = 1) 23 | 24 | # retrieve the command-line arguments 25 | suppressPackageStartupMessages(library(docopt)) 26 | opts = docopt(doc) 27 | 28 | # relevent arguments 29 | genome = opts$genome 30 | cnvs_txt = opts$cnvs_txt 31 | cnvs_png = opts$out_png 32 | 33 | # check that input file exists 34 | if (!file.exists(cnvs_txt)) stop("file does not exist: ", cnvs_txt) 35 | 36 | # load libraries 37 | suppressPackageStartupMessages(library(magrittr)) 38 | suppressPackageStartupMessages(library(tidyverse)) 39 | suppressPackageStartupMessages(library(karyoploteR)) 40 | 41 | # import CNVs table and filter by Wilcoxon p-value 42 | cnvs = read_tsv(cnvs_txt, guess_max = 999999) %>% 43 | filter(WilcoxonRankSumTestPvalue < 0.05) 44 | 45 | # convert ratio table to GRanges 46 | cnvs_gr = GRanges(cnvs) 47 | seqlevelsStyle(cnvs_gr) = "UCSC" 48 | 49 | # separate ratios based on amplifications/deletions 50 | cvns_gain = cnvs_gr[cnvs_gr$status == "gain"] 51 | cnvs_loss = cnvs_gr[cnvs_gr$status == "loss"] 52 | 53 | # plot 54 | png(cnvs_png, res = 300, width = 15, height = 2, units = "in") 55 | pp = getDefaultPlotParams(plot.type = 4) 56 | pp$data1inmargin = 0 57 | pp$bottommargin = 80 58 | pp$ideogramheight = 20 59 | kp = plotKaryotype(genome = genome, plot.type = 4, ideogram.plotter = NULL, labels.plotter = NULL, plot.params = pp) %>% 60 | kpAddCytobandsAsLine() %>% 61 | kpAddChromosomeNames(srt = 90) %>% 62 | kpRect(data = cvns_gain, y0 = 0, y1 = 1, col = "firebrick2", border = NA) %>% 63 | kpRect(data = cnvs_loss, y0 = 0, y1 = 1, col = "royalblue4", border = NA) 64 | dev.off() 65 | 66 | 67 | 68 | # end 69 | -------------------------------------------------------------------------------- /scripts/csv-clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## 5 | ## make csv file more compatible with various tools by removing problematic characters 6 | ## 7 | 8 | 9 | # script filename 10 | script_name=$(basename "${BASH_SOURCE[0]}") 11 | 12 | # check for correct number of arguments 13 | if [ ! $# == 1 ] ; then 14 | echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 15 | echo -e "\n USAGE: $script_name file.csv \n" >&2 16 | exit 1 17 | fi 18 | 19 | # arguments 20 | csv=$1 21 | 22 | # check that input exists 23 | if [ ! -s "$csv" ] ; then 24 | echo -e "\n $script_name ERROR: file $csv does not exist \n" >&2 25 | exit 1 26 | fi 27 | 28 | # fix newlines 29 | dos2unix --quiet $csv 30 | mac2unix --quiet $csv 31 | 32 | # replace commas inside quoted fields with dashes 33 | awk -F '"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "-", $i) } 1' $csv > ${csv}.tmp && mv ${csv}.tmp $csv 34 | 35 | # remove quotes 36 | sed -i 's/\"//g' $csv 37 | 38 | # remove lines missing any values (only commas present) 39 | sed -i '/^,,*$/d' $csv 40 | 41 | # add newline to end of file if one does not exist (some scripts may complain) 42 | sed -i -e '$a\' $csv 43 | 44 | 45 | 46 | # end 47 | -------------------------------------------------------------------------------- /scripts/fastq-merge.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $HELP = <>" instead of ">") 103 | print STDERR " MERGED R1 : $fastq_r1_merged \n"; 104 | my $merge_cmd = "cat $fastq_r1 >> $fastq_r1_merged"; 105 | print STDERR " CMD : $merge_cmd \n"; 106 | system($merge_cmd); 107 | 108 | # repeat merge for R2 if present 109 | if ( -e $fastq_r2 ) { 110 | print STDERR " MERGED R2 : $fastq_r2_merged \n"; 111 | $merge_cmd = "cat $fastq_r2 >> $fastq_r2_merged"; 112 | print STDERR " CMD : $merge_cmd \n"; 113 | system($merge_cmd); 114 | } 115 | 116 | sleep(1); 117 | 118 | } 119 | 120 | } 121 | 122 | 123 | 124 | # end 125 | -------------------------------------------------------------------------------- /scripts/fastq-quality-bars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Visualize FASTQ quality using bars and "animate" them by looping through the individual reads. 4 | # Demo: https://asciinema.org/a/194133 5 | # Usage: cat | ./fastq-quality-bars.sh 6 | # 7 | 8 | 9 | # check if a stdin pipe exists 10 | if [ -p /dev/stdin ]; then 11 | 12 | # blank line for cleaner output 13 | echo "" 14 | 15 | # initiate line counter 16 | n=0 17 | 18 | while read -r line; do 19 | 20 | # update line counter 21 | n=$(($n+1)) 22 | 23 | # use only every fourth line (quality scores) 24 | if [ "$n" -eq 4 ]; then 25 | 26 | # clear screen 27 | printf "\033c" 28 | 29 | # bin quality scores 30 | qual8=$(echo "$line" | sed -e 'y/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJKL/00001111122222333334444455555666667777788888/') 31 | 32 | # convert binned quality scores to vertical bars 33 | awk -v q="$qual8" 'BEGIN { 34 | # number of bases 35 | len = split(q, qarr, ""); 36 | # height of bars (plus an extra row on top and bottom) 37 | h = 8 + 2; 38 | # matrix for output characters 39 | for (i = 1; i <= len; i++) { 40 | # top border 41 | a[i,h] = "▁"; 42 | # bottom border 43 | a[i,1] = "▔"; 44 | # add quality bars 45 | for (j = 1; j <= qarr[i]; j++) { 46 | a[i,j+1] = "▊"; 47 | } 48 | } 49 | # transpose matrix and print from top 50 | for (j = h; j >= 1; j--) { 51 | out = ""; 52 | for (i = 1; i <= len; i++) { 53 | if (a[i,j] == "") a[i,j] = " "; 54 | out = out a[i,j]; 55 | } 56 | print out; 57 | } 58 | }' 59 | 60 | # pause 61 | sleep 0.2 62 | 63 | # reset line counter 64 | n=0 65 | 66 | fi 67 | 68 | done 69 | 70 | echo "fastq ended" 71 | 72 | else 73 | 74 | # show usage if nothing was piped in 75 | echo -e "\n Usage: cat | ./fastq-quality-bars.sh \n" 76 | 77 | fi 78 | 79 | 80 | 81 | # end 82 | -------------------------------------------------------------------------------- /scripts/gtf-remove-overlapping.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | my $HELP = <) { 46 | chomp; 47 | my ($chr, $src, $feat, $pos0, $pos1, $score, $strand, $frame, $attr) = split(/\t/); 48 | 49 | # extract just the transcript id from attributes column 50 | my $transcript = $attr; 51 | $transcript =~ s/.*transcript_id\s"(.+?)".*/$1/; 52 | 53 | # check if current region is same chr as previous region but transcript is different 54 | if ( ( $chr eq $prev_chr ) && ( $transcript ne $prev_transcript ) ) { 55 | # check if previous end pos is larger than current start pos 56 | if ( $pos0 < $prev_pos1 ) { 57 | # flag previous transcript if not flagged already 58 | unless ($prev_transcript ~~ @bad_transcripts) { 59 | push (@bad_transcripts, $prev_transcript); 60 | #print "$prev_transcript \n"; 61 | } 62 | # flag current transcript if not flagged already 63 | unless ($transcript ~~ @bad_transcripts) { 64 | push (@bad_transcripts, $transcript); 65 | #print "$transcript \n"; 66 | } 67 | } 68 | } 69 | 70 | #print " $i $gtf_array[$i][0] $gtf_array[$i][3] $gtf_array[$i][4] \n"; 71 | 72 | # update info of previous entry 73 | $prev_chr = $chr; 74 | $prev_pos0 = $pos0; 75 | $prev_pos1 = $pos1; 76 | $prev_transcript = $transcript; 77 | 78 | 79 | #for my $row (@gtf_array) { 80 | # print "@$row[0]\t@$row[1]\t@$row[2]\n"; 81 | #} 82 | } 83 | close(SORTED); 84 | 85 | # delete sorted cds-only gtf created at the beginning 86 | system("rm -f $gtf_sorted"); 87 | 88 | # count number of overlapping transcripts 89 | my $bad_transcript_count = scalar(@bad_transcripts); 90 | print " overlapping transcripts: $bad_transcript_count \n"; 91 | 92 | # create new gtf for overlapping and non-overlapping transcripts 93 | open(UNIQUE, ">", "${gtf}.unique.gtf"); 94 | open(OVERLAPPING, ">", "${gtf}.overlapping.gtf"); 95 | 96 | # process original gtf and sort each element based on transcript being one of overlapping transcripts 97 | open(GTF, "<", $gtf); 98 | while () { 99 | chomp; 100 | my ($chr, $src, $feat, $pos0, $pos1, $score, $strand, $frame, $attr) = split(/\t/); 101 | 102 | # extract just the transcript id from attributes column 103 | my $transcript = $attr; 104 | $transcript =~ s/.*transcript_id\s"(.+?)".*/$1/; 105 | 106 | # check if transcript is one of overlapping transcripts 107 | if ($transcript ~~ @bad_transcripts) { 108 | print OVERLAPPING "$_\n"; 109 | } 110 | else { 111 | print UNIQUE "$_\n"; 112 | } 113 | } 114 | close(GTF); 115 | 116 | close(UNIQUE); 117 | close(OVERLAPPING); 118 | 119 | } 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /scripts/hdcyto-1-import-fcs.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Import FCS files and generate a flowSet object" 3 | subtitle: "HDCyto workflow step 1" 4 | date-modified: last-modified 5 | format: 6 | html: 7 | embed-resources: true 8 | code-tools: true 9 | toc: true 10 | df-print: paged 11 | execute: 12 | cache: false 13 | params: 14 | # project label for the output directory 15 | project_name: "project" 16 | # path to the FCS files 17 | fcs_dir: "/path/to/fcs" 18 | # subset large samples to a maximum number of cells 19 | max_cells_per_sample: 500000 20 | # quarto render hdcyto-1-import-fcs.qmd -P project_name:? -P fcs_dir:? -P max_cells_per_sample:? 21 | --- 22 | 23 | This script generates a flowSet object from FCS files. 24 | The Quarto format allows both interactive and command-line execution. 25 | It additionally generates an HTML report that can be used for recordkeeping and troubleshooting. 26 | It is structured to facilitate the import and quality control of FCS files in a reproducible and organized manner. 27 | It searches for FCS files in the specified directory, imports the files into a flowSet object, downsamples the samples to avoid extremely large objects, and generates summary tables and plots. 28 | It generates `input/metadata-files.csv` and `input/metadata-channels.csv` metadata tables which should be manually edited to clean up labels, define sample groups, and specify the relevant samples and markers/antibodies used for the next step (`hdcyto-2-prepare-sce.qmd`). 29 | 30 | # Settings 31 | 32 | ```{r packages} 33 | #| message: false 34 | #| warning: false 35 | library(tidyverse) 36 | library(glue) 37 | library(cowplot) 38 | library(qs2) 39 | library(flowCore) 40 | library(ncdfFlow) 41 | library(ggsci) 42 | ``` 43 | 44 | Check parameters 45 | 46 | ```{r params} 47 | params 48 | ``` 49 | 50 | Define inputs 51 | 52 | ```{r settings-inputs} 53 | if (!dir.exists(params$fcs_dir)) stop("FCS dir does not exist: ", params$fcs_dir) 54 | ``` 55 | 56 | Define outputs 57 | 58 | ```{r settings-outputs} 59 | out_dir <- glue("./out-{params$project_name}") 60 | input_dir <- glue("{out_dir}/input") 61 | data_dir <- glue("{out_dir}/r-data") 62 | qc_dir <- glue("{out_dir}/qc") 63 | 64 | dir.create(out_dir, showWarnings = FALSE) 65 | dir.create(input_dir, showWarnings = FALSE) 66 | dir.create(data_dir, showWarnings = FALSE) 67 | dir.create(qc_dir, showWarnings = FALSE) 68 | 69 | files_csv <- glue("{input_dir}/metadata-files.csv") 70 | channels_csv <- glue("{input_dir}/metadata-channels.csv") 71 | fs_qs2 <- glue("{data_dir}/flowset.qs2") 72 | nfs_cdf <- glue("{data_dir}/ncdfflowset.cdf") 73 | ``` 74 | 75 | Stop if the output files that need to be edited already exist (to prevent overwriting) 76 | 77 | ```{r} 78 | if (file.exists(files_csv)) stop("files metadata table already exists: ", files_csv) 79 | if (file.exists(channels_csv)) stop("channels metadata table already exists: ", channels_csv) 80 | ``` 81 | 82 | # Determine input files 83 | 84 | Find FCS files 85 | 86 | ```{r find-fcs-files} 87 | fcs_dir <- params$fcs_dir 88 | fcs_files <- list.files(path = fcs_dir, pattern = "\\.fcs$", full.names = TRUE, recursive = TRUE) 89 | ``` 90 | 91 | Generate a files/samples metadata table 92 | 93 | `CATALYST::prepData()` expects `file_name`, `sample_id`, `patient_id`, and `condition` columns 94 | 95 | ```{r files_df} 96 | files_df <- 97 | data.frame( 98 | file_name = basename(fcs_files), 99 | sample_id = str_remove(basename(fcs_files), ".fcs$"), 100 | patient_id = "?", 101 | condition = "?", 102 | full_path = fcs_files, 103 | row.names = basename(fcs_files) 104 | ) 105 | files_df 106 | ``` 107 | 108 | ```{r} 109 | write_csv(files_df, files_csv) 110 | ``` 111 | 112 | Generate an AnnotatedDataFrame 113 | 114 | ```{r files_adf} 115 | files_adf <- new("AnnotatedDataFrame", data = files_df) 116 | ``` 117 | 118 | # Generate flowSet 119 | 120 | Import FCS files 121 | 122 | ```{r read.flowSet} 123 | # fs <- read.flowSet(files, alter.names = TRUE) 124 | # fs <- read.flowSet(files, transformation = "scale", alter.names = TRUE) 125 | # fs <- suppressWarnings(read.flowSet(path = fcs_dir, alter.names = TRUE, transformation = FALSE, phenoData = samples_adf)) 126 | ``` 127 | 128 | The flowCore flowSet represents a set of FCS files and requires the data elements to remain in memory. The ncdfFlowSet inherits most of data structures from flowSet. It stores event-level data on disk and only keeps the file handler and meta data in memory. 129 | 130 | `read.ncdfFlowSet()` will load only common channels if there are discrepancies between files (`read.flowSet()` requires identical channels) 131 | 132 | ```{r read.ncdfFlowSet} 133 | nfs <- read.ncdfFlowSet(files = files_df$full_path, ncdfFile = nfs_cdf, alter.names = TRUE, transformation = FALSE, phenoData = files_adf, compress = 5, mc.cores = 4) 134 | ``` 135 | 136 | Check which files were imported 137 | 138 | ```{r} 139 | pData(nfs) 140 | ``` 141 | 142 | Save phenotypic data 143 | 144 | ```{r} 145 | write_csv(pData(nfs), glue("{qc_dir}/flowset-pdata.csv")) 146 | ``` 147 | 148 | Save a table of the number of cells in each imported file 149 | 150 | ```{r flowSet-num-cells} 151 | fsApply(nfs, nrow) |> 152 | as.data.frame() |> 153 | as_tibble(rownames = "filename") |> 154 | dplyr::rename(num_cells = V1) |> 155 | arrange(filename) |> 156 | write_csv(glue("{qc_dir}/fcs-num-cells.csv")) 157 | ``` 158 | 159 | Generate a channels/antibodies metadata table 160 | 161 | `CATALYST::prepData()` expects `fcs_colname`, `antigen`, and optional `marker_class` 162 | 163 | `CATALYST::guessPanel()` guesses the marker class based on the channel name and outputs CATALYST-compatible column names (some columns may need to be coerced to character from list) 164 | 165 | ```{r channels_df} 166 | channels_df <- 167 | CATALYST::guessPanel(nfs[[1]]) |> 168 | mutate(marker_class = ifelse(use_channel, "state", "none")) |> 169 | mutate(across(where(is.list), ~ map_chr(., toString))) 170 | channels_df 171 | ``` 172 | 173 | ```{r} 174 | write_csv(channels_df, channels_csv) 175 | ``` 176 | 177 | Randomly subsample ncdfFlowSet to avoid extremely large objects 178 | 179 | ```{r subsample} 180 | set.seed(99) 181 | fres <- filter(nfs, filter = sampleFilter(size = params$max_cells_per_sample)) 182 | # summary(fres) 183 | nfs <- Subset(nfs, fres) 184 | ``` 185 | 186 | Save a table of the number of cells in each file after subsampling 187 | 188 | ```{r} 189 | fsApply(nfs, nrow) |> 190 | as.data.frame() |> 191 | as_tibble(rownames = "filename") |> 192 | dplyr::rename(num_cells = V1) |> 193 | arrange(filename) |> 194 | write_csv(glue("{qc_dir}/flowset-num-cells.csv")) 195 | ``` 196 | 197 | Convert ncdfFlowSet to flowSet 198 | 199 | ```{r as.flowSet} 200 | fs <- ncdfFlow::as.flowSet(nfs) 201 | fs 202 | ``` 203 | 204 | Check that the conversion did not lose data 205 | 206 | ```{r} 207 | if (!identical(pData(fs), pData(nfs))) stop("pData does not match") 208 | if (!identical(exprs(fs[[1]]), exprs(nfs[[1]]))) stop("exprs does not match") 209 | ``` 210 | 211 | Save flowSet object 212 | 213 | ```{r save-flowSet} 214 | qs2::qs_save(object = fs, file = fs_qs2) 215 | unlink(nfs_cdf) 216 | ``` 217 | 218 | # Plot expression density histograms 219 | 220 | Subset to random samples to make plots more readable 221 | 222 | ```{r plot-subset-samples} 223 | samples_subset <- sampleNames(fs) 224 | if (length(samples_subset) > 15) { 225 | set.seed(99) 226 | samples_subset <- sort(sample(sampleNames(fs), 10)) 227 | } 228 | fs_min <- subset(fs, sampleNames(fs) %in% samples_subset) 229 | length(sampleNames(fs_min)) 230 | ``` 231 | 232 | Subset to random cells to speed up plot generation 233 | 234 | ```{r plot-subset-cells} 235 | set.seed(99) 236 | fres <- filter(fs_min, filter = sampleFilter(size = 5000)) 237 | fs_min <- Subset(fs_min, fres) 238 | sum(fsApply(fs_min, nrow)) 239 | ``` 240 | 241 | Generate a ggplot-friendly expression table with extreme outliers removed 242 | 243 | ```{r tidy-exprs-tbl} 244 | exprs_tbl <- 245 | bind_rows( 246 | lapply( 247 | 1:length(fs_min), 248 | function(i) { 249 | data.frame(sample_id = pData(fs_min)[i, "sample_id"], exprs(fs_min[[i]])) 250 | } 251 | ) 252 | ) |> 253 | pivot_longer(!sample_id, names_to = "channel", values_to = "exprs") |> 254 | left_join(channels_df, by = c("channel" = "fcs_colname")) |> 255 | dplyr::mutate(desc0 = if_else(is.na(desc0), channel, desc0)) |> 256 | group_by(channel) |> 257 | dplyr::mutate(min_cutoff = quantile(exprs, 0.01), max_cutoff = quantile(exprs, 0.99)) |> 258 | dplyr::mutate(zscore = scale(exprs)) |> 259 | ungroup() |> 260 | dplyr::filter(exprs >= min_cutoff, exprs <= max_cutoff) 261 | # dplyr::filter(zscore > -3, zscore < 3) 262 | dim(exprs_tbl) 263 | ``` 264 | 265 | Generate a density plot for the original unmodified values 266 | 267 | ```{r plot-density-original} 268 | dens_plot <- 269 | exprs_tbl |> 270 | ggplot(aes(x = exprs, color = sample_id)) + 271 | geom_density() + 272 | facet_wrap(vars(desc0), scales = "free") + 273 | theme_minimal() + 274 | theme( 275 | plot.background = element_rect(fill = "white"), 276 | panel.grid.minor = element_blank() 277 | ) + 278 | scale_color_igv() 279 | ggsave(glue("{qc_dir}/flowset-exprs-density-raw.png"), dens_plot, width = 16, height = 12) 280 | ``` 281 | 282 | Generate a density plot for the log-transformed values 283 | 284 | ```{r plot-density-log} 285 | if (min(exprs_tbl$exprs) >= 0) { 286 | dens_plot <- 287 | exprs_tbl |> 288 | mutate(exprs_log10 = log10(exprs + 1)) |> 289 | ggplot(aes(x = exprs_log10, color = sample_id)) + 290 | geom_density() + 291 | facet_wrap(vars(desc0), scales = "free") + 292 | theme_minimal() + 293 | theme( 294 | plot.background = element_rect(fill = "white"), 295 | panel.grid.minor = element_blank() 296 | ) + 297 | scale_color_igv() 298 | ggsave(glue("{qc_dir}/flowset-exprs-density-log.png"), dens_plot, width = 16, height = 12) 299 | } else { 300 | warning("negative expression values present") 301 | } 302 | ``` 303 | 304 | # Session info 305 | 306 | ```{r} 307 | sessionInfo() 308 | ``` 309 | -------------------------------------------------------------------------------- /scripts/hdcyto-2-prepare-sce.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Convert flowSet to SingleCellExperiment with sample annotations" 3 | subtitle: "HDCyto workflow step 2" 4 | date-modified: last-modified 5 | format: 6 | html: 7 | embed-resources: true 8 | code-tools: true 9 | toc: true 10 | df-print: paged 11 | execute: 12 | cache: false 13 | params: 14 | # project label for the input/output directory 15 | project_name: "project" 16 | # cofactor for arcsinh transformation (generally 5 for CyTOF and 150 for flow cytometry) 17 | arcsinh_cofactor: 150 18 | # quarto render hdcyto-2-prepare-sce.qmd -P project_name:? -P arcsinh_cofactor:? 19 | --- 20 | 21 | This script generates a SingleCellExperiment object with sample annotations. 22 | The previous step (`hdcyto-1-import-fcs.qmd`) saves an unfiltered flowSet object. 23 | It generates `input/metadata-files.csv` and `input/metadata-channels.csv` tables used to identify the relevant samples and markers/antibodies. 24 | Those files should be edited to clean up labels, define sample groups, and exclude problematic samples or markers/antibodies. 25 | The generated SingleCellExperiment object can then be used for CATALYST-based downstream analysis in the next step (`hdcyto-3-analyze-sce.qmd`). 26 | 27 | # Settings 28 | 29 | ```{r packages} 30 | #| message: false 31 | #| warning: false 32 | library(tidyverse) 33 | library(glue) 34 | library(cowplot) 35 | library(qs2) 36 | library(flowCore) 37 | library(CATALYST) 38 | ``` 39 | 40 | Check parameters 41 | 42 | ```{r params} 43 | params 44 | ``` 45 | 46 | Define inputs 47 | 48 | ```{r settings-inputs} 49 | out_dir <- glue("./out-{params$project_name}") 50 | input_dir <- glue("{out_dir}/input") 51 | data_dir <- glue("{out_dir}/r-data") 52 | qc_dir <- glue("{out_dir}/qc") 53 | if (!dir.exists(out_dir)) stop("output dir does not exist: ", out_dir) 54 | 55 | fs_qs2 <- glue("{data_dir}/flowset.qs2") 56 | if (!file.exists(fs_qs2)) stop("flowSet does not exist: ", fs_qs2) 57 | 58 | files_csv <- glue("{input_dir}/metadata-files.csv") 59 | if (!file.exists(files_csv)) stop("files metadata table does not exist: ", files_csv) 60 | 61 | channels_csv <- glue("{input_dir}/metadata-channels.csv") 62 | if (!file.exists(channels_csv)) stop("channels metadata table does not exist: ", channels_csv) 63 | ``` 64 | 65 | Define outputs 66 | 67 | ```{r settings-outputs} 68 | sce_qs2 <- glue("{data_dir}/sce.qs2") 69 | ``` 70 | 71 | # Import data 72 | 73 | Import files metadata table 74 | 75 | ```{r import-files-csv} 76 | files_tbl <- read_csv(files_csv, show_col_types = FALSE) 77 | files_tbl 78 | ``` 79 | 80 | Validate files metadata table 81 | 82 | ```{r check-files-table} 83 | if (!"file_name" %in% names(files_tbl)) { 84 | stop("files metadata table should have 'file_name' column") 85 | } 86 | if (!"sample_id" %in% names(files_tbl)) { 87 | stop("files metadata table should have 'sample_id' column (for CATALYST)") 88 | } 89 | if (!"patient_id" %in% names(files_tbl)) { 90 | stop("files metadata table should have 'patient_id' column (for CATALYST)") 91 | } 92 | if (!"condition" %in% names(files_tbl)) { 93 | stop("files metadata table should have 'condition' column (for CATALYST)") 94 | } 95 | if (anyNA(files_tbl$sample_id)) { 96 | stop("files metadata table 'sample_id' column has NAs") 97 | } 98 | if (anyNA(files_tbl$condition)) { 99 | stop("files metadata table 'condition' column has NAs") 100 | } 101 | if (n_distinct(files_tbl$condition) == 1) { 102 | stop("files metadata table should have multiple conditions") 103 | } 104 | if (!all(sapply(files_tbl, "class") %in% c("factor", "character"))) { 105 | stop("files metadata table column contents should be discrete") 106 | } 107 | ``` 108 | 109 | Import channels metadata table 110 | 111 | ```{r import-channels-csv} 112 | channels_tbl <- read_csv(channels_csv, show_col_types = FALSE) 113 | channels_tbl 114 | ``` 115 | 116 | Validate channels metadata table 117 | 118 | ```{r check-channels-table} 119 | if (!"fcs_colname" %in% names(channels_tbl)) { 120 | stop("channels metadata table should have 'fcs_colname' column") 121 | } 122 | if (!"antigen" %in% names(channels_tbl)) { 123 | stop("channels metadata table should have 'antigen' column (for CATALYST)") 124 | } 125 | ``` 126 | 127 | Import flowSet 128 | 129 | ```{r import-flowSet} 130 | fs <- qs2::qs_read(fs_qs2, validate_checksum = TRUE) 131 | fs 132 | ``` 133 | 134 | Check that the parameter names are not all identical 135 | 136 | ```{r check-flowset-parameters} 137 | if (n_distinct(fs[[1]]@parameters@data$name) == 1) stop("parameter names are all identical") 138 | if (n_distinct(fs[[1]]@parameters@data$desc) == 1) stop("parameter descriptions are all identical") 139 | ``` 140 | 141 | Check if any files were removed from the files table 142 | 143 | ```{r} 144 | removed_samples <- setdiff(rownames(pData(fs)), files_tbl$file_name) 145 | removed_samples 146 | ``` 147 | 148 | Add removed files back to the table (will be removed after SingleCellExperiment is generated) 149 | 150 | ```{r} 151 | if (length(removed_samples) > 0) { 152 | files_tbl <- bind_rows(files_tbl, data.frame(file_name = removed_samples)) 153 | } 154 | ``` 155 | 156 | ```{r} 157 | # remove rownames (causes an error with diffcyt) 158 | # rownames(files_tbl) <- NULL 159 | # rownames(channels_tbl) <- NULL 160 | ``` 161 | 162 | # Generate a SingleCellExperiment 163 | 164 | Convert a flowSet into a SingleCellExperiment 165 | 166 | ```{r prepData} 167 | arcsinh_cofactor <- params$arcsinh_cofactor 168 | 169 | # transform: arcsinh-transformation should be performed 170 | # FACS: keep all channels as assay data 171 | sce <- 172 | prepData( 173 | fs, 174 | panel = channels_tbl, 175 | md = files_tbl, 176 | transform = TRUE, 177 | cofactor = arcsinh_cofactor, 178 | FACS = TRUE 179 | ) 180 | sce 181 | ``` 182 | 183 | Remove files not found in the files table 184 | 185 | ```{r filter-samples} 186 | sce <- filterSCE(sce, !is.na(sample_id)) 187 | sce 188 | ``` 189 | 190 | Check the contents of the SingleCellExperiment object 191 | 192 | ```{r} 193 | rowData(sce) 194 | ``` 195 | 196 | ```{r save-rowdata} 197 | write_csv(as_tibble(rowData(sce), rownames = "feature_id"), glue("{qc_dir}/sce-rowdata.csv")) 198 | ``` 199 | 200 | `CATALYST::prepData()` does not keep all files metadata columns, so they need to be added 201 | 202 | ```{r clean-coldata} 203 | colData(sce) <- colData(sce)[, "sample_id", drop = FALSE] 204 | 205 | coldata_full_df <- left_join(as.data.frame(colData(sce)), files_tbl, by = c("sample_id")) 206 | if (!all(coldata_full_df$sample_id == colData(sce)$sample_id)) stop("colData order mismatch") 207 | coldata_full_df <- dplyr::select(coldata_full_df, !sample_id) 208 | coldata_full_df <- dplyr::select(coldata_full_df, !file_name) 209 | coldata_full_df <- dplyr::select(coldata_full_df, !full_path) 210 | 211 | colData(sce) <- cbind(colData(sce), coldata_full_df) 212 | colData(sce) 213 | ``` 214 | 215 | Add rownames to column metadata 216 | 217 | ```{r add-coldata-rownames} 218 | colnames(sce) <- make.names(colData(sce)$sample_id, unique = TRUE) 219 | ``` 220 | 221 | Extract the experimental design table 222 | 223 | ```{r ei} 224 | ei(sce) 225 | ``` 226 | 227 | ```{r save-ei} 228 | write_csv(as_tibble(ei(sce)), glue("{qc_dir}/sce-exp-design.csv")) 229 | ``` 230 | 231 | Plot the number of cells per sample 232 | 233 | ```{r plot-num-cells} 234 | # plotCounts(sce, group_by = "sample_id", color_by = NULL) 235 | # plotCounts(sce, group_by = "sample_id", color_by = "condition") + scale_fill_igv() 236 | num_cells_plot <- plotCounts(sce, group_by = "sample_id", color_by = NULL) 237 | ggsave(glue("{qc_dir}/sce-num-cells.png"), num_cells_plot, width = 15, height = 5) 238 | ``` 239 | 240 | Check expression 241 | 242 | ```{r check-assay-counts} 243 | quantile(assay(sce, "counts")[, sample(1:ncol(sce), 10000)]) 244 | ``` 245 | 246 | ```{r check-assay-exprs} 247 | quantile(assay(sce, "exprs")[, sample(1:ncol(sce), 10000)]) 248 | ``` 249 | 250 | ```{r} 251 | # p = plotExprs(sce, color_by = "condition") 252 | # p = plotExprs(sce, color_by = "sample_id") 253 | # p$facet$params$ncol <- 6 254 | # p 255 | ``` 256 | 257 | Save SingleCellExperiment 258 | 259 | ```{r save-sce} 260 | qs2::qs_save(object = sce, file = sce_qs2) 261 | ``` 262 | 263 | Delete Rplots.pdf 264 | 265 | ```{r} 266 | if (file.exists("Rplots.pdf")) { 267 | file.remove("Rplots.pdf") 268 | } 269 | ``` 270 | 271 | # Session info 272 | 273 | ```{r session-info} 274 | sessionInfo() 275 | ``` 276 | -------------------------------------------------------------------------------- /scripts/hdcyto-3-analyze-sce.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Perform CATALYST-based analysis" 3 | subtitle: "HDCyto workflow step 3" 4 | date-modified: last-modified 5 | format: 6 | html: 7 | embed-resources: true 8 | code-tools: true 9 | toc: true 10 | df-print: paged 11 | execute: 12 | cache: false 13 | params: 14 | # project label for the input/output directory 15 | project_name: "project" 16 | # quarto render hdcyto-3-analyze-sce.qmd -P project_name:? 17 | --- 18 | 19 | This script performs CATALYST-based analysis, including dimensionality reduction and clustering. 20 | It starts with a SingleCellExperiment object generated in the previous step (`hdcyto-2-prepare-sce.qmd`). 21 | 22 | # Settings 23 | 24 | ```{r packages} 25 | #| message: false 26 | #| warning: false 27 | library(tidyverse) 28 | library(glue) 29 | library(cowplot) 30 | library(qs2) 31 | library(RColorBrewer) 32 | library(ggsci) 33 | library(CATALYST) 34 | ``` 35 | 36 | Check parameters 37 | 38 | ```{r params} 39 | params 40 | ``` 41 | 42 | Define inputs 43 | 44 | ```{r settings-inputs} 45 | out_dir <- glue("./out-{params$project_name}") 46 | input_dir <- glue("{out_dir}/input") 47 | data_dir <- glue("{out_dir}/r-data") 48 | qc_dir <- glue("{out_dir}/qc") 49 | if (!dir.exists(out_dir)) stop("output dir does not exist: ", out_dir) 50 | 51 | sce_qs2 <- glue("{data_dir}/sce.qs2") 52 | if (!file.exists(sce_qs2)) stop("SingleCellExperiment does not exist: ", sce_qs2) 53 | ``` 54 | 55 | Define outputs 56 | 57 | ```{r settings-outputs} 58 | phono_dir <- glue("{out_dir}/phenotypes") 59 | exprs_dir <- glue("{out_dir}/expression") 60 | clust_dir <- glue("{out_dir}/clusters") 61 | 62 | dir.create(phono_dir, showWarnings = FALSE) 63 | dir.create(exprs_dir, showWarnings = FALSE) 64 | dir.create(clust_dir, showWarnings = FALSE) 65 | ``` 66 | 67 | # Import data 68 | 69 | Import SingleCellExperiment 70 | 71 | ```{r import-sce} 72 | sce <- qs2::qs_read(sce_qs2, validate_checksum = TRUE) 73 | sce 74 | ``` 75 | 76 | Check the contents of the SingleCellExperiment object 77 | 78 | ```{r} 79 | ei(sce) 80 | ``` 81 | 82 | ```{r} 83 | write_csv(as_tibble(rowData(sce), rownames = "feature_id"), glue("{qc_dir}/sce-rowdata.csv")) 84 | ``` 85 | 86 | ```{r} 87 | write_csv(as_tibble(ei(sce)), glue("{qc_dir}/sce-exp-design.csv")) 88 | ``` 89 | 90 | Define the color scheme, accounting for many groups 91 | 92 | ```{r color-scheme} 93 | color_scheme <- c(pal_igv("default")(51), pal_igv(alpha = 0.6)(51), pal_igv(alpha = 0.3)(51)) 94 | ``` 95 | 96 | Pseudobulk-level MDS plot (computed on median marker expressions in each sample) 97 | 98 | ```{r plot-pb-mds} 99 | for (p in names(colData(sce))) { 100 | mds_plot <- 101 | pbMDS(sce, by = "sample_id", color_by = p, pal = color_scheme) + 102 | theme_classic() + 103 | theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank(), strip.background = element_blank()) 104 | ggsave(glue("{qc_dir}/dr-mds-pseudobulk-{p}.png"), mds_plot, width = 10, height = 6) 105 | } 106 | ``` 107 | 108 | # Plot expression patterns 109 | 110 | Subset to random samples to make plots more readable 111 | 112 | ```{r plot-subset-samples} 113 | samples_subset <- levels(sce$sample_id) 114 | if (length(samples_subset) > 15) { 115 | set.seed(99) 116 | samples_subset <- sort(sample(samples_subset, 10)) 117 | } 118 | sce_rand <- sce[, sce$sample_id %in% samples_subset] 119 | ``` 120 | 121 | Subset to random cells to speed up plot generation 122 | 123 | ```{r plot-subset-cells} 124 | if (ncol(sce_rand) > 100000) { 125 | set.seed(99) 126 | sce_rand <- sce_rand[, sample(colnames(sce_rand), 100000)] 127 | } 128 | ``` 129 | 130 | Create a function to convert expression matrix to a tidy data frame for ggplot2 131 | 132 | ```{r tidy_expression} 133 | tidy_expression <- function(sce, assay_name) { 134 | t(assay(sce, assay_name)) |> 135 | as_tibble(rownames = "cell_id") |> 136 | pivot_longer(!cell_id, names_to = "channel", values_to = "exprs") |> 137 | left_join(as_tibble(colData(sce_rand), rownames = "cell_id"), by = "cell_id") |> 138 | group_by(channel) |> 139 | dplyr::mutate(min_cutoff = quantile(exprs, 0.01), max_cutoff = quantile(exprs, 0.99)) |> 140 | dplyr::mutate(zscore = as.vector(scale(exprs))) |> 141 | ungroup() |> 142 | dplyr::filter(exprs >= min_cutoff, exprs <= max_cutoff) 143 | # dplyr::filter(zscore > -3, zscore < 3) 144 | } 145 | ``` 146 | 147 | Create a function to generate expression density plots 148 | 149 | ```{r} 150 | plot_expression_density <- function(x, values_col) { 151 | ggplot(x, aes(x = .data[[values_col]], color = sample_id)) + 152 | geom_density() + 153 | facet_wrap(vars(channel), scales = "free") + 154 | theme_minimal() + 155 | theme( 156 | plot.background = element_rect(fill = "white"), 157 | panel.grid.minor = element_blank() 158 | ) + 159 | scale_color_manual(values = color_scheme) 160 | } 161 | ``` 162 | 163 | Generate a density plot for the original unmodified values 164 | 165 | ```{r plot-density-original} 166 | exprs_tbl <- tidy_expression(sce = sce_rand, assay_name = "counts") 167 | density_plot <- 168 | exprs_tbl |> 169 | plot_expression_density(values_col = "exprs") 170 | ggsave(glue("{qc_dir}/sce-exprs-density-raw.png"), density_plot, width = 16, height = 12) 171 | ``` 172 | 173 | Generate a density plot for the original log-transformed values 174 | 175 | ```{r plot-density-log} 176 | if (min(exprs_tbl$exprs) >= 0) { 177 | density_plot <- 178 | exprs_tbl |> 179 | mutate(exprs_log10 = log10(exprs + 1)) |> 180 | plot_expression_density(values_col = "exprs_log10") 181 | ggsave(glue("{qc_dir}/sce-exprs-density-log.png"), density_plot, width = 16, height = 12) 182 | } 183 | ``` 184 | 185 | Generate a density plot for the arcsinh-transformed values 186 | 187 | ```{r plot-density-arcsinh} 188 | density_plot <- 189 | tidy_expression(sce = sce_rand, assay_name = "exprs") |> 190 | plot_expression_density(values_col = "exprs") 191 | ggsave(glue("{qc_dir}/sce-exprs-density-arcsinh.png"), density_plot, width = 16, height = 12) 192 | ``` 193 | 194 | Generate correlation plots for specific markers 195 | 196 | ```{r} 197 | if (all(c("CD4", "CD8") %in% rownames(sce))) { 198 | cor_plot <- 199 | t(assay(sce_rand, "exprs")) |> 200 | as_tibble(rownames = "cell_id") |> 201 | # dplyr::filter(CD3 > 4) |> 202 | left_join(as_tibble(colData(sce_rand), rownames = "cell_id"), by = "cell_id") |> 203 | ggplot(aes(x = CD4, y = CD8)) + 204 | # geom_point(size = 0.1, alpha = 0.2) + 205 | # geom_density_2d(color = "darkred", alpha = 0.8) + 206 | geom_density_2d_filled(contour_var = "ndensity") + 207 | facet_wrap(vars(sample_id), scales = "free") + 208 | theme_minimal() + 209 | theme( 210 | plot.background = element_rect(fill = "white"), 211 | panel.grid.minor = element_blank(), 212 | aspect.ratio = 1 213 | ) + 214 | scale_fill_viridis_d(option = "plasma") 215 | save_plot(glue("{qc_dir}/expr-marker-cor-CD4-CD8.png"), cor_plot, base_width = 12, base_height = 12) 216 | } 217 | 218 | if (all(c("CD3", "CD19") %in% rownames(sce))) { 219 | cor_plot <- 220 | t(assay(sce_rand, "exprs")) |> 221 | as_tibble(rownames = "cell_id") |> 222 | left_join(as_tibble(colData(sce_rand), rownames = "cell_id"), by = "cell_id") |> 223 | ggplot(aes(x = CD3, y = CD19)) + 224 | # geom_point(size = 0.1, alpha = 0.2) + 225 | # geom_density_2d(color = "darkred", alpha = 0.8) + 226 | geom_density_2d_filled(contour_var = "ndensity") + 227 | facet_wrap(vars(sample_id), scales = "free") + 228 | theme_minimal() + 229 | theme( 230 | plot.background = element_rect(fill = "white"), 231 | panel.grid.minor = element_blank(), 232 | aspect.ratio = 1 233 | ) + 234 | scale_fill_viridis_d(option = "plasma") 235 | save_plot(glue("{qc_dir}/expr-marker-cor-CD3-CD19.png"), cor_plot, base_width = 12, base_height = 12) 236 | } 237 | ``` 238 | 239 | Plot median expression of all markers per sample 240 | 241 | ```{r plot-median-expression} 242 | pb_plot <- plotPbExprs(sce, features = NULL) + scale_color_manual(values = color_scheme) 243 | save_plot(glue("{exprs_dir}/exprs-markers-samples-boxplot.png"), pb_plot, base_width = 12, base_height = 6) 244 | ``` 245 | 246 | Modified `CATALYST::.anno_factors()` function 247 | 248 | ```{r} 249 | .anno_factors <- function(x, ids, which, type = c("row", "column")) { 250 | type <- match.arg(type) 251 | # get non-numeric cell metadata variables 252 | cd <- colData(x) 253 | df <- data.frame(cd, check.names = FALSE) 254 | df <- select_if(df, ~ !is.numeric(.)) 255 | df <- mutate_all(df, ~ droplevels(factor(.x))) 256 | 257 | # store sample matching 258 | m <- match(ids, df$sample_id) 259 | 260 | # get number of matches per variable 261 | ns <- split(df, df$sample_id) |> 262 | lapply(mutate_all, droplevels) |> 263 | lapply(summarize_all, nlevels) |> 264 | do.call(what = "rbind") 265 | 266 | # keep only uniquely mapable factors included in 'which' 267 | keep <- names(which(colMeans(ns) == 1)) 268 | keep <- setdiff(keep, c("sample_id", "cluster_id")) 269 | if (is.character(which)) { 270 | keep <- intersect(keep, which) 271 | } 272 | if (length(keep) == 0) { 273 | return(NULL) 274 | } 275 | df <- df[m, keep, drop = FALSE] 276 | 277 | # get list of colors for each annotation 278 | lvls <- lapply(as.list(df), levels) 279 | nlvls <- vapply(lvls, length, numeric(1)) 280 | pal <- pal_igv("default")(51) 281 | if (any(nlvls > length(pal))) { 282 | pal <- colorRampPalette(pal)(max(nlvls)) 283 | } 284 | names(is) <- is <- colnames(df) 285 | cols <- lapply(is, function(i) { 286 | u <- pal[seq_len(nlvls[i])] 287 | names(u) <- lvls[[i]] 288 | u 289 | }) 290 | 291 | ComplexHeatmap::HeatmapAnnotation( 292 | which = type, df = df, 293 | col = cols, gp = grid::gpar(col = "white") 294 | ) 295 | } 296 | ``` 297 | 298 | Modified `CATALYST::plotExprHeatmap()` function 299 | 300 | ```{r} 301 | plotExprHeatmap_ed <- function(x, features = NULL, 302 | by = c("sample_id", "cluster_id", "both"), k = "meta20", m = NULL, 303 | assay = "exprs", fun = c("median", "mean", "sum"), 304 | scale = c("first", "last", "never"), q = 0.01, 305 | row_anno = TRUE, col_anno = TRUE, 306 | row_clust = TRUE, col_clust = TRUE, 307 | row_dend = TRUE, col_dend = TRUE, 308 | bars = FALSE, perc = FALSE, bin_anno = FALSE, 309 | hm_pal = rev(RColorBrewer::brewer.pal(11, "RdYlBu")), 310 | k_pal = CATALYST:::.cluster_cols, m_pal = k_pal, 311 | distance = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"), 312 | linkage = c("average", "ward.D", "single", "complete", "mcquitty", "median", "centroid", "ward.D2")) { 313 | # check validity of input arguments 314 | args <- as.list(environment()) 315 | CATALYST:::.check_args_plotExprHeatmap(args) 316 | distance <- match.arg(distance) 317 | linkage <- match.arg(linkage) 318 | scale <- match.arg(scale) 319 | fun <- match.arg(fun) 320 | by <- match.arg(by) 321 | 322 | # subset features of interest 323 | x <- x[unique(CATALYST:::.get_features(x, features)), ] 324 | 325 | # get specified cluster IDs 326 | if (by != "sample_id") { 327 | CATALYST:::.check_k(x, k) 328 | x$cluster_id <- cluster_ids(x, k) 329 | } 330 | if (by == "both") { 331 | by <- c("cluster_id", "sample_id") 332 | } 333 | 334 | # aggregate to pseudobulks by sample/cluster/both 335 | # using 'assay' data & 'fun' as summary statistic 336 | .do_agg <- function() { 337 | z <- CATALYST:::.agg(x, by, fun, assay) 338 | if (length(by) == 1) { 339 | return(z) 340 | } 341 | set_rownames( 342 | do.call("rbind", z), 343 | levels(x$cluster_id) 344 | ) 345 | } 346 | # do 0-1 scaling for each marker trimming 347 | # lower ('q'%) & upper (1-'q'%) quantiles 348 | .do_scale <- function() { 349 | if (scale == "first") { 350 | z <- assay(x, assay) 351 | z <- CATALYST:::.scale_exprs(z, 1, q) 352 | assay(x, assay, FALSE) <- z 353 | return(x) 354 | } else { 355 | CATALYST:::.scale_exprs(z, 1, q) 356 | } 357 | } 358 | 359 | # apply one of... 360 | # - scale & trim then aggregate 361 | # - aggregate then scale & trim 362 | # - aggregate only 363 | z <- switch(scale, 364 | first = { 365 | x <- .do_scale() 366 | .do_agg() 367 | }, 368 | last = { 369 | z <- .do_agg() 370 | .do_scale() 371 | }, 372 | never = { 373 | .do_agg() 374 | } 375 | ) 376 | if (length(by) == 1) z <- t(z) 377 | 378 | if (scale != "never" && !(assay == "counts" && fun == "sum")) { 379 | qs <- round(quantile(z, c(0.01, 0.99)) * 5) / 5 380 | lgd_aes <- list(at = seq(qs[1], qs[2], 0.2)) 381 | } else { 382 | lgd_aes <- list() 383 | } 384 | lgd_aes$title_gp <- grid::gpar( 385 | fontsize = 10, 386 | fontface = "bold", 387 | lineheight = 0.8 388 | ) 389 | 390 | # left-hand side heatmap annotation: 391 | # non-numeric cell metadata variables 392 | if (!isFALSE(row_anno)) { 393 | left_anno <- switch(by[1], 394 | sample_id = .anno_factors(x, levels(x$sample_id), row_anno, "row"), 395 | CATALYST:::.anno_clusters(x, k, m, k_pal, m_pal) 396 | ) 397 | } else { 398 | left_anno <- NULL 399 | } 400 | if (!isFALSE(col_anno) && length(by) == 2) { 401 | top_anno <- .anno_factors(x, levels(x$sample_id), col_anno, "colum") 402 | } else { 403 | top_anno <- NULL 404 | } 405 | 406 | # right-hand side heatmap annotation: 407 | # labeled barplot of event counts by sample 408 | if (bars) { 409 | right_anno <- .anno_counts(x[[by[1]]], perc) 410 | } else { 411 | right_anno <- NULL 412 | } 413 | 414 | # get bin annotation 415 | if (bin_anno) { 416 | cell_fun <- function(j, i, x, y, ...) { 417 | grid.text( 418 | gp = gpar(fontsize = 8), 419 | sprintf("%.2f", z[i, j]), x, y 420 | ) 421 | } 422 | } else { 423 | cell_fun <- NULL 424 | } 425 | 426 | a <- ifelse(assay == "exprs", "expression", assay) 427 | f <- switch(fun, 428 | "median" = "med", 429 | fun 430 | ) 431 | hm_title <- switch(scale, 432 | first = sprintf("%s %s\n%s", fun, "scaled", a), 433 | last = sprintf("%s %s\n%s", "scaled", fun, a), 434 | never = paste(fun, a, sep = "\n") 435 | ) 436 | if (length(by) == 2) { 437 | col_title <- features 438 | } else if (length(features) == 1 && 439 | features %in% c("type", "state")) { 440 | col_title <- paste0(features, "_markers") 441 | } else { 442 | col_title <- "" 443 | } 444 | 445 | ComplexHeatmap::Heatmap( 446 | matrix = z, 447 | name = hm_title, 448 | col = circlize::colorRamp2( 449 | seq(min(z), max(z), l = n <- 100), 450 | colorRampPalette(hm_pal)(n) 451 | ), 452 | column_title = col_title, 453 | column_title_side = ifelse(length(by) == 2, "top", "bottom"), 454 | cell_fun = cell_fun, 455 | cluster_rows = row_clust, 456 | cluster_columns = col_clust, 457 | show_row_dend = row_dend, 458 | show_column_dend = col_dend, 459 | clustering_distance_rows = distance, 460 | clustering_method_rows = linkage, 461 | clustering_distance_columns = distance, 462 | clustering_method_columns = linkage, 463 | show_row_names = ( 464 | is.null(left_anno) || 465 | isTRUE(by == "sample_id")) && !perc, 466 | row_names_side = ifelse( 467 | by[1] == "cluster_id" || 468 | isFALSE(row_anno) && !row_dend || 469 | isFALSE(row_clust), 470 | "left", "right" 471 | ), 472 | top_annotation = top_anno, 473 | left_annotation = left_anno, 474 | right_annotation = right_anno, 475 | rect_gp = grid::gpar(col = "white"), 476 | heatmap_legend_param = lgd_aes 477 | ) 478 | } 479 | ``` 480 | 481 | ```{r plot-markers-samples-heatmap} 482 | num_samples <- n_distinct(sce$sample_id) 483 | plot_height <- 4 + (num_samples / 8) 484 | plot_height <- round(plot_height, 1) 485 | # hm_plot <- plotExprHeatmap(sce, k_pal = pal_igv()(51), m_pal = pal_igv()(51), scale = "last") 486 | hm_plot <- plotExprHeatmap_ed(sce, scale = "last") 487 | png(glue("{exprs_dir}/exprs-markers-samples-heatmap.png"), width = 15, height = plot_height, units = "in", res = 300) 488 | print(hm_plot) 489 | dev.off() 490 | ``` 491 | 492 | Aggregate expression the same way as `CATALYST::plotExprHeatmap()` function 493 | 494 | ```{r} 495 | agg_exprs <- function(x, assay = "exprs", fun = c("median", "mean", "sum"), 496 | scale = c("first", "last", "never"), q = 0.01) { 497 | # check validity of input arguments 498 | scale <- match.arg(scale) 499 | fun <- match.arg(fun) 500 | by <- "sample_id" 501 | 502 | # aggregate to pseudobulks by sample/cluster/both 503 | # using 'assay' data & 'fun' as summary statistic 504 | .do_agg <- function() { 505 | z <- CATALYST:::.agg(x, by, fun, assay) 506 | if (length(by) == 1) { 507 | return(z) 508 | } 509 | set_rownames( 510 | do.call("rbind", z), 511 | levels(x$cluster_id) 512 | ) 513 | } 514 | # do 0-1 scaling for each marker trimming 515 | # lower ('q'%) & upper (1-'q'%) quantiles 516 | .do_scale <- function() { 517 | if (scale == "first") { 518 | z <- assay(x, assay) 519 | z <- CATALYST:::.scale_exprs(z, 1, q) 520 | assay(x, assay, FALSE) <- z 521 | return(x) 522 | } else { 523 | CATALYST:::.scale_exprs(z, 1, q) 524 | } 525 | } 526 | 527 | # apply one of... 528 | # - scale & trim then aggregate 529 | # - aggregate then scale & trim 530 | # - aggregate only 531 | z <- switch(scale, 532 | first = { 533 | x <- .do_scale() 534 | .do_agg() 535 | }, 536 | last = { 537 | z <- .do_agg() 538 | .do_scale() 539 | }, 540 | never = { 541 | .do_agg() 542 | } 543 | ) 544 | if (length(by) == 1) z <- t(z) 545 | 546 | if (scale != "never" && !(assay == "counts" && fun == "sum")) { 547 | qs <- round(quantile(z, c(0.01, 0.99)) * 5) / 5 548 | lgd_aes <- list(at = seq(qs[1], qs[2], 0.2)) 549 | } else { 550 | lgd_aes <- list() 551 | } 552 | lgd_aes$title_gp <- grid::gpar( 553 | fontsize = 10, 554 | fontface = "bold", 555 | lineheight = 0.8 556 | ) 557 | 558 | as_tibble(t(z), rownames = "marker") 559 | } 560 | ``` 561 | 562 | Aggregate expression 563 | 564 | ```{r aggregate-expression} 565 | # aggregate assay data is just (no scaling) 566 | agg_exprs(sce, scale = "never") |> write_csv(glue("{exprs_dir}/exprs-samples-median.csv")) 567 | # aggregate assay data first and scale subsequently (range of each marker will be 0-1) 568 | agg_exprs(sce, scale = "last") |> write_csv(glue("{exprs_dir}/exprs-samples-median-scaled.csv")) 569 | # scale and trim then aggregate 570 | # agg_exprs(sce, scale = "first") |> write_csv(glue("{proj_dir}/expr-samples-scale-agg.csv")) 571 | ``` 572 | 573 | # Generate UMAPs 574 | 575 | Run UMAP 576 | 577 | ```{r run-umap} 578 | # cells: maximal number of cells per sample 579 | # features: "type"/"state" for type/state_markers(x) or NULL to use all features 580 | set.seed(99) 581 | sce <- runDR(sce, dr = "UMAP", cells = 1000, n_neighbors = 50, features = NULL, min_dist = 0.3, n_epochs = 500) 582 | sce 583 | ``` 584 | 585 | Get only the cells with UMAP coordinates 586 | 587 | ```{r} 588 | # umap_cells <- reducedDims(sce)$UMAP[,1] 589 | # umap_cells <- umap_cells[!is.na(umap_cells)] 590 | ``` 591 | 592 | Randomize cell order for plotting 593 | 594 | ```{r} 595 | set.seed(99) 596 | sce_rand <- sce[, sample(colnames(sce))] 597 | ``` 598 | 599 | Plot phenotypes overlaid on a UMAP 600 | 601 | ```{r umap-phenotypes} 602 | for (p in names(colData(sce))) { 603 | if (n_distinct(colData(sce)[[p]]) < length(color_scheme)) { 604 | umap_pheno <- 605 | plotDR(sce_rand, "UMAP", color_by = p) + 606 | theme_classic() + 607 | theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank()) + 608 | scale_color_manual(values = color_scheme) 609 | ggsave(glue("{phono_dir}/dr-umap-pheno-{p}.png"), umap_pheno, width = 10, height = 6) 610 | } 611 | } 612 | ``` 613 | 614 | Plot samples overlaid on a UMAP, split by patient/condition when there are a lot of samples 615 | 616 | ```{r umap-samples-subsets} 617 | if (n_distinct(sce$sample_id) > 15) { 618 | for (sub_p in sort(unique(sce$patient_id))) { 619 | umap_pheno <- 620 | plotDR(sce_rand[, sce_rand$patient_id == sub_p], "UMAP", color_by = "sample_id") + 621 | theme_classic() + 622 | theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank()) + 623 | scale_color_manual(values = color_scheme) 624 | ggsave(glue("{phono_dir}/dr-umap-pheno-subset-sample_id-patient_id-{sub_p}.png"), umap_pheno, width = 10, height = 6) 625 | } 626 | for (sub_c in sort(unique(sce$condition))) { 627 | umap_pheno <- 628 | plotDR(sce_rand[, sce_rand$condition == sub_c], "UMAP", color_by = "sample_id") + 629 | theme_classic() + 630 | theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank()) + 631 | scale_color_manual(values = color_scheme) 632 | ggsave(glue("{phono_dir}/dr-umap-pheno-subset-sample_id-condition-{sub_c}.png"), umap_pheno, width = 10, height = 6) 633 | } 634 | } 635 | ``` 636 | 637 | Plot markers overlaid on a UMAP 638 | 639 | ```{r umap-markers} 640 | marker_colors <- rev(RColorBrewer::brewer.pal(11, "RdYlBu")) 641 | # hcl.colors(10, "reds", rev = TRUE) 642 | for (m in sort(rownames(sce))) { 643 | marker_plot <- 644 | plotDR(sce_rand, "UMAP", color_by = m, assay = "exprs", a_pal = marker_colors) + 645 | theme_cowplot() + 646 | theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank(), strip.background = element_blank()) 647 | ggsave(glue("{exprs_dir}/dr-umap-exprs-{m}.png"), marker_plot, width = 8, height = 6) 648 | } 649 | ``` 650 | 651 | # Perform clustering 652 | 653 | Cluster 654 | 655 | ```{r cluster} 656 | sce <- cluster(sce, features = NULL, xdim = 15, ydim = 15, maxK = 50, seed = 99) 657 | ``` 658 | 659 | Randomize cell order for plotting 660 | 661 | ```{r} 662 | set.seed(99) 663 | sce_rand <- sce[, sample(colnames(sce))] 664 | ``` 665 | 666 | Generate cluster-related plots (UMAP, heatmap, abundance) 667 | 668 | ```{r plot-clusters} 669 | for (clust in c("meta5", "meta8", "meta10", "meta15", "meta20", "meta25", "meta30", "meta50")) { 670 | umap_clust <- 671 | plotDR(sce_rand, "UMAP", color_by = clust) + 672 | theme_classic() + 673 | theme(aspect.ratio = 1, axis.text = element_blank(), axis.ticks = element_blank(), strip.background = element_blank()) + 674 | scale_color_manual(values = color_scheme) 675 | ggsave(glue("{clust_dir}/dr-umap-clusters-{clust}.png"), umap_clust, width = 8, height = 6) 676 | 677 | hm_h <- n_distinct(cluster_ids(sce, clust)) 678 | hm_h <- (hm_h / 5) + 4 679 | hm_plot <- plotExprHeatmap(sce, scale = "last", k = clust, by = "cluster_id", bars = TRUE, perc = TRUE, k_pal = pal_igv()(51)) 680 | png(glue("{clust_dir}/exprs-markers-clusters-heatmap-{clust}.png"), width = 15, height = hm_h, units = "in", res = 300) 681 | print(hm_plot) 682 | dev.off() 683 | 684 | plot_bar <- plotAbundances(sce, k = clust, by = "sample_id", group_by = "condition", k_pal = pal_igv()(51)) 685 | ggsave(glue("{clust_dir}/abundance-bar-{clust}.png"), plot_bar, width = 9, height = 9) 686 | 687 | plot_box <- plotAbundances(sce, k = clust, by = "cluster_id", group_by = "condition", k_pal = pal_igv()(51)) 688 | ggsave(glue("{clust_dir}/abundance-box-{clust}.png"), plot_box, width = 9, height = 6) 689 | } 690 | ``` 691 | 692 | Generate a complete metadata table (meta-clusters are not stored in colData) 693 | 694 | ```{r combine-coldata-clusters} 695 | # confirm that cluster_id and som100 (when grid is 10x10) are referring to the same clusters 696 | # names(cluster_codes(x)) 697 | if (identical(table(colData(sce)$cluster_id), table(cluster_ids(sce, "som225")))) { 698 | metadata_tbl <- 699 | left_join( 700 | as_tibble(colData(sce), rownames = "cell_id"), 701 | cluster_codes(sce), 702 | by = c("cluster_id" = "som225") 703 | ) 704 | } else { 705 | stop("cluster_id mismatch") 706 | } 707 | dim(metadata_tbl) 708 | ``` 709 | 710 | ```{r save-metadata} 711 | write_csv(metadata_tbl, glue("{qc_dir}/sce-metadata.csv.gz")) 712 | ``` 713 | 714 | # Finalize analysis 715 | 716 | Save SingleCellExperiment 717 | 718 | ```{r save-sce} 719 | qs2::qs_save(object = sce, file = sce_qs2) 720 | ``` 721 | 722 | Delete Rplots.pdf 723 | 724 | ```{r} 725 | if (file.exists("Rplots.pdf")) { 726 | file.remove("Rplots.pdf") 727 | } 728 | ``` 729 | 730 | # Session info 731 | 732 | ```{r session-info} 733 | sessionInfo() 734 | ``` 735 | -------------------------------------------------------------------------------- /scripts/meth-minfi.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## minfi wrapper functions to streamline the analysis of methylation microarrays 3 | ## 4 | 5 | 6 | # output width 7 | options(width = 120) 8 | # print warnings as they occur 9 | options(warn = 1) 10 | # default type for the bitmap devices such as png (should default to "cairo") 11 | options(bitmapType = "cairo") 12 | 13 | # dependencies 14 | suppressPackageStartupMessages({ 15 | library(tidyverse) 16 | library(minfi) 17 | library(RColorBrewer) 18 | library(ggsci) 19 | }) 20 | 21 | # color scale for plots 22 | plot_colors = c(brewer.pal(5, "Set1"), brewer.pal(8, "Dark2"), pal_igv("default")(51)) 23 | 24 | # import data and generate some qc plots 25 | load_data = function(sample_sheet) { 26 | 27 | # check if sample sheet exists 28 | if (!file.exists(sample_sheet)) stop("sample sheet ", sample_sheet, " does not exist") 29 | 30 | # import sample sheet 31 | samples_tbl = read_csv(sample_sheet) 32 | 33 | # sample sheet needs to have "Basename" and "Sentrix_ID" for minfi 34 | if (!("Basename" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Basename\" column") 35 | if (!("Sentrix_ID" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Sentrix_ID\" column") 36 | 37 | # sample sheet needs to have "Sample" and "Condition" for this workflow 38 | if (!("Sample" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Sample\" column") 39 | if (!("Condition" %in% colnames(samples_tbl))) stop("sample sheet must contain \"Condition\" column") 40 | 41 | # add array ID based on basename (to check for batch effects, for example) 42 | samples_tbl$Array = gsub(".*/([0-9]*)_R[0-9][0-9]C[0-9][0-9]", "\\1", samples_tbl$Basename) 43 | 44 | message("\n\n ===== minfi::read.metharray.exp() ===== \n\n") 45 | 46 | # red and green channel measurements of the samples (combine() combines two sets of samples) 47 | raw_set = read.metharray.exp(targets = samples_tbl, recursive = TRUE, verbose = FALSE) 48 | 49 | # check that sample names and pData are in the same order (probably not necessary) 50 | if (!(identical(sampleNames(raw_set), sub(".*/", "", pData(raw_set)$Basename)))) stop("sample names not identical") 51 | 52 | # change sample identifier from "Basename" to "Sample" 53 | sampleNames(raw_set) = pData(raw_set)$Sample 54 | 55 | # show which array type and corresponding package are being used 56 | message("array: ", annotation(raw_set)[["array"]]) 57 | message("annotation: ", annotation(raw_set)[["annotation"]]) 58 | 59 | # show conditions 60 | message("samples per condition: ") 61 | raw_set$Condition %>% table(useNA = "ifany") %>% print() 62 | 63 | message("\n\n ===== minfi::read.qcReport() ===== \n\n") 64 | 65 | # PDF QC report of the most common plots 66 | qcReport(raw_set, sampGroups=pData(raw_set)$Condition, pdf="plot.qcreport.pdf") 67 | 68 | png("plot.density.raw.condition.png", width = 8, height = 5, units = "in", res = 300) 69 | densityPlot(raw_set, sampGroups = pData(raw_set)$Condition, pal = plot_colors) 70 | dev.off() 71 | 72 | png("plot.density.raw.array.png", width = 8, height = 5, units = "in", res = 300) 73 | densityPlot(raw_set, sampGroups = pData(raw_set)$Array, pal = plot_colors) 74 | dev.off() 75 | 76 | # delete Rplots.pdf 77 | if (file.exists("Rplots.pdf")) file.remove("Rplots.pdf") 78 | 79 | message("\n\n ===== minfi::detectionP() ===== \n\n") 80 | 81 | # identify failed positions 82 | det_p = detectionP(raw_set) 83 | 84 | # save detection stats 85 | det_p_summary = 86 | tibble( 87 | sample = colnames(det_p), 88 | detected_positions = colSums(det_p < 0.01), 89 | failed_positions = colSums(det_p >= 0.01), 90 | failed_positions_pct = round(colMeans(det_p > 0.01), digits = 3) 91 | ) %>% 92 | arrange(-failed_positions) %>% 93 | mutate(failed_positions_pct = failed_positions_pct * 100) 94 | write_csv(det_p_summary, "summary.detection.csv") 95 | 96 | return(raw_set) 97 | 98 | } 99 | 100 | # normalize (functional normalization) raw data using functional normalization and generate some qc plots 101 | normalize_data = function(raw_channel_set) { 102 | 103 | # FunNorm: preprocessFunnorm() -> GenomicRatioSet 104 | # Noob: preprocessNoob() -> MethylSet -> mapToGenome() -> GenomicMethylSet - > ratioConvert() -> GenomicRatioSet 105 | # may not be necessary to convert to GenomicRatioSet, getBeta() works with both 106 | 107 | message("\n\n ===== minfi::preprocessRaw() ===== \n\n") 108 | 109 | mset = preprocessRaw(raw_channel_set) 110 | 111 | # plot and save median intensity QC 112 | qc = getQC(mset) 113 | mset = addQC(mset, qc = qc) 114 | png("plot.medianintensity.png", width = 8, height = 8, units = "in", res = 300) 115 | plotQC(qc) 116 | dev.off() 117 | 118 | # worst samples (median intensity < 10.5 is failing by default) 119 | # qc[qc[,"mMed"] < 10.5 | qc[,"uMed"] < 10.5,] 120 | 121 | message("\n\n ===== minfi::preprocessFunnorm() ===== \n\n") 122 | 123 | # functional normalization (FunNorm) - produces GenomicRatioSet 124 | norm_set = preprocessFunnorm(raw_set, bgCorr = TRUE, dyeCorr = TRUE) 125 | # class(norm_set) 126 | write(paste0("total probes: ", nrow(norm_set)), file = "norm.log", append = TRUE) 127 | 128 | # identify failed positions 129 | det_p = detectionP(raw_set) 130 | 131 | # keep only probes that pass through preprocessFunnorm() 132 | det_p = det_p[intersect(rownames(det_p), rownames(norm_set)), ] 133 | 134 | # probes detected in at least 90% of the samples 135 | # normSet = normSet[rowSums(det_p < 0.01) > ncol(det_p) * 0.9, ] 136 | # probes detected in all samples 137 | norm_set = norm_set[rowSums(det_p < 0.01) == ncol(det_p), ] 138 | write(paste0("detected probes: ", nrow(norm_set)), file = "norm.log", append = TRUE) 139 | 140 | # drop the probes that contain either a SNP at the CpG interrogation or at the single nucleotide extension 141 | norm_set = addSnpInfo(norm_set) 142 | # head(granges(norm_set)) 143 | norm_set = dropLociWithSnps(norm_set, snps = c("SBE","CpG"), maf = 0) 144 | write(paste0("non-SNP probes: ", nrow(norm_set)), file = "norm.log", append = TRUE) 145 | 146 | # sex prediction plot 147 | png("plot.sex.png", width = 8, height = 8, units = "in", res = 300) 148 | plotSex(getSex(norm_set), id = sampleNames(norm_set)) 149 | dev.off() 150 | 151 | # annotation 152 | annot = getAnnotation(norm_set) 153 | 154 | # remove extra annotation columns and save 155 | annot_tbl = annot %>% as_tibble(rownames = "probe") %>% arrange(probe) 156 | remove_cols = c( 157 | "AddressA", "AddressB", "ProbeSeqA", "ProbeSeqB", "NextBase", "Color", "Forward_Sequence", "SourceSeq", 158 | "Probe_rs", "CpG_rs", "SBE_rs","Probe_maf", "CpG_maf", "SBE_maf", "Islands_Name", "UCSC_RefGene_Accession", 159 | "GencodeBasicV12_NAME", "GencodeBasicV12_Accession", "GencodeBasicV12_Group", "GencodeCompV12_Accession", 160 | "DNase_Hypersensitivity_NAME", "OpenChromatin_NAME", "Methyl27_Loci", "Methyl450_Loci", "Random_Loci") 161 | annot_tbl = annot_tbl %>% dplyr::select(!any_of(remove_cols)) 162 | write_csv(head(annot_tbl, 100), "annot.head100.csv") 163 | write_csv(annot_tbl, "annot.csv.gz") 164 | 165 | # remove sex probes 166 | sex_probes = annot$Name[annot$chr %in% c("chrX", "chrY")] 167 | norm_set = norm_set[!(rownames(norm_set) %in% sex_probes), ] 168 | write(paste0("non-sex probes: ", nrow(norm_set)), file = "norm.log", append = TRUE) 169 | 170 | beta = getBeta(norm_set) 171 | 172 | png("plot.density.norm.fnorm.png", width = 8, height = 5, units = "in", res = 300) 173 | densityPlot(beta, sampGroups = pData(norm_set)$Condition, pal = plot_colors) 174 | dev.off() 175 | 176 | # MDS plots (must be an 'RGChannelSet', a 'MethylSet' or matrix) 177 | 178 | png("plot.mds.raw.condition.png", width = 8, height = 8, units = "in", res = 300) 179 | mdsPlot(raw_set, numPositions = 10000, sampNames = sampleNames(raw_set), sampGroups = pData(raw_set)$Condition, 180 | legendPos = "topright", legendNCol = 1, pal = plot_colors) 181 | dev.off() 182 | 183 | png("plot.mds.norm.fnorm.condition.png", width = 8, height = 8, units = "in", res = 300) 184 | mdsPlot(beta, numPositions = 10000, sampNames = sampleNames(norm_set), sampGroups = pData(norm_set)$Condition, 185 | legendPos = "topright", legendNCol = 1, pal = plot_colors) 186 | dev.off() 187 | 188 | png("plot.mds.norm.fnorm.array.png", width = 8, height = 8, units = "in", res = 300) 189 | mdsPlot(beta, numPositions = 10000, sampNames = sampleNames(norm_set), sampGroups = pData(norm_set)$Array, 190 | legendPos = "topright", legendNCol = 1, pal = plot_colors) 191 | dev.off() 192 | 193 | # save beta values 194 | beta = getBeta(norm_set) 195 | beta_tbl = beta %>% round(3) %>% as_tibble(rownames = "probe") %>% arrange(probe) 196 | write_csv(head(beta_tbl, 100), "beta.head100.csv") 197 | write_csv(beta_tbl, "beta.csv.gz") 198 | 199 | return(norm_set) 200 | 201 | } 202 | 203 | 204 | 205 | # end 206 | -------------------------------------------------------------------------------- /scripts/mut-mhc-binding.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use v5.10; 6 | use File::Basename; 7 | use Bio::SeqIO; 8 | 9 | my $HELP = <new(-file => $coding_change_fa, -format => 'fasta'); 204 | my $seqio_out = Bio::SeqIO->new(-file => ">$out_file", -format => 'fasta'); 205 | my $seq_num = 0; 206 | # tracking WT sequence info (listed before mutant) 207 | my ($seq_id_wt, $tx_name_wt, $sequence_wt, $seq_padded_wt) = ("ERR", "ERR", "ERR", "ERR"); 208 | while ( my $seq_in = $seqio_in->next_seq() ) { 209 | # extract relevant parts 210 | my $seq_id = $seq_in->id; 211 | my $seq_desc = $seq_in->desc; 212 | $seq_desc =~ s/\s+$//; 213 | my $sequence = $seq_in->seq; 214 | 215 | # extract mutation details and sequence around the WT sequence (listed before mutant) 216 | if ($seq_desc =~ m/(.+?)\sWILDTYPE/) { 217 | $seq_id_wt = $seq_id; 218 | $sequence_wt = $sequence; 219 | $tx_name_wt = $1; 220 | } 221 | 222 | # process only altered sequences and single amino acid substitutions 223 | if ($seq_desc !~ m/silent/ && $seq_desc =~ m/\sp\.\w\d+\w\s/) { 224 | my ($tx_name, $aa_num, $aa_from, $aa_to, $seq_padded) = ("ERR", "ERR", "ERR", "ERR", "ERR"); 225 | # extract mutation details and sequence around the altered amino acid 226 | if ($seq_desc =~ m/(.+?)\s.+?\(position\s(\d+)\schanged\sfrom\s(\w)\sto\s(\w)\)/) { 227 | $tx_name = $1; 228 | $aa_num = $2; 229 | $aa_from = $3; 230 | $aa_to = $4; 231 | # adjust offsets if amino acid is close to the beginning of sequence 232 | my $substr_start = $aa_num - 1 - $padding; 233 | $substr_start = $substr_start < 0 ? 0 : $substr_start; 234 | my $substr_length = $aa_num > $padding ? $padding + $padding + 1 : $aa_num + $padding; 235 | # get sequence (0-based) 236 | $seq_padded = substr($sequence, $substr_start, $substr_length); 237 | # remove trailing asterisk (stop codon) 238 | $seq_padded =~ s/\*$//; 239 | # say "${seq_id}\t${tx_name}\t${aa_from}${aa_num}${aa_to}\t${seq_padded}"; 240 | 241 | # repeat for WT (was processed as previous sequence) 242 | my $seq_padded_wt = substr($sequence_wt, $substr_start, $substr_length); 243 | $seq_padded_wt =~ s/\*$//; 244 | 245 | # create output FASTA record and write it (binding predictions will just assign consecutive numbers) 246 | $seq_num++; 247 | my $new_id = "${seq_num}|${seq_id}|${tx_name}|${aa_from}${aa_num}${aa_to}|${seq_padded}"; 248 | my $seq_out = Bio::Seq->new(-seq => $seq_padded, -id => $new_id); 249 | $seqio_out->write_seq($seq_out); 250 | 251 | # create WT output FASTA record and write it (binding predictions will just assign consecutive numbers) 252 | $seq_num++; 253 | my $new_id_wt = "${seq_num}|${seq_id_wt}|${tx_name_wt}|WILDTYPE|${seq_padded_wt}"; 254 | my $seq_out_wt = Bio::Seq->new(-seq => $seq_padded_wt, -id => $new_id_wt); 255 | $seqio_out->write_seq($seq_out_wt); 256 | } 257 | } 258 | } 259 | 260 | # confirm that padded mutations FASTA file generated 261 | unless ( -e $out_file ) { 262 | die "\n\n ERROR: $out_file DOES NOT EXIST \n\n"; 263 | } 264 | if ( -z $out_file ) { 265 | die "\n\n ERROR: $out_file IS EMPTY \n\n"; 266 | } 267 | 268 | say "created $out_file"; 269 | 270 | return $out_file; 271 | } 272 | 273 | # run IEDB MHC-I Binding Predictions 274 | sub predict_mhc_i_binding { 275 | my $base_name = $_[0]; 276 | my $mutpad_fa = $_[1]; 277 | my $peptide_length = $_[2]; 278 | my $predict_binding_py = $_[3]; 279 | 280 | my $raw_iedb_out_file = "${base_name}.iedb.txt"; 281 | 282 | # delete output if already exists 283 | if ( -e $raw_iedb_out_file ) { 284 | unlink $raw_iedb_out_file; 285 | } 286 | 287 | # a reference panel of 27 alleles (human HLA reference set with maximal population coverage) 288 | # http://help.iedb.org/hc/en-us/articles/114094151851 289 | # my @alleles = ('HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:03', 'HLA-A*02:06', 'HLA-A*03:01', 'HLA-A*11:01', 290 | # 'HLA-A*23:01', 'HLA-A*24:02', 'HLA-A*26:01', 'HLA-A*30:01', 'HLA-A*30:02', 'HLA-A*31:01','HLA-A*32:01', 291 | # 'HLA-A*33:01', 'HLA-A*68:01', 'HLA-A*68:02', 'HLA-B*07:02', 'HLA-B*08:01', 'HLA-B*15:01', 'HLA-B*35:01', 292 | # 'HLA-B*40:01', 'HLA-B*44:02', 'HLA-B*44:03', 'HLA-B*51:01', 'HLA-B*53:01', 'HLA-B*57:01', 'HLA-B*58:01'); 293 | 294 | # mouse alleles 295 | my @alleles = ('H-2-Db', 'H-2-Dd', 'H-2-Kb', 'H-2-Kd', 'H-2-Kk', 'H-2-Ld'); 296 | 297 | # make predictions for each allele 298 | foreach (@alleles) { 299 | say "running predictions for allele $_"; 300 | 301 | # predict_binding.py command 302 | # ./src/predict_binding [method] [mhc] [peptide_length] [input_file] 303 | my $predict_binding_cmd = "$predict_binding_py consensus $_ $peptide_length $mutpad_fa >> $raw_iedb_out_file"; 304 | system $predict_binding_cmd; 305 | 306 | # confirm that binding predictions file generated 307 | unless ( -e $raw_iedb_out_file ) { 308 | die "\n\n ERROR: $raw_iedb_out_file DOES NOT EXIST \n\n"; 309 | } 310 | if ( -z $raw_iedb_out_file ) { 311 | die "\n\n ERROR: $raw_iedb_out_file IS EMPTY \n\n"; 312 | } 313 | } 314 | 315 | my $merged_output = merge_binding_predictions_input_output($base_name, $mutpad_fa, $raw_iedb_out_file); 316 | 317 | # clean up 318 | sleep(1); 319 | # unlink $mutpad_fa; 320 | 321 | say "created $merged_output"; 322 | 323 | return $merged_output; 324 | 325 | } 326 | 327 | # run IEDB MHC-II Binding Predictions 328 | sub predict_mhc_ii_binding { 329 | my $base_name = $_[0]; 330 | my $mutpad_fa = $_[1]; 331 | my $peptide_length = $_[2]; 332 | my $predict_binding_py = $_[3]; 333 | 334 | my $raw_iedb_out_file = "${base_name}.iedb.txt"; 335 | 336 | # delete output if already exists 337 | if ( -e $raw_iedb_out_file ) { 338 | unlink $raw_iedb_out_file; 339 | } 340 | 341 | # mouse alleles 342 | my @alleles = ('H2-IAb', 'H2-IAd'); 343 | 344 | # make predictions for each allele 345 | foreach (@alleles) { 346 | say "running predictions for allele $_"; 347 | 348 | # mhc_II_binding.py command 349 | # python mhc_II_binding.py prediction_method_name allele_name input_sequence_file_name 350 | # Example: python mhc_II_binding.py consensus3 HLA-DRB1*03:01 test.fasta 351 | my $predict_binding_cmd = "$predict_binding_py consensus3 $_ $mutpad_fa >> $raw_iedb_out_file"; 352 | system $predict_binding_cmd; 353 | say "$predict_binding_cmd"; 354 | 355 | # confirm that binding predictions file generated 356 | unless ( -e $raw_iedb_out_file ) { 357 | die "\n\n ERROR: $raw_iedb_out_file DOES NOT EXIST \n\n"; 358 | } 359 | if ( -z $raw_iedb_out_file ) { 360 | die "\n\n ERROR: $raw_iedb_out_file IS EMPTY \n\n"; 361 | } 362 | } 363 | 364 | my $merged_output = merge_binding_predictions_input_output($base_name, $mutpad_fa, $raw_iedb_out_file); 365 | 366 | # clean up 367 | sleep(1); 368 | # unlink $mutpad_fa; 369 | 370 | say "created $merged_output"; 371 | 372 | return $merged_output; 373 | 374 | } 375 | 376 | # combine binding predictions input FASTA and output table 377 | sub merge_binding_predictions_input_output { 378 | my $base_name = $_[0]; 379 | my $mutpad_fa = $_[1]; 380 | my $iedb_out_txt = $_[2]; 381 | 382 | my $out_file = "${base_name}.binding.txt"; 383 | 384 | # delete output if already exists 385 | if ( -e $out_file ) { 386 | unlink $out_file; 387 | } 388 | 389 | # $iedb_out_txt columns: allele, seq_num, start, end, peptide, ... 390 | # ${mutpad_fa}.txt columns: seq_num, line_id, transcript_id, mutation, peptide 391 | 392 | # header for the mutated and WT parts of the combined file 393 | my $bindpred_header = `cat $iedb_out_txt | head -1 | cut -f 1,3,5-11`; 394 | system "printf \"line_id\ttranscript_id\tp_change\taa_padded\t${bindpred_header}\" > ${out_file}.mut.txt"; 395 | $bindpred_header =~ s/\t/_wt\t/g; 396 | system "printf \"p_change_wt\taa_padded_wt\t${bindpred_header}\" > ${out_file}.wt.txt"; 397 | 398 | # clean up input files for joining 399 | system "cat $mutpad_fa | grep '^>' | cut -c 2- | tr '|' '\t' | LC_ALL=C sort -k1,1 > ${mutpad_fa}.txt"; 400 | system "cat $iedb_out_txt | grep -v '^allele' | LC_ALL=C sort -k2,2 -k1,1 -k3,3 | cut -f 1,2,3,5-11 > ${iedb_out_txt}.tmp"; 401 | 402 | system "cat ${mutpad_fa}.txt | grep -v 'WILDTYPE' > ${mutpad_fa}.mut.txt"; 403 | system "cat ${mutpad_fa}.txt | grep 'WILDTYPE' > ${mutpad_fa}.wt.txt"; 404 | 405 | # join by seq_num, remove seq_num col, sort by line_id and start 406 | my $join_mut_cmd = 'LC_ALL=C join -t $\'\t\' -a 1 -1 1 -2 2'; 407 | $join_mut_cmd .= " ${mutpad_fa}.mut.txt"; 408 | $join_mut_cmd .= " ${iedb_out_txt}.tmp"; 409 | $join_mut_cmd .= " | LC_ALL=C sort -k2,2 -k7,7"; 410 | $join_mut_cmd .= " | cut -f 2-"; 411 | $join_mut_cmd .= " >> ${out_file}.mut.txt"; 412 | system $join_mut_cmd; 413 | 414 | # join by seq_num 415 | my $join_wt_cmd = 'LC_ALL=C join -t $\'\t\' -a 1 -1 1 -2 2'; 416 | $join_wt_cmd .= " ${mutpad_fa}.wt.txt"; 417 | $join_wt_cmd .= " ${iedb_out_txt}.tmp"; 418 | $join_wt_cmd .= " | LC_ALL=C sort -k2,2 -k7,7"; 419 | $join_wt_cmd .= " | cut -f 4-"; 420 | $join_wt_cmd .= " >> ${out_file}.wt.txt"; 421 | system $join_wt_cmd; 422 | 423 | sleep(1); 424 | 425 | system "paste ${out_file}.mut.txt ${out_file}.wt.txt >> ${out_file}"; 426 | 427 | # confirm that binding predictions file generated 428 | unless ( -e $out_file ) { 429 | die "\n\n ERROR: $out_file DOES NOT EXIST \n\n"; 430 | } 431 | 432 | # clean up 433 | sleep(1); 434 | # unlink $mutpad_fa; 435 | unlink "${iedb_out_txt}.tmp"; 436 | unlink "${mutpad_fa}.mut.txt"; 437 | unlink "${mutpad_fa}.wt.txt"; 438 | unlink "${out_file}.mut.txt"; 439 | unlink "${out_file}.wt.txt"; 440 | 441 | return $out_file; 442 | 443 | } 444 | 445 | # combine binding predictions table with variant annotations 446 | sub annotate_binding_predictions { 447 | my $base_name = $_[0]; 448 | my $bindpred_txt = $_[1]; 449 | my $evf = $_[2]; 450 | 451 | my $out_file = "${base_name}.binding.annot.txt"; 452 | 453 | # header for the combined file 454 | my $bindpred_header = `cat $bindpred_txt | head -1 | cut -f 2-`; 455 | system "printf \"#MUT\taa_change\tchr\tpos\tref\talt\t${bindpred_header}\" > $out_file"; 456 | 457 | # clean up input files for joining 458 | system "cat $evf | LC_ALL=C sort -k1,1 | cut -f 1,3,4,5,7,8 > ${evf}.tmp"; 459 | system "cat $bindpred_txt | grep -v '^line_id' | LC_ALL=C sort -k1,1 > ${bindpred_txt}.tmp"; 460 | 461 | # join, remove seq col, add sample and mut cols, sort by consensus_percentile_rank 462 | my $join_cmd = 'LC_ALL=C join -t $\'\t\' -a 2'; 463 | $join_cmd .= " ${evf}.tmp"; 464 | $join_cmd .= " ${bindpred_txt}.tmp"; 465 | $join_cmd .= " | cut -f 2-"; 466 | $join_cmd .= ' | awk -F $\'\t\' \'BEGIN {OFS=FS} {print $2":"$3":"$4":"$5,$0}\''; 467 | $join_cmd .= " | LC_ALL=C sort -k13,13n -k14,14n"; 468 | $join_cmd .= " >> $out_file"; 469 | system $join_cmd; 470 | 471 | # clean up 472 | sleep(1); 473 | unlink "${evf}.tmp"; 474 | unlink "${bindpred_txt}.tmp"; 475 | 476 | return $out_file; 477 | } 478 | 479 | 480 | 481 | # end 482 | -------------------------------------------------------------------------------- /scripts/scrna-decontaminate-soupx.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ' 4 | Description: 5 | Remove ambient RNA contamination from 10x Genomics Chromium single-cell RNA-seq data using SoupX. 6 | Input and output are in the format produced by the Cell Ranger software suite. 7 | 8 | Usage: 9 | scrna-decontaminate-soupx.R 10 | 11 | Arguments: 12 | input directory 13 | output directory (will contain "outs/filtered_feature_bc_matrix") 14 | 15 | Options: 16 | -h, --help show this screen 17 | ' -> doc 18 | 19 | 20 | # increase output width 21 | options(width = 120) 22 | # print warnings as they occur 23 | options(warn = 1) 24 | 25 | # retrieve the command-line arguments 26 | library(docopt) 27 | opts = docopt(doc) 28 | 29 | # relevent arguments 30 | in_dir = opts$in_dir 31 | out_dir = opts$out_dir 32 | 33 | # check if the input parameters are valid 34 | message("input dir: ", in_dir) 35 | if (!dir.exists(in_dir)) { stop("input dir does not exist") } 36 | message("output dir: ", out_dir) 37 | if (dir.exists(out_dir)) { stop("output dir already exists") } 38 | 39 | # load libraries 40 | suppressPackageStartupMessages({ 41 | library(glue) 42 | library(Matrix) 43 | library(SoupX) 44 | library(DropletUtils) 45 | }) 46 | 47 | # set output directory as working directory 48 | dir.create(out_dir) 49 | if (dir.exists(out_dir)) { 50 | setwd(out_dir) 51 | } else { 52 | stop(glue("output dir {out_dir} could not be created")) 53 | } 54 | 55 | # log to file 56 | write(glue("analysis: {out_dir}"), file = "create.log", append = TRUE) 57 | write(glue("soupx version: {packageVersion('SoupX')}"), file = "create.log", append = TRUE) 58 | 59 | # find "outs" dir (contains "raw_feature_bc_matrix") 60 | cr_outs_dir = list.files(path = in_dir, pattern = "raw_feature_bc_matrix$", full.names = TRUE, recursive = TRUE, include.dirs = TRUE) 61 | cr_outs_dir = dirname(cr_outs_dir) 62 | if (length(cr_outs_dir) != 1) stop(glue("no outs directory in {in_dir}")) 63 | 64 | # load data and estimate soup profile 65 | sc = SoupX::load10X(cr_outs_dir) 66 | 67 | # log the stats 68 | write(glue("counts matrix cells: {ncol(sc$toc)}"), file = "create.log", append = TRUE) 69 | write(glue("counts matrix genes: {nrow(sc$toc)}"), file = "create.log", append = TRUE) 70 | in_umis = Matrix::colSums(sc$toc) 71 | write(glue("unfiltered mean UMIs per cell: {round(mean(in_umis), 3)}"), file = "create.log", append = TRUE) 72 | write(glue("unfiltered median UMIs per cell: {median(in_umis)}"), file = "create.log", append = TRUE) 73 | in_detected_genes = Matrix::colSums(sc$toc > 0) 74 | write(glue("unfiltered min genes per cell: {min(in_detected_genes)}"), file = "create.log", append = TRUE) 75 | write(glue("unfiltered max genes per cell: {max(in_detected_genes)}"), file = "create.log", append = TRUE) 76 | write(glue("unfiltered mean genes per cell: {round(mean(in_detected_genes), 3)}"), file = "create.log", append = TRUE) 77 | write(glue("unfiltered median genes per cell: {median(in_detected_genes)}"), file = "create.log", append = TRUE) 78 | 79 | # estimate the level of background contamination (represented as rho) 80 | # creates a plot showing the density of estimates 81 | png("qc.soupx.estimates.png", res = 300, width = 8, height = 5, units = "in") 82 | sc = SoupX::autoEstCont(sc) 83 | dev.off() 84 | 85 | write(glue("estimated rho: {sc$fit$rhoEst}"), file = "create.log", append = TRUE) 86 | 87 | # clean the data 88 | soupx_out = SoupX::adjustCounts(sc) 89 | dim(soupx_out) 90 | 91 | # log the stats 92 | out_umis = Matrix::colSums(soupx_out) 93 | write(glue("decontaminated mean UMIs per cell: {round(mean(out_umis), 3)}"), file = "create.log", append = TRUE) 94 | write(glue("decontaminated median UMIs per cell: {median(out_umis)}"), file = "create.log", append = TRUE) 95 | out_detected_genes = Matrix::colSums(soupx_out > 0) 96 | write(glue("decontaminated min genes per cell: {min(out_detected_genes)}"), file = "create.log", append = TRUE) 97 | write(glue("decontaminated max genes per cell: {max(out_detected_genes)}"), file = "create.log", append = TRUE) 98 | write(glue("decontaminated mean genes per cell: {round(mean(out_detected_genes), 3)}"), file = "create.log", append = TRUE) 99 | write(glue("decontaminated median genes per cell: {median(out_detected_genes)}"), file = "create.log", append = TRUE) 100 | 101 | # write count data in the 10x format (path must not exist) 102 | dir.create("./outs") 103 | DropletUtils::write10xCounts(x = soupx_out, path = "./outs/filtered_feature_bc_matrix", version = "3") 104 | 105 | # delete Rplots.pdf 106 | if (file.exists("Rplots.pdf")) file.remove("Rplots.pdf") 107 | 108 | 109 | 110 | # end 111 | -------------------------------------------------------------------------------- /scripts/scrna-doublets-scdblfinder.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ' 4 | Description: 5 | Mark putative doublets in single-cell RNA-seq data stored as a Seurat object using scDblFinder. 6 | Input is a Seurat object stored as an RDS file. 7 | 8 | Usage: 9 | scrna-doublets-scdblfinder.R 10 | 11 | Arguments: 12 | input directory 13 | 14 | Options: 15 | -h, --help show this screen 16 | ' -> doc 17 | 18 | 19 | # increase output width 20 | options(width = 120) 21 | # print warnings as they occur 22 | options(warn = 1) 23 | 24 | # retrieve the command-line arguments 25 | library(docopt) 26 | opts = docopt(doc) 27 | 28 | # relevent arguments 29 | seurat_rds = opts$seurat_rds 30 | out_dir = dirname(seurat_rds) 31 | 32 | message("seurat object: ", seurat_rds) 33 | message("output dir: ", out_dir) 34 | 35 | # check if the input is valid 36 | if (!file.exists(seurat_rds)) { stop("object file does not exist") } 37 | if (!dir.exists(out_dir)) { stop("output dir does not exist") } 38 | 39 | # load libraries 40 | suppressPackageStartupMessages({ 41 | library(Seurat) 42 | library(tidyverse) 43 | library(cowplot) 44 | library(glue) 45 | library(scran) 46 | library(scDblFinder) 47 | }) 48 | 49 | # import seurat object 50 | seurat_obj = readRDS(seurat_rds) 51 | 52 | # set output directory as working directory 53 | setwd(out_dir) 54 | 55 | # check if output exists already 56 | if (file.exists("doublets.scDblFinder.csv.gz")) { stop("output already exists") } 57 | 58 | # log to file 59 | write(glue("scDblFinder version: {packageVersion('scDblFinder')}"), file = "create.log", append = TRUE) 60 | 61 | Idents(seurat_obj) = "orig.ident" 62 | sce = Seurat::as.SingleCellExperiment(seurat_obj, assay = "RNA") 63 | sce = scran::computeSumFactors(sce, BPPARAM = BiocParallel::MulticoreParam(4)) 64 | 65 | set.seed(99) 66 | if (("hash.ID" %in% names(seurat_obj@meta.data)) && ("HTO" %in% Seurat::Assays(seurat_obj))) { 67 | # hashed multi-sample experiment 68 | # `hash.ID` is created by `HTODemux`, `MULTI_ID` is the `MULTIseqDemux` equivalent and renamed `hash.ID` by scooter 69 | # samples are independent captures, not biological samples, if multiplexed using cell hashes 70 | if ("library" %in% names(seurat_obj@meta.data)) { 71 | if ("Doublet" %in% seurat_obj@meta.data$hash.ID) { 72 | message("library/batch: `library` with known doublets") 73 | known_doublets = sce$hash.ID == "Doublet" 74 | hto_doublet_rate = round(sum(known_doublets) / ncol(seurat_obj), 3) 75 | message(glue("hashtag doublet rate: {hto_doublet_rate}")) 76 | write(glue("hashtag doublet rate: {hto_doublet_rate}"), file = "create.log", append = TRUE) 77 | } else { 78 | message("library/batch: `library` without known doublets") 79 | known_doublets = NULL 80 | } 81 | doublet_tbl = 82 | scDblFinder( 83 | sce, samples = "library", knownDoublets = known_doublets, 84 | returnType = "table", BPPARAM = BiocParallel::MulticoreParam(4) 85 | ) 86 | } else { 87 | stop("hashed multi-sample experiment should have a `library` metadata column") 88 | } 89 | } else if (n_distinct(seurat_obj@meta.data$orig.ident) > 1) { 90 | # multi-sample experiment 91 | message("library/batch: `orig.ident` without known doublets") 92 | doublet_tbl = scDblFinder(sce, samples = "orig.ident", returnType = "table", BPPARAM = BiocParallel::MulticoreParam(4)) 93 | } else { 94 | message("library/batch: none") 95 | doublet_tbl = scDblFinder(sce, returnType = "table", BPPARAM = BiocParallel::MulticoreParam(4)) 96 | } 97 | 98 | # using the "samples" parameter does not return a table (fixed in 1.11.4) 99 | if (class(doublet_tbl) == "SingleCellExperiment") { 100 | doublet_tbl = colData(doublet_tbl) %>% as.data.frame() %>% dplyr::select(starts_with("scDblFinder")) 101 | colnames(doublet_tbl) = stringr::str_remove(colnames(doublet_tbl), "scDblFinder.") 102 | doublet_tbl$type = "real" 103 | } 104 | doublet_tbl = doublet_tbl %>% as_tibble(rownames = "cell") %>% dplyr::filter(type == "real") %>% dplyr::arrange(cell) 105 | write_csv(doublet_tbl, "doublets.scDblFinder.csv.gz") 106 | 107 | if (nrow(doublet_tbl) != ncol(seurat_obj)) { stop("doublet table and seurat object are not the same size") } 108 | 109 | # add doublet stats to the seurat object 110 | doublet_tbl = doublet_tbl %>% select(cell, doublet_score_scDblFinder = score, doublet_class_scDblFinder = class) 111 | doublet_df = doublet_tbl %>% as.data.frame() %>% column_to_rownames("cell") %>% sample_frac() 112 | seurat_obj = AddMetaData(seurat_obj, doublet_df) 113 | seurat_obj@meta.data$doublet_class_scDblFinder = factor(seurat_obj@meta.data$doublet_class_scDblFinder) 114 | 115 | # check doublet rate 116 | num_doublets = table(seurat_obj@meta.data$doublet_class_scDblFinder)[["doublet"]] 117 | doublet_rate = round(num_doublets / ncol(seurat_obj), 3) 118 | message(glue("doublet rate: {doublet_rate}")) 119 | write(glue("num doublets: {num_doublets}"), file = "create.log", append = TRUE) 120 | write(glue("doublet rate: {doublet_rate}"), file = "create.log", append = TRUE) 121 | 122 | # dot size for plots 123 | num_cells = ncol(seurat_obj) 124 | pt_size = 1.8 125 | if (num_cells > 1000) pt_size = 1.4 126 | if (num_cells > 5000) pt_size = 1.0 127 | if (num_cells > 10000) pt_size = 0.6 128 | if (num_cells > 50000) pt_size = 0.2 129 | 130 | # plot doublet score 131 | featplot_colors = colorRampPalette(c("#d9cfcb", "#d49070", "#ca5528", "#b72600", "#981000", "#730000"))(100) 132 | random_cells = sample(colnames(seurat_obj)) 133 | plot_umap = 134 | FeaturePlot( 135 | seurat_obj, features = "doublet_score_scDblFinder", reduction = "umap", 136 | cells = random_cells, pt.size = pt_size, cols = featplot_colors 137 | ) + 138 | theme_cowplot() + 139 | theme( 140 | plot.background = element_rect(fill = "white"), 141 | aspect.ratio = 1, plot.title = element_text(hjust = 0.5), 142 | axis.ticks = element_blank(), axis.text = element_blank() 143 | ) 144 | save_plot("dr.umap.doublet_score_scDblFinder.png", plot = plot_umap, base_height = 6.5, base_width = 8) 145 | Sys.sleep(1) 146 | save_plot("dr.umap.doublet_score_scDblFinder.pdf", plot = plot_umap, base_height = 6.5, base_width = 8) 147 | Sys.sleep(1) 148 | 149 | # plot doublet class 150 | plot_umap = 151 | DimPlot( 152 | seurat_obj, group.by = "doublet_class_scDblFinder", reduction = "umap", 153 | cells = random_cells, pt.size = pt_size, cols = c("#E41A1C", "#377EB8") 154 | ) + 155 | theme_cowplot() + 156 | theme( 157 | plot.background = element_rect(fill = "white"), 158 | aspect.ratio = 1, plot.title = element_text(hjust = 0.5), 159 | axis.ticks = element_blank(), axis.text = element_blank() 160 | ) 161 | save_plot("dr.umap.doublet_class_scDblFinder.png", plot = plot_umap, base_height = 6.5, base_width = 8) 162 | Sys.sleep(1) 163 | save_plot("dr.umap.doublet_class_scDblFinder.pdf", plot = plot_umap, base_height = 6.5, base_width = 8) 164 | Sys.sleep(1) 165 | 166 | # delete Rplots.pdf 167 | if (file.exists("Rplots.pdf")) file.remove("Rplots.pdf") 168 | 169 | # save 170 | Idents(seurat_obj) = "orig.ident" 171 | saveRDS(seurat_obj, "seurat_obj.rds") 172 | 173 | 174 | 175 | # end 176 | -------------------------------------------------------------------------------- /scripts/snvs-cnvs-mutect-strelka-freec-pyclone.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | ' 5 | Description: 6 | Convert mutations from Mutect and Strelka VCFs and CNVs from Control-FREEC to PyClone input table. 7 | 8 | Usage: 9 | snvs-cnvs-tsv-freec-pyclone.R 10 | 11 | Arguments: 12 | sample name in the tumor:normal format, must match the VCF sample names 13 | Mutect (GATK4) VCF 14 | Strelka VCF 15 | Control-FREEC "ratio.txt" file with ratios, copy numbers, and genotypes for each window 16 | output PyClone TSV 17 | 18 | Options: 19 | -h, --help show this screen 20 | ' -> doc 21 | 22 | 23 | # print warnings as they occur 24 | options(warn = 1) 25 | 26 | # retrieve the command-line arguments 27 | suppressPackageStartupMessages(library(docopt)) 28 | opts = docopt(doc) 29 | 30 | # relevent arguments 31 | sample_name = opts$sample_name 32 | mutect_vcf = opts$mutect_vcf 33 | strelka_vcf = opts$strelka_vcf 34 | cnvs_txt = opts$cnvs_txt 35 | out_txt = opts$out_txt 36 | 37 | # check that input files exists 38 | if (!file.exists(mutect_vcf)) stop("file does not exist: ", mutect_vcf) 39 | if (!file.exists(strelka_vcf)) stop("file does not exist: ", strelka_vcf) 40 | if (!file.exists(cnvs_txt)) stop("file does not exist: ", cnvs_txt) 41 | 42 | # load libraries 43 | suppressPackageStartupMessages({ 44 | library("tidyverse") 45 | library("glue") 46 | library("vcfR") 47 | library("GenomicRanges") 48 | library("stringr") 49 | }) 50 | 51 | # Mutect 2.1 (GATK 4) 52 | parse_mutect21 = function(vcfr_obj, sample_T, sample_N) { 53 | 54 | # confirm mutect version 55 | if (!any(str_detect(vcfr_obj@meta, "##Mutect Version=2.1"))) { 56 | stop("version mismatch: expecting Mutect 2.1") 57 | } 58 | 59 | # confirm sample names 60 | if (!any(str_detect(vcfr_obj@meta, glue("##tumor_sample={sample_T}")))) { 61 | stop("sample mismatch") 62 | } 63 | if (!any(str_detect(vcfr_obj@meta, glue("##normal_sample={sample_N}")))) { 64 | stop("sample mismatch") 65 | } 66 | 67 | # if a read is considered uninformative, it is counted towards the DP, but not the AD 68 | # an uninformative read is not reported in the AD, it is still used in calculations for genotyping 69 | # if uninformative reads are the only reads, we report the potential variant allele, but keep the AD values 0 70 | # AD is the number of reads that more likely than not support an allele 71 | # if you have 10 reads, each with 0.6 probability of having a certain alt allele, you get an AD of 10, whereas you get essentially 0.6 x 10 = 6 reads for the purpose of AF 72 | 73 | ##INFO DP Approximate read depth; some reads may have been filtered 74 | ##INFO TLOD Tumor LOD score 75 | ##FORMAT AD Allelic depths for the ref and alt alleles in the order listed 76 | ##FORMAT AF Allele fractions of alternate alleles in the tumor 77 | ##FORMAT DP Approximate read depth (reads with MQ=255 or with bad mates are filtered 78 | 79 | # convert vcfR object to a tibble 80 | muts_tbl = vcfR2tidy(vcfr_obj, single_frame = TRUE, verbose = FALSE, 81 | info_fields = c("DP", "TLOD"), 82 | format_fields = c("AD", "AF", "DP")) 83 | muts_tbl = muts_tbl$dat 84 | 85 | # extract relevant metrics 86 | muts_tbl = 87 | muts_tbl %>% 88 | # unique mutation identifier (for joining T and N) 89 | mutate(mutation_id = as.character(glue("{CHROM}:{POS}:{REF}:{ALT}"))) %>% 90 | filter(FILTER == "PASS") %>% 91 | separate(gt_AD, into = c("ref_counts", "alt_counts"), sep = ",", convert = TRUE, extra = "drop") %>% 92 | # manual AF calculation for comparison 93 | mutate(myAF = alt_counts / (ref_counts + alt_counts)) %>% 94 | mutate( 95 | QUAL = round(as.numeric(TLOD), 1), 96 | T_DEPTH = ref_counts + alt_counts, 97 | T_FREQ = round(as.numeric(gt_AF), 3) 98 | ) 99 | 100 | # extract samples T and N to put side by side ("wide" format) 101 | snvs_n_tbl = 102 | muts_tbl %>% 103 | filter(Indiv == sample_N) %>% 104 | dplyr::rename(N_DEPTH = T_DEPTH, N_FREQ = T_FREQ) %>% 105 | dplyr::select(mutation_id, N_DEPTH, N_FREQ) 106 | muts_tbl = 107 | muts_tbl %>% 108 | filter(Indiv == sample_T) %>% 109 | inner_join(snvs_n_tbl, by = "mutation_id") 110 | 111 | # manual filtering 112 | muts_tbl %>% filter((alt_counts >= 3) & (T_DEPTH >= 10) & (T_FREQ > 0.01) & (T_FREQ > (N_FREQ * 5))) 113 | 114 | } 115 | 116 | # Strelka 2 117 | parse_strelka2 = function(vcfr_obj, sample_T, sample_N) { 118 | 119 | # confirm strelka version 120 | if (!any(str_detect(vcfr_obj@meta, "##source_version=2"))) { 121 | stop("version mismatch: expecting Strelka 2") 122 | } 123 | 124 | ##INFO QSS Quality score for any somatic snv 125 | ##INFO SOMATIC Somatic mutation"> 126 | ##INFO QSI Quality score for the ALT haplotype to be present at a significantly different freq in the T and N 127 | ##FORMAT AU Number of 'A' alleles used in tiers 1,2 128 | ##FORMAT CU Number of 'C' alleles used in tiers 1,2 129 | ##FORMAT GU Number of 'G' alleles used in tiers 1,2 130 | ##FORMAT TU Number of 'T' alleles used in tiers 1,2 131 | ##FORMAT TAR Reads strongly supporting alternate allele for tiers 1,2 132 | ##FORMAT TIR Reads strongly supporting indel allele for tiers 1,2 133 | 134 | # convert vcfR object to a tibble 135 | muts_tbl = vcfR2tidy(vcfr_obj, 136 | info_fields = c("SOMATIC", "QSS", "QSI"), 137 | format_fields = c("DP", "AU", "CU", "GU", "TU", "TAR", "TIR"), 138 | single_frame = TRUE, verbose = FALSE) 139 | muts_tbl = muts_tbl$dat 140 | colnames(muts_tbl) 141 | 142 | # extract relevant metrics 143 | muts_tbl = 144 | muts_tbl %>% 145 | mutate(mutation_id = as.character(glue("{CHROM}:{POS}:{REF}:{ALT}"))) %>% 146 | filter(FILTER == "PASS") %>% 147 | # extract tier1 counts for each nucleotide 148 | separate(gt_AU, into = "A_counts", sep = ",", convert = TRUE, extra = "drop") %>% 149 | separate(gt_CU, into = "C_counts", sep = ",", convert = TRUE, extra = "drop") %>% 150 | separate(gt_GU, into = "G_counts", sep = ",", convert = TRUE, extra = "drop") %>% 151 | separate(gt_TU, into = "T_counts", sep = ",", convert = TRUE, extra = "drop") %>% 152 | separate(gt_TAR, into = "indel_ref_counts", sep = ",", convert = TRUE, extra = "drop") %>% 153 | separate(gt_TIR, into = "indel_alt_counts", sep = ",", convert = TRUE, extra = "drop") %>% 154 | # set ref/alt counts 155 | mutate( 156 | ref_counts = case_when( 157 | is.na(QSS) ~ indel_ref_counts, 158 | REF == "A" ~ A_counts, 159 | REF == "C" ~ C_counts, 160 | REF == "G" ~ G_counts, 161 | REF == "T" ~ T_counts 162 | ) 163 | ) %>% 164 | mutate( 165 | alt_counts = case_when( 166 | is.na(QSS) ~ indel_alt_counts, 167 | ALT == "A" ~ A_counts, 168 | ALT == "C" ~ C_counts, 169 | ALT == "G" ~ G_counts, 170 | ALT == "T" ~ T_counts 171 | ) 172 | ) %>% 173 | # extract quality 174 | mutate( 175 | QUAL = case_when( 176 | is.na(QSI) ~ QSS, 177 | is.na(QSS) ~ QSI 178 | ) 179 | ) %>% 180 | mutate(T_DEPTH = gt_DP) %>% 181 | mutate(T_FREQ = round(alt_counts / (ref_counts + alt_counts), 3)) 182 | 183 | # extract samples T and N to put side by side ("wide" format) 184 | snvs_n_tbl = 185 | muts_tbl %>% 186 | filter(Indiv == "NORMAL") %>% 187 | dplyr::rename(N_DEPTH = T_DEPTH, N_FREQ = T_FREQ) %>% 188 | dplyr::select(mutation_id, N_DEPTH, N_FREQ) 189 | muts_tbl = 190 | muts_tbl %>% 191 | filter(Indiv == "TUMOR") %>% 192 | inner_join(snvs_n_tbl, by = "mutation_id") 193 | 194 | # manual filtering 195 | muts_tbl %>% filter((alt_counts >= 3) & (T_DEPTH >= 10) & (T_FREQ > 0.01) & (T_FREQ > (N_FREQ * 5))) 196 | 197 | } 198 | 199 | # parse either vcf 200 | parse_vcf = function(sample_name, vcf) { 201 | 202 | # split sample name for somatic variants 203 | if (str_detect(sample_name, ":")) { 204 | sample_T = str_split_fixed(sample_name, ":", 2)[1] 205 | sample_N = str_split_fixed(sample_name, ":", 2)[2] 206 | } 207 | 208 | # import VCF as a vcfR object 209 | muts_vcfr = read.vcfR(vcf, verbose = FALSE) 210 | 211 | # check if there are any variants 212 | if (nrow(muts_vcfr@fix) == 0) stop("no variants in imported VCF") 213 | 214 | # determine variant caller based on VCF contents and parse accordingly 215 | if (any(str_detect(muts_vcfr@meta, "##source=Mutect2"))) { 216 | 217 | # Mutect 2.1 (GATK 4) 218 | message("parsing Mutect 2.1 VCF") 219 | caller_type = "somatic" 220 | vcf_type = "mutect21" 221 | vcf_tbl = parse_mutect21(vcfr_obj = muts_vcfr, sample_T = sample_T, sample_N = sample_N) 222 | 223 | } else if (any(str_detect(muts_vcfr@meta, "##source=strelka"))) { 224 | 225 | # Strelka 2 226 | message("parsing Strelka 2 VCF") 227 | caller_type = "somatic" 228 | vcf_type = "strelka2" 229 | vcf_tbl = parse_strelka2(vcfr_obj = muts_vcfr, sample_T = sample_T, sample_N = sample_N) 230 | 231 | } else { 232 | 233 | stop("unknown variant caller") 234 | 235 | } 236 | 237 | # check if table is empty 238 | if (nrow(vcf_tbl) == 0) stop("output table is empty after parsing") 239 | 240 | # create and sort by mutation_id 241 | vcf_tbl = 242 | vcf_tbl %>% 243 | mutate(mutation_id = as.character(glue("{CHROM}:{POS}:{REF}:{ALT}"))) %>% 244 | arrange(mutation_id) %>% 245 | mutate( 246 | mut_type = case_when( 247 | str_length(REF) > str_length(ALT) ~ "del", 248 | str_length(ALT) > str_length(REF) ~ "ins", 249 | TRUE ~ "pt" 250 | ) 251 | ) %>% 252 | filter(mut_type == "pt") 253 | 254 | # output table columns 255 | vcf_tbl = vcf_tbl %>% mutate(SAMPLE_T = sample_T, SAMPLE_N = sample_N) 256 | vcf_tbl 257 | 258 | } 259 | 260 | # parse mutect VCF 261 | muts_mutect_tbl = parse_vcf(sample_name = sample_name, vcf = mutect_vcf) 262 | muts_mutect_tbl$variant_caller = "mutect" 263 | 264 | # parse strelka VCF 265 | muts_strelka_tbl = parse_vcf(sample_name = sample_name, vcf = strelka_vcf) 266 | muts_strelka_tbl$variant_caller = "strelka" 267 | 268 | # combine both tables 269 | muts_all = bind_rows(muts_mutect_tbl, muts_strelka_tbl) 270 | 271 | # determine number of callers (including duplicates) 272 | snvs_tbl = 273 | muts_all %>% 274 | add_count(mutation_id, SAMPLE_T, SAMPLE_N) %>% 275 | mutate(variant_caller = if_else(n == 2, "mutect+strelka", variant_caller)) %>% 276 | dplyr::select(-QUAL, -n) 277 | 278 | # keep one row if called by both callers 279 | # for mutations with entry for each caller, select entry with higher T_FREQ 280 | snvs_tbl = 281 | snvs_tbl %>% 282 | group_by(mutation_id, SAMPLE_T, SAMPLE_N) %>% 283 | arrange(-T_FREQ, -T_DEPTH) %>% 284 | dplyr::slice(1) %>% 285 | ungroup() 286 | 287 | # chrs to filter out 288 | chr_filter = c("chrX", "chrY", "chrM", "X", "Y", "MT", "M") 289 | 290 | # clean up and add PyClone columns 291 | snvs_tbl = snvs_tbl %>% 292 | filter(!CHROM %in% chr_filter) %>% 293 | dplyr::rename(chr = CHROM, variant_freq = T_FREQ, var_counts = alt_counts) %>% 294 | mutate(start = POS, end = POS) 295 | 296 | # convert SNVs to GRanges for overlapping 297 | snvs_gr = makeGRangesFromDataFrame(df = snvs_tbl, 298 | ignore.strand = TRUE, 299 | keep.extra.columns = TRUE, 300 | starts.in.df.are.0based = FALSE) 301 | names(snvs_gr) = snvs_gr$mutation_id 302 | 303 | # import CNVs file 304 | cnvs_tbl = read_tsv(cnvs_txt, guess_max = 500000, progress = FALSE) 305 | 306 | # clean up 307 | cnvs_tbl = cnvs_tbl %>% 308 | transmute(chr = Chromosome, start = Start - 1, end = Start, genotype = Genotype) %>% 309 | dplyr::select(chr, start, end, genotype) %>% 310 | mutate(cnv_id = str_c(chr, ":", end)) %>% 311 | filter(!chr %in% chr_filter) 312 | 313 | # calculate window size (most common start site difference) 314 | win_size = cnvs_tbl %>% 315 | mutate(win_size = (start - lag(start))) %>% 316 | filter(win_size > 100) %>% 317 | na.omit() %>% 318 | pull(win_size) 319 | win_size = unique(win_size)[which.max(tabulate(match(win_size, unique(win_size))))] 320 | 321 | # update coordinates to reflect window size (0-based) 322 | # can be extracted from targeted sequencing results, but not WGS 323 | cnvs_tbl = cnvs_tbl %>% mutate(end = start + win_size) 324 | 325 | # add PyClone columns 326 | cnvs_tbl = cnvs_tbl %>% 327 | filter(genotype != "-", genotype != "") %>% 328 | mutate(normal_cn = 2, 329 | minor_cn = str_count(genotype, "B"), 330 | major_cn = str_count(genotype, "A")) 331 | 332 | # convert CNVs to GRanges for overlapping 333 | cnvs_gr = makeGRangesFromDataFrame(df = cnvs_tbl, 334 | ignore.strand = TRUE, 335 | keep.extra.columns = TRUE, 336 | starts.in.df.are.0based = TRUE) 337 | names(cnvs_gr) = cnvs_gr$cnv_id 338 | 339 | # set seqlevels in both GRanges to UCSC style 340 | seqlevelsStyle(snvs_gr) = "UCSC" 341 | seqlevelsStyle(cnvs_gr) = "UCSC" 342 | 343 | # get overlaps for SNVs and CNVs (consider adjacent windows as well) 344 | overlaps = distanceToNearest(x = snvs_gr, subject = cnvs_gr) 345 | overlaps = overlaps %>% as("data.frame") %>% as_tibble() %>% filter(distance < win_size * 2) 346 | 347 | # convert overlaps from identifiers to names 348 | overlaps$mutation_id = snvs_gr[overlaps$queryHits] %>% names() 349 | overlaps$cnv_id = cnvs_gr[overlaps$subjectHits] %>% names() 350 | overlaps = overlaps %>% dplyr::select(-queryHits, -subjectHits) 351 | 352 | # merge overlapping SNVs and CNVs 353 | overlaps = overlaps %>% 354 | inner_join(snvs_tbl, by = "mutation_id") %>% 355 | inner_join(cnvs_tbl, by = "cnv_id") 356 | 357 | # extract columns for PyClone 358 | pyclone_tbl = overlaps %>% 359 | dplyr::select(mutation_id, ref_counts, var_counts, normal_cn, minor_cn, major_cn, variant_freq, genotype) 360 | 361 | # save PyClone table 362 | write_tsv(pyclone_tbl, path = out_txt) 363 | 364 | 365 | 366 | # end 367 | -------------------------------------------------------------------------------- /workflows/draft-genome-init.md: -------------------------------------------------------------------------------- 1 | # Initializing draft genome directory 2 | 3 | 4 | The requirement for [setting up a reference genome directory](https://github.com/igordot/reference-genomes) is having two basic files: 5 | 6 | * `genome.fa` - genome sequence in FASTA format 7 | * `genes.gtf` - gene annotations in GTF format 8 | 9 | Properly formatted sequence and annotation files are readily available for commonly analyzed genomes. Less popular and 10 | draft genomes are less standardized and can be more difficult to work with. 11 | 12 | ## Saimiri boliviensis boliviensis (Bolivian squirrel monkey) example 13 | 14 | As of September 2016, the best reference for the Bolivian squirrel monkey is the first preliminary assembly SaiBol1.0 15 | (GCA_000235385.1), provided by the Broad Institute in October 2011. The assembly comprises 2685 top level sequences, all 16 | of which are unplaced scaffolds (from 151,413 contigs). 17 | 18 | When working with a new genome, Ensembl is usually a good place to start as it contains well-formatted reference files for 19 | many species. You can find the [SaiBol1 genome](http://pre.ensembl.org/Saimiri_boliviensis/Info/Index). 20 | 21 | The FASTA and GTF files are available and can be downloaded: 22 | 23 | ```bash 24 | wget -O genome.ensembl.pre.fa.gz ftp://ftp.ensembl.org/pub/pre/fasta/dna/saimiri_boliviensis/Saimiri_boliviensis.SaiBol1.0.dna_rm.toplevel.fa.gz 25 | wget -O genes.ensembl.pre.gtf.gz ftp://ftp.ensembl.org/pub/pre/gtf/saimiri_boliviensis/SaiBol1.0.genes.gtf.gz 26 | ``` 27 | 28 | Once you have the reference files, it's a good idea to spot-check them. 29 | 30 | ```bash 31 | zcat genome.ensembl.pre.fa.gz | grep ">" | head 32 | ``` 33 | 34 | Output: 35 | 36 | ``` 37 | >scaffold:SaiBol1.0:JH378105.1:1:72162052:1 scaffold JH378105.1 38 | >scaffold:SaiBol1.0:JH378106.1:1:71252344:1 scaffold JH378106.1 39 | >scaffold:SaiBol1.0:JH378107.1:1:58292249:1 scaffold JH378107.1 40 | >scaffold:SaiBol1.0:JH378108.1:1:54856640:1 scaffold JH378108.1 41 | >scaffold:SaiBol1.0:JH378109.1:1:50794693:1 scaffold JH378109.1 42 | >scaffold:SaiBol1.0:JH378110.1:1:49021937:1 scaffold JH378110.1 43 | >scaffold:SaiBol1.0:JH378111.1:1:46157118:1 scaffold JH378111.1 44 | >scaffold:SaiBol1.0:JH378112.1:1:45331107:1 scaffold JH378112.1 45 | >scaffold:SaiBol1.0:JH378113.1:1:44311105:1 scaffold JH378113.1 46 | >scaffold:SaiBol1.0:JH378114.1:1:44255708:1 scaffold JH378114.1 47 | ``` 48 | 49 | ```bash 50 | zcat genes.ensembl.pre.gtf.gz | head 51 | ``` 52 | 53 | Output: 54 | 55 | ``` 56 | JH378796.1 protein_coding exon 805 910 . + . gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "1"; gene_biotype "protein_coding"; 57 | JH378796.1 protein_coding CDS 805 910 . + 0 gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "1"; gene_biotype "protein_coding"; protein_id "ENSP00000271139_1"; 58 | JH378796.1 protein_coding exon 2580 3055 . + . gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "2"; gene_biotype "protein_coding"; 59 | JH378796.1 protein_coding CDS 2580 3055 . + 2 gene_id "ENSP00000271139_1"; transcript_id "ENSP00000271139_1"; exon_number "2"; gene_biotype "protein_coding"; protein_id "ENSP00000271139_1"; 60 | JH378584.1 protein_coding exon 731 835 . + . gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "1"; gene_biotype "protein_coding"; 61 | JH378584.1 protein_coding CDS 731 835 . + 0 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "1"; gene_biotype "protein_coding"; protein_id "ENSP00000250416_1"; 62 | JH378584.1 protein_coding exon 2441 2603 . + . gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "2"; gene_biotype "protein_coding"; 63 | JH378584.1 protein_coding CDS 2441 2603 . + 0 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "2"; gene_biotype "protein_coding"; protein_id "ENSP00000250416_1"; 64 | JH378584.1 protein_coding exon 3155 3293 . + . gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "3"; gene_biotype "protein_coding"; 65 | JH378584.1 protein_coding CDS 3155 3293 . + 2 gene_id "ENSP00000250416_1"; transcript_id "ENSP00000250416_1"; exon_number "3"; gene_biotype "protein_coding"; protein_id "ENSP00000250416_1"; 66 | ``` 67 | 68 | There are several issues: 69 | 70 | * FASTA contig names have spaces. This is usually fine, but will cause errors with some tools. 71 | * GTF contig names (first column) do not match FASTA contig names (the full line after `>`). 72 | * GTF records have `gene_id`, but not `gene_name`. Most biologists will want to know the gene names. 73 | 74 | The first two problems can be solved with a few command line commands. The last one is more complicated. 75 | 76 | Let's try NCBI next. 77 | 78 | NCBI has a lot of databases, so it can be difficult to navigate. It also has the 79 | [SaiBol1 genome](https://www.ncbi.nlm.nih.gov/genome/6907), which is also based GCA_000235385.1 like Ensembl. 80 | 81 | There is no GTF, but there is a GFF. 82 | 83 | ```bash 84 | wget -O genome.ncbi.fa.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/Saimiri_boliviensis/CHR_Un/39432_ref_SaiBol1.0_chrUn.fa.gz 85 | wget -O genes.ncbi.gff3.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/Saimiri_boliviensis/GFF/ref_SaiBol1.0_scaffolds.gff3.gz 86 | ``` 87 | 88 | There are actually two GFF files: `scaffolds.gff3.gz` and `top_level.gff3.gz`, but they are identical based on both file 89 | size and `md5sum`. 90 | 91 | ``` 92 | $ zcat genome.ncbi.fa.gz | grep ">" | head 93 | >gi|395726353|ref|NW_003943604.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00001, whole genome shotgun sequence 94 | >gi|395726233|ref|NW_003943605.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00002, whole genome shotgun sequence 95 | >gi|395726111|ref|NW_003943606.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00003, whole genome shotgun sequence 96 | >gi|395725977|ref|NW_003943607.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00004, whole genome shotgun sequence 97 | >gi|395725897|ref|NW_003943608.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00005, whole genome shotgun sequence 98 | >gi|395725895|ref|NW_003943609.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00006, whole genome shotgun sequence 99 | >gi|395725818|ref|NW_003943610.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00007, whole genome shotgun sequence 100 | >gi|395725816|ref|NW_003943611.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00008, whole genome shotgun sequence 101 | >gi|395725734|ref|NW_003943612.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00009, whole genome shotgun sequence 102 | >gi|395725652|ref|NW_003943613.1| Saimiri boliviensis boliviensis isolate 3227 unplaced genomic scaffold, SaiBol1.0 scaffold00010, whole genome shotgun sequence 103 | ``` 104 | 105 | ``` 106 | $ zcat genes.ncbi.gff3.gz | grep -v "#" | head 107 | NW_003943604.1 RefSeq region 1 72162052 . + . ID=id0;Dbxref=taxon:39432;Name=Unknown;chromosome=Unknown;gbkey=Src;genome=genomic;isolate=3227;mol_type=genomic DNA;sex=female;sub-species=boliviensis 108 | NW_003943604.1 Gnomon gene 8363 13782 . - . ID=gene0;Dbxref=GeneID:101049931;Name=LOC101049931;gbkey=Gene;gene=LOC101049931 109 | NW_003943604.1 Gnomon mRNA 8363 13782 . - . ID=rna0;Parent=gene0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;Name=XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1 110 | NW_003943604.1 Gnomon exon 13673 13782 . - . ID=id1;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1 111 | NW_003943604.1 Gnomon exon 11227 11475 . - . ID=id2;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1 112 | NW_003943604.1 Gnomon exon 8363 8619 . - . ID=id3;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XM_010336801.1;gbkey=mRNA;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;transcript_id=XM_010336801.1 113 | NW_003943604.1 Gnomon CDS 13673 13739 . - 0 ID=cds0;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XP_010335103.1;Name=XP_010335103.1;gbkey=CDS;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;protein_id=XP_010335103.1 114 | NW_003943604.1 Gnomon CDS 11227 11475 . - 2 ID=cds0;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XP_010335103.1;Name=XP_010335103.1;gbkey=CDS;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;protein_id=XP_010335103.1 115 | NW_003943604.1 Gnomon CDS 8363 8619 . - 2 ID=cds0;Parent=rna0;Dbxref=GeneID:101049931,Genbank:XP_010335103.1;Name=XP_010335103.1;gbkey=CDS;gene=LOC101049931;product=breast cancer type 2 susceptibility protein;protein_id=XP_010335103.1 116 | NW_003943604.1 Gnomon gene 16666 24141 . + . ID=gene1;Dbxref=GeneID:101050261;Name=ZAR1L;gbkey=Gene;gene=ZAR1L 117 | ``` 118 | 119 | These reference files aren't perfect either, but at least the gene names are present. 120 | 121 | Fix the FASTA contig names so they match the GFF contig names (`NW_*` or `NC_*`): 122 | 123 | ```bash 124 | zcat genome.ncbi.fa.gz | perl -pe 's/gi\|.*\|(N._.+?)\|.*/\1/g' > genome.fa 125 | ``` 126 | 127 | Convert GFF to GTF using `gffread` (part of Cufflinks suite): 128 | 129 | ```bash 130 | zcat genes.ncbi.gff3.gz | gffread - -T -o genes.ncbi.gff3.gtf 131 | ``` 132 | 133 | A few (24 out of over 850,000) of the GTF entries do not contain `gene_id` or `gene_name`. Remove those: 134 | 135 | ```bash 136 | cat genes.ncbi.gff3.gtf | grep "gene_name" > genes.gtf 137 | ``` 138 | 139 | This leaves us with a clean `genome.fa` and `genes.gtf` for [setting up a reference genome directory](https://github.com/igordot/reference-genomes). 140 | -------------------------------------------------------------------------------- /workflows/gatk-mouse-mm10.md: -------------------------------------------------------------------------------- 1 | # Creating GATK mm10 resource bundle 2 | 3 | 4 | The GATK resource bundle is a collection of standard files for working with human resequencing data. 5 | It contains known SNPs and indels to be used for BaseRecalibrator, RealignerTargetCreator, and IndelRealigner. 6 | This is an attempt to recreate a similar bundle for the mouse genome (UCSC build mm10). 7 | 8 | For mouse SNPs, it's possible to use the dbSNP database, which should be comparable to the human version. 9 | 10 | Download dbSNP GRCm38 VCF files (each chromosome is in a separate file): 11 | 12 | ```bash 13 | wget --recursive --no-parent --no-directories \ 14 | --accept vcf*vcf.gz \ 15 | ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/ 16 | ``` 17 | 18 | Delete the non-primary chromosomes if they are not included in the reference FASTA file. 19 | 20 | Add "chr" to each chromosome (convert from GRCm38 to mm10 format): 21 | 22 | ```bash 23 | for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do 24 | vcf_new=${vcf/.vcf.gz/.vcf} 25 | echo $vcf 26 | zcat $vcf | sed 's/^\([0-9XY]\)/chr\1/' > $vcf_new 27 | rm -fv $vcf 28 | done 29 | ``` 30 | 31 | Combine all dbSNP VCF files into one: 32 | 33 | ```bash 34 | # generate parameter string containing all VCF files 35 | vcf_file_string="" 36 | for vcf in $(ls -1 vcf_chr_*.vcf) ; do 37 | vcf_file_string="$vcf_file_string -V $vcf" 38 | done 39 | echo $vcf_file_string 40 | 41 | # concatenate VCF files 42 | java -Xms16G -Xmx16G -cp ${gatk_path}/GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \ 43 | -R genome.fa $vcf_file_string -out dbsnp.vcf 44 | ``` 45 | 46 | More recent dbSNP releases include a merged `00-All.vcf.gz` file in addition to the separate chromosome files. 47 | Although it will not need to be concatenated, it will likely still need to be adjusted for GATK compatibility. 48 | 49 | For mouse indels, the Sanger Mouse Genetics Programme (Sanger MGP) is probably the best resource. 50 | 51 | Download all MGP indels (5/2015 release): 52 | 53 | ```bash 54 | wget ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz \ 55 | -O mgp.v5.indels.vcf.gz 56 | ``` 57 | 58 | Filter for passing variants with chr added: 59 | 60 | ```bash 61 | # adjust header 62 | zcat mgp.v5.indels.vcf.gz | head -1000 | grep "^#" | cut -f 1-8 \ 63 | | grep -v "#contig" | grep -v "#source" \ 64 | > mgp.v5.indels.pass.chr.vcf 65 | # keep only passing and adjust chromosome name 66 | zcat mgp.v5.indels.vcf.gz | grep -v "^#" | cut -f 1-8 \ 67 | | grep -w "PASS" | sed 's/^\([0-9MXY]\)/chr\1/' \ 68 | >> mgp.v5.indels.pass.chr.vcf 69 | ``` 70 | 71 | Sort VCF (automatically generated index has to be deleted due to a known bug): 72 | 73 | ```bash 74 | java -Xms16G -Xmx16G -jar ${PICARD_ROOT}/picard.jar SortVcf VERBOSITY=WARNING \ 75 | SD=genome.dict \ 76 | I=mgp.v5.indels.pass.chr.vcf \ 77 | O=mgp.v5.indels.pass.chr.sort.vcf 78 | rm -fv mgp.v5.indels.pass.chr.sort.vcf.idx 79 | ``` 80 | 81 | Additional info: 82 | 83 | * [GATK Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle) 84 | * [What should I use as known variants/sites for running tool X?](http://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x) 85 | -------------------------------------------------------------------------------- /workflows/microarray.md: -------------------------------------------------------------------------------- 1 | # Microarray differential expression analysis 2 | 3 | Basic microarray differential expression analysis in R using `limma`. 4 | 5 | ```r 6 | library(affy) 7 | library(limma) 8 | sample_info = read.AnnotatedDataFrame("samples.csv") 9 | eset = justRMA("/path/to/cel-files", phenoData = sample_info) 10 | design = model.matrix(~ group, pData(eset)) 11 | fit = lmFit(eset, design) 12 | efit = eBayes(fit) 13 | topTable(efit, coef = 2) 14 | ``` 15 | -------------------------------------------------------------------------------- /workflows/nanopore-init.md: -------------------------------------------------------------------------------- 1 | # Processing of Oxford Nanopore Technologies (ONT) data 2 | 3 | 4 | ## Sequencing 5 | 6 | There are multiple sequencing protocols. The basic one is 1D, which is analogous to Illumina's single-read option where each DNA fragment is sequenced once. For higher accuracy, there was a 2D workflow where each fragment will generate both a template and complement reads (sepated by hairpin). In May 2017, ONT replaced the 2D system (part of a legal dispute since Pacific Biosciences patented the hairpin approach) with 1D^2 or "1D squared". 7 | 8 | During a standard MinION run with a single unbarcoded sample, the MinKNOW software writes a FAST5 file for each DNA molecule in a local directory. These FAST5 files contain aggregated signal measurements and may be basecalled. 9 | 10 | FAST5 files overview from [Simpson Lab blog](http://simpsonlab.github.io/2017/09/06/nanopolish-v0.8.0/): 11 | > Oxford Nanopore’s sequencers measure the disruption in electric current caused by single-stranded DNA moving through the nanopore. The device samples the current six thousand times per second and writes the samples to a FAST5 file. We refer to these measurements as “the raw signal” or “the raw samples”, or simply “the raw”. For the past three years nanopore basecallers have converted the raw samples into segments called “events”, with the boundaries between events roughly corresponding to movements of DNA through the pore. After the samples are segmented into events, the basecaller predicts which k-mer was in the pore when the samples for each event were taken. The basecalling results are stored in a new FAST5 file that has a table containing every event and its k-mer label. 12 | 13 | ## Basecalling 14 | 15 | MinKNOW can output basecalled and non-basecalled FAST5 files, but MinKNOW basecalled files may cause issues downstream. ONT also offers [Albacore](https://community.nanoporetech.com/protocols/albacore-offline-basecalli/v/abec_2003_v1_revx_29nov2016) for local offline basecalling. 16 | 17 | Albacore needs Python 3.4+ and can by installed using `pip` (it's probably best to use `virtualenv`): 18 | ```bash 19 | 20 | # create a new environment in your virtualenv directory 21 | cd /virtualenv_path/ 22 | pyvenv albacore 23 | 24 | # activate virtualenv 25 | source /virtualenv_path/albacore/bin/activate 26 | 27 | # upgrade pip 28 | pip install --upgrade pip 29 | 30 | # install albacore from .whl (from ONT website) 31 | pip install /path/ont_albacore-x.x.x.whl 32 | 33 | # deactivate virtualenv 34 | deactivate 35 | ``` 36 | 37 | Perform basecalling: 38 | ```bash 39 | # it may be necessary to unset PYTHONPATH if it was set 40 | unset PYTHONPATH 41 | 42 | # activate virtualenv 43 | source /virtualenv_path/albacore/bin/activate 44 | 45 | # check available flowcells and kits (must be specified for basecalling) 46 | read_fast5_basecaller.py --list_workflows 47 | 48 | # run basecaller (use & to run the process in the background) 49 | read_fast5_basecaller.py \ 50 | --worker_threads 8 \ 51 | --recursive \ 52 | --save_path ./albacore-out \ 53 | --output_format fast5 \ 54 | --input ./fast5 \ 55 | --flowcell FLO-MIN000 \ 56 | --kit SQK-NSK000 \ 57 | & 58 | 59 | # check the number of processed reads 60 | cat ./albacore-out/pipeline.log | grep "Finished" | wc -l 61 | 62 | # deactivate virtualenv 63 | deactivate 64 | ``` 65 | 66 | ## Data extraction 67 | 68 | Basecalled FAST5 files can be converted to FASTQ format that is more compatible with verious downstream analysis tools. 69 | 70 | [Poretools](http://poretools.readthedocs.io/) is a popular tool for extracting data and information from FAST5 files. 71 | 72 | Create a variable for the basecalled reads directory: 73 | ```bash 74 | fast5_dir="/albacore_path/workspace" 75 | ``` 76 | 77 | Calculate overall stats (number of reads, mean read length, etc.): 78 | ```bash 79 | poretools stats --type 2D $fast5_dir > reads.2D.stats.txt 80 | ``` 81 | 82 | Determine nucleotide composition: 83 | ```bash 84 | poretools nucdist $fast5_dir > reads.2D.nucdist.txt 85 | ``` 86 | 87 | Generate gzipped FASTQ file: 88 | ```bash 89 | poretools fastq --min-length 500 --type 2D $fast5_dir | gzip > reads.2D.fastq.gz 90 | ``` 91 | 92 | Generate FASTA file: 93 | ```bash 94 | poretools fasta --min-length 500 --type 2D $fast5_dir > reads.2D.fasta 95 | ``` 96 | 97 | Generate a histogram of read lengths: 98 | ```bash 99 | poretools hist --theme-bw --min-length 0 --max-length 20000 --num-bins 39 --saveas reads.2D.hist.png $fast5_dir 100 | ``` 101 | 102 | Determine read lengths: 103 | ```bash 104 | samtools faidx reads.2D.fasta 105 | cat reads.2D.fasta.fai | grep "_Basecall_2D_2d" | cut -f 1,2 > reads.2D.length.txt 106 | ``` 107 | 108 | ## Assembly 109 | 110 | [Canu](https://canu.readthedocs.io/) is a common de novo assembler for Nanopore long reads. It performs error correction, but additional polishing is helpful. [Nanopolish](https://github.com/jts/nanopolish) can calculate an improved consensus sequence for a draft genome assembly. 111 | 112 | If you have both Illumina and Nanopore data, then [SPAdes](http://bioinf.spbau.ru/spades) is a good option for hybrid assembly. SPAdes will use Nanopore reads for gap closure and repeat resolution. 113 | 114 | ## Additional info 115 | 116 | PoreCamp is a training bootcamp based around Oxford Nanopore MinION sequencing that provides great [tutorials](https://porecamp.github.io/2017/) for basic processing of the data. 117 | -------------------------------------------------------------------------------- /workflows/ref-genome-gfp.md: -------------------------------------------------------------------------------- 1 | # Creating a reference genome with exogenous sequences such as GFP 2 | 3 | 4 | A standard reference genome is sufficient for most sequencing-based studies, but the experiment may be more complicated. 5 | Studies may involve knock-in or transgenic organisms where the genome sequence is altered. 6 | Since a FASTA file can contain multiple sequences, it's trivial to create a combined one if the exact position in the genome where the foreign sequence is introduced is not relevant. 7 | However, for RNA-seq, you additionally need to modify gene annotations, which is more involved. 8 | 9 | Green fluorescent protein (GFP) is a frequently introduced sequence. 10 | Searching for the exact GFP sequence yields many variants, since there are multiple source species of wild-type GFP and 11 | various engineered derivatives, such as yellow fluorescent protein (YFP) or TurboGFP. 12 | This example uses the "enhanced" or "eukaryotic" GFP (EGFP), commonly used in the mammalian expression vectors. 13 | 14 | This is the sequence that was used to create a GFP FASTA file `genome.EGFP.fa`: 15 | ``` 16 | >EGFP 17 | ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAA 18 | GTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGC 19 | TGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAG 20 | CAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTA 21 | CAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGG 22 | ACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAAC 23 | GGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACAC 24 | CCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACG 25 | AGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA 26 | ``` 27 | 28 | To create the corresponding GTF, it's necessary to find the sequence length: 29 | ```bash 30 | cat genome.EGFP.fa | grep -v "^>" | tr -d "\n" | wc -c 31 | ``` 32 | 33 | The length is 720. This was used to manually create `genes.EGFP.gtf`: 34 | ``` 35 | EGFP unknown gene 1 720 . + . gene_id "EGFP"; gene_name "EGFP"; gene_biotype "protein_coding"; 36 | EGFP unknown transcript 1 720 . + . gene_id "EGFP"; transcript_id "EGFP"; gene_name "EGFP"; gene_biotype "protein_coding"; 37 | EGFP unknown exon 1 720 . + . gene_id "EGFP"; transcript_id "EGFP"; gene_name "EGFP"; gene_biotype "protein_coding"; 38 | ``` 39 | 40 | Make sure that the sequence name (column 1) in the GTF matches the one in the FASTA file. 41 | 42 | Different tools expect different content from a GTF file. 43 | Using `gene`, `transcript`, and `exon` features seems to be sufficient. 44 | The `gene_biotype` attribute was only added to help with filtering. 45 | 46 | Finally, merge the standard genome FASTA and GTF files with the GFP ones: 47 | ```bash 48 | cat genome.mm10.fa genome.EGFP.fa > genome.fa 49 | cat genes.mm10.gtf genes.EGFP.gtf > genes.gtf 50 | ``` 51 | 52 | This produces the FASTA and GTF files for the combined reference genome that should be compatible with most GTF-based tools, such as STAR or Cell Ranger. 53 | Pseudoaligners like Kallisto or Salmon build an index from a FASTA formatted file of target sequences, so you can simply append `genome.EGFP.fa` to the cDNA FASTA file. 54 | -------------------------------------------------------------------------------- /workflows/ref-genome-init.md: -------------------------------------------------------------------------------- 1 | # Initializing reference genome directory 2 | 3 | 4 | The workflow is now in a [dedicated reference genomes repo](https://github.com/igordot/reference-genomes). 5 | -------------------------------------------------------------------------------- /workflows/rna-seq-diff-exp.md: -------------------------------------------------------------------------------- 1 | # RNA-seq differential expression analysis 2 | 3 | Basic RNA-seq differential expression analysis in R. 4 | 5 | ## DESeq2 6 | 7 | Load library: 8 | ```r 9 | library(DESeq2) 10 | ``` 11 | 12 | Import data: 13 | ```r 14 | # import raw counts matrix 15 | dds = DESeqDataSetFromMatrix(counts, coldata, ~ group) 16 | # import SummarizedExperiment 17 | dds = DESeqDataSet(se, ~ group) 18 | # import tximport 19 | dds = DESeqDataSetFromTximport(txi, coldata, ~ group) 20 | ``` 21 | 22 | Analysis: 23 | ```r 24 | dds = DESeq(dds) 25 | res = results(dds) 26 | ``` 27 | 28 | *** 29 | 30 | ## edgeR 31 | 32 | Load library: 33 | ```r 34 | library(edgeR) 35 | ``` 36 | 37 | Import data (assuming four RNA-Seq libraries in two groups with counts are stored in a tab-delimited text file and gene symbols in a column "gene"): 38 | ```r 39 | # import data 40 | counts = read.delim("counts.txt", row.names = "gene") 41 | group = factor(c(1,1,2,2)) 42 | # edgeR stores data in a list-based data object called a DGEList 43 | y = DGEList(counts = counts, group = group) 44 | y = calcNormFactors(y) 45 | design = model.matrix(~ group) 46 | y = estimateDisp(y, design) 47 | ``` 48 | 49 | Perform likelihood ratio tests: 50 | ```r 51 | fit = glmFit(y, design) 52 | lrt = glmLRT(fit) 53 | topTags(lrt) 54 | ``` 55 | 56 | *** 57 | 58 | ## limma-voom 59 | 60 | ```r 61 | library(limma) 62 | design = model.matrix(~ group) 63 | dgel = DGEList(counts) 64 | dgel = calcNormFactors(dgel) 65 | v = voom(dgel, design, plot = FALSE) 66 | fit = lmFit(v, design) 67 | fit = eBayes(fit) 68 | topTable(fit) 69 | ``` 70 | -------------------------------------------------------------------------------- /workflows/rrna-ref.md: -------------------------------------------------------------------------------- 1 | # Creating ribosomal RNA reference sequence 2 | 3 | 4 | RNA-seq libraries are typically prepared from total RNA using poly(A) enrichment of the mRNA to remove ribosomal RNAs, but this method fails to capture non-poly(A) transcripts or partially degraded mRNAs. 5 | As an alternative, there are total RNA-seq protocols that require a separate rRNA depletion step. 6 | To test the effectiveness of rRNA depletion, it's a good idea to check rRNA levels in the final RNA-seq library. 7 | 8 | Some rRNAs are annotated in the GTF file and show up along with other genes in the final output. 9 | However, rRNA abundance may be substantially underrepresented, since those sequences can fall in the repetitive regions of genome and many tools filter out multi-mapping reads. 10 | Thus, it may be useful to create a separate set of rRNA sequences to align against. 11 | This would also be necessary for tools like [FastQ Screen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/), which check the composition of a library by screening it against a set of sequence databases. 12 | 13 | A good resource for rRNA sequences is [RNAcentral](https://rnacentral.org/), a database of non-coding RNA from multiple databases such as Rfam and RDP (Ribosomal Database Project). 14 | 15 | Download and merge RNAcentral FASTA files: 16 | 17 | ```bash 18 | wget ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/releases/12.0/sequences/rnacentral_species_specific_ids.fasta.gz 19 | ``` 20 | 21 | Convert multi-line sequences to single-line (using `fasta_formatter` from FASTX-Toolkit): 22 | 23 | ```bash 24 | gzip -cd rnacentral_species_specific_ids.fasta.gz \ 25 | | fasta_formatter -w 0 \ 26 | | gzip \ 27 | > rnacentral.nowrap.fasta.gz 28 | ``` 29 | 30 | Remove empty lines, replace spaces with underscores, and keep just ribosomal sequences: 31 | 32 | ```bash 33 | gzip -cd rnacentral.nowrap.fasta.gz \ 34 | | sed '/^$/d' \ 35 | | sed 's/\s/_/g' \ 36 | | grep -E -A 1 "ribosomal_RNA|rRNA" \ 37 | | grep -v "^--$" \ 38 | | gzip \ 39 | > rnacentral.ribosomal.nowrap.fasta.gz 40 | ``` 41 | 42 | Set a variable for the species of interest. For example: 43 | 44 | ```bash 45 | species="homo_sapiens" 46 | species="mus_musculus" 47 | species="drosophila_melanogaster" 48 | ``` 49 | 50 | Extract species-specific ribosomal sequences: 51 | 52 | ```bash 53 | zcat rnacentral.ribosomal.nowrap.fasta.gz \ 54 | | grep -A 1 -F -i "${species}" \ 55 | | grep -v "^--$" \ 56 | | fasta_formatter -w 80 \ 57 | > rRNA.${species}.fa 58 | ``` 59 | 60 | Index the species-specific FASTA file (not necessary, but will confirm that the FASTA file is valid): 61 | 62 | ```bash 63 | samtools faidx rRNA.${species}.fa 64 | ``` 65 | 66 | Check the number of sequences per species (there are currently around 6,000 for human and 600 for mouse in RNAcentral): 67 | 68 | ```bash 69 | wc -l *fai 70 | ``` 71 | 72 | Build the species-specific Bowtie2 index for tools like FastQ Screen: 73 | 74 | ```bash 75 | bowtie2-build rRNA.${species}.fa rRNA.${species} 76 | ``` 77 | --------------------------------------------------------------------------------