├── _config.yml
└── scRNAseq
    ├── .DS_Store
    ├── docs
        └── bcl2fastq2_guide_15051736_v2.pdf
    ├── scRNAseq_analysis_tutorial
        ├── .DS_Store
        ├── README.md
        ├── img
        │   ├── .DS_Store
        │   ├── cell_cycle.png
        │   ├── download.png
        │   ├── sc_clus_no_cellcycle_regress.png
        │   ├── sc_clus_pc_heatmap.png
        │   ├── sc_clus_pca_group.png
        │   ├── sc_clus_sig_pcs.png
        │   ├── sc_clus_tsne_cellmarkers.png
        │   ├── sc_clus_tsne_clusters.png
        │   ├── sc_clus_tsne_clusters_lowres.png
        │   ├── sc_clus_tsne_cyclemarkers.png
        │   ├── sc_clus_tsne_group.png
        │   ├── sc_clus_tsne_pcs.png
        │   ├── sc_clus_tsne_qc.png
        │   ├── sc_clus_variable_genes.png
        │   ├── sc_clus_violin_genes.png
        │   ├── sc_clus_viz_pca.png
        │   ├── sc_clus_yes_cellcycle_regress.png
        │   ├── sc_metadata.png
        │   ├── sc_qc_UMIsVsGenesDetected.png
        │   ├── sc_qc_cellcounts.png
        │   ├── sc_qc_filtered_UMIsVsGenesDetected.png
        │   ├── sc_qc_filtered_cellcounts.png
        │   ├── sc_qc_filtered_genesDetected.png
        │   ├── sc_qc_filtered_mitoRatio.png
        │   ├── sc_qc_filtered_novelty.png
        │   ├── sc_qc_filtered_reads.png
        │   ├── sc_qc_filtered_umisPerCell.png
        │   ├── sc_qc_genesDetected.png
        │   ├── sc_qc_mitoRatio.png
        │   ├── sc_qc_novelty.png
        │   ├── sc_qc_reads_histogram.png
        │   ├── sc_qc_reads_ridgeline.png
        │   ├── sc_qc_umisPerCell.png
        │   ├── sc_seq_method.png
        │   └── sequencing_dir_org.png
        └── lessons
        │   ├── .DS_Store
        │   ├── 01_bcbio_run.md
        │   ├── 02_QC_report.md
        │   ├── 03_seurat_clustering_analysis.md
        │   ├── 04_seurat_markers.md
        │   ├── Monocle.md
        │   ├── R_set-up.md
        │   ├── SPRING.md
        │   ├── bcbioSingleCell_setup.md
        │   ├── cell_hashing.md
        │   ├── clustering_report_bcbioSingleCell.md
        │   ├── seurat_loom_subset_velocity.md
        │   └── velocity.md
    ├── scripts
        ├── clustering_pre_regress_v2.R
        ├── clustering_regress.R
        ├── sc_DESeq2_analysis_inner.R
        └── sc_DESeq2_analysis_outer.R
    └── templates
        ├── README.md
        ├── power_analysis.Rmd
        ├── sc_DESeq2_analysis_report_template.Rmd
        ├── sc_QC_template.Rmd
        ├── sc_clustering_template.Rmd
        ├── sc_marker_identification_template.Rmd
        └── sc_prep_for_DESeq2_analysis.Rmd


/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/scRNAseq/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/.DS_Store


--------------------------------------------------------------------------------
/scRNAseq/docs/bcl2fastq2_guide_15051736_v2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/docs/bcl2fastq2_guide_15051736_v2.pdf


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/.DS_Store


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/README.md:
--------------------------------------------------------------------------------
 1 | ## Single-cell RNA-seq analysis tutorials
 2 | 
 3 | This repository contains tutorials for how to perform each part of a single-cell RNA-seq analysis, from running bcbio on the raw data to performing clustering, marker identificaton and differential expression analysis with DESeq2 and EdgeR. It also contains documents for generating data to use with online exploratory tools such as SPRING.
 4 | 
 5 | ### Generating abundance estimates with bcbio
 6 | 
 7 | - [Running bcbio](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/01_bcbio_run.md)
 8 | 
 9 | ### Setting up to run scRNA-seq analyses using R on O2
10 | 
11 | - [R on O2](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/R_set-up.md)
12 | 
13 | ### Analysis workflow with Seurat (version 3)
14 | All steps from QC to integration, clustering, and marker identification can be found in the teaching team repo: https://hbctraining.github.io/scRNA-seq/schedule/. The hands-on lessons from the workshop can be found below:
15 | 
16 | - [Generation of count matrix](https://hbctraining.github.io/scRNA-seq/lessons/02_SC_generation_of_count_matrix.html)
17 | - [Quality control set-up](https://hbctraining.github.io/scRNA-seq/lessons/03_SC_quality_control-setup.html)
18 | - [Quality control](https://hbctraining.github.io/scRNA-seq/lessons/04_SC_quality_control.html)
19 | - [Normalization and Integration](https://hbctraining.github.io/scRNA-seq/lessons/06_SC_SCT_and_integration.html)
20 | - [Clustering](https://hbctraining.github.io/scRNA-seq/lessons/07_SC_clustering_cells_SCT.html)
21 | - [Clustering QC](https://hbctraining.github.io/scRNA-seq/lessons/08_SC_clustering_quality_control.html)
22 | - [Marker Identification](https://hbctraining.github.io/scRNA-seq/lessons/09_merged_SC_marker_identification.html)
23 | 
24 | ### Downstream and other analyses
25 | 
26 | - [Cell hashing](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/cell_hashing.md)
27 | - [Generating data for SPRING](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/SPRING.md)
28 | - [Differential expression analysis - pseudobulk method with DESeq2](https://hbctraining.github.io/scRNA-seq/lessons/pseudobulk_DESeq2_scrnaseq.html)
29 | - Velocity analysis
30 |   - [Using Seurat clusters](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/velocity.md)
31 |   - Using scanpy - follow [Scanpy tutorial](https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html) to generate clusters, then use scVelo, which is the same as with Seurat clusters ([documented here](https://github.com/hbc/hbc_10x_scRNAseq_Feinberg_Aortic_cells_from_TKO_and_WT_mouse_hbc04205_3/blob/master/velocyto/feinberg_velocity_analysis_KLF10_KO_vs_Cre_all_results/velocity_jupyter_notebook.pdf)) and in the [scVelo documentation](https://scvelo.readthedocs.io/VelocityBasics).
32 | - Trajectory analysis: Slingshot
33 |   - [Analysis/benchmarking Rmd report](https://github.com/hbc/hbc_scrnaseq_tseng_10x_brown_fat_mouse_hbc03764/blob/master/2019_09_tseng_multisample_analysis/analysis_reports/slingshot/tseng_slingshot_comprehensive_report.tar.gz)
34 |   - [R script using Seurat clusters and UMAP dim reduction](https://github.com/hbc/hbc_scrnaseq_tseng_10x_brown_fat_mouse_hbc03764/blob/master/2019_09_tseng_multisample_analysis/analysis_reports/slingshot/VSM_only_slingshot_to_adipo24_UMAP.R)
35 | - Power analysis 
36 |   - [Current] Associated `.Rmd` template available [here](https://github.com/hbc/tutorials/blob/master/scRNAseq/templates/)
37 | 
38 | ## Past analysis workflows (deprecated)
39 | 
40 | ### Power analysis prior to pseudobulk technique
41 | 
42 | - [Deprecated] Preparation of data for DE analysis with DESeq2 (associated `.Rmd` template available [here](https://github.com/hbc/tutorials/blob/master/scRNAseq/templates/sc_prep_for_DESeq2_analysis.Rmd)) - Needs to be changed to pseudobulk analysis
43 | - [Deprecated] DE analysis report (associated `.Rmd` template available [here](https://github.com/hbc/tutorials/blob/master/scRNAseq/templates/sc_DESeq2_analysis_report_template.Rmd)) - Needs to be changed to pseudobulk analysis
44 |   
45 | ### Analysis workflow with Seurat (version 2)
46 | 
47 | - [Quality control analysis](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionIV/lessons/SC_quality_control_analysis.html) (associated `.Rmd` template available [here](https://github.com/hbc/tutorials/blob/master/scRNAseq/templates/sc_QC_template.Rmd))
48 | - [Clustering analysis](https://hbctraining.github.io/scRNA-seq/lessons/05_SC_clustering_cells.html)
49 | - [Marker identification analysis](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionIV/lessons/SC_marker_identification.html) (associated `.Rmd` template available [here](https://github.com/hbc/tutorials/blob/master/scRNAseq/templates/sc_marker_identification_template.Rmd))
50 | 
51 | ### Analysis workflow with bcbioSingleCell [last update: 2017]
52 | 
53 | - [bcbioSingleCell set-up](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/bcbioSingleCell_setup.md)
54 | - [Quality control analysis](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/02_QC_report.md)
55 | - [Clustering analysis](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/clustering_report_bcbioSingleCell.md)
56 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/.DS_Store


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/cell_cycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/cell_cycle.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/download.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_no_cellcycle_regress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_no_cellcycle_regress.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_pc_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_pc_heatmap.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_pca_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_pca_group.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_sig_pcs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_sig_pcs.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_cellmarkers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_cellmarkers.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_clusters.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_clusters_lowres.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_clusters_lowres.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_cyclemarkers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_cyclemarkers.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_group.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_pcs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_pcs.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_qc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_tsne_qc.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_variable_genes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_variable_genes.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_violin_genes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_violin_genes.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_viz_pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_viz_pca.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_yes_cellcycle_regress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_clus_yes_cellcycle_regress.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_metadata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_metadata.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_UMIsVsGenesDetected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_UMIsVsGenesDetected.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_cellcounts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_cellcounts.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_UMIsVsGenesDetected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_UMIsVsGenesDetected.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_cellcounts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_cellcounts.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_genesDetected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_genesDetected.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_mitoRatio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_mitoRatio.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_novelty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_novelty.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_reads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_reads.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_umisPerCell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_filtered_umisPerCell.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_genesDetected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_genesDetected.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_mitoRatio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_mitoRatio.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_novelty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_novelty.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_reads_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_reads_histogram.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_reads_ridgeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_reads_ridgeline.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_umisPerCell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_qc_umisPerCell.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sc_seq_method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sc_seq_method.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/img/sequencing_dir_org.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/img/sequencing_dir_org.png


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbc/tutorials/7ff670c5e3b477da09b6c2e832e05bd43e25448f/scRNAseq/scRNAseq_analysis_tutorial/lessons/.DS_Store


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/01_bcbio_run.md:
--------------------------------------------------------------------------------
  1 | # bcbio Run
  2 | 
  3 | ## Setting up for bcbio single cell RNA-Seq analysis
  4 | 1. **Questions during initial consult**
  5 | 	- Protocol for disassociation - how difficult?
  6 | 	- Type of method used?
  7 | 	- % viability of cells
  8 | 	- What types of cells expect to exist and a ballpark proportion expected of each
  9 | 		- If rare cell type, any method to enrich for cell type?
 10 | 	- Potential sources of variation that we might expect?
 11 | 	
 12 | 1. **Analyst should ask client for the following:**
 13 | 	- How many samples were sequenced?
 14 | 	- What were the sample indices used?
 15 | 	- How many cells were encapsulated and sequenced per sample?
 16 | 	- What is the main experimental question - does it require clustering using markers and/or cell trajectory analyses?
 17 | 	- What are a handful of markers for the expected cell types using official gene symbols or Ensembl IDs. 
 18 | 		- Human gene symbols, clients can search [here](https://www.genenames.org)
 19 | 		- Mouse gene symbols, clients can search [here](http://www.informatics.jax.org/marker).
 20 | 
 21 | 
 22 | 2. **Acquire data from sequencing core.** The way in which you handle/process your data will differ depending on the sequencing core that you obtain it from. The key thing to keep in mind is that the input to `bcbio` cannot be demultiplexed. **The data needs to remain multiplexed, but split into four FASTQ files** (R1-R4, as described in detail below).
 23 | 
 24 | 	- **Bauer sequencing core:** uses Basespace. To download the sequencing files use [BaseMount](https://help.basespace.illumina.com/articles/descriptive/introduction-to-basemount/).
 25 | 
 26 | 		- The BaseSpaceRunDownloader tool previously used and shown below is deprecated:
 27 | 	
 28 | 			```
 29 | 			wget https://da1s119xsxmu0.cloudfront.net/sites/knowledgebase/API/08052014/Script/BaseSpaceRunDownloader_v2.zip
 30 | 			unzip BaseSpaceRunDownloader_v2.zip
 31 | 			python BaseSpaceRunDownloader_v2.py -r <Run ID> -a <access token>
 32 | 			```
 33 | 		
 34 | 			The option `-r` is the number in the basespace url and the [access token](https://developer.basespace.illumina.com/docs/content/documentation/authentication/obtaining-access-tokens) is something you have to get for your basespace account. 
 35 | 		
 36 | 		The files output will be BCL files that can be turned into FASTQ files with the `bcl2fastq` tool (instructions below).
 37 | 
 38 | 	- **DFCI sequencing center (Zach):** will output the FASTQ files in the correct format since the Core has provided a script to Zach, but should check the files - should have 4 reads, not a huge undetermined file, etc.
 39 | 		
 40 | 	- **Biopolymers sequencing facility:** should be FASTQ, but should check the files - should have 4 reads, not a huge undetermined file, etc
 41 | 		
 42 | 	- **Broad Institute:** has their own single cell distribution platform - should be FASTQ, but should check the files - should have 4 reads, not a huge undetermined file, etc
 43 | 	
 44 | 	- **CCCB:** will generally provide tarballs that correspond to different runs. Sometimes they have run `bcl2fastq` on the data but you do not want to use this output. It is likely demultiplexed and cannot be used as input to `bcbio`. 
 45 | 	
 46 | 
 47 | 4. If downloaded sequencing files are BCL format, then need to **convert to FASTQ** after completing changes to the `Samplesheet` that are detailed below.
 48 | 
 49 | 	- Change directories to the sequencing folder downloaded from the facility. The folder should be arranged according to the image below for NextSeq or MiniSeq:
 50 | 	
 51 | 
 52 | 		<img src="../img/sequencing_dir_org.png" width="400">
 53 | 		
 54 | 		*Image acquired from [bcl2fastq documentation](../../docs/bcl2fastq2_guide_15051736_v2.pdf).* 
 55 | 		
 56 | 		```
 57 | 		cd path/to/YYMMDD_machinename_XXXX_FCexperimentname 
 58 | 		```
 59 | 	- Update the `Samplesheet.csv` so that it does not demultiplex.
 60 | 		- In the run-level folder (decompress the tarball), you should see a `Samplesheet.csv` file. This is a standard file obtained from Illumina sequencing. In the file you will notice four sections (Header, Reads, Settings, Data). The `[Data]` section is what we are interested in. It should look something like:
 61 | 	
 62 | 			```
 63 | 					[Data],,,,,,,,,
 64 | 			Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
 65 | 			Sample_JP1_11a_1,JP1_11a_1,,,D701,ATTACTCG,D501,AGGCTATA,JP_06092018_1641,
 66 | 
 67 | 			```
 68 | 		There are two things we need to check in this section of the CSV file:
 69 | 		
 70 | 		1. The columns `I7_Index` and `I5_index` are empty.
 71 | 		2. The barcode sequences that are in `index` and `index2` columns do not match sample barcodes. These can be changed to a dummy sequence like `AAAAAAAA` just to be safe.
 72 | 		
 73 | 		> **NOTE:** If we do not make these changes, `bcl2fastq` will attempt to demultiplex the samples. We can make changes because we don't need this samplesheet for any steps downstream other than the `bcl2fastq` step.
 74 | 		
 75 | 	- Log on to O2 to run `bcl2fastq`. Load `bcl2fastq` module and convert files to FASTQ by using the following command:
 76 | 		
 77 | 		``` 
 78 | 		bcl2fastq \
 79 | 		--use-bases-mask y*,y*,y*,y* \
 80 | 		--mask-short-adapter-reads 0 \
 81 | 		--minimum-trimmed-read-length 0
 82 | 		```
 83 | 		
 84 | 		More information regarding the `bcl2fastq` command and directory structures for other sequencing machines can be found in the [documentation](../../docs/bcl2fastq2_guide_15051736_v2.pdf). 
 85 | 		
 86 | 		> **NOTE:** This can sometimes take awhile and is best run as a job submission script.
 87 | 		
 88 | 5. The output files should be in the `BaseCalls` directory. For each file of sequenced reads, there should be four associated FASTQ files (R1-R4) for the inDrops technology.
 89 | 	
 90 | 	- **R1 (61 bp Read 1):** sequence of the read
 91 | 	- **R2 (8 bp Index Read 1 (i7)):** cellular barcode - which cell read originated from
 92 | 	- **R3 (8 bp Index Read 2 (i5)):** library index - which sample read originated from
 93 | 	- **R4 (14 bp Read 2):** read 2 and barcode/UMI - remaining cellular barcode and UMI - which transcript read originated from (to find PCR duplicates)
 94 | 
 95 | 	The reads for each sequence are depicted in the image below:
 96 | 
 97 | 	<img src="../img/sc_seq_method.png" width="800">
 98 | 	
 99 | 	*Image credit: Sarah Boswell, Harvard Staff Scientist for Sequencing Technologies*
100 | 
101 | 	
102 | 6. To quickly view the counts for the barcodes with the top five highest counts based on the first 10,000 reads in a file:
103 | 
104 | 	```
105 | 	gzip -cd filename_R3.fq.gz | head -40000 | awk 'NR % 4 == 2' | sort | uniq -c | awk 	'{ print $2 "," $1}' | sort -t"," -n --key=2 | tail -5
106 | 	```
107 | 	
108 | 	>**NOTE:** `awk 'NR % 4 == 2'` gets every 4th line starting from the 2nd, which is a useful trick when you want to count up FASTQ file entries (Rory's code)
109 | 
110 | 	The reverse complement sequences of the sample indices given by the client should correspond to the most abundant indices in the file.
111 | 
112 | 	**Automatization**
113 | 	
114 | 	Alternatively, you can use this [script](https://gist.github.com/lpantano/2a8d5b14fa6f5df7be3b68c006ef729d) to make a list of top N barcodes and match them with a list provided in a CSV file:
115 | 	
116 | 	```
117 | 	python check_sc_barcode.py --fastq FILE_R3.fastq.gz --barcodes barcodes.csv
118 | 	```
119 | 	
120 | 	It needs python3 and biopython (for people on O2 python is available at `/n/app/bcbio/conda3/bin/python` and the script at `/n/app/bcbio/scripts/check_sc_barcode.py`).
121 | 	
122 | 	The **barcode.csv** file looks like this:
123 | 	
124 | 	```
125 | 	CTATTAAG,M3L_Basal amygdala
126 | 	AAGGCTAT,M2_BA25_1
127 | 	GAGCCTTA,M3R_BA25
128 | 	TTATGCGA,M3L_BA25
129 | 	```
130 | 	
131 | 	The output will look like this:
132 | 	
133 | 	```
134 | 	This barcode CTTAATAG has not been detected.
135 | 	This barcode ATAGCCTT has not been detected.
136 | 	This barcode TAAGGCTC (M2_BA25 2) is detected with 1394 reads.
137 | 	This barcode AGATCTCG is not in your list (2893 reads).
138 | 	```
139 | 
140 | 7. Use the `cat` command to concatenate all of the files for a given sample across lanes:
141 | 
142 | 	```
143 | 	cat Undetermined_S0_L001_R1_001.fastq.gz Undetermined_S0_L002_R1_001.fastq.gz Undetermined_S0_L003_R1_001.fastq.gz Undetermined_S0_L004_R1_001.fastq.gz > cat_R1.fastq.gz
144 | 	```
145 | 	
146 | 	or 
147 | 	
148 | 	```
149 | 	cat *R1*.fastq.gz > cat_R1.fastq.gz
150 | 	```
151 | 	
152 | 	Do the same for the R2, R3, and R4 files.
153 | 
154 | 8. Create metadata file as normal for bcbio run. Note that your FASTQ files are not demultiplexed, so you will often have multiple samples in each of the FASTQ files.
155 | 
156 | 	```
157 | 	fileName,description
158 | 	cat,run1
159 | 	```
160 | 
161 | 
162 | 9. Download the most recent transcriptome FASTA and GTF files:
163 | 
164 | 	```
165 | 	# Most recent mouse FASTA from Ensembl FTP
166 | 	wget ftp://ftp.ensembl.org/pub/current_fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz
167 | 
168 | 	# Most recent mouse GTF from Ensembl FTP
169 | 	wget ftp://ftp.ensembl.org/pub/release-92/gtf/mus_musculus/Mus_musculus.GRCm38.92.gtf.gz
170 | 	
171 | 	# Perform the checksums
172 | 	sum Mus_musculus.GRCm38.cdna.all.fa.gz
173 | 	sum Mus_musculus.GRCm38.92.gtf.gz
174 | 	
175 | 	# Decompress FASTA and GTF to run in bcbio
176 | 	gzip -d Mus_musculus.GRCm38.cdna.all.fa.gz
177 | 	gzip -d Mus_musculus.GRCm38.92.gtf.gz
178 | 	```
179 | ## Overview of bcbio single cell RNA-Seq workflow on O2
180 | 
181 | The bcbio single cell RNA-Seq pipeline will perform the following steps:
182 | 
183 | 1. Identify the sample barcodes in the R3 read, which will be provided in the `config` file in the `sample_barcodes` parameter. A single mismatch between known sample barcodes and sequences is allowed.
184 | 
185 | 2. Identify the cellular barcodes by parsing the R2 and R4 reads. 
186 | 
187 | 3. Identify the unique molecular identifiers (UMIs) by parsing R4 read.
188 | 
189 | 4. Filter out the sequence data with cellular barcodes matching less than 1000 reads (indicating poor quality cells due to encapsulation of free floating RNA from dying cells, small cells, or set of cells that failed for some reason). The threshold for the number of matching reads used for filtering can be specified in the `config` file with the `minimum_barcode_depth` parameter.
190 | 
191 | 5. Align reads with [Rapmap](https://academic.oup.com/bioinformatics/article/32/12/i192/2288985/RapMap-a-rapid-sensitive-and-accurate-tool-for) tool.
192 | 
193 | 6. Take reads that mapped to more than one transcript and divide the count between all of the transcripts to which the reads aligned.
194 | 
195 | > **NOTE:** The location of the barcodes and UMIs differs by library method, and the description given above reflects the locations for inDrop data. However, bcbio will perform similar steps for other methods; it will just parse the reads a bit differently.
196 | 
197 | ## Running bcbio single cell RNA-Seq workflow on O2
198 | 
199 | 1. Create sample barcodes file (`.txt`) to identify samples in bcbio. The **reverse-complement of the sample barcodes supplied by the client** are written as a single barcode per line in a file. No other text should be present in the file, for example the following is the contents of a barcode file for an experiment with four samples:
200 | 
201 | 	```
202 | 	AGGCTTAG
203 | 	CGGAGAGA
204 | 	TACTCCTT
205 | 	ATTAGACG
206 | 	```
207 | 	
208 | > **NOTE:** This is information that should have been supplied by the client. While it is possible to run bcbio without this, it is advisable not to. 
209 | 	
210 | > **NOTE:** The barcodes written here should match the most prevalent barcodes in the **Setting up for bcbio single cell RNA-Seq analysis** section, Step 5.
211 | 
212 | 2. Create configuration template for single cell run:
213 | 
214 | ```
215 | details:
216 |   - analysis: scRNA-seq
217 |     algorithm:
218 |       transcriptome_fasta: /n/data1/cores/bcbio/PIs/PI_name/ref_data/Mus_musculus.GRCm38.cdna.all.fa
219 |       transcriptome_gtf: /n/data1/cores/bcbio/PIs/PI_name/ref_data/Mus_musculus.GRCm38.92.gtf
220 |       umi_type: harvard-indrop-v3
221 |       minimum_barcode_depth: 1000
222 |       cellular_barcode_correction: 1
223 |       sample_barcodes: /n/data1/cores/bcbio/PIs/PI_name/meta/hbc02055-sample-barcodes-rc.txt
224 |     genome_build: mm10
225 | ```
226 | 
227 | **NOTE:** If you want to perform the same barcode selection as cellranger, then you can add `auto` in `minimum_barcode_depth` parameter.
228 | 
229 | **NOTE:** The `.gtf` is only used to link genes to transcripts - does not use the coordinates, so it is fine that the coordinates reference the genome.
230 | 
231 | 3. Normal bcbio configuration file creation:
232 | 
233 | 	```
234 | 	bcbio_nextgen.py -w template ../config/scRNAseq_config_template.yaml ../meta/PI_name.csv ../hbcXXXXX/seq_dir/Data/Intensities/BaseCalls/cat*fastq.gz
235 | 	```
236 | 
237 | 4. Create script (below) to run job on O2 and run with `sbatch ../../runJob-PI_name-scRNAseq.slurm`:
238 | 
239 | ```
240 | #!/bin/sh
241 | #SBATCH -p medium
242 | #SBATCH -J win-full
243 | #SBATCH -o run.o
244 | #SBATCH -e run.e
245 | #SBATCH -t 4-00:00
246 | #SBATCH --cpus-per-task=1
247 | #SBATCH --mem=8000
248 | #SBATCH --mail-type=ALL
249 | #SBATCH --mail-user=piper@hsph.harvard.edu
250 | 
251 | /n/app/bcbio/dev/anaconda/bin/bcbio_nextgen.py ../config/PI_name.yaml -n 48 -t ipython -s slurm -q medium -r t=4-00:00
252 | ```
253 | 
254 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/02_QC_report.md:
--------------------------------------------------------------------------------
  1 | # bcbioSingleCell QC Report
  2 | 
  3 | ## Setting up
  4 | 
  5 | 1. Choose the quality control template.
  6 | 
  7 | 	> Documentation for all functions available from the bcbioSingleCell package is available at [http://bioinformatics.sph.harvard.edu/bcbioSingleCell/reference/index.html](http://bioinformatics.sph.harvard.edu/bcbioSingleCell/reference/index.html)
  8 | 
  9 | 2. Edit the information in the files `_header.Rmd` and `_footer.Rmd` with experiment-specific information.
 10 | 
 11 | 3. Install `bcbioSingleCell` and load the library:
 12 | 	
 13 | 	```r
 14 | 	# devtools::install_github("hbc/bcbioSingleCell") # Add argument `ref = "develop"` if need development branch
 15 | 	
 16 | 	library(bcbioSingleCell)
 17 | 	```
 18 | 	
 19 | 4. Bring in data from bcbio:
 20 | 	
 21 | 	```r
 22 | 	bcbio <- bcbioSingleCell("~/bcbio/PIs/path/to/final/",
 23 |                     organism = "Homo sapiens",
 24 |                     interestingGroups = "sampleName",
 25 |                     sampleMetadataFile = "~/path/to/metadata.csv",
 26 |                     ensemblRelease = 92L,
 27 |                     genomeBuild = "GRCh38")
 28 | 	
 29 | 	save(bcbio_output, file="data/bcb.rda")
 30 | 	```
 31 | 
 32 | 
 33 | 5. Choose the filtering parameters to use. You can start with these parameters, then after viewing the data, change to better values. Generally, you don't want `minGenes`/`minUMIs` to be any lower than 500.  You would hope for at least 1000 genes/UMIs detected per sample. After choosing parameters, run the entire `r setup` chunk by clicking on the green triangle at the top of the setup chunk (if you clear your environment, you need to run the chunk this way to make the `params` reappear.
 34 | 	
 35 | 	**Choosing parameters**
 36 | 	```r
 37 | 	params:
 38 |     	  bcb_file: "data/bcb.rda"
 39 |     	  min_genes: 500
 40 |           max_genes: !r Inf
 41 |           max_mito_ratio: 0.25
 42 |           min_novelty: 0.85
 43 |           min_cells_per_gene: 10
 44 |           data_dir: !r file.path("data", Sys.Date())
 45 |   	```
 46 | 	
 47 | 	**Running setup chunk**
 48 | 	```r
 49 | 	# Shared RMarkdown settings
 50 | 	prepareSingleCellTemplate()
 51 | 	if (file.exists("setup.R")) {
 52 | 	    source("setup.R")
 53 | 	}
 54 | 
 55 | 	# Directory paths
 56 | 	dataDir <- file.path(params$outputDir, "data")
 57 | 
 58 | 	# Load bcbioSingleCell object
 59 | 	bcbName <- load(params$bcbFile)
 60 | 	bcb <- get(bcbName, inherits = FALSE)
 61 | 	```
 62 | 	
 63 | 	```r
 64 | 	eval=file.exists("_header.Rmd")
 65 | 	```
 66 | 
 67 | 	```r
 68 | 	sampleMetadata(bcb)
 69 | 	```
 70 | 
 71 | 6. For the count alignment, be sure to update the **linked Ensembl** to be accurate for the organism. This information is present in the file: `_footer.Rmd`. 
 72 | 
 73 | 7. To explore the raw data stored inside the `bcb` object, the following functions can be helpful:
 74 | 	
 75 | 	```r
 76 | 	# Access metadata for each sample: "sampleID", "sampleName", "description", "fileName", "index", "sequence", "revcomp"
 77 | 	sampleMetadata(bcb)
 78 | 	
 79 | 	# Access metadata for each cell: "nCount", "nUMI", "nGene", "nCoding", "nMito", "log10GenesPerUMI", "mitoRatio" 
 80 | 	colData(bcb)
 81 | 	
 82 | 	# Access raw counts - each column represents a single cell
 83 | 	counts <- counts(bcb)
 84 | 	
 85 | 	# Can return cells from a particular sample by using metadata information about which sample corresponds to each barcode
 86 | 	unsort_counts <- counts[, str_detect(colnames(counts), "run1_ATTAGACG")] # Return only the counts for the `Unsorted` sample
 87 | 	
 88 | 	# Extract information associated with each gene including "ensgene", "symbol", "description", "biotype", "broadClass"
 89 | 	rowData(bcb)
 90 | 	
 91 | 	# Return the genes that are associated with a broad class (ex: mitochondrial contamination)
 92 | 	subset(rowData(bcb), broadClass == "mito")
 93 | 	```
 94 | 
 95 | ## Quality Control Metrics
 96 | 
 97 | ### Reads per cell
 98 | 
 99 | 8. Evaluate the number of reads per cell:
100 | 
101 | 	```r
102 | 	plotReadsPerCell(bcb)
103 | 	```
104 | 	
105 | 	The three plots give different ways of looking at the number of reads per cell. Generally you would like to see a large peak at around 10,000 reads per cell, and you hope your filtering threshold of 1,000 reads per cell used in bcbio has removed the poor quality cells with few number of reads. The filtering threshold of 1,000 is represented by the vertical dotted line.
106 | 
107 | 	For example, in the figures below, the yellow sample is worrisome because we see a small peak at 10,000 reads per cell, but a much larger peak at 1,000 reads per cell. The larger peak merges into the poor quality cells with few reads per cell.
108 | 	
109 | 	<img src="../img/sc_qc_reads_ridgeline.png" width="600">
110 | 	
111 | 	The proportional histogram looks a bit better, as you hope to see all of the samples with peaks in relatively the same location between 10,000 and 100,000 reads per cell. However, the yellow sample still has this shoulder, which is indicative of many poor quality cells. If this were the only issue with the data, we may want to set the threshold to be more strict to ~10,000 reads per cell to get rid of the cells constituting the shoulder in the yellow sample.
112 | 
113 | 	<img src="../img/sc_qc_reads_histogram.png" width="500">
114 | 	
115 | ### Cell counts
116 | 
117 | 9. Determine the number of cells detected per sample:
118 | 
119 | 	```r
120 | 	plotCellCounts(bcb)
121 | 	```
122 | 
123 | 	The cell counts are determined by the number of unique cellular barcodes detected. During the inDrop protocol, the cellular barcodes are present in the hydrogels, which are encapsulated in the droplets with a single cell and lysis/reaction mixture. Upon treatment of UV and cell lysis, all components mix together inside the droplet and reverse transcription proceeds, followed by droplet breakup and linear amplification for library preparation. While each hydrogel should have a single cellular barcode associated with it, occasionally a hydrogel can have more than one cellular barcode. We often see all possible combinations of cellular barcodes at a low level, leading to a higher number of cellular barcodes than cells.
124 | 
125 | 	You expect the number of unique cellular barcodes to be around the number of sequenced cells (determined in step 1) or greater due to some hydrogels having more than one cellular barcode. The yellow sample below seems to have at least double the number of cellular barcodes as the other samples.
126 | 
127 | 	<img src="../img/sc_qc_cellcounts.png" width="500">
128 | 
129 | ### UMI counts per cell
130 | 
131 | 10. Determine the number of UMI counts (transcripts) per cell:
132 | 
133 | 	```r
134 | 	plotUMIsPerCell(
135 |     		bcb,
136 | 	    	min = params$minUMIs)
137 | 	```
138 | 
139 | 	The UMI counts per cell should be generally above 500, although usable, it's still low if between 500-1000 counts. If UMIs per cell is 500-1000 counts, then the cells probably should have been sequenced more deeply. The threshold of 500 was given in the `params`, and this is represented by the vertical dashed line in the plots.
140 | 	
141 | 	The number of UMIs per cell tends to be very low for the Unsorted sample (yellow). The other samples have good numbers of UMIs per cell, indicating a problem only with the Unsorted sample. Using this cutoff, we will lose the majority of the Unsorted cells.
142 | 	
143 | 	<img src="../img/sc_qc_umisPerCell.png" width="500">
144 | 	
145 | ### Genes detected per cell
146 | 
147 | 11. Discover the number of genes detected per cell:
148 | 
149 | 	```r
150 | 	plotGenesPerCell(
151 | 	    bcb,
152 | 	    min = params$minGenes,
153 | 	    max = params$maxGenes)
154 | 	```
155 | 
156 | 	Seeing gene detection in the range of 500-5000 is normal for inDrop analysis. Similar expectations for gene detection as for UMI detection.
157 | 
158 | 	All samples other than the Unsorted sample have a good number of genes detected (with medians between 1,000 - 3,000 genes), which correspond to the numbers of UMIs per cell for each sample. However, the Unsorted sample has a very low median number of genes per cell, indicating a sample failure.
159 | 
160 | 	<img src="../img/sc_qc_genesDetected.png" width="500">
161 | 	
162 | ### UMIs vs. genes detected
163 | 
164 | 12. Identify whether large number of poor quality cells present in any samples with low UMI/genes detected:
165 | 
166 | 	```r
167 | 	plotUMIsVsGenes(bcb)
168 | 	```
169 | 
170 | 	Poor quality cells are likely to have low genes and UMIs per cell. Therefore, a poor sample is likely to have cells in the lower left of the graph. Good cells should exhibit both higher number of genes per cell and higher numbers of UMIs. We also expect similar lines with similar slopes for all samples.
171 | 	
172 | 	The Unsorted sample has many cells with few UMIs and low number of genes per cell. The other samples look fine.
173 | 	
174 | 	<img src="../img/sc_qc_UMIsVsGenesDetected.png" width="500">
175 | 	
176 | ### Mitochondrial counts ratio
177 | 
178 | 13. Identify whether there is a large amount of mitochondrial contamination from dead or dying cells:
179 | 
180 | 	```r
181 | 	plotMitoRatio(
182 | 	    bcb,
183 | 	    max = params$maxMitoRatio)
184 | 	```
185 | 	
186 | 	Poor quality samples for mitochondrial counts would have larger peaks above the 0.1 mitochondrial ratio mark, unless it is expected based on sample type. 
187 | 	
188 | 	There was just a very low number of genes detected for the Unsorted sample, so mitochondrial expression appears higher mainly due to this fact. The poor quality of the Unsorted sample does not appear to be due to dead or dying cells. The other samples have little mitochondrial expression, although hPSC sample has a bit more than the Sorted samples, and these cells will likely be removed using the threshold of 0.1. The hPSC sample was expected to contain brown adipocytes, which have higher quantities of mitochondrial expression, so it may have been advisable to keep these cells and move the threshold to 0.2.
189 | 
190 | 	<img src="../img/sc_qc_mitoRatio.png" width="500">
191 | 	
192 | ### Novelty
193 | 
194 | 14. Explore the novelty for contamination with low complexity cell types:
195 | 
196 | 	```r
197 | 	plotNovelty(
198 | 	    bcb,
199 | 	    min = params$minNovelty)
200 | 	```
201 | 	
202 | 	We can see the samples where we sequenced each cell less have a higher overall novelty, that is because we have not started saturated the sequencing for any given gene for these samples. Outlier cells in these samples might be cells that we have a less complex RNA species than other cells. Sometimes we can detect contamination with low complexity cell types like red blood cells via this metric.
203 | 	
204 | 	All of the samples look fine for complexity, except for the Unsorted sample, so it is unlikely that there is contamination with low complexity cell types in these of the samples. The Unsorted sample has a larger shoulder than desired, but is not bad by this metric.
205 | 	
206 | 	<img src="../img/sc_qc_novelty.png" width="500">
207 | 	
208 | 
209 | ## Filtered results
210 | 
211 | 15. Run the filtering criteria and explore the plots again. The metrics should have improved greatly after removing low gene/UMI cells and high mitochondrial cells.
212 | 
213 | 	```r
214 | 	bcbFiltered <- filterCells(bcb,
215 | 	minUMIs = params$minUMIs,
216 | 	minGenes = params$minGenes,
217 | 	maxGenes = params$maxGenes,
218 | 	maxMitoRatio = params$maxMitoRatio,
219 | 	minNovelty = params$minNovelty,
220 | 	minCellsPerGene = params$minCellsPerGene)
221 | 	```
222 | 
223 | 	One main plot to look at to determine the success of the filtering criteria is the number of cell counts. You should expect roughly the number of sequenced cells per sample. We found out from the client that they had sequenced 2000-3000 cells, so the final numbers were around our expectations. If the number of cells sequenced is vastly different than the number returned after filtering, then you may need to re-visit the threshold criteria used for filtering.
224 | 	
225 | 	**Cell counts**
226 | 	
227 | 	<img src="../img/sc_qc_filtered_cellcounts.png" width="500">
228 | 	
229 | 	In addition, it is a good idea to explore all of the quality plots for the filtered data. All plots should be much improved for the number of reads per cell, genes detected, UMIs per cell, mitochondrial ratio, and novelty. The plots below show the filtered plots from the example data. Since the `Unsorted` sample was a poor quality sample, the filter will remove a large number of the cells for this sample; in this case all cells except 1 were filtered out. 
230 | 	
231 | 	**Reads per cell**
232 | 	
233 | 	The majority of cells have between 10,000 and 100,000 reads per cell, which is good.
234 | 	
235 | 	<img src="../img/sc_qc_filtered_reads.png" width="500">
236 | 	
237 | 	**Genes detected**
238 | 	
239 | 	The number of genes detected has also improved after the removal of the cells with low genes and or low UMIs.
240 | 	
241 | 	<img src="../img/sc_qc_filtered_genesDetected.png" width="500">
242 | 	
243 | 	**UMIs per cell**
244 | 	
245 | 	The numbers of UMIs per cell has also improved significantly, with the low quality cells dropped. It is worth noting here that the sample `Sort1` has many more UMIs per cell than the replicate `Sort2`. We will definitely want to regress out the variation due to numbers of UMI per cell in the clustering analysis.
246 | 	
247 | 	<img src="../img/sc_qc_filtered_umisPerCell.png" width="500">
248 | 	
249 | 	**UMIs versus genes detected**
250 | 	
251 | 	The correlations look more similar between samples, with few low gene and/or low UMI cells.
252 | 	
253 | 	<img src="../img/sc_qc_filtered_UMIsVsGenesDetected.png" width="500">
254 | 	
255 | 	**Mitochondrial ratio**
256 | 	
257 | 	The mitochondrial ratios are improved with no cells present with the high mitochondrial contamination.
258 | 	
259 | 	<img src="../img/sc_qc_filtered_mitoRatio.png" width="500">
260 | 	
261 | 	**Novelty**
262 | 	
263 | 	The novelty is also improved, with no shoulder for any of the samples.
264 | 	
265 | 	<img src="../img/sc_qc_filtered_novelty.png" width="500">
266 | 	
267 | 16. When you are satisfied with the filtered results, save the filtered data. You may need to adjust the filtering criteria multiple times to optimize the filtering results prior to saving the report.
268 | 
269 | 	```r
270 | 	assignAndSaveData(name = "bcbFiltered", object = bcbFiltered, dir = dataDir)
271 | 	```
272 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/03_seurat_clustering_analysis.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Clustering with Seurat
  3 | description: This code is for clustering single cell rnaseq data with Seurat.
  4 | category: research
  5 | subcategory: scrnaseq
  6 | tags: [clustering]
  7 | ---
  8 | 
  9 | # Seurat singlecell RNA-Seq clustering analysis
 10 | 
 11 | This is a clustering analysis workflow to be run mostly on O2 using the output from the QC which is the `bcb_filtered` object. This workflow incorporates [**Lorena's script**](https://github.com/hbc/hbcABC/blob/master/inst/rmarkdown/Rscripts/singlecell/from_bcb_to_seurat.R) to set up for clustering run by changing the `bcb` object output to a `seurat` object. 
 12 | 
 13 | ## Creating Seurat object at the end of the QC analysis
 14 | 
 15 | The first thing needed is to convert the `bcb_filtered` object in the QC to a Seurat object. We can do this by running Lorena's [`bcb_to_seurat.R`](https://github.com/hbc/hbcABC/blob/master/inst/rmarkdown/Rscripts/singlecell/from_bcb_to_seurat.R) script at the end of the QC analysis. The contents of the script are described below.
 16 | 
 17 | ### Setting up the parameters
 18 | 
 19 | We need to load the `bcbioSingleCell` library and specify the appropriate organism and data directory to store output:
 20 | 
 21 | ```r
 22 | library(bcbioSingleCell)
 23 | 
 24 | species <- "mus musculus" # change to appropriate species
 25 | 
 26 | data_dir <- "data"
 27 | ```
 28 | 
 29 | ### Define the cell cycle markers and save to file along with rowData
 30 | 
 31 | In the clustering analysis, we need to determine the likely phase of the cell cycle for each cell, to do this we need a list of markers for our organism output from the bcbioSingleCell package:
 32 | 
 33 | ```r
 34 | cell_cycle_markers <- bcbioSingleCell::cellCycleMarkers[[camel(species)]]
 35 | 
 36 | s_genes <- cell_cycle_markers %>%
 37 |     filter(phase == "S") %>%
 38 |     pull("geneID")
 39 | 
 40 | g2m_genes <- cell_cycle_markers %>%
 41 |     filter(phase == "G2/M") %>%
 42 |     pull("geneID")
 43 | 
 44 | save(g2m_genes, s_genes, file = file.path(data_dir,"cycle.rda"))
 45 | ```
 46 | 
 47 | Now save the rowData of the `bcb_filtered` data to file:
 48 | 
 49 | ```r
 50 | saveRDS(rowData(bcb_filtered), file = file.path(data_dir,"rowData.rds"))
 51 | ```
 52 | 
 53 | ### Create Seurat object
 54 | 
 55 | To create the Seurat object we need only our `bcb_filtered` object, which contains the raw counts from the cells that have passed our quality control filtering parameters:
 56 | 
 57 | ```r
 58 | seurat_raw <- CreateSeuratObject(raw.data = counts(bcb_filtered), 
 59 |                              meta.data = metrics(bcb_filtered))
 60 |                              
 61 | saveRDS(seurat_raw, file = file.path(data_dir,"seurat_raw.rds"))
 62 | ```
 63 | 
 64 | Now that we have the Seurat object created, we can move on to the Seurat clustering analysis on O2.
 65 | 
 66 | ## Setting up O2 environment to run clustering analysis
 67 | 
 68 | To run the clustering analysis on O2, be sure to have X11 forwarding working if you want to visualize any of the images. To do this, you may need to have XQuartz running on your local machine and log onto O2 with the terminal:
 69 | 
 70 | ```bash
 71 | ssh -XY username@o2.hms.harvard.edu
 72 | ```
 73 | 
 74 | Edit the your `.Renviron` file to have the following inside:
 75 | 
 76 | ```bash
 77 | vim ~/.Renviron
 78 | 
 79 | 
 80 | R_LIBS_USER="/n/data1/cores/bcbio/R/library/3.4-bioc-release/library"
 81 | 
 82 | R_MAX_NUM_DLLS=150
 83 | ```
 84 | 
 85 | 
 86 | 
 87 | Then start an interactive session with extra memory and x11:
 88 | 
 89 | ```bash
 90 | srun --pty -p interactive -t 0-12:00 --x11 --mem 96G /bin/bash
 91 | ```
 92 | 
 93 | After starting the interactive session, load the necessary R modules and start R:
 94 | 
 95 | ```bash
 96 | module load gcc/6.2.0 R/3.4.1 hdf5/1.10.1
 97 | 
 98 | R
 99 | ```
100 | 
101 | ## Running the Seurat clustering analysis on O2
102 | 
103 | The next step is performing the actual clustering analysis with Seurat on O2. We are following **Lorena's `clustering_seurat.R` script** with descriptions and some additional plots added in. We are also using some descriptions from the **bcbioSingleCell clustering template**.
104 | 
105 | This workflow is adapted from the following sources:
106 | 
107 | - Satija Lab: [Seurat v2 Guided Clustering Tutorial](http://satijalab.org/seurat/pbmc3k_tutorial.html)
108 | - Paul Hoffman: [Cell-Cycle Scoring and Regression](http://satijalab.org/seurat/cell_cycle_vignette.html)
109 | 
110 | To identify clusters, the following steps will be performed:
111 | 
112 | 1. Normalization and transformation of the raw gene counts per cell to account for differences in sequencing depth.
113 | 2. Identification of high variance genes.
114 | 3. Regression of sources of unwanted variation (e.g. number of UMIs per cell, mitochondrial transcript abundance, cell cycle phase).
115 | 4. Identification of the primary sources of heterogeneity using principal component (PC) analysis and heatmaps.
116 | 5. Clustering cells based on significant PCs (metagenes).
117 | 
118 | ## Pre-regression workflow
119 | 
120 | To run this workflow optimally, we have split the workflow into pre-regression and regression steps. To run the pre-regression steps outlined below, we have a [`clustering_pre_regress.R`](../scripts/clustering_pre_regress.R) script that you can run on O2.
121 | 
122 | ### Setting up the R environment
123 | 
124 | Load the necessary libraries:
125 | 
126 | ```r
127 | library(Seurat)
128 | library(tidyverse)
129 | ```
130 | 
131 | Create variable for where you store the needed data and load the cell cycle file stored in this directory:
132 | 
133 | ```r
134 | data_dir <- "data" 
135 | 
136 | load(file.path(data_dir, "cycle.rda")) 
137 | 
138 | set.seed(1454944673L)   
139 | 
140 | # Load Seurat object created                                                                                               
141 | 
142 | seurat_raw <- readRDS(file.path(data_dir, "seurat_raw.rds"))
143 | ```
144 | 
145 | >**NOTE:** Often identifying cell types is easiest for a single sample type. To subset the Seurat object, we can use the `SubsetData()` function. For example:
146 | >
147 | >```r
148 | > pre_regressed_seurat <- SubsetData(seurat_raw, 
149 | >                                cells.use = rownames(seurat_raw@meta.data[which(seurat_raw@meta.data$interestingGroups == "control")])
150 | >```
151 | 
152 | ### Normalizing counts, finding variable genes, and scaling the data
153 | 
154 | The raw counts are normalized using global-scaling normalization with the `NormalizeData()` function, which performs the following:
155 | 
156 | 1. normalizes the gene expression measurements for each cell by the total expression 
157 | 2. multiplies this by a scale factor (10,000 by default)
158 | 3. log-transforms the result
159 | 
160 | ```r
161 | # Normalize counts for total cell expression and take log value                            
162 | 
163 | pre_regressed_seurat <- seurat_raw %>%
164 |                         NormalizeData(normalization.method = "LogNormalize",
165 |                                    scale.factor = 10000)  
166 | ```
167 | 
168 | Following normalization, the most variable genes are identified and will be used for downstream clustering analyses. The `FindVariableGenes()` function is called, which performs the following calculations:
169 | 
170 | 1. calculates the average expression and dispersion for each gene
171 | 2. places these genes into bins
172 | 3. calculates a z-score for dispersion within each bin
173 | 
174 | This helps control for the relationship between variability and average expression. 
175 | 
176 | ```r
177 | # Find variable genes based on the mean-dispersion relationship based on z-score for dispersion. 
178 | 
179 | pre_regressed_seurat <-  pre_regressed_seurat %>%
180 |                           FindVariableGenes(
181 |                             mean.function = ExpMean,
182 |                             dispersion.function = LogVMR,
183 |                             do.plot = FALSE)
184 | ```
185 | 
186 | It's recommended to set parameters as to mark visual outliers on dispersion plot - default parameters are for ~2,000 variable genes.
187 | 
188 | Finally, the genes are scaled and centered using the `ScaleData()` function.
189 | 
190 | ```r
191 | # Scale and center data
192 | 
193 | pre_regressed_seurat <- pre_regressed_seurat %>%
194 |                         ScaleData(model.use = "linear")
195 | ```
196 | 
197 | We can plot dispersion (a normalized measure of to cell-to-cell variation) as a function of average expression for each gene to identify a set of high-variance genes. To check that the dispersions behave as expected, decreasing with increasing mean, and to identify the most variable genes, we can visualize the dispersions with the `VariableGenePlot()` function.
198 | 
199 | ```r
200 | # Plot variable genes
201 | 
202 | VariableGenePlot(pre_regressed_seurat)
203 | ```
204 | 
205 | We can also check the number of variable genes:
206 | 
207 | ```r
208 | # Check number of variable genes to determine if correct parameters used  
209 | 
210 | length(x = pre_regressed_seurat@var.genes)
211 | ```
212 | 
213 | ### Examining sources of variation in the data
214 | 
215 | Your single-cell dataset likely contains "uninteresting" sources of variation. This can include technical noise, batch effects, and/or uncontrolled biological variation (e.g. cell cycle). We can use PCA to identify these sources of variation, which can then be regressed out prior to further analysis.
216 | 
217 | ### Cell cycle scoring
218 | 
219 | If we want to examine cell cycle variation in our data, we assign each cell a score, based on its expression of G2/M and S phase markers. These marker sets should be anticorrelated in their expression levels, and cells expressing neither are likely not cycling and in G1 phase. We assign scores in the `CellCycleScoring()` function, which stores S and G2/M scores in `seurat@meta.data`, along with the predicted classification of each cell in either G2M, S or G1 phase.
220 | 
221 | ```r
222 | # Perform cell cycle scoring
223 | 
224 | pre_regressed_seurat <- CellCycleScoring(
225 |   pre_regressed_seurat,
226 |   g2m.genes = g2m_genes,
227 |   s.genes = s_genes)
228 | ```
229 | 
230 | Here we are checking to see if the cells are grouping by cell cycle. If we don't see clear grouping of the cells into `G1`, `G2M`, and `S` clusters on the PCA plot, then it is recommended that we don't regress out cell-cycle variation. When this is the case, remove `S.Score` and `G2M.Score` from the variables to regress (`vars_to_regress`) in the R Markdown YAML parameters.
231 | 
232 | ```r
233 | # Perform PCA and color by cell cycle phase
234 | 
235 | pre_regressed_seurat = RunPCA(
236 |   pre_regressed_seurat,
237 |   pc.genes = c(s_genes, g2m_genes),
238 |   do.print = FALSE)
239 | 
240 | PCAPlot(pre_regressed_seurat, group.by= "Phase")
241 | ```
242 | 
243 | Now save the pre-regressed Seurat object:
244 | 
245 | ```r
246 | # Save pre-regression Seurat object
247 | 
248 | saveRDS(pre_regressed_seurat, file = file.path(data_dir, "seurat_pre_regress.rds"))
249 | ```
250 | 
251 | ## Apply regression variables
252 | 
253 | To run these regression steps outlined below, we have a [`clustering_regress.R`](../scripts/clustering_regress.R) script that can be run on O2. The scripts do not include the visualizations, but these can be included in the final report.
254 | 
255 | In this step, we are regressing out variables of uninteresting variation, using the `vars.to.regress` argument in the `ScaleData()` function. When variables are defined in the `vars.to.regress` argument, [Seurat][] regresses them individually against each gene, then rescales and centers the resulting residuals.
256 | 
257 | We generally recommend minimizing the effects of variable read count depth (`nUMI`) and mitochondrial gene expression (`mitoRatio`) as a standard first-pass approach. If the differences in mitochondrial gene expression represent a biological phenomenon that may help to distinguish cell clusters, then we advise not passing in `mitoRatio` here.
258 | 
259 | When regressing out the effects of cell-cycle variation, include `S.Score` and `G2M.Score` in the `vars.to.regress` argument. Cell-cycle regression is generally recommended but should be avoided for samples containing cells undergoing differentiation.
260 | 
261 | ```r
262 | # Regress out the uninteresting sources of variation in the data
263 | 
264 | vars_to_regress <- c("nUMI", "S.Score", "G2M.Score")
265 | 
266 | seurat <- ScaleData(pre_regressed_seurat, vars.to.regress = vars_to_regress)
267 | ```
268 | 
269 | Now that regression has been applied, let's recheck to see if the cells are no longer clustering by cycle. We should now see the phase clusters superimpose.
270 | 
271 | ```r
272 | # Re-run the PCA plots and color by cell cycle phase
273 | 
274 | seurat <- RunPCA(
275 |   seurat,
276 |   pc.genes = c(s_genes, g2m_genes),
277 |   do.print = FALSE)
278 |   
279 | PCAPlot(seurat, group.by= "Phase")
280 | ```
281 | 
282 | ## Linear dimensionality reduction
283 | 
284 | Next, we perform principal component analysis (PCA) on the scaled data with `RunPCA()`. By default, the genes in `seurat@var.genes` are used as input, but can be defined using the `pc.genes` argument. `ProjectPCA()` scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don't use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection.  The results of the projected PCA can be explored by setting `use.full = TRUE` for `PrintPCA()`.
285 | 
286 | ```r
287 | # Perform the scoring for all genes
288 | 
289 | seurat <- seurat %>%
290 |   RunPCA(do.print = FALSE) %>%
291 |   ProjectPCA(do.print = FALSE)
292 | ```
293 | 
294 | ## Determine statistically significant principal components
295 | 
296 | To overcome the extensive technical noise in any single gene for scRNA-seq data, [Seurat][] clusters cells based on their PCA scores, with each PC essentially representing a "metagene" that combines information across a correlated gene set. Determining how many PCs to include downstream is therefore an important step. To accomplish this, we plot the standard deviation of each PC as an elbow plot with our `plotPCElbow()` function.
297 | 
298 | PC selection — identifying the true dimensionality of a dataset — is an important step for [Seurat][], but can be challenging/uncertain. We therefore suggest these three approaches to consider:
299 | 
300 | 1. Supervised, exploring PCs to determine relevant sources of heterogeneity, and could be used in conjunction with GSEA for example.
301 | 2. Implement a statistical test based on a random null model. This can be time-consuming for large datasets, and may not return a clear PC cutoff.
302 | 3. **Heuristic approach**, using a metric that can be calculated instantly.
303 | 
304 | We're using a heuristic approach here, by calculating where the principal components start to elbow. The plots below show where we have defined the principal compoment cutoff used downstream for dimensionality reduction. This is calculated automatically as the larger value of:
305 | 
306 | 1. The point where the principal components only contribute 5% of standard deviation (bottom left).
307 | 2. The point where the principal components cumulatively contribute 90% of the standard deviation (bottom right).
308 | 
309 | This methodology is also commonly used for PC covariate analysis on bulk RNA-seq samples.
310 | 
311 | ```r
312 | # Create elbow plot
313 | 
314 | PCElbowPlot(seurat)
315 | 
316 | # Determine the estimate for significant PCs
317 | 
318 | pct = seurat@dr$pca@sdev / sum(seurat@dr$pca@sdev) * 100
319 | cum = cumsum(pct)
320 | co1 = which(cum > 90 & pct < 5)[1]
321 | co2 = sort(which((pct[1:length(pct)-1] - pct[2:length(pct)]) > 0.1),
322 |            decreasing = T)[1] + 1 # last point where change of % of variation is more than 0.1%.
323 | pcs = min(co1, co2) # change to any other number
324 | ```
325 | 
326 | ## Cluster the cells
327 | 
328 | Seurat uses a graph-based clustering approach, inspired by SNN-Cliq [@Xu2015-je] and PhenoGraph [@Levine2015-hr]. This approach embeds cells in a graph structure, by default using a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar gene expression patterns, and then attempt to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’. As in PhenoGraph, [Seurat][] first constructs a KNN graph based on the euclidean distance in PCA space, and refines the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard distance). To cluster the cells, it then applies modularity optimization techniques [@Blondel2008-rf], to iteratively group cells together, with the goal of optimizing the standard modularity function.
329 | 
330 | The `FindClusters()` function implements the procedure, and contains a `resolution` argument that sets the "granularity" of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between `0.6`-`1.2` typically returns good results for single cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters are saved in the `seurat@ident` slot.
331 | 
332 | Regarding the value of the `resolution` argument, use a value < 1 if you want to obtain fewer clusters. We provide a series of options and downstream we can choose the best resolution.
333 | 
334 | ```r
335 | # Find cell clusters
336 | 
337 | seurat <- FindClusters(
338 |   seurat,
339 |   dims.use = 1:pcs,
340 |   force.recalc = TRUE,
341 |   print.output = TRUE,
342 |   resolution = c(0.6, 0.8, 1.0, 1.2),
343 |   save.SNN = TRUE)
344 | ```
345 | ## t-SNE
346 | 
347 | [Seurat][] continues to use t-distributed stochastic neighbor embedding (t-SNE) as a powerful tool to visualize and explore these datasets. While we no longer advise clustering directly on t-SNE components, cells within the graph-based clusters determined above should co-localize on the t-SNE plot. This is because the t-SNE aims to place cells with similar local neighborhoods in high-dimensional space together in low-dimensional space. As input to the t-SNE, we suggest using the same PCs as input to the clustering analysis, although computing the t-SNE based on scaled gene expression is also supported using the `genes.use` argument.
348 | 
349 | ```r
350 | # Choose a resolution
351 | seurat <- SetAllIdent(object = seurat, id = "res.0.8")
352 | 
353 | # Run the TSNE and plot
354 | seurat <- RunTSNE(
355 |   seurat,
356 |   dims.use = 1:pcs,
357 |   do.fast = TRUE)
358 | ```
359 | 
360 | ```r
361 | # Plot the TSNE
362 | TSNEPlot(object = seurat)
363 | ```
364 | 
365 | Once a resolution has been chosen, a useful feature in [Seurat][] v2.0 is the ability to recall the parameters that were used in the latest function calls for commonly used functions. For `FindClusters()`, the authors provide the function `PrintFindClustersParams()` to print a nicely formatted formatted summary of the parameters that were chosen.
366 | 
367 | ```r
368 | PrintFindClustersParams(seurat)
369 | ```
370 | 
371 | ```r
372 | # Save clustered cells
373 | 
374 | saveRDS(seurat, file = file.path(data_dir, "name_seurat_tsne.rds"))
375 | ```
376 | # Creating the clustering report
377 | 
378 | To create the clustering report, `rsync` the `seurat_tsne.rds` object to your local computer and run the code for the visualizations as provided in the [template]().
379 | 
380 | > - *Use the saved Seurat objects on a local computer to make report with figures.*
381 | > - *rsync your data if you work on the cluster and local computer with the same data.*
382 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/04_seurat_markers.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Seurat Markers
 3 | description: This code is for finding Seurat markers
 4 | category: research
 5 | subcategory: scrnaseq
 6 | tags: [differential_analysis]
 7 | ---
 8 | 
 9 | ```bash
10 | ssh -XY username@o2.hms.harvard.edu
11 | 
12 | srun --pty -p interactive -t 0-12:00 --x11 --mem 128G /bin/bash
13 | 
14 | module load gcc/6.2.0 R/3.5.1 hdf5/1.10.1
15 | 
16 | R
17 | ```
18 | 
19 | ```r
20 | library(Seurat)
21 | library(tidyverse)
22 | 
23 | set.seed(1454944673L)
24 | data_dir <- "data" 
25 | seurat <- readRDS(file.path(data_dir, "seurat_tsne_all_res0.6.rds"))
26 | ```
27 | 
28 | Make sure the TSNEPlot looks as expected
29 | 
30 | ```r
31 | TSNEPlot(seurat)
32 | ```
33 | 
34 | Check markers for any particular cluster against all others
35 | 
36 | ```r
37 | cluster14_markers <- FindMarkers(object = seurat, ident.1 = 14, min.pct = 0.25)
38 | ```
39 | 
40 | Or look for markers of every cluster against all others
41 | 
42 | ```r
43 | seurat_markers <- FindAllMarkers(object = seurat, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
44 | ```
45 | 
46 | >**NOTE:** The `seurat_markers` object with be a dataframe with the row names as Ensembl IDs; however, since row names need to be unique, if a gene is a marker for more than one cluster, then Seurat will add a number to the end of the Ensembl ID. Therefore, do not use the row names as the gene identifiers. Use the `gene` column.
47 | 
48 | Save the markers for report generation
49 | 
50 | ```r
51 | saveRDS(seurat_markers, "data/seurat_markers_all_res0.6.rds")
52 | ```
53 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/Monocle.md:
--------------------------------------------------------------------------------
  1 | # Monocle
  2 | 
  3 | ## Bringing in data
  4 | 
  5 | Bringing data into Monocle can be achieved in a few different ways depending on the source of the data. Regardless of source, a `CellDataSet` object needs to be created. A `CellDataSet` consists of:
  6 | 
  7 | - **expression matrix:** counts matrix, can be dense or sparse - in `CellDataSet` object stored in `assayData` slot
  8 | - **phenotype data:** metadata - in `CellDataSet` object stored in `pData` slot
  9 | - **feature data:** gene annotations, including a column named `gene_short_name` - in `CellDataSet` object stored in `fData` slot
 10 | 
 11 | Before we can construct the `CellDataSet`, the following libraries need to be loaded:
 12 | 
 13 | ```r
 14 | library(Seurat)
 15 | library(monocle)
 16 | library(tidyverse)
 17 | library(readxl)
 18 | library(AnnotationHub)
 19 | library(ensembldb)
 20 | ```
 21 | 
 22 | ### Using Seurat output
 23 | 
 24 | Often pseudotime analysis is performed after performing QC, clustering and marker identification using Seurat. If this is the case, bringing the data into Monocle is quite easy. The `importCDS()` function is supposed to work, but I have had issues using this if my metadata is for a different number of cells than my raw data:
 25 | 
 26 | ```r
 27 | # Read in Seurat object
 28 | seurat <- readRDS("path/to/seurat.rds")
 29 | 
 30 | # Create the 'CellDataSet'
 31 | cds <- importCDS(seurat)
 32 | ```
 33 | 
 34 | Now we have our data stored as a `CellDataSet` object, and we can proceed through the Monocle workflow.
 35 | 
 36 | If this doesn't work for the creation of the object, then we can create the CDS using the metadata and raw counts.
 37 | 
 38 | ### Using raw count matrix and metadata objects
 39 | 
 40 | **Step 1:** Read in count matrix - this should only be the filtered cells output from QC
 41 | 
 42 | If we have our Seurat object, we can access the different slots in which this data is stored:
 43 | 
 44 | ```r
 45 | seurat <- readRDS("path/to/seurat.rds")
 46 | 
 47 | raw_counts <- seurat@raw.data
 48 | 
 49 | metadata <- seurat@meta.data
 50 | ```
 51 | 
 52 | If no Seurat object, then we can read in the individual count matrix and metadata objects: 
 53 | 
 54 | ```r
 55 | # Bring in count matrix from bcbio
 56 | raw_counts <- readMM("path/to/tagcounts.mtx")
 57 | 
 58 | # Assign row names and column names of matrix
 59 | gene_names <- read.csv("path/to/tagcounts.mtx.rownames", header = FALSE)
 60 | 
 61 | cell_ids <- read.csv("path/to/tagcounts.mtx.colnames", header = FALSE)
 62 | 
 63 | rownames(raw_counts) <- gene_names[, 1]
 64 | 
 65 | colnames(raw_counts) <- cell_ids[, 1]
 66 | ```
 67 | 
 68 | **Step 2:** Create the annotations - this could be brought in from the previous QC or clustering analysis or created again:
 69 | 
 70 | ```r
 71 | # Acquire the gene names for the Ensembl IDs
 72 | ## Connect to AnnotationHub
 73 | ah <- AnnotationHub()
 74 | 
 75 | ## Access the Ensembl database for organism
 76 | ahDb <- query(ah, 
 77 |               pattern = c("Homo sapiens", "EnsDb"), 
 78 |               ignore.case = TRUE)
 79 | 
 80 | ## Acquire the latest annotation files
 81 | id <- ahDb %>%
 82 |         mcols() %>%
 83 |         rownames() %>%
 84 |         tail(n = 1)
 85 | 
 86 | ## Download the appropriate Ensembldb database
 87 | edb <- ah[[id]]
 88 | 
 89 | ## Extract gene-level information from database
 90 | annotations <- genes(edb, 
 91 |                      return.type = "data.frame")
 92 | 
 93 | ## Select annotations of interest
 94 | annotations <- annotations %>%
 95 |         dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
 96 | ```
 97 | 
 98 | Now that we have the annotations, we can subset those to only the genes present in the counts data:
 99 | 
100 | ```r        
101 | ## Subset to include only those genes in data frame
102 | monocle_annotations <- annotations[which(annotations$gene_id %in% rownames(raw_counts)), ]
103 | ```
104 | 
105 | Monocle expects the annotations to be formatted with the gene IDs as row names and a column corresponding to gene symbol named `gene_short_name`:
106 | 
107 | ```r
108 | ## Make the row names of the annotations to be the same gene IDs as in the count matrix
109 | rownames(monocle_annotations) <- monocle_annotations$gene_id
110 | 
111 | ## Change name of gene symbol column to 'gene_short_name'
112 | colnames(monocle_annotations)[colnames(monocle_annotations) == "gene_name"] <- "gene_short_name"
113 | ```
114 | 
115 | Finally, the order of genes need to match between the features and the counts:
116 | 
117 | ```r
118 | # Check if all genes are annotated
119 | which(!(rownames(raw_counts) %in% rownames(monocle_annotations)))
120 | 
121 | # Remove genes not annotated
122 | raw_counts <- raw_counts[which(rownames(raw_counts) %in% rownames(monocle_annotations)), ]
123 | 
124 | ## Check all of the row names of the annotations match the row names of the counts
125 | all(rownames(raw_counts) %in% rownames(monocle_annotations))
126 | 
127 | all(rownames(raw_counts) == rownames(monocle_annotations))
128 | 
129 | ## If not, then match them
130 | idx <- match(rownames(raw_counts), rownames(monocle_annotations))
131 | 
132 | monocle_annotations <- monocle_annotations[idx, ]
133 | 
134 | ## Sanity check
135 | all(rownames(raw_counts) == rownames(monocle_annotations))
136 | ```
137 | 
138 | Then, we can create the feature data object used to create the `CellDataSet` as an `AnnotatedDataFrame`:
139 | 
140 | ```r
141 | ## Create feature data as an 'AnnotatedDataFrame'
142 | fd <- new("AnnotatedDataFrame", data = monocle_annotations)
143 | ```
144 | 
145 | **Step 3:** Create an `AnnotatedDataFrame` to be used to create the `CellDataSet`:
146 | 
147 | ```r
148 | ## Read in the metadata if not already present
149 | metadata <- read.csv("path/to/metadata.csv")
150 | 
151 | ## Check that the columns of the counts corresponds to the rows of the metadata
152 | all(rownames(metadata) == colnames(raw_counts))
153 | 
154 | # if not matching, then use match() similar to above
155 | 
156 | ## Create the phenotype data as an 'AnnotatedDataFrame'
157 | pd <- new("AnnotatedDataFrame", data = metadata)
158 | ```
159 | 
160 | **Step 4:** Create the `CellDataSet` object - `expressionFamily` depends on the type of data. 
161 | 
162 | ```r
163 | cds <- newCellDataSet(raw_counts,
164 |                       phenoData = pd,
165 |                       featureData = fd,
166 |                       expressionFamily=negbinomial.size())
167 | ```
168 | 
169 | > **NOTE:** If the data have UMIs, and it's not an extremely small dataset, then `negbinomial.size()` is the correct option. More details available in the [Monocle docs](http://cole-trapnell-lab.github.io/monocle-release/docs/#choosing-a-distribution-for-your-data-required).
170 | 
171 | > **NOTE:** ## Using Cell Ranger output
172 | >
173 | >Taken directly from the Monocle documentation: 'If you have 10X Genomics data and are using cellrangerRkit, you can use it to load your data and then pass that to Monocle as follows:'
174 | >
175 | >```r
176 | >cellranger_pipestance_path <- "/path/to/your/pipeline/output/directory"
177 | >gbm <- load_cellranger_matrix(cellranger_pipestance_path)
178 | >
179 | >fd <- fData(gbm)
180 | >
181 | ># The number 2 is picked arbitrarily in the line below.
182 | ># Where "2" is placed you should place the column number that corresponds to your
183 | ># featureData's gene short names.
184 | >
185 | >colnames(fd)[2] <- "gene_short_name"
186 | >
187 | >gbm_cds <- newCellDataSet(exprs(gbm),
188 | >                  phenoData = new("AnnotatedDataFrame", data = pData(gbm)),
189 | >                  featureData = new("AnnotatedDataFrame", data = fd),
190 | >                  lowerDetectionLimit = 0.5,
191 | >                  expressionFamily = negbinomial.size())
192 | >```
193 | 
194 | ## Estimating size factors and dispersions
195 | 
196 | Similar to any other RNA-seq analysis exploring differential expression, we need to calculate the size factors for normalization and dispersions per gene:
197 | 
198 | ```r
199 | # Estimate the size factors
200 | cds <- estimateSizeFactors(cds)
201 | 
202 | # Estimate the gene dispersions
203 | cds <- estimateDispersions(cds)
204 | ```
205 | 
206 | ## Additional QC suggested by Monocle
207 | 
208 | The Monocle tutorial suggests filtering low quality cells for minimum expression levels and for doublets. We likely have already performed the filtering for minimum expression during the original QC, but the doublet filtering has not been performed. Evidently, the trajectory analysis is quite sensitive to the presence of doublets. Moving forward we may consider different tools to perform this filtering.
209 | 
210 | ```r
211 | # Additional filtering of low quality cells
212 | 
213 | # Removing lowly expressed genes
214 | cds <- detectGenes(cds, min_expr = 0.1)
215 | 
216 | # Removing genes not expressed in at least 10 cells
217 | expressed_genes <- row.names(subset(fData(cds), num_cells_expressed >= 10))
218 | 
219 | # Removing cells that may be doublets
220 | pData(cds)$Total_mRNAs <- Matrix::colSums(exprs(cds))
221 | 
222 | cds <- cds[,pData(cds)$Total_mRNAs < 1e6]
223 | 
224 | upper_bound <- 10^(mean(log10(pData(cds)$Total_mRNAs)) +
225 |             2*sd(log10(pData(cds)$Total_mRNAs)))
226 | lower_bound <- 10^(mean(log10(pData(cds)$Total_mRNAs)) -
227 |             2*sd(log10(pData(cds)$Total_mRNAs)))
228 | 
229 | qplot(Total_mRNAs, data = pData(cds), color = viralLoad, geom =
230 | "density") +
231 | geom_vline(xintercept = lower_bound) +
232 | geom_vline(xintercept = upper_bound)
233 | 
234 | cds <- cds[,pData(cds)$Total_mRNAs > lower_bound &
235 |       pData(cds)$Total_mRNAs < upper_bound]
236 | cds <- detectGenes(cds, min_expr = 0.1)
237 | ```
238 | 
239 | After performing the filtering, it is suggested to check the data to ensure the counts follow an approximate log-normal scale.
240 | 
241 | ```r
242 | # Check expression to make sure filtered counts follow approximate log-normal distribution
243 | 
244 | # Log-transform each value in the expression matrix.
245 | L <- log(exprs(cds[expressed_genes,]) + 1)
246 | 
247 | # Standardize each gene, so that they are all on the same scale,
248 | # Then melt the data with plyr so we can plot it easily
249 | melted_dens_df <- melt(Matrix::t(scale(Matrix::t(L))))
250 | 
251 | # Plot the distribution of the standardized gene expression values.
252 | qplot(value, geom = "density", data = melted_dens_df) +
253 | stat_function(fun = dnorm, size = 0.5, color = 'red') +
254 | xlab("Standardized log(norm_counts)") +
255 | ylab("Density")
256 | ```
257 | 
258 | Now we are ready for classifying cells by cell type. 
259 | 
260 | ## Cell classification
261 | 
262 | Monocle uses a bit different method for clustering cells by taking in known marker genes to aid with clustering and identification of cell type. We need to provide Monocle with the gene IDs for the marker genes of the different clusters.
263 | 
264 | For example, if working with immune cells, we could have identified good cell markers for our dataset with Seurat previously:
265 | 
266 | ```r
267 | # Acquiring the rownames of markers
268 | CD14_id <- row.names(subset(fData(cds), gene_short_name == "CD14")) # Monocytes
269 | CD3_id <- row.names(subset(fData(cds),
270 |                              gene_short_name == "CD3D")) # T cells
271 | CD4_id <- rownames(subset(fData(cds),gene_short_name == "CD4")) # CD4+ T cells
272 | CD8_id <- row.names(subset(fData(cds),
273 |                            gene_short_name == "CD8A")) # CD8+ T cells
274 | CD19_id <- row.names(subset(fData(cds),
275 |                            gene_short_name == "CD19")) # B cells
276 | 
277 | # Creating hierarchy for cell assignment
278 | cth <- newCellTypeHierarchy()
279 | cth <- addCellType(cth, "Monocytes", classify_func =
280 |                            function(x) { x[CD14_id,] > 1 })
281 | cth <- addCellType(cth, "T cell", 
282 |                    classify_func=function(x) {x[CD3_id,] > 0})
283 | 
284 | cth <- addCellType(cth, "CD4+ T cells", classify_func = function(x)
285 | { x[CD14_id,] < 1  & x[CD4_id,] > 1 &  x[CD8_id,] < 1 }, 
286 | parent_cell_type_name = "T cell")
287 | 
288 | cth <- addCellType(cth, "CD8+ T cells", classify_func = function(x)
289 | { x[CD14_id,] < 1  & x[CD8_id,] > 1 &  x[CD4_id,] < 1 }, 
290 | parent_cell_type_name = "T cell")
291 | 
292 | cth <- addCellType(cth, "B cells", classify_func =
293 |                            function(x) { x[CD19_id,] > 1 })
294 | 
295 | ```
296 | 
297 | Now that we have the heirarchy for cell type assignment, we can assign cells to a known cell type. A cell is assigned to a cell type if at least the fraction of counts specified with the `frequency_thres` argument correspond to that cell type marker.
298 | 
299 | ```r
300 | # Should assign cells to one of the cell types specified in the heirarchy, Ambiguous, or Unknown 
301 | cmv <- classifyCells(cds = cds, cth = cth, frequency_thres = 0.1)
302 | ``` 
303 | 
304 | We can explore the assignments to see if they make sense. At this stage in the analysis, it is normal for the majority of cells to be of 'Unknown' cell type. However, if there are a lot of 'Ambiguous' cells, then you may want to modify your assignment heirarchy.
305 | 
306 | ```r
307 | # Check number of cells per celltype - at this stage majority of cells are often unknown
308 | table(pData(cmv)$CellType)
309 | 
310 | # Visualize by pie chart
311 | pie <- ggplot(pData(cmv),
312 |               aes(x = factor(1), fill = factor(CellType))) + geom_bar(width = 1)
313 | 
314 | pie + coord_polar(theta = "y") +
315 |         theme(axis.title.x = element_blank(), axis.title.y = element_blank())
316 | 
317 | # Check for specific cell types if desired
318 | subset(pData(cmv), CellType == "pDCs")
319 | ```
320 | 
321 | ## Identify clustering genes using an 'Unsupervised' method
322 | 
323 | Now we can try to assign identity to the 'Unknown' cells by using the prinicipal components that explain the largest amount of variance in the data, somewhat similar to Seurat's method.
324 | 
325 | ```r
326 | # Assign celltype to Unknown cells
327 | disp_table <- dispersionTable(cmv)
328 | 
329 | # Identify ordering genes - unsupervised clustering
330 | 
331 | ## Subset those genes with expression higher than 0.1
332 | unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
333 | 
334 | ## Mark genes to be used for clustering
335 | cmv <- setOrderingFilter(cmv, unsup_clustering_genes$gene_id)
336 | 
337 | ## View genes to be used for clustering
338 | plot_ordering_genes(cmv)
339 | 
340 | ## Determine number of principal components to use based on where elbow meets the surface
341 | # x11(type="cairo") # Run if error viewing the following plot
342 | plot_pc_variance_explained(cmv, return_all = F) # norm_method='log'
343 | ```
344 | 
345 | Now we can perform the dimensionality reduction using the identified principal components and the tSNE method. You will need to choose the number of clusters to return; I randomly chose 15 clusters, but you could choose more or less based on expectations. If you choose more, than expected, you can always merge together later on in the analysis. Also, we need to choose the number of dimensions to use, which should be based on analysis of the Elbow (skree) plot where the elbow just seems to touch the base.
346 | 
347 | ```r
348 | ## Reduce dimensions for tSNE viewing with max components of 2 and number of dimensions equal to the PCs determined in elbow plot
349 | cmv <- reduceDimension(cmv, max_components = 2, num_dim = 9,
350 |                 reduction_method = 'tSNE', verbose = T)
351 | 
352 | ## Cluster the cells to a certain number of clusters - will limit # clusters returned - randomly chose 15, but may return less                
353 | cmv <- clusterCells(cmv, num_clusters = 15)
354 | ```
355 | 
356 | Let's explore the quality of our clustering by checking our known markers:
357 | 
358 | ```r
359 | ## Explore cluster assignment
360 | head(pData(cmv))
361 | 
362 | plot_cell_clusters(cmv, 1, 2, color = "CellType",
363 |     markers = c("CD14", "CD36", "CD3D", "CD8A", "CD4", "CD19"))
364 |     
365 | cmv <- reduceDimension(cmv, max_components = 2, num_dim = 9,
366 |             reduction_method = 'tSNE',
367 |             residualModelFormulaStr = "~ condition + num_genes_expressed",
368 |             verbose = T)
369 | 
370 | cmv <- clusterCells(cmv, num_clusters = 15)
371 | 
372 | plot_cell_clusters(cmv, 1, 2, color = "Cluster") +
373 |     facet_wrap(~CellType)
374 |  ```
375 |  
376 | ## Further identify clustering genes using a 'Supervised' method
377 | 
378 | While the unsupervised clustering method allowed for using genes that were more highly expressed and variable for determining the principal components to use for clustering, the supervised method will instead choose genes that co-vary with the cell type markers given. After identifying the genes that co-vary signficantly with the cell type markers, we will select genes with high specificity; usually it's best to pick the top 10 or 20 genes most specific per cell type.
379 | 
380 | ```r
381 | # Identify ordering genes - supervised method
382 | 
383 | # Identifying genes that co-vary with markers
384 | marker_diff <- markerDiffTable(cmv[expressed_genes,],
385 |             cth,
386 |             residualModelFormulaStr = "~ condition + num_genes_expressed",
387 |             cores = 1)
388 | 
389 | # Selecting the genes that significantly co-vary
390 | candidate_clustering_genes <-
391 |     row.names(subset(marker_diff, qval < 0.01))
392 | 
393 | # Determine specificity of the markers
394 | marker_spec <- calculateMarkerSpecificity(cmv[candidate_clustering_genes,], cth)
395 | 
396 | head(selectTopMarkers(marker_spec, 3))
397 | ```
398 | 
399 | Now we can use these specific markers to cluster the cells. We will pick the top 500 markers for each cell type (although I am unsure why the choice is 500 here and not the 10-20 genes mentioned previously. We determine the unique top 500 specific markers for each cluster, then mark the genes 
400 | ```r
401 | # Select the specific cell type genes to use
402 | semisup_clustering_genes <- unique(selectTopMarkers(marker_spec, 500)$gene_id)
403 | 
404 | # Mark that these are the genes to be used for clustering 
405 | cmv <- setOrderingFilter(cmv, semisup_clustering_genes)
406 | 
407 | # Explore the variance explained by the genes
408 | plot_ordering_genes(cmv)
409 | 
410 | plot_pc_variance_explained(cmv, return_all = F)
411 | 
412 | # Use these genes for the clustering
413 | cmv <- reduceDimension(cmv, max_components = 2, num_dim = 9,
414 |   norm_method = 'log',
415 |   reduction_method = 'tSNE',
416 |   residualModelFormulaStr = "~ condition + num_genes_expressed",
417 |   verbose = T)
418 |  
419 | # Cluster the genes similar to previously
420 | cmv <- clusterCells(cmv, num_clusters = 15)
421 | 
422 | # Explore the clustering
423 | plot_cell_clusters(cmv, 1, 2, color = "CellType",
424 |     markers = c("CD14", "CD36", "CD3D", "CD8A", "CD4", "CD19"))
425 |     
426 | plot_cell_clusters(cmv, 1, 2, color = "Cluster") +
427 |     facet_wrap(~CellType)
428 | ```
429 | 
430 | ## Imputing cell types
431 | 
432 | For those cells that are still of 'Unknown' cell type, we can impute the identity based on the expression of markers from the other cells in that cluster. We will impute the identities of the 'Unknown' cells using a threshold of 10% for the percentage of cluster marked as a certain type of cell to impute the values of the remaining cells.
433 | 
434 | ```r
435 | # Impute cell type
436 | imputed <- clusterCells(cmv,
437 |               num_clusters = 15,
438 |               frequency_thresh = 0.1,
439 |               cell_type_hierarchy = cth)
440 |               
441 | plot_cell_clusters(imputed, 1, 2, color = "CellType",
442 |     markers = c("CD14", "CD36", "CD3D", "CD8A", "CD4", "CD19"))
443 |     
444 | pie <- ggplot(pData(imputed),
445 |               aes(x = factor(1), fill = factor(CellType))) + geom_bar(width = 1)
446 | 
447 | pie + coord_polar(theta = "y") +
448 |         theme(axis.title.x = element_blank(), axis.title.y = element_blank())
449 | 
450 | table(pData(imputed)$CellType)
451 | ```
452 | 
453 | ## Trajectory analysis
454 | 
455 | To perform trajectory analysis, you will need to subset out the cells of interest from your object. In this example, I'm interested in monocytes.
456 | 
457 | ```r
458 | # Subset out monocytes
459 | monocyte_cells <- row.names(subset(pData(imputed), CellType == "Monocytes"))
460 | 
461 | monocytes <- imputed[ , monocyte_cells]
462 | 
463 | dim(monocytes)
464 | ```
465 | Now, we can order cells based on genes that differ between clusters using the 'dpFeature' unsupervised method after selecting a subset of genes expressed in 5% of the cells.
466 | 
467 | ```r
468 | # Trajectory analysis
469 | # This filtering should have already be performed, so probably not necessary
470 | monocytes <- detectGenes(monocytes, min_expr = 0.1)
471 | 
472 | # Selecting the genes expressed in 5% of cells
473 | fData(monocytes)$use_for_ordering <- fData(monocytes)$num_cells_expressed > 0.05 * ncol(monocytes)
474 | ```
475 | 
476 | After determining the genes to use for the ordering of cells for the trajectory analysis, we can perform PCA analysis to identify the variance explained by each PC and choose the number of dimensions to include in the TSNE reduction based on where the elbow approaches the base.
477 | 
478 | ```r
479 | # Elbow or scree plot
480 | plot_pc_variance_explained(monocytes, return_all = F)
481 |     
482 | # Only include highest PCs with large gaps following components
483 | monocytes <- reduceDimension(monocytes,
484 |                               max_components = 2,
485 |                               norm_method = 'log',
486 |                               num_dim = 6,
487 |                               reduction_method = 'tSNE',
488 |                               verbose = T)
489 | ```
490 | 
491 | The next step involves the density peak clustering, which clusters the cells based on the cell's local density. It is explained by monocle as: "The densityPeak algorithm clusters cells based on each cell's local density (Ρ) and the nearest distance (Δ) of a cell to another cell with higher distance. We can set a threshold for the Ρ, Δ and define any cell with a higher local density and distance than the thresholds as the density peaks. Those peaks are then used to define the clusters for all cells. By default, clusterCells choose 95% of Ρ and Δ to define the thresholds. We can also set a number of clusters (n) we want to cluster. In this setting, we will find the top n cells with high Δ with Δ among the top 50% range. The default setting often gives good clustering."
492 | 
493 | ```r
494 | # Perform the density peak clustering
495 | monocytes <- clusterCells(monocytes, verbose = F)
496 | ```
497 | We can explore the clustering using the `plot_cell_clusters()` function:
498 | 
499 | ```r
500 | # Plotting the clusters to explore clustering
501 | plot_cell_clusters(monocytes, color_by = 'as.factor(Cluster)')
502 | plot_cell_clusters(monocytes, color_by = 'as.factor(condition)')
503 | ```
504 | 
505 | Then we want to identify the genes that are differentially expressed between the beginning and end of our process/time/condition. We can perform the differential expression and adding the time/condition that is changing to the model.
506 | 
507 | ```r
508 | # Subsetting the genes to only those genes expressed in at least 10 cells within this monocyte subset
509 | mono_expressed_genes <-  row.names(subset(fData(monocytes),
510 | num_cells_expressed >= 10))
511 | 
512 | # Performing the DE gene test
513 | clustering_DEG_genes <-
514 |     differentialGeneTest(monocytes[mono_expressed_genes,],
515 |           fullModelFormulaStr = '~ Cluster',
516 |           cores = 1)
517 | 
518 | # Select the top 1000 most significant genes
519 | mono_ordering_genes <-
520 |     row.names(clustering_DEG_genes)[order(clustering_DEG_genes$qval)][1:1000]
521 | ```
522 | 
523 | Through the differential expression test we have identitified the genes that will be used for ordering our cells along a trajectory. We now need to set them in the 'CellDataSet' object (`monocytes`), then reduce the dimensions to 2 for viewing and perform the ordering of the cells along the trajectory.
524 | 
525 | ```r
526 | 
527 | monocytes <-
528 |     setOrderingFilter(monocytes,
529 |         ordering_genes = mono_ordering_genes)
530 | 
531 | monocytes <-
532 |     reduceDimension(monocytes, method = 'DDRTree')
533 | 
534 | monocytes <-
535 |     orderCells(monocytes)
536 | 
537 | monocytes <-
538 |     orderCells(monocytes, root_state = GM_state(monocytes))
539 | 
540 | plot_cell_trajectory(monocytes, color_by = "viralLoad")
541 | 
542 | ```
543 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/R_set-up.md:
--------------------------------------------------------------------------------
 1 | # Set-up for using R on O2 for single-cell RNA-seq
 2 | 
 3 | Log onto O2 and start an interactive session - more memory is often required, and I start with `--mem` of `64G` for ~ 10,000 - 15,000 cells. It also helps to have X11 forwarding enabled, `--x11`, to work through the analysis and view plots.
 4 | 
 5 | If X11 is not working for you, then make sure you follow the directions given on the [O2 wiki](https://wiki.rc.hms.harvard.edu/display/O2/Using+X11+Applications+Remotely).
 6 | 
 7 | Generally, I have my script open in a different terminal window and start an interactive session to copy and paste each line of the script. I explore the output and decide whether to adjust parameters. Alternatively, you could just run the script and explore the output.
 8 | 
 9 | ```r
10 | srun --pty -p interactive -t 0-12:00 --x11 --mem 64G /bin/bash
11 | ```
12 | 
13 | We should specify the R library to use in our `~/.Renviron` file. You can either create a personal R library or use those provided by the core. Either way you will need to store the path to the `R_LIBS_USER` variable. 
14 | 
15 | Within the same file, it will help to set the `R_MAX_NUM_DLLS` variable to a high number in order to use many of the single cell packages. I have mine set to `200`, which has worked so far.
16 | 
17 | ```r
18 | # Library using R 3.6.1 with Seurat 3.1+
19 | R_LIBS_USER="/n/data1/cores/bcbio/R/library/3.6.1-bioc-release/library"
20 | 
21 | # Library using R 3.5.1 with Seurat 3.0
22 | R_LIBS_USER="/n/data1/cores/bcbio/R/library/3.5.1-bioc-release_Seurat3.0"
23 | 
24 | # Library using R 3.5.1 but with version 2 Seurat
25 | #R_LIBS_USER="/n/data1/cores/bcbio/R/library/3.5.1-bioc-release/library"
26 | 
27 | # Library using R 3.4
28 | #R_LIBS_USER="/n/data1/cores/bcbio/R/library/3.4-bioc-release/library"
29 | 
30 | R_MAX_NUM_DLLS=200
31 | ```
32 | 
33 | Now to use R we need to load the required modules for our analysis:
34 | 
35 | ```r
36 | module load gcc/6.2.0 R/3.5.1
37 | ```
38 | 
39 | > Be sure to match the module version of R to the library specified in the `~/.Renviron` file.
40 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/SPRING.md:
--------------------------------------------------------------------------------
  1 | # SPRING
  2 | 
  3 | [SPRING](https://github.com/AllonKleinLab/SPRING) is a tool for visualizing and interacting with high dimensional data. The SPRING tool can be accessed [online](https://kleintools.hms.harvard.edu/tools/spring.html) or through a [local instance](https://github.com/AllonKleinLab/SPRING).
  4 | 
  5 | ## Online webserver
  6 | 
  7 | When using the webserver for SPRING, the maximum number of cells to be used as input is **10,000 cells**. If looking to use SPRING for analysis of >10,000 cells, then a local instance would be required.
  8 | 
  9 | ### Generating data for upload
 10 | 
 11 | Generally when using SPRING to visualize our data, we will need several files:
 12 | 
 13 | - the filtered raw counts
 14 | - gene symbols for genes in raw counts matrix
 15 | - metadata for the counts
 16 | 
 17 | ### Filtered raw counts
 18 | 
 19 | The first file to attain is the filtered raw counts object. The easiest method is to access it from the Seurat object with clusters assigned. It is helpful if the clusters are named with the known or hypothesized cell types. So let's read in the seurat object from R:
 20 | 
 21 | ```r
 22 | # Read in seurat object
 23 | seurat <- readRDS("path/to/data/seurat_tsne.rds")
 24 | ```
 25 | 
 26 | **We can directly use this object if it contains less than 10,000 cells.** Alternatively, we could subset the object to the cells that we would like to view:
 27 | 
 28 | ```r
 29 | # Get cell ids
 30 | spring_cells <- rownames(seurat@meta.data[seurat@meta.data$sample == "sample1" | seurat@meta.data$sample == "sample2", ])
 31 | 
 32 | # Subset the raw counts to these cells
 33 | spring_counts <- as.matrix(seurat@raw.data[ ,spring_cells])
 34 | 
 35 | # Write counts to file
 36 | write.csv(spring_counts, "spring/spring_counts.csv", quote= F)
 37 | ```
 38 | 
 39 | > **NOTE:** We could subset by any other factor in the metadata similarly. We could also subset randomly to 10,000 cells by taking a sample of cells:
 40 | > 
 41 | > ```r
 42 | > # Get cell ids
 43 | > sampled_cells <- sample(x = seurat@cell.names, size = 10000, replace = F)
 44 | > 
 45 | > # Use cell ids to subset seurat
 46 | > spring_counts_10000 <- as.matrix(seurat@raw.data[ ,sampled_cells])
 47 | > ```
 48 | 
 49 | ### Gene names for extracted counts
 50 | 
 51 | The next data we need is the gene names for the extracted counts. We can get the gene names directly from the counts file we just created.
 52 | 
 53 | ```r
 54 | # Write genes to file
 55 | write(rownames(spring_counts), "spring/spring_genes.txt")
 56 | ```
 57 | 
 58 | 
 59 | ### Metadata
 60 | 
 61 | Finally, the last object we need is any metadata we might want to visualize. Now we can subset the Seurat object and extract the metadata stored in the `meta.data` slot of the Seurat object:
 62 | 
 63 | ```r
 64 | # Add cell type to the metadata for each cell
 65 | seurat@meta.data$ident <- seurat@ident
 66 | 
 67 | # Use cell ids to subset seurat object
 68 | spring_seurat <- SubsetData(seurat, cells.use = spring_cells)
 69 | 
 70 | # Extract metadata including cell type
 71 | spring_meta <- spring_seurat@meta.data
 72 | ```
 73 | 
 74 | #### Writing metadata to file
 75 | 
 76 | To write the metadata to file it needs to be in a particular format, which we can output using the `write()` function. We can specify any column of metadata that we would like to include in the SPRING visualizations.
 77 | 
 78 | ```r
 79 | write(c("Cluster", spring_meta$ident), 
 80 |       file = "spring/spring_meta.csv", 
 81 |       sep = ",", 
 82 |       ncolumns = length(spring_meta$ident) + 1, 
 83 |       append = FALSE)
 84 | 
 85 | write(c("Condition", spring_meta$interestingGroups), 
 86 |       file = "spring/spring_meta.csv", 
 87 |       sep = ",", 
 88 |       ncolumns = length(spring_meta$ident) + 1, 
 89 |       append = TRUE)
 90 |       
 91 | write(c("Phase", spring_meta$phase), 
 92 |       file = "spring/spring_meta.csv", 
 93 |       sep = ",", 
 94 |       ncolumns = length(spring_meta$ident) + 1, 
 95 |       append = TRUE)
 96 | ```
 97 | 
 98 | ## SPRING interface
 99 | 
100 | Now that we have the data that we would like to visualize, we can upload it to the [SPRING webserver](https://kleintools.hms.harvard.edu/tools/spring.html). 
101 | 
102 | Create a name for your dataset and a password, and choose the `Load new files` option. Load the following files by clicking on `Choose file`:
103 | 
104 | - **Expression data:** `spring_counts.csv`
105 | - **Gene list:** `spring_genes.txt`
106 | - **Cell groupings:** `spring_meta.csv`
107 | 
108 | Once loaded, select the `Upload` button. After the upload is successful, click on `Step 2: Process data`. There are a few parameters here that are available if you wish to adjust, then continue by clicking on `Begin processing data!`. Finally click on `Step 3: Click here to view data`, which should take you to the web interface for your data.
109 | 
110 | We encourage watching the 1-2 minute videos available on the [SPRING website](https://kleintools.hms.harvard.edu/tools/spring.html). Briefly, you can look at markers for different clusters by positively and negatively selecting cells. 
111 | 
112 | - Positive selection: `Shift`
113 | - Negative selection: `Shift` + `Esc` 
114 | - Deselection: `command`
115 | 
116 | After positively and negatively selecting cells, you can view the DE genes/marker genes by clicking on the `enriched genes` on the left-hand side.
117 | 
118 | **Share the link to this page with your client, and they can interactively play/view the data.**
119 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/bcbioSingleCell_setup.md:
--------------------------------------------------------------------------------
 1 | # Different approaches to running `bcbioSingleCell`
 2 | 
 3 | There are various approaches to running `bcbioSingleCell` to generate the QC report. The first part is getting all of the output from the **`bcbio` final directory loaded in to create the `bcb` object**. The next step is running through code which will **compute metrics and generate figures for quality assessment**. This second step is best done locally so you can run the code interactively and assess things as you run through the report code chunks. For approaches #1 and #2 listed below, you are doing everything locally. For #3 and #4 you are creating the `bcb` object on the cluster, and then moving it local to create the report. 
 4 | 
 5 | 1. **Using a [Docker image](https://hub.docker.com/r/lpantano/bcbiosinglecell/)**. 
 6 |     - First, install Docker 
 7 |     - Pull the Docker image: `docker pull lpantano/bcbiosinglecell:r3.5-bsc0.1.5`
 8 |     - Set your memory RAM limit to 4G or more. This is done with the Docker main application (Preferences -> Advanced)
 9 |     - Mount the O2 `final` directory from your `bcbio` run on your laptop
10 |     - Open up a terminal and make sure you are in your home directory (or a place where you can easily navigate to the mount space)
11 |     - Run the Docker image: `docker run -d -p 8787:8787 -e ROOT=TRUE -v $(pwd):/home/rstudio lpantano/bcbiosinglecell`
12 |     - In a browser connect to RStudio: localhost:8787 with user and password: rstudio/rstudio. From here you can start [Creating the metadata file](#metadata), and continue working within the Docker container to create the QC report.   
13 |     
14 |     ---
15 |     
16 |     > **NOTE:** If you start a Docker container and realize you want to start a new one, you will want to kill this one and remove it using the commands below:
17 |     > ```
18 |     > docker ps # to see your containers listed by id
19 |     > docker stop <container_id>
20 |     > docker rm <container_id> ```
21 | 
22 | 2. Running it **locally on your laptop RStudio**. This will require you to install `bcbioSingleCell` and also mount O2. Note that if you have more 400K-500K cells this will max out of memory. Also, note you may have to deal with problems with various dependency packages as you update R. If you choose this method, skip down to [Creating the metadata file section](#metadata) and get started.
23 | 
24 | 3. **Generate the `bcb` object on the O2 cluster**. You are limited to using R 3.4.1 because that is what is available for conda and the modules, but `bcbioSingleCell` is backwards compatible to R 3.4.1. The code is as follows:
25 | 
26 | ```r
27 | 	bcbio <- loadSingleCell("~/bcbio/PIs/path/to/final/",
28 |                         interestingGroups = "sampleName",
29 |                         sampleMetadataFile = "~/path/to/metadata", 
30 |                         gtfFile = "~/bcbio/PIs/path/to/Homo_sapiens.GRCh38.90.chr_patch_hapl_scaff.gtf")
31 | 	
32 | 	save(bcbio_output, file="data/bcb.rda")
33 | ```
34 | 
35 | The above code chunk can be run on O2 in one of two ways:
36 | 
37 |    - **A.** Using a **conda install of R 3.4.1** and pointing to a [shared R library](#rlib). For the conda recipe you can find more information [here](https://steinbaugh.com/r_bioconda). Keep note of the different versions when you create your environment (i.e. pandoc 1 is required for rmarkdown (version 2 is super buggy) and hdf5 1.10.1 is required for the latest version of Seurat, or it won’t compile)
38 | 	
39 |    - **B.** Using the **R 3.4.1 module** and pointing to the [shared R library](#rlib). This may require some troubleshooting with the HMSRC folks as it has been known to be problematic.
40 | 	
41 | 		
42 | > #### Using a pre-existing shared R library on O2 (for single cell RNA-seq) <a name="rlib"></a>
43 | >  This library has been created for use with single cell RNA-seq analysis. It can be used not only for QC but also for clustering with Seurat. First, you will need to edit your `.Renviron` file to have the following inside:
44 | > 
45 | > ```
46 | > R_LIBS_USER="/n/data1/cores/bcbio/R/library/3.4-bioc-release/library"
47 | > R_MAX_NUM_DLLS=150
48 | > ```
49 | > 
50 | > Then start an interactive session with extra memory and x11:
51 | > 
52 | > `$ srun --pty -p interactive -t 0-12:00 --x11 --mem 128G /bin/bash`
53 | > 
54 | > After starting the interactive session, load the necessary R modules and start R as described at https://github.com/hbc/knowledgebase/blob/master/research/scrnaseq/Single-Cell.md#shared-installation-in-o2.
55 | 
56 | 
57 | 
58 | ### Creating the metadata file <a name="metadata"></a>
59 | 
60 | Use the information from the client to construct the metadata table to use with bcbioSingleCell R package according to the specifications detailed at [https://github.com/hbc/bcbioSingleCell](https://github.com/hbc/bcbioSingleCell). You will need the columns for `description`, `index`, `sequence`, and `sampleName`. You can add any additional metadata as desired.
61 | 
62 | - **Example metadata table:**
63 | 	
64 | 	![example metadata](../img/sc_metadata.png)
65 | 	
66 | 	- **Important:** the `sequence` column for the inDrop metadata is the **Forward** sequence, not the same as the sequences present in the `sample_barcodes` file, which is the reverse complement. 
67 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/cell_hashing.md:
--------------------------------------------------------------------------------
  1 | # Cell Hashing
  2 | 
  3 | 'Cell Hashing, where oligo-tagged antibodies against ubiquitously expressed surface proteins uniquely label cells from distinct samples, which can be subsequently pooled. By sequencing these tags alongside the cellular transcriptome, we can assign each cell to its original sample, robustly identify cross-sample multiplets, and “super-load” commercial droplet-based systems for significant cost reduction.' [Stoeckius, M, et. al.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1)  
  4 | 
  5 | 'Cell hashtags allow for robust sample multiplexing, confident multiplet identification, and discrimination of low-quality cells from ambient RNA. In addition to enabling “super-loading” of commercial scRNA-seq platforms to substantially reduce costs, this strategy represents a generalizable approach for multiplet identification and multiplexing that can be tailored to any biological sample or experimental design'. [Stoeckius, M, et. al.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1)
  6 | 
  7 | Therefore, the benefits of cell hashing include:
  8 | 
  9 | - Reduction in batch effects since library preparation between conditions occurs at the same time.
 10 | - Reliable identification of multiplets (allows for removal of 'cells' containing more than one cell.
 11 | - Detection of low quality cells that contain only ambient RNA
 12 | - Super-loading allows for reduction of costs for high number of cells
 13 | 
 14 | 
 15 | ## Generate FASTQ files from BCL files
 16 | 
 17 | Using the 10X Cell Ranger workflow, we can efficiently demultiplex our samples. The first step is to generate the FASTQ files as we normally would using `cellranger mkfastq`. However, for this step we need to **attain the indices that correspond to the gene expression reads and those that correspond to the antibody barcode reads from the sequencing facility**. We supply the indices using the standard samplesheet as described in the [documentation](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq?src=pr&lss=none&cnm=&cid=NULL&src=pr&lss=none&cnm=&cid=NULL#simple_csv). An example samplesheet for cell hashing with reads distributed across all lanes (`*`) is given below.
 18 | 
 19 | ```
 20 | Lane,Sample,Index
 21 | *,gene,SI-GA-F1
 22 | *,barcode,SI-GA-F2
 23 | ```
 24 | 
 25 | By running the `cellranger mkfastq` command with this samplesheet, we will generate separate FASTQ files for our gene expression reads and our antibody barcode reads. An example script for creating the FASTQ files is given below (the run_folder contains the `Data` directory which has the `Intensities` folder inside):
 26 | 
 27 | ```
 28 | #!/bin/bash
 29 | 
 30 | #SBATCH -p priority             # partition name
 31 | #SBATCH -t 0-12:00              # hours:minutes runlimit after which job will be killed
 32 | #SBATCH --mem 32G
 33 | #SBATCH --job-name mkfastq             # Job name
 34 | #SBATCH -o %j.out                       # File to which standard out will be written
 35 | #SBATCH -e %j.err               # File to which standard err will be written
 36 | #SBATCH --mail-type=ALL
 37 | #SBATCH --mail-user=piper@hsph.harvard.edu
 38 | 
 39 | # Load modules
 40 | module load cellranger/6.1.0 bcl2fastq/2.20.0.422
 41 | 
 42 | cellranger mkfastq --id=mycellranger_mkfastq --run=path/to/run_folder --samplesheet=samplesheet_mkfastq.csv
 43 | ```
 44 | 
 45 | ## Perform alignment and counting
 46 | 
 47 | The next step is to perform the alignment and counting of the reads. The `cellranger count` pipline allows for the quantification of the gene expression and feature barcode for each cell barcode. Cell ranger expects the gene expression and feature barcodes to be in separate FASTQ files, which we generated in the `cellranger mkfastq` command previously. The [documentation](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis) is quite good and thorough. Basically, we need to create provide the following information to `cellranger count`:
 48 | 
 49 | - **Library CSV file**: This file assigns each cell to a condition. You need the following columns:
 50 |   - `fastqs`: path to the directory containing the demultiplexed FASTQ files
 51 |   - `sample`: same as the `Sample` given in the samplesheet for the `cellranger mkfastq` command
 52 |   - `library_type`: Should be 'Antibody Capture' for the antibody barcode FASTQ files for cell hashing experiments, and 'Gene Expression' for your gene expression FASTQ files
 53 |   - Below is an example `library.csv` file
 54 |     
 55 |     ```
 56 |     fastqs, sample, library_type
 57 |     cellranger_mkfastq/outs/fastq_path/HGFCGBGXK/, barcode, Antibody Capture
 58 |     cellranger_mkfastq/outs/fastq_path/HGFCGBGXK/, gene, Gene Expression
 59 |     ```
 60 | - **Feature Reference CSV file**: This file provides information for parsing the antibody capture barcode information for each sample and assigning it to a sample. **You will need to acquire this information from the group preparing the libraries.** The following information is needed:
 61 |   - Are the hashing antibodies from Biolegend? If so, then the parsing information can be found in the [documention](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref)
 62 |   - What antibodies did you use and what are their barcodes?
 63 |   - What are the corresponding samples? 
 64 |   - Might also be helpful to ask for the antibody description / documentation
 65 | 
 66 |   Upon asking, the client returned the following, which was helpful for proceeding:
 67 |   
 68 |   > **WT**
 69 |   > - [TotalSeq™-B0301 anti-mouse Hashtag 1 Antibody](https://www.biolegend.com/en-us/search-results/totalseq-b0301-anti-mouse-hashtag-1-antibody-17771)
 70 |   > - Barcode Sequence: ACCCACCAGTAAGAC
 71 |   > 
 72 |   > **KO**
 73 |   > - [TotalSeq™-B0302 anti-mouse Hashtag 2 Antibody](https://www.biolegend.com/en-us/search-results/totalseq-b0302-anti-mouse-hashtag-2-antibody-17772)
 74 |   > - Barcode Sequence: GGTCGAGAGCATTCA
 75 | 
 76 |   > _**Important information from the antibody documentation (above links) to note:**  The antibodies are specific against mouse CD45 and MHC class I (of a, b,  d, j, k, s, and u haplotypes) and can be used to label hematopoietic and non-hematopoietic cells in most commonly used mouse strains for multiplex single cell sequencing analysis. CD45 (LCA, T200, or Ly-5) is expressed on all hematopoietic cells except mature erythrocytes and platelets. CD45 plays a key role in TCR and BCR signal transduction. The MHC class I M1/42 antibody reacts with the H-2 MHC class I alloantigens expressed on nucleated cells from mice of the a, b, d, j, k, s, and u haplotypes_ 
 77 |   
 78 |   To construct the feature reference file, the client's barcoding information needs to be provided in a specific format. The `feature_ref.csv` file should have the following columns:
 79 |     - `id`: unique ID corresponding to feature
 80 |     - `name`: name for feature
 81 |     - `read`: which read the antibody capture barcode is present within
 82 |     - `pattern`: the pattern used to parse the barcode
 83 |     - `sequence`: the barcode sequence for each sample
 84 |     - `feature_type`: for cell hashing, this should be 'Antibody Capture'
 85 |   
 86 |   An example `feature_ref.csv` file is given below:
 87 |   
 88 |   ```
 89 |   id, name, read, pattern, sequence, feature_type
 90 |   WT, WT_TotalSeqB0301, R2, 5PNNNNNNNNNN(BC)NNNNNNNNN, ACCCACCAGTAAGAC, Antibody Capture
 91 |   KO, KO_TotalSeqB0302, R2,5PNNNNNNNNNN(BC)NNNNNNNNN, GGTCGAGAGCATTCA, Antibody Capture
 92 |   ```
 93 |   
 94 | Now to run the `cellranger count` command, we can include this information:
 95 | 
 96 |   ```
 97 |   cellranger count --id=name_for_output_folder\
 98 |                    --libraries=library.csv \
 99 |                    --transcriptome=/n/shared_db/mm10/uk/cellranger/6.0.0/6.0.0/refdata-gex-mm10-2020-A/ \ # change for experiment
100 |                    --feature-ref=feature_ref.csv \
101 |                    --force-cells=30000 \ # change for experiment
102 |                    --localcores 6 \
103 |                    --localmem 64 
104 |   ```
105 | 
106 | > _**NOTE:** After bringing count matrix into R we roughly follow [this vignette](https://satijalab.org/seurat/articles/hashing_vignette.html)._
107 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/clustering_report_bcbioSingleCell.md:
--------------------------------------------------------------------------------
  1 | # bcbioSingleCell Clustering Report
  2 | 
  3 | * All **bcbioSingleCell functions** are available at: [http://bioinformatics.sph.harvard.edu/bcbioSingleCell/reference/index.html](http://bioinformatics.sph.harvard.edu/bcbioSingleCell/reference/index.html).
  4 | 
  5 | * Much of the content is from the bcbioSingleCell template, but with additional explanations and hints.
  6 | 
  7 | #### Setting up
  8 | 
  9 | 1. Install `bcbioSingleCell` and load the library:
 10 | 	
 11 | 	```r
 12 | 	# devtools::install_github("hbc/bcbioSingleCell") # Add argument `ref = "develop"` if need development branch
 13 | 	
 14 | 	library(bcbioSingleCell)
 15 | 	```
 16 | 	
 17 | 2. Create new RMarkdown file and choose the clustering template.
 18 | 
 19 | 3. Edit the information in the files `_header.Rmd` and `_footer.Rmd` with experiment-specific information (I also include the experimental description with design).
 20 | 
 21 | 3. We will perform clustering using the output from our QC analysis. To use the filtered data, fill in the `params` for `bcbFile` with the path to the filtered output data:
 22 | 
 23 | 	```r
 24 | 	title: "Seurat Clustering"
 25 | 	author: "`r getOption('author')`"
 26 | 	date: "`r Sys.Date()`"
 27 | 	bibliography: bibliography.bib
 28 | 	params:
 29 | 	    bcbFile: "data/bcbFiltered.rda"
 30 | 	    seuratName: "seurat"
 31 | 	    pcCompute: 20
 32 | 	    pcUse: FALSE
 33 | 	    varsToRegress: !r c("nUMI", "mitoRatio", "S.Score", "G2M.Score")
 34 | 	    resolution: 0.8
 35 | 	    outputDir: "."
 36 | 	---
 37 | 	```
 38 | 
 39 | 3. At he beginning of the Clustering analysis report, I added a summary of the clustering analysis workflow just below the `params` chunk:
 40 | 	
 41 | 	**Clustering analysis on all samples**
 42 | 	
 43 | 	For this clustering analysis, we will take the filtered cells output from the quality control analysis to identify cellular populations with similar transcriptional profiles. To identify these clusters the following steps need to be performed:
 44 | 
 45 | 	1. Normalization and transformation of the raw gene counts per cell
 46 | 	2. Identification of high variance genes
 47 | 	3. Regression of unwanted variation (mitochondrial content, number of genes per cell, cell cycle, etc.)
 48 | 	4. Identification of the primary sources of heterogeneity using PCA analysis and heatmaps
 49 | 	5. Clustering cells based on significant PCs (metagenes)
 50 | 	6. Evaluation of cell clusters
 51 | 	
 52 | 2. Run the setup chunk using the green arrow - this code will load your filtered data file specified in the `bcbFile` param (`bcbFiltered.rda`) and will save it to the variable `bcb`.
 53 | 
 54 | 3. Generate the `seurat` object using the filtered data (`bcb`), then normalize and transform the raw gene counts per cell.
 55 | 
 56 | 	Prior to any clustering analysis, the raw counts need to be normalized using global-scaling normalization. Global-scaling normalization (1) normalizes the gene expression measurements for each cell by the total expression, (2) multiplies this by a scale factor (10,000 by default), and (3) log-transforms the result. Following normalization, the average expression and dispersion for each gene is calculated, which places these genes into bins, and then a z-score for dispersion within each bin is calculated. This helps control for the relationship between variability and average expression. Finally, the genes are scaled and centered.
 57 | 
 58 | 	```r
 59 | 	seurat <- as(bcb, "seurat") %>%
 60 | 	    NormalizeData(
 61 | 		object = .,
 62 | 		normalization.method = "LogNormalize",
 63 | 		scale.factor = 10000) %>%
 64 | 	    FindVariableGenes(
 65 | 		object = .,
 66 | 		mean.function = ExpMean,
 67 | 		dispersion.function = LogVMR,
 68 | 		do.plot = FALSE) %>%
 69 | 	    ScaleData(
 70 | 		object = .,
 71 | 		model.use = "linear")
 72 | 	```
 73 | 
 74 | 4. Ensure that the data in the seurat object is properly filtered (do not need to include this inside the actual clustering report). These violin plots should match up with the histograms and bar plots in the quality control report.
 75 | 
 76 | 	```r
 77 | 	features <- c("nUMI", "nGene", "mitoRatio")
 78 | 	sapply(seq_along(features), function(a) {
 79 | 	    VlnPlot(
 80 | 		seurat,
 81 | 		features.plot = features[[a]],
 82 | 		x.lab.rot = TRUE) %>%
 83 | 		show
 84 | 	}) %>%
 85 | 	    invisible
 86 | 	```
 87 | 
 88 | 5. We can also explore the presence of cell markers of interest in all cells. 
 89 | 
 90 | 	We can see in the violin plots below that we have a subset of cells expressing the markers of interest (ex. PAX7 and MYF5 genes). This was a useful step in this experiment, since if these markers weren't expressed, then the experiment did not work and there would be no need to continue.
 91 | 	
 92 | 	```{r qc_plots_markers, message=FALSE, warning=FALSE}
 93 | 	VlnPlot(seurat,
 94 | 	        features.plot = c("PAX7", "MYF5),
 95 | 	        x.lab.rot = TRUE,
 96 | 	        do.return = TRUE)
 97 | 	```
 98 | 
 99 | 	<img src="../img/sc_clus_violin_genes.png" width="600">
100 | 
101 | 6. Plot the high variance genes. Look at this plot similar to how you examine the dispersion plot in DESeq2 - look for decreasing dispersion with increasing mean expression. Generally this plot should be fine - shouldn't have a cloud of data/bullseye.
102 | 
103 | 	```r
104 | 	VariableGenePlot(seurat)
105 | 	```
106 | 	<img src="../img/sc_clus_variable_genes.png" width="600">
107 | 	
108 | 7. Regress out unwanted sources of variation
109 | 
110 | 	The single-cell dataset contains "uninteresting" sources of variation in addition to interesting sources. This can include technical noise, batch effects, and/or uncontrolled biological variation (e.g. cell cycle). Regressing these signals out of the analysis can improve downstream dimensionality reduction and clustering. To mitigate the effect of these signals, Seurat constructs linear models to predict gene expression based on user-defined variables. The scaled z-scored residuals of these models are used for dimensionality reduction and clustering.
111 | 
112 | 	First, we will explore cell cycle variation among the cells and see if the cells cluster by cell cycle in the PCA. As a reminder of the cell cycle phases:
113 | 	
114 | 	<img src="../img/cell_cycle.png" width="400">
115 | 	
116 | 	*Adapted from [Wikipedia](https://en.wikipedia.org/wiki/Cell_cycle) (Image License is [CC BY-SA 3.0](https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License))*
117 | 	
118 | 	- **G0:** Quiescence or resting phase. The cell is not actively dividing, which is common for cells that are fully differentiated. Some types of cells enter G0 for long periods of time (many neuronal cells), while other cell types never enter G0 by continuously dividing (epithelial cells).
119 | 	- **G1:** Gap 1 phase represents the **beginning of interphase**. During G1 there is growth of the non-chromosomal components of the cells. From this phase, the cell may enter G0 or S phase.
120 | 	- **S:** Synthesis phase for the replication of the chromosomes (also part of interphase).
121 | 	- **G2:** Gap 2 phase represents the **end of interphase**, prior to entering the mitotic phase. During this phase th cell grows in preparation for mitosis and the spindle forms.
122 | 	- **M:** M phase is the nuclear division of the cell (consisting of prophase, metaphase, anaphase and telophase).
123 | 	
124 | 	We assign each cell a score based on its expression of G2/M and S phase markers (provided by Seurat, which is descibed in [this publication](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4944528/). These marker sets should be anticorrelated in their expression levels, and cells expressing neither are likely not cycling (G0) or in G1 phase.
125 | 
126 | 	In the following PCA plot, we are checking to see if the cells are grouping by cell cycle. If we don't see clear grouping of the cells into `G1` (including G0), `G2M`, and `S` clusters, then we don't need to regress out cell-cycle variation. 
127 | 
128 | 	>***NOTE:** There are differences regarding which cells are in which phase of the cell cycle depending on whether the cells are analyzed as a single sample or analyzed with other samples. We still need to figure out why.*
129 | 
130 | 	In the PCA plot below, we can see clear clusters by cell cycle on the PCA. Therefore, we will plan to regress out the cell cycle variation.
131 | 
132 | 	```r
133 | 	ccm <- metadata(bcbFiltered)$organism %>%
134 | 	    str_match("^([A-Z])[a-z]+ ([a-z]+)$") %>%
135 | 	    .[, 2:3] %>%
136 | 	    as.character() %>%
137 | 	    paste0(collapse = "") %>%
138 | 	    tolower() %>%
139 | 	    cellCycleMarkers[[.]]
140 | 	sGenes <- ccm %>%
141 | 	    dplyr::filter(phase == "S") %>%
142 | 	    pull("symbol")
143 | 	g2mGenes <- ccm %>%
144 | 	    dplyr::filter(phase == "G2/M") %>%
145 | 	    pull("symbol")
146 | 
147 | 	seurat <- CellCycleScoring(
148 | 	    seurat,
149 | 	    g2m.genes = g2mGenes,
150 | 	    s.genes = sGenes)
151 | 	# Cell-cycle `Phase` column should now be added to `seurat@meta.data`
152 | 	seuratPreregress <- seurat
153 | 	assignAndSaveData(
154 | 	    name = "seurat_preregress",
155 | 	    object = seuratPreregress,
156 | 	    dir = dataDir)
157 | 
158 | 	RunPCA(
159 | 	    seuratPreregress,
160 | 	    pc.genes = c(sGenes, g2mGenes),
161 | 	    do.print = FALSE) %>%
162 | 	    plotPCA(interestingGroups = "phase", label = FALSE)
163 | 	```
164 | 
165 | 	<img src="../img/sc_clus_no_cellcycle_regress.png" width="600">
166 | 
167 | 8. Apply regression variables
168 | 
169 | 	[Seurat][] regresses out variables of uninteresting variation individually against each gene, then rescales and centers the resulting residuals. We generally recommend minimizing the effects of variable read count depth (`nUMI`) and mitochondrial gene expression (`mitoRatio`). If the differences in mitochondrial gene expression represent a biological phenomenon that may help to distinguish cell clusters, then we advise not regressing it out. Cell-cycle regression is generally recommended but should be avoided for samples containing cells undergoing differentiation.
170 | 
171 | 	In this report we will regress out the cell-cycle variation, so that we can examine clustering not due to cell cycle stage. However, we may not pick up different clusters of the Pax7+ cells at different stages of differentiation. We will explore the clustering without regressing out the cell cycle stages later.
172 | 
173 | 	Now that regression has been applied, let's recheck to see if the cells are no longer clustering by cycle. We now see the phase clusters superimpose.
174 | 
175 | 	```r
176 | 	seurat <- ScaleData(seurat, vars.to.regress = params$varsToRegress)
177 | 
178 | 	RunPCA(
179 | 	    seurat,
180 | 	    pc.genes = c(sGenes, g2mGenes),
181 | 	    do.print = FALSE) %>%
182 | 	    plotPCA(interestingGroups = "phase", label = FALSE)
183 | 	```
184 | 	<img src="../img/sc_clus_yes_cellcycle_regress.png" width="600">
185 | 	
186 | 9. Linear dimensionality reduction
187 | 
188 | 	Next, we perform principal component analysis (PCA) on the scaled data and score each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components.
189 | 
190 | 	In particular, a heatmap of the PCs allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and genes are ordered according to their PCA scores. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated gene sets.
191 | 
192 | 	```r
193 | 	seurat <- seurat %>%
194 | 	    RunPCA(do.print = FALSE) %>%
195 | 	    ProjectPCA(do.print = FALSE)
196 | 	```
197 | 
198 | 	The heatmap shows the expression of the top 15 genes that most contribute to the PCs (positively and negatively). 
199 | 
200 | 	```r
201 | 	PCHeatmap(
202 | 	    seurat,
203 | 	    col.use = CustomPalette(
204 | 		low = viridis(3)[[1]],
205 | 		mid = viridis(3)[[2]],
206 | 		high = viridis(3)[[3]]),
207 | 	    do.balanced = TRUE,
208 | 	    label.columns = FALSE,
209 | 	    pc.use = 1:params$pcCompute,
210 | 	    remove.key = TRUE)
211 | 	 ```
212 | 
213 | 	<img src="../img/sc_clus_pc_heatmap.png" width="600">
214 | 
215 | 	The visualizations of the PCs show the scores for each of the top 15 +/- genes contributing to each PC.
216 | 
217 | 	```r
218 | 	 VizPCA(
219 | 	    seurat,
220 | 	    pcs.use = 1:params$pcCompute,
221 | 	    do.balanced = TRUE,
222 | 	    nCol = 2)
223 | 	```
224 | 	
225 | 	<img src="../img/sc_clus_viz_pca.png" width="600">
226 | 
227 | 	The printed list include the top 15 genes contributing positively / negatively the most to each PC.
228 | 
229 | 	```r
230 | 	 PrintPCA(
231 | 	    seurat,
232 | 	    pcs.print = 1:params$pcCompute)
233 | 	```
234 | 
235 | 
236 | 10. Determine statistically significant principal components
237 | 
238 | 	To overcome the extensive technical noise in any single gene for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a "metagene" that combines information across a correlated gene set. Determining how many PCs to include downstream is therefore an important step. To accomplish this, we plot the standard deviation of each PC as an elbow plot.
239 | 
240 | 	The plots below show where we have defined the principal component cutoff used downstream for dimensionality reduction. This is calculated automatically as the larger value of:
241 | 
242 | 	1. The point where the principal components only contribute 5% of standard deviation (bottom left).
243 | 	2. The point where the principal components cumulatively contribute 80% of the standard deviation (bottom right).
244 | 
245 | 	This methodology is also commonly used for PC covariate analysis on bulk RNA-seq samples.
246 | 
247 | 	```r
248 | 	pcUse <- params$pcUse
249 | 	if (!is.numeric(params$pcUse)) {
250 | 	    pcUse <- pcCutoff(seurat) %>%
251 | 		seq(from = 1, to = .)
252 | 	}
253 | 	```
254 | 
255 |  	<img src="../img/sc_clus_sig_pcs.png" width="600">
256 | 
257 | 
258 | 	Based on these plots, we will use 10 principal components for dimensionality reduction calculations.
259 | 
260 | 11. Cluster the cells
261 | 
262 | 	Seurat now includes an graph-based clustering approach. Importantly, the *distance metric* which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [SNN-Cliq, Xu and Su, Bioinformatics, 2015] and CyTOF data [PhenoGraph, Levine et al., Cell, 2015]. 
263 | 	
264 | 	Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar gene expression patterns, and then attempt to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’. As in PhenoGraph, we first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard distance). To cluster the cells, we apply modularity optimization techniques [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.
265 | 
266 | 	```r
267 | 	seurat <- FindClusters(
268 | 	    seurat,
269 | 	    dims.use = pcUse,
270 | 	    force.recalc = TRUE,
271 | 	    print.output = TRUE,
272 | 	    resolution = params$resolution,
273 | 	    save.SNN = TRUE)
274 | 
275 | 	# A summary of the parameters that were chosen for clustering are given below.
276 | 
277 | 	PrintFindClustersParams(seurat)
278 | 	```
279 | 
280 | 12. Run non-linear dimensional reduction (tSNE)
281 | 
282 | 	Seurat continues to use tSNE as a powerful tool to visualize and explore these datasets. While we no longer advise clustering directly on tSNE components, cells within the graph-based clusters determined above should co-localize on the tSNE plot. This is because the tSNE aims to place cells with similar local neighborhoods in high-dimensional space together in low-dimensional space. As input to the tSNE, we use the same PCs as input to the clustering analysis.
283 | 
284 | 	```r
285 | 	seurat <- RunTSNE(
286 | 	    seurat,
287 | 	    dims.use = pcUse,
288 | 	    do.fast = TRUE)
289 | 	assignAndSaveData(
290 | 	    name = "seuratTSNE",
291 | 	    object = seurat,
292 | 	    dir = dataDir)
293 | 
294 | 	PrintTSNEParams(seurat)
295 | 
296 | 	lapply(seq_along(groupBy), function(a) {
297 | 	    if (groupBy[[a]] == "ident") {
298 | 		label <- TRUE
299 | 	    } else {
300 | 		label <- FALSE
301 | 	    }
302 | 	    plotTSNE(
303 | 		seurat,
304 | 		interestingGroups = groupBy[[a]],
305 | 		label = label) %>%
306 | 		show()
307 | 	    plotPCA(
308 | 		seurat,
309 | 		interestingGroups = groupBy[[a]],
310 | 		label = label) %>%
311 | 		show()
312 | 	}) %>%
313 | 	    invisible()
314 | 	```
315 | 
316 |  	<img src="../img/sc_clus_pca_group.png" width="600">
317 | 	
318 | 	This PCA looks like there may be differentiation occuring, which is separating the clusters. Should not automatically assume that is the case though.
319 | 	
320 | 	<img src="../img/sc_clus_tsne_clusters.png" width="600">
321 | 
322 | 	**Note that tSNE is not PCA! The measurement of distance in a tSNE plot is difficult to interpret. To better infer separation distance between the putative clusters, let's reapply PCA.**
323 | 
324 | 13. Cluster quality control
325 | 
326 | 	Let's look at the variance in the number of UMI counts (`nUMI`), gene detection (`nGene`), and the percentage of mitochondrial gene expression (`mitoRatio`), to see if there are any obvious cluster artefacts. We can also assess cell cycle batch effects (`S.Score`, `G2M.Score`) and any principal component bias toward individual clusters.
327 | 
328 | 	```r
329 | 	plotFeatures(
330 | 	    seurat,
331 | 	    features = c("nUMI", "nGene",
332 | 	                 "log10GenesPerUMI", "mitoRatio",
333 | 	                 "S.Score", "G2M.Score"))
334 | 	plotFeatures(
335 | 	    seurat,
336 | 	    features = paste0("PC", pcUse))
337 | 	```
338 | 
339 | 14. Adjusting parameters
340 | 
341 | 	Based on your assumptions for the experiment, you may not expect so many clusters or you may expect more clusters. If you do not expect so many clusters, then it would be useful to reduce the resolution used for the clustering in the `FindClusters()` function (step 11) and re-do the rest of this analysis. You can reduce this parameter all the way down to 0.1 if necessary. Similarly, if you expect more clusters, you can increase the resolution.
342 | 	
343 | 	<img src="../img/sc_clus_tsne_clusters_lowres.png" width="600">
344 | 
345 | 15. If you determine that your cells are differentiating, then better to perform analysis using the pseudotime tool from [Monocle](http://cole-trapnell-lab.github.io/monocle-release/).
346 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/seurat_loom_subset_velocity.md:
--------------------------------------------------------------------------------
 1 | # Subset loom file to cells of interest
 2 | 
 3 | ```r
 4 | # Use devtools to install hdf5r and loomR from GitHub
 5 | # devtools::install_github(repo = "hhoeflin/hdf5r")
 6 | # devtools::install_github(repo = "mojaveazure/loomR"
 7 | # library(devtools)
 8 | # install_github("velocyto-team/velocyto.R")   
 9 | # remotes::install_github('satijalab/seurat-wrappers')
10 | # remotes::install_github("mojaveazure/seurat-disk")
11 | 
12 | library(Seurat)
13 | library(SeuratDisk)
14 | library(SeuratWrappers)
15 | library(loomR)
16 | 
17 | 
18 | # Get loom data
19 | ldat <- ReadVelocity(file = "data/merged_loom_files/all_merged.loom")
20 | 
21 | # Turn loom to seurat
22 | bm <- as.Seurat(x = ldat)
23 | bm[["RNA"]] <- bm[["spliced"]]
24 | 
25 | # Bring in Seurat with known clusters
26 | seurat_object <- readRDS('data/seurat_combined_sct_umap_FBS.rds')
27 | 
28 | # Change cell ids of loom seurat to match cell ids in seurat with known clusters
29 | all_cells <- Cells(bm)
30 | all_cells[str_detect(all_cells, pattern = "A1_")] <- all_cells[str_detect(all_cells, pattern = "A1_")] %>% str_replace("x$", "-1_1")
31 | all_cells[str_detect(all_cells, pattern = "A2_")] <- all_cells[str_detect(all_cells, pattern = "A2_")] %>% str_replace("x$", "-1_2")
32 | all_cells[str_detect(all_cells, pattern = "A3_")] <- all_cells[str_detect(all_cells, pattern = "A3_")] %>% str_replace("x$", "-1_3")
33 | all_cells[str_detect(all_cells, pattern = "A4_")] <- all_cells[str_detect(all_cells, pattern = "A4_")] %>% str_replace("x$", "-1_4")
34 | 
35 | all_cells <- gsub('A1_CKDL210009739-1a-SI_TT_B3_HC2W5DSX2:', '', all_cells)
36 | all_cells <- gsub('A2_CKDL210009740-1a-SI_TT_B6_HC2W5DSX2:', '', all_cells)
37 | all_cells <- gsub('A3_CKDL210009741-1a-SI_TT_B2_HC2W5DSX2:', '', all_cells)
38 | all_cells <- gsub('A4_CKDL210009742-1a-SI_TT_B7_HC2W5DSX2:', '', all_cells)
39 | 
40 | new_names <- all_cells
41 | bm <- RenameCells(bm, new.names = new_names)
42 | 
43 | # Get names of seurat_object
44 | DefaultAssay(seurat_object) <- "RNA"
45 | 
46 | sub_genes <- rownames(seurat_object)
47 | sub_cells <- colnames(seurat_object)
48 | 
49 | # Subset loom seurat
50 | bm <- subset(bm, features = sub_genes, cells = sub_cells)
51 | 
52 | # Add  cluster ID to metadata file for each cell
53 | bm <- AddMetaData(bm, seurat_object@meta.data[, c("DE_group", "sample_simple", "seurat_clusters")])
54 | 
55 | # Add all slots to object
56 | bm@reductions[["pca"]] <- seurat_object@reductions[["pca"]]
57 | bm@reductions[["umap"]] <- seurat_object@reductions[["umap"]]
58 | 
59 | bm@assays$integrated <- seurat_object@assays$integrated
60 | bm@assays$SCT <- seurat_object@assays$SCT
61 | 
62 | # Save object and convert to h5ad format for scvelo
63 | 
64 | DefaultAssay(bm) <- "RNA"
65 | SaveH5Seurat(bm, filename = "all_samples.h5Seurat")
66 | Convert("all_samples.h5Seurat", dest = "h5ad")
67 | 
68 | 
69 | # Split into individual objects
70 | bm_cre <- subset(bm, subset = sample_simple == "re")
71 | DefaultAssay(bm_cre) <- "RNA"
72 | SaveH5Seurat(bm_cre, filename = "cre_samples.h5Seurat")
73 | Convert("cre_samples.h5Seurat", dest = "h5ad")
74 | 
75 | bm_ko <- subset(bm_ko, subset = sample_simple == "KO")
76 | DefaultAssay(bm_ko) <- "RNA"
77 | SaveH5Seurat(bm_ko, filename = "ko_samples.h5Seurat")
78 | Convert("ko_samples.h5Seurat", dest = "h5ad")
79 | 
80 | ```
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/scRNAseq/scRNAseq_analysis_tutorial/lessons/velocity.md:
--------------------------------------------------------------------------------
 1 | # Velocity analysis using velocyto, Seurat and scVelo
 2 | 
 3 | > _**NOTE:** To run analysis in a Jupyter notebook on O2, please follow the [available instructions](https://github.com/hbc/knowledgebase/blob/master/rc/jupyter_notebooks.md)._
 4 | 
 5 | 1. Generate loom files containing layers for spliced and unspliced reads using Velocyto.
 6 | 
 7 |     ```
 8 |     # On command line
 9 |     
10 |     # Load modules
11 |     module load gcc/9.2.0 python/3.8.12
12 |     
13 |     # Create virtual environment
14 |     # virtualenv velocyto --system-site-packages
15 |     
16 |     # Activate virtual environment
17 |     source velocyto/bin/activate
18 |   
19 |     # Install tools
20 |     # pip3 install numpy scipy cython numba matplotlib scikit-learn h5py click
21 |     # pip3 install velocyto
22 |     # pip3 install scvelo
23 |     
24 |     # Run velocyto
25 |     velocyto run10x -m ../data/mm10_rmsk.gtf ../final/cellranger_6.0.0/count/expect_cells/A1_CKDL210009739-1a-SI_TT_B3_HC2W5DSX2/ genes.gtf
26 |     
27 |     # Create merged loom object - start by copying first file in 'files' below, then add another loom file in python below.
28 |     # cp path_to_file1.loom all_merged.loom
29 |     ```
30 | 
31 | 2. In python merge any loom files desired
32 | 
33 |       ```python
34 |       #python3
35 |       import velocyto as vcy
36 |       import loompy
37 |       import scvelo as scv
38 |       import numpy as np
39 |       import h5py
40 |       import scipy
41 |       import cython
42 |       import numba
43 |       import matplotlib
44 |       import 
45 |       import click
46 |       
47 |       # files = ["path_to_file1.loom", "path_to_file2.loom"]
48 |       files = ["../final/cellranger_6.0.0/count/expect_cells/A1_CKDL210009739-1a-SI_TT_B3_HC2W5DSX2/velocyto/A1_CKDL210009739-1a-SI_TT_B3_HC2W5DSX2.loom", "../final/cellranger_6.0.0/count/expect_cells/A2_CKDL210009740-1a-SI_TT_B6_HC2W5DSX2/velocyto/A2_CKDL210009740-1a-SI_TT_B6_HC2W5DSX2.loom", "../final/cellranger_6.0.0/count/expect_cells/A3_CKDL210009741-1a-SI_TT_B2_HC2W5DSX2/velocyto/A3_CKDL210009741-1a-SI_TT_B2_HC2W5DSX2.loom", "../final/cellranger_6.0.0/count/expect_cells/A4_CKDL210009742-1a-SI_TT_B7_HC2W5DSX2/velocyto/A4_CKDL210009742-1a-SI_TT_B7_HC2W5DSX2.loom"]
49 |     
50 |       ds = loompy.connect("data/merged_loom_files/all_merged.loom")
51 |       for fn in files[1:]:
52 |         ds.add_loom(fn, batch_size=1000)
53 |       ```
54 |   
55 | 3. Perform all QC, normalization and clustering using Seurat as described at [http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html). 
56 | 
57 |     If you cannot do your own analyses and must use the data given to you by the client, then documentation for merging/subsetting loom files with their Seurat data/clusters is [available](https://github.com/hbc/tutorials/blob/master/scRNAseq/scRNAseq_analysis_tutorial/lessons/seurat_loom_subset_velocity.md).
58 |   
59 |     ```r
60 |     deactivate velocyto
61 |     module load gcc/6.2.0 R/4.1.1
62 |     
63 |     # library(devtools)
64 |     # install_github("velocyto-team/velocyto.R")   
65 |     # remotes::install_github('satijalab/seurat-wrappers')
66 |     # remotes::install_github("mojaveazure/seurat-disk")
67 |     
68 |     library(Seurat)
69 |     library(SeuratDisk)
70 |     library(SeuratWrappers)
71 |     
72 |     ldat <- ReadVelocity(file = "path_to_file.loom")
73 |     bm <- as.Seurat(x = ldat)
74 |     bm[["RNA"]] <- bm[["spliced"]]
75 |     bm <- SCTransform(bm)
76 |     bm <- RunPCA(bm)
77 |     bm <- RunUMAP(bm, dims = 1:30)
78 |     bm <- FindNeighbors(bm, dims = 1:30)
79 |     bm <- FindClusters(bm)
80 |     DefaultAssay(bm) <- "RNA"
81 |     SaveH5Seurat(bm, filename = "mouseBM.h5Seurat")
82 |     Convert("mouseBM.h5Seurat", dest = "h5ad")
83 |     ```
84 |   
85 | 4. Use scVelo in python to construct velocity estimates and trajectories and continue following [http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html) and consulting [scVelo documentation](https://scvelo.readthedocs.io/VelocityBasics)
86 | 
87 |     ```python
88 |     import scvelo as scv
89 |     adata = scv.read("mouseBM.h5ad")
90 |     adata
91 |     ```
92 |     
93 |     The scVelo code used for a scVelo analysis can be found as a [pdf](https://github.com/hbc/hbc_10x_scRNAseq_Feinberg_Aortic_cells_from_TKO_and_WT_mouse_hbc04205_3/blob/master/velocyto/feinberg_velocity_analysis_KLF10_KO_vs_Cre_all_results/velocity_jupyter_notebook.pdf) or [Jupyter notebook](https://github.com/hbc/hbc_10x_scRNAseq_Feinberg_Aortic_cells_from_TKO_and_WT_mouse_hbc04205_3/blob/master/velocyto/velocity_jupyter_notebook.ipynb).
94 | 


--------------------------------------------------------------------------------
/scRNAseq/scripts/clustering_pre_regress_v2.R:
--------------------------------------------------------------------------------
 1 | # Usage: this Rscript is using Seurat to perform normalization, calculation of variable genes and significant PCs, finally it will score cells for cell cycle. The output is ready for the regress.R script. 
 2 | 
 3 | # To run:  Rscript name_of_script
 4 | 
 5 | library(Seurat)
 6 | library(tidyverse)
 7 | data_dir <- "data"
 8 | load(file.path(data_dir, "cycle.rda"))
 9 | set.seed(1454944673L)
10 | 
11 | # Load pre_regressed Seurat object                                                                                                
12 | pre_regressed_seurat <- readRDS(file.path(data_dir, "seurat_raw.rds"))
13 | 
14 | 
15 | 
16 | # Normalize counts for total cell expression and take log value                            
17 | pre_regressed_seurat <- pre_regressed_seurat %>% NormalizeData(
18 |   normalization.method = "LogNormalize",
19 |   scale.factor = 10000)
20 | 
21 | # Find variable genes based on the mean-dispersion relationship based on z-score for dispersion. Recommended to set parameters as to mark visual outliers on dispersion plot - default for ~2,000 variable genes
22 | 
23 | pre_regressed_seurat =  pre_regressed_seurat %>%
24 |   FindVariableGenes(
25 |     mean.function = ExpMean,
26 |     dispersion.function = LogVMR,
27 |     do.plot = FALSE)
28 | 
29 | pre_regressed_seurat = pre_regressed_seurat %>%
30 |   ScaleData(model.use = "linear")
31 | 
32 | 
33 | # Check number of variable genes to determine if correct parameters used  
34 | length(x = pre_regressed_seurat@var.genes)
35 | 
36 | pre_regressed_seurat <- CellCycleScoring(
37 |   pre_regressed_seurat,
38 |   g2m.genes = g2m_genes,
39 |   s.genes = s_genes)
40 | 
41 | pre_regressed_seurat = RunPCA(
42 |   pre_regressed_seurat,
43 |   pc.genes = c(s_genes, g2m_genes),
44 |   do.print = FALSE)
45 | 
46 | saveRDS(pre_regressed_seurat, file = file.path(data_dir, "seurat_pre_regress.rds"))
47 | 
48 | # If needed, you could subset by sample using commandline argument at the beginning of the analysis - uncomment code below if desired
49 | # Use command line argument to specify sample extracted
50 | # options(echo=TRUE)
51 | # args <- commandArgs(trailingOnly = TRUE)
52 | #
53 | # pre_regressed_seurat <- SubsetData(pre_regressed_seurat,
54 | #                                cells.use = rownames(pre_regressed_seurat@meta.data)[which(pre_regressed_seurat@meta.data$interestingGroups == args[1])])
55 | 


--------------------------------------------------------------------------------
/scRNAseq/scripts/clustering_regress.R:
--------------------------------------------------------------------------------
 1 | # Usage: This Rscript as input the path to the pre-regressed rds file to be used in the analysis. This script will regress out sources of variation, 
 2 | # Rscript name_of_script "path/to/file.rds"
 3 | # To change the variables to regress, alter line 22.
 4 | 
 5 | # Load libraries and provide data directory
 6 | library(Seurat)
 7 | library(tidyverse)
 8 | data_dir <- "data"
 9 | load(file.path(data_dir, "cycle.rda"))
10 | set.seed(1454944673L)
11 | 
12 | # Use command line argument to specify sample extracted
13 | options(echo=TRUE)
14 | args <- commandArgs(trailingOnly = TRUE)
15 | 
16 | # Read in desired data
17 | pre_regressed_seurat <- readRDS(args[1])
18 | 
19 | # Regress out the uninteresting sources of variation in the data (need to decide whether or not to include cell cycle and mitoRatio as variables to regress)
20 | vars_to_regress <- c("nUMI", "S.Score", "G2M.Score", "mitoRatio")
21 | 
22 | seurat <- ScaleData(pre_regressed_seurat, vars.to.regress = vars_to_regress)
23 | 
24 | # Re-run the PCA plots and color by cell cycle phase
25 | seurat <- RunPCA(
26 |   seurat,
27 |   pc.genes = c(s_genes, g2m_genes),
28 |   do.print = FALSE)
29 | 
30 | PCAPlot(seurat, group.by= "Phase")
31 | 
32 | # Perform the scoring for all genes
33 | seurat <- seurat %>%
34 |   RunPCA(do.print = FALSE) %>%
35 |   ProjectPCA(do.print = FALSE)
36 | 
37 | # Create elbow plot
38 | PCElbowPlot(seurat)
39 | 
40 | # Determine the estimate for significant PCs
41 | pct <- seurat@dr$pca@sdev / sum(seurat@dr$pca@sdev) * 100
42 | cumu <- cumsum(pct)
43 | co1 <- which(cumu > 90 & pct < 5)[1]
44 | co2 <- sort(which((pct[1:length(pct)-1] - pct[2:length(pct)]) > 0.1),
45 |            decreasing = T)[1] + 1 # last point where change of % of variation is more than 0.1%.
46 | pcs <- min(co1, co2) # change to any other number
47 | 
48 | # Find cell clusters for different resolutions
49 | seurat <- FindClusters(
50 |   seurat,
51 |   dims.use = 1:pcs,
52 |   force.recalc = TRUE,
53 |   print.output = TRUE,
54 |   resolution = c(0.1, 0.6, 0.8, 1.0, 1.2, 1.8)
55 |   save.SNN = TRUE)
56 | 
57 | PrintFindClustersParams(seurat)
58 | 
59 | # Save clustered cells
60 | saveRDS(seurat, file = file.path(data_dir, "seurat_tsne.rds"))                                                                                                                                                                                                                        
61 | 


--------------------------------------------------------------------------------
/scRNAseq/scripts/sc_DESeq2_analysis_inner.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Usage: script to run DE analysis using DESeq2 on single-cell RNA-seq clusters generated by Seurat. The DE analysis is performed on the cell type cluster specified on the command line. The command line argument for cell type cluster should contain a value output from levels(seurat@ident)
 4 | 
 5 | library(dplyr)
 6 | library(Matrix)
 7 | library(DESeq2)
 8 | library(tibble)
 9 | library(BiocParallel)
10 | 
11 | load("data/DESeq_object.rda")
12 | 
13 | # Use command line argument for cluster ID
14 | options(echo=TRUE)
15 | args <- commandArgs(trailingOnly = TRUE)
16 | 
17 | dds_cluster <- dds[ , dds$ident %in% as.character(args[1])]
18 | 
19 | dds_cluster$ident <- droplevels(dds_cluster$ident)
20 | dds_cluster$sample <- droplevels(dds_cluster$sample)
21 | 
22 | dds_cluster_lrt <- DESeq(dds_cluster,
23 |         test = "LRT",
24 |         full = ~ nUMI + Phase + condition,
25 |         reduced = ~ nUMI + Phase,
26 |         sfType = "poscounts",
27 |         minmu = 1e-6,
28 |         minRep = Inf,
29 |         parallel = TRUE,
30 |         BPPARAM = MulticoreParam(8))
31 | 
32 | print("working on next cluster conditionA_vs_conditionB")
33 | 
34 | # conditionA vs. conditionB
35 |     contrast_conditionA_vs_conditionB <- c("condition","conditionA","conditionB")
36 |     dds_conditionA_vs_conditionB_lrt_results_unshrunken <- results(dds_cluster_lrt, contrast=contrast_conditionA_vs_conditionB, cooksCutoff = FALSE)
37 | dds_conditionA_vs_conditionB_lrt_results_shrunken <- lfcShrink(dds_cluster_lrt,contrast = contrast_conditionA_vs_conditionB, res = dds_conditionA_vs_conditionB_lrt_results_unshrunken)
38 |     save(dds_cluster_lrt, dds_conditionA_vs_conditionB_lrt_results_unshrunken,dds_conditionA_vs_conditionB_lrt_results_shrunken,file = paste0("results/dds_results_conditionA_vs_conditionB",as.character(args[1]),".Rdata"))
39 | 
40 | print("moving on to conditionA_vs_conditionC")
41 | 
42 | # conditionA vs. conditionC
43 | contrast_conditionA_vs_conditionC <- c("condition","conditionA","conditionC")
44 |     dds_conditionA_vs_conditionC_lrt_results_unshrunken <- results(dds_cluster_lrt, contrast=contrast_conditionA_vs_conditionC, cooksCutoff = FALSE)
45 | dds_conditionA_vs_conditionC_lrt_results_shrunken <- lfcShrink(dds_cluster_lrt,contrast = contrast_conditionA_vs_conditionC, res = dds_conditionA_vs_conditionC_lrt_results_unshrunken)
46 |     save(dds_cluster_lrt, dds_conditionA_vs_conditionC_lrt_results_unshrunken,dds_conditionA_vs_conditionC_lrt_results_shrunken,file = paste0("results/dds_results_conditionA_vs_conditionC",as.character(args[1]),".Rdata"))
47 | 
48 | print("moving on to conditionB_vs_conditionC")
49 | 
50 | # conditionB vs. conditionC
51 | contrast_conditionB_vs_conditionC <- c("condition","conditionB","conditionC")
52 |     dds_conditionB_vs_conditionC_lrt_results_unshrunken <- results(dds_cluster_lrt, contrast=contrast_conditionB_vs_conditionC, cooksCutoff = FALSE)
53 | dds_conditionB_vs_conditionC_lrt_results_shrunken <- lfcShrink(dds_cluster_lrt,contrast = contrast_conditionB_vs_conditionC, res = dds_conditionB_vs_conditionC_lrt_results_unshrunken)
54 |     save(dds_cluster_lrt, dds_conditionB_vs_conditionC_lrt_results_unshrunken,dds_conditionB_vs_conditionC_lrt_results_shrunken,file = paste0("results/dds_results_conditionB_vs_conditionC",as.character(args[1]),".Rdata"))
55 | 


--------------------------------------------------------------------------------
/scRNAseq/scripts/sc_DESeq2_analysis_outer.R:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p priority             # partition name
 4 | #SBATCH -t 0-12:00              # hours:minutes runlimit after which job will be killed
 5 | #SBATCH --job-name DESeq2           # Job name
 6 | #SBATCH -o %j.out                       # File to which standard out will be written
 7 | #SBATCH -e %j.err               # File to which standard err will be written
 8 | 
 9 | module load gcc/6.2.0 R/3.5.1 hdf5/1.10.1
10 | 
11 | 
12 | # This `for` loop will take the single-cell RNA-seq cluster ids as input and run the script for each of them on a different set of cores. The clusternames should be what is output on the seurat object from `seurat@ident` in the preparation for DESeq2 script.
13 | 
14 | for cluster_n in "clustername0" "clustername1" "clustername2" "clustername3" "clustername4" "clustername5" 
15 | 
16 | do
17 | 
18 | sbatch -p medium -t 3-12:00 -c 8 --mem 64G --job-name DEseq2 --wrap="Rscript sc_DESeq2_analysis_inner.R $cluster_n"
19 | 
20 | sleep 1 # wait 1 second between each job submission
21 | 
22 | done
23 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/README.md:
--------------------------------------------------------------------------------
1 | This directory contains scRNA-seq templates to use during analyses.
2 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/power_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Power analysis of scRNA-seq experiment"
  3 | author: ""
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |         html_document:
  7 |                 toc: true
  8 |                 toc_float: true
  9 |                 collapsed: true
 10 |                 toc_depth: 2
 11 |                 theme: paper
 12 | ---
 13 | 
 14 | ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE}
 15 | knitr::opts_chunk$set(echo = FALSE)
 16 | # Load libraries
 17 | library(Seurat)
 18 | library(knitr)
 19 | library(rmarkdown)
 20 | library(tidyverse)
 21 | library(Matrix)
 22 | library(AnnotationHub)
 23 | library(ensembldb)
 24 | library(scales)
 25 | library(SingleCellExperiment)
 26 | 
 27 | # Set seed for reproducibility
 28 | set.seed(1454944673L)
 29 | opts_chunk[["set"]](
 30 |     audodep = TRUE,
 31 |     cache = FALSE,
 32 |     cache.lazy = FALSE,
 33 |     error = TRUE,
 34 |     echo = FALSE,
 35 |     fig.height = 10L,
 36 |     fig.retina = 2L,
 37 |     fig.width = 10L,
 38 |     message = FALSE,
 39 |     tidy = TRUE,
 40 |     warning = TRUE
 41 | )
 42 | 
 43 | ```
 44 | 
 45 | # Power analysis
 46 | 
 47 | Power analyses were performed to determine the number of samples needed to identify differentially expressed genes between `samplegroup1` and `samplegroup2` given a false positive rate of 0.05 (alpha) and a false negative rate of 0.1 or 0.2 (80% or 90% power).
 48 | 
 49 | For single-cell analyses with multiple biological replicates, differential expression analysis can be performed by summing the single cells for each gene to the sample level for the cell type of interest [[1](https://goo.gl/hWpXQF)]. Therefore, to determine the number of required biological replicates needed to achieve a specific power to detect differentially expressed genes for a given effect size, the single cells are collapsed to the sample level to perform a pseudo-bulk power analysis.
 50 | 
 51 | To perform the power analysis, the power was calculated by calculating the average number of counts per gene in the cell type population of interest and exploring a range of effect sizes using the RNASeqPower package in R [[2](https://www.bioconductor.org/packages/release/bioc/vignettes/RNASeqPower/inst/doc/samplesize.pdf)]. With the pilot analysis containing a single patient sample, we cannot calculate the biological coefficient of variation (CV); however, within group CV has been found to be less than 0.4 for a range of human studies in bulk RNA-seq [[3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3842884/)]. Since single-cell RNA-seq analyses have additional variation, we will use a stringent CV, set to 0.5. It is likely a conservative estimate of variability between patients, but we will be able to better estimate the the variability after sequencing additional patients.
 52 | 
 53 | Our cell population of most interest is `celltype`. We performed the power calculation for this population first. Below are the sample sizes needed for 80% and 90% power and a range of effect sizes given on the left:
 54 | 
 55 | ```{r power_analysis, message=FALSE, warning=FALSE}
 56 | # Load library for power analysis
 57 | library(RNASeqPower)
 58 | library(basejump)
 59 | 
 60 | # Bring in clustering information
 61 | seurat_tsne <- readRDS("path/to/seurat.rds")
 62 | 
 63 | metadata <- seurat_tsne@meta.data
 64 | metadata$ident <- seurat_tsne@ident
 65 | 
 66 | # Subset cells to cluster of interest
 67 | # Example Monocyte cells
 68 | 
 69 | cellIDs_monocytes <- rownames(metadata[metadata$ident == "Monocytes", ])
 70 | monocyte_counts <- seurat_tsne@raw.data[, cellIDs_monocytes]
 71 | #all(cellIDs_monocytes %in% colnames(monocyte_counts))
 72 | 
 73 | # Determine average number of reads per gene for each sample
 74 | total_gene_reads <- rowSums(as.matrix(monocyte_counts))
 75 | avg_gene_reads <- geometricMean(total_gene_reads)
 76 | 
 77 | # Determine the biological coefficient of variation
 78 | ## If only a single sample and can't estimate, use bcv of 0.5
 79 | bcv <- 0.5
 80 | 
 81 | ## If more than a single sample:
 82 | 
 83 | library(edgeR)
 84 | edgeR_dgelist <- DGEList(counts=monocyte_counts, group=condition))
 85 | edgeR_dgelist <- calcNormFactors(edgeR_dgelist, method = "TMM")
 86 | edgeR_dgelist <- estimateCommonDisp(edgeR_dgelist)
 87 | bcv <- sqrt(edgeR_dgelist$common.dispersion) 
 88 | 
 89 | # Run the power analysis
 90 | power_table <- rnapower(depth=avg_gene_reads,
 91 |                         cv=bcv, 
 92 |                         effect=c(1.5, 1.75, 2),
 93 |                         alpha= .05, 
 94 |                         power=c(.8, .9))
 95 | knitr::kable(power_table)
 96 | ```
 97 | 
 98 | To achieve 80% power for identifying of genes differentially expressed between `samplegroup1` and `samplegroup2` monocytes by at least 2-fold would require x replicates per sample group. If we wanted to achieve 90% power, then we would require x replicates. Similarly, if we preferred to identify DE genes with 1.75-fold differences in expression, then we would need at least x replicates for 80% power and x replicates for 90% power. 
 99 | 
100 | Since monocytes are one of the larger cell populations we could explore with 9,674 cells, we could also test one of the smaller cell populations. 
101 | 
102 | ```{r cell_pop}
103 | cell_freqs <- data.frame(sort(table(metadata_all$ident), decreasing = TRUE))
104 | colnames(cell_freqs) <- c("Cell type", "Number of cells")
105 | knitr::kable(cell_freqs)
106 | ```
107 | 
108 | For the T cells with x cells, we can perform the same power analysis:
109 | 
110 | ```{r power_analysis_Tcells, message=FALSE, warning=FALSE}
111 | # Load library for power analysis
112 | # Subset cells to cluster of interest
113 | # Example T cells
114 | 
115 | cellIDs_Tcells <- rownames(metadata[metadata$ident == "T cells", ])
116 | Tcells_counts <- seurat_tsne@raw.data[, cellIDs_Tcells]
117 | #all(cellIDs_Tcells %in% colnames(Tcells_counts))
118 | 
119 | # Determine average number of reads per gene for each sample
120 | total_gene_reads <- rowSums(as.matrix(Tcells_counts))
121 | avg_gene_reads <- geometricMean(total_gene_reads)
122 | 
123 | # Determine the biological coefficient of variation
124 | ## Since only a single sample and can't estimate, use bcv of 0.5
125 | bcv <- 0.5
126 | 
127 | # Run the power analysis
128 | power_table <- rnapower(depth=avg_gene_reads,
129 |                         cv=bcv, 
130 |                         effect=c(1.5, 1.75, 2),
131 |                         alpha= .05, 
132 |                         power=c(.8, .9))
133 | knitr::kable(power_table)
134 | ```
135 | 
136 | With the fewer number of cells, we would need more samples to detect differences in expression between the groups. For 80% power, we would need x replicates to detect an effect size of 2 and x replicates to detect an effect size of 1.75.
137 | 
138 | **References:**
139 | 
140 | 1. H.M. Kang, M. Subramaniam, S. Targ, M. Nguyen, L. Maliskova, E. Wan, S. Wong, L. Byrnes, C. Lanata, R. Gate, et al. Multiplexing droplet-based single cell RNA-sequencing using natural genetic barcodes. Nature Biotechnology (2018), 36:89–94. Doi: 10.1038/nbt.4042  
141 | 
142 | 2. Terry Therneau, Steven Hart and Jean-Pierre Kocher (2019). Calculating
143 |   samplesSize estimates for RNA Seq studies. R package version 1.22.1.
144 |   
145 | 3. Hart SN, Therneau TM, Zhang Y, Poland GA, Kocher JP. Calculating sample size estimates for RNA sequencing data. J Comput Biol. (2013), 12:970-8. Doi: 10.1089/cmb.2012.0283. 
146 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/sc_DESeq2_analysis_report_template.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: ""
  3 | author: ""
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | # Load libraries
 11 | library(Seurat)
 12 | library(knitr)
 13 | library(rmarkdown)
 14 | library(tidyverse)
 15 | library(Matrix)
 16 | library(AnnotationHub)
 17 | library(ensembldb)
 18 | library(scales)
 19 | library(cowplot)
 20 | library(gridExtra)
 21 | # Set seed for reproducibility
 22 | set.seed(1454944673L)
 23 | opts_chunk[["set"]](
 24 |     audodep = TRUE,
 25 |     cache = FALSE,
 26 |     cache.lazy = FALSE,
 27 |     error = TRUE,
 28 |     echo = FALSE,
 29 |     fig.height = 10L,
 30 |     fig.retina = 2L,
 31 |     fig.width = 10L,
 32 |     message = FALSE,
 33 |     tidy = TRUE,
 34 |     warning = TRUE
 35 | )
 36 | ```
 37 | 
 38 | # Overview
 39 | 
 40 | - Principal Investigator:
 41 | - Researcher: 
 42 | - Experiment: 
 43 | - Description:
 44 | 
 45 | 
 46 | * * *
 47 | 
 48 | # Differential expression analysis report 
 49 | 
 50 | ## All cell types
 51 | 
 52 | Across many of the different cell types we see the same genes appearing as differentially expressed, including Prr15, many mitochondrial genes and hemoglobin genes (as well as some pseudogenes). If a gene is DE in the majority of the cell type clusters, then I have included it in this section and removed it from the other overviews for each of the other cell types. I plotted the gene expression across temperature to explore these genes in a bit more detil below:
 53 | 
 54 | ```{r all_sig, out.width='33%'}
 55 | cluster_sig_results <- list()
 56 | 
 57 | for (cluster in c()){
 58 | 
 59 | # Loading cluster-specific data - conditionA_vs_conditionB
 60 | load(paste0("path/to/results/dds_results_conditionA_vs_conditionB_"), cluster, ".Rdata") # This file was output on O2 from the DESeq2 script (https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/sc_DESeq2_analysis_inner.R).
 61 | 
 62 | # Extracting results
 63 | all_results_conditionA_vs_conditionB <- data.frame(dds_conditionA_vs_conditionB_lrt_results_shrunken)
 64 | 
 65 | colnames(all_results_conditionA_vs_conditionB)[2] <- "LFC-conditionA_vs_conditionB"
 66 | colnames(all_results_conditionA_vs_conditionB)[3] <- "SE-conditionA_vs_conditionB"
 67 | 
 68 | # Merge results with annotations
 69 | cluster_annot_results <- merge(all_results_conditionA_vs_conditionB, annotations, by.x=0, by.y="gene_name")
 70 | 
 71 | # Loading cluster-specific data - conditionA_vs_conditionC
 72 | load(paste0("path/to/results/dds_results_conditionA_vs_conditionC_"), cluster, ".Rdata") 
 73 | 
 74 | # Extracting results
 75 | all_results_conditionA_vs_conditionC <- data.frame(dds_conditionA_vs_conditionC_lrt_results_shrunken)
 76 | 
 77 | colnames(all_results_conditionA_vs_conditionC)[2] <- "LFC-conditionA_vs_conditionC"
 78 | colnames(all_results_conditionA_vs_conditionC)[3] <- "SE-conditionA_vs_conditionC"
 79 | 
 80 | # Merge results with annotations
 81 | cluster_annot_results <- merge(cluster_annot_results, all_results_conditionA_vs_conditionC, by.x="Row.names", by.y=0)
 82 | 
 83 | # Loading cluster-specific data - conditionA_vs_conditionC
 84 | load(paste0("path/to/results/dds_results_conditionB_vs_conditionC_"), cluster, ".Rdata") 
 85 | 
 86 | # Extracting results
 87 | all_results_conditionB_vs_conditionC <- data.frame(dds_conditionB_vs_conditionC_lrt_results_shrunken)
 88 | 
 89 | colnames(all_results_conditionB_vs_conditionC)[2] <- "LFC-conditionB_vs_conditionC"
 90 | colnames(all_results_conditionB_vs_conditionC)[3] <- "SE-conditionB_vs_conditionC"
 91 | 
 92 | # Merge results with annotations
 93 | cluster_annot_results <- merge(cluster_annot_results, all_results_conditionB_vs_conditionC, by.x="Row.names", by.y=0)
 94 | 
 95 | # Arrange by padj
 96 | cluster_annot_results <- dplyr::arrange(cluster_annot_results, padj)
 97 | 
 98 | # Reorder columns
 99 | cluster_annot_results <- cluster_annot_results[, c(1:4, 11:14, 5:10)]
100 | 
101 | colnames(cluster_annot_results)[1] <- "Gene_symbol"
102 | 
103 | cluster_annot_results <- cluster_annot_results[which(!(duplicated(cluster_annot_results$Gene_symbol))), ]
104 | 
105 | 
106 | # write.csv(cluster_annot_results, paste0("results/", cluster, "_DE_all_results.csv"), quote = FALSE)
107 | 
108 | # Extract significant genes
109 | sig_genes <- dplyr::filter(cluster_annot_results, padj < 0.05)
110 | 
111 | cluster_sig_results[[cluster]] <- sig_genes
112 | 
113 | # write.csv(sig_genes, paste0("results/", cluster, "DE_sig_results.csv"), quote = FALSE)
114 | }
115 | 
116 | # Genes present in the majority of analyses
117 | 
118 | intersection_genes <- Reduce(intersect, cluster_sig_results)
119 | 
120 | 
121 | cells_conditionA <- rownames(seurat@meta.data[which(seurat@meta.data$sample == "conditionA"), ])
122 | all_genes <- as.character(intersection_genes)
123 | 
124 | cells_conditionA<- rownames(seurat@meta.data[which(seurat@meta.data$sample == "conditionA"), ])
125 | cells_conditionB<- rownames(seurat@meta.data[which(seurat@meta.data$sample == "conditionB"), ])
126 | cells_conditionC<- rownames(seurat@meta.data[which(seurat@meta.data$sample == "conditionC"), ])
127 | 
128 | for (gene in all_genes){
129 | 
130 | FeaturePlot(object = seurat, 
131 |                 features.plot = gene, 
132 |                 cells.use = cells_conditionA,
133 |                 do.return=TRUE)[[gene]]
134 | FeaturePlot(object = seurat, 
135 |                 features.plot = gene, 
136 |                 cells.use = cells_conditionB, 
137 |                 do.return=TRUE)[[gene]]
138 | FeaturePlot(object = seurat, 
139 |                 features.plot = gene,
140 |                 cells.use = cells_conditionC,
141 |                 do.return=TRUE)[[gene]]
142 | }
143 | 
144 | 
145 | ```
146 | **Figure Legend:** *From left to right for each gene are the samples from the conditions: A, B, and C*
147 | 
148 | ## Differentially expressed genes by cell type
149 | 
150 | The DE genes listed in the tables of top 12 genes for each cell type were filtered to remove the genes present in the majority of analyses and non-protein-coding genes. However, these genes are present in the downloadable results files.
151 | 
152 | The DE results tables give the following information:
153 | 
154 | - **Ensembl_id:** Gene Ensembl ID
155 | - **baseMean:** Mean gene expression across all cells in cluster
156 | - **LFC-coldvsTN:** Log2 fold change (shrunken) of gene expression for cold cells relative to thermal neutral cells in cluster
157 | - **SE-coldvsTN:** Standard error of the log2 fold change estimate for cold cells relative to thermal neutral cells in cluster
158 | - **LFC-coldvsRT:** Log2 fold change (shrunken) of gene expression for cold cells relative to room temperature cells in cluster
159 | - **SE-coldvsRT:** Standard error of the log2 fold change estimate for cold cells relative to room temperature cells in cluster
160 | - **LFC-RTvsTN:** Log2 fold change (shrunken) of gene expression for room temperature cells relative to thermal neutral cells in cluster
161 | - **SE-RTvsTN:** Standard error of the log2 fold change estimate for room temperature cells relative to thermal neutral cells in cluster
162 | - **stat:** Statistic used for measuring p-value
163 | - **pvalue:** P-value
164 | - **padj:** P-value adjusted for multiple test correction
165 | - **gene_name:** Official gene symbol
166 | - **gene_biotype:** Type of gene: protein-coding, rRNA, pseudogene, etc.
167 | - **description:** Full name of gene
168 | 
169 | _**NOTE:** The results do not describe at which temperatures the gene expression is significantly different. However, the log2 foldchanges can help discern the differences in gene expression across temperature._
170 | 
171 | ## Cluster1
172 | 
173 | ```{r cluster1}
174 | cluster1_sig_genes <- cluster_sig_results[["cluster1"]]
175 | cluster1_sig_genes <- cluster1_sig_genes[which(!(cluster1_sig_genes %in% intersection_genes))]
176 | 
177 | knitr::kable(cluster1_sig_genes[1:12,])
178 | ```
179 | 
180 | [Download all results for cluster1](results/cluster1_DE_all_results.csv)
181 | 
182 | [Download significant results for cluster1](results/cluster1_DE_sig_results.csv)
183 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/sc_QC_template.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quality Control"
  3 | author: 
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, cache=FALSE, message=FALSE, warning=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | # Load libraries
 12 | library(Seurat)
 13 | library(knitr)
 14 | library(rmarkdown)
 15 | library(tidyverse)
 16 | library(Matrix)
 17 | library(AnnotationHub)
 18 | library(ensembldb)
 19 | library(scales)
 20 | 
 21 | # Set seed for reproducibility
 22 | set.seed(1454944673L)
 23 | 
 24 | opts_chunk[["set"]](
 25 |     audodep = TRUE,
 26 |     cache = FALSE,
 27 |     cache.lazy = FALSE,
 28 |     error = TRUE,
 29 |     echo = FALSE,
 30 |     fig.height = 10L,
 31 |     fig.retina = 2L,
 32 |     fig.width = 10L,
 33 |     message = FALSE,
 34 |     tidy = TRUE,
 35 |     warning = TRUE
 36 | )
 37 | ```
 38 | 
 39 | # Overview
 40 | 
 41 | - Principal Investigator / Researcher: 
 42 | - Experiment: 
 43 | 
 44 | Experimental description:
 45 | 
 46 | Goals:
 47 | 
 48 | Expectations: 
 49 | 
 50 | - Reads per cell:
 51 | - Cells per sample:
 52 | - Cell type populations:
 53 | 
 54 | * * *
 55 | 
 56 | 
 57 | 
 58 | ```{r raw_data}
 59 | # Bring in count matrix from bcbio
 60 | raw_counts <- readMM("path/to/tagcounts.mtx")
 61 | 
 62 | # Assign row names and column names of matrix
 63 | gene_names <- read.csv("path/to/tagcounts.mtx.rownames", header = FALSE)
 64 | cell_ids <- read.csv("path/to/tagcounts.mtx.colnames", header = FALSE)
 65 | rownames(raw_counts) <- gene_names[, 1]
 66 | colnames(raw_counts) <- cell_ids[, 1]
 67 | 
 68 | # Acquire the gene names for the Ensembl IDs
 69 | ## Connect to AnnotationHub
 70 | ah <- AnnotationHub()
 71 | 
 72 | ## Access the Ensembl database for organism
 73 | ahDb <- query(ah, 
 74 |               pattern = c("Homo sapiens", "EnsDb"), 
 75 |               ignore.case = TRUE)
 76 | 
 77 | ## Acquire the latest annotation files
 78 | id <- ahDb %>%
 79 |         mcols() %>%
 80 |         rownames() %>%
 81 |         tail(n = 1)
 82 | 
 83 | ## Download the appropriate Ensembldb database
 84 | edb <- ah[[id]]
 85 | 
 86 | ## Extract gene-level information from database
 87 | annotations <- genes(edb, 
 88 |                      return.type = "data.frame")
 89 | 
 90 | ## Select annotations of interest
 91 | annotations <- annotations %>%
 92 |         dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
 93 | 
 94 | # Getting gene names in place of Ensembl IDs as row IDs
 95 | gene_symbols <- merge(raw_counts[, 1], annotations, by.x = 0, by.y = "gene_id")
 96 | 
 97 | gene_symbols <- gene_symbols[, -c(2)]
 98 | 
 99 | duplicated_ids <- which(duplicated(gene_symbols$gene_name))
100 | # length(duplicated_ids)
101 | 
102 | raw_counts <- raw_counts[-c(duplicated_ids), ]
103 | 
104 | gene_symbols <- gene_symbols[-c(duplicated_ids), ]
105 | 
106 | raw_counts <- raw_counts[which(rownames(raw_counts) %in% gene_symbols$Row.names), ]
107 | 
108 | gene_symbols <- gene_symbols[which(gene_symbols$Row.names %in% rownames(raw_counts)), ]
109 | 
110 | # all(gene_symbols$Row.names == rownames(raw_counts))
111 | 
112 | rownames(raw_counts) <- gene_symbols$gene_name
113 | 
114 | #write.csv(as.matrix(raw_counts), "data/raw_counts_symbols.csv")
115 | #write(gene_symbols$gene_name, "data/gene_names.txt")
116 | #write(colnames(raw_counts), "data/cell_ids.txt")
117 | 
118 | # Create a sparse matrix for more efficient computation
119 | counts <- as(raw_counts, "dgCMatrix")
120 | 
121 | # Format cells properly
122 | colnames(counts) <- str_replace_all(colnames(counts), "-", "_")
123 | colnames(counts) <- str_replace_all(colnames(counts), ":", "_")
124 | 
125 | # Create metadata containing only the cell IDs
126 | metadata <- data.frame(row.names = colnames(counts), cells = colnames(counts), stringsAsFactors = F)
127 | 
128 | # Add number of UMIs for each gene per cell to metadata
129 | metadata$nUMI <- Matrix::colSums(counts)
130 | 
131 | # Add number of genes detected per cell to metadata
132 | metadata$nGene <- Matrix::colSums(counts > 0)
133 | 
134 | # Add number of UMIs per gene for each cell to metadata
135 | metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)
136 | 
137 | # Add sample name
138 | sample1 <- which(str_detect(metadata$cells, "run1_ATAGAGAG"))
139 | sample2 <- which(str_detect(metadata$cells, "run1_ATTAGACG"))
140 | sample3 <- which(str_detect(metadata$cells, "run1_CTCCTTAC"))
141 | sample4 <- which(str_detect(metadata$cells, "run1_TACTCCTT"))
142 | 
143 | metadata$sample <- "x"
144 | metadata$sample[sample1] <- "Good name for sample1"
145 | metadata$sample[sample2] <- "Good name for sample2"
146 | metadata$sample[sample3] <- "Good name for sample3"
147 | metadata$sample[sample4] <- "Good name for sample4"
148 | #which(metadata$sample == "x")
149 | 
150 | # Get the total number of reads per sample
151 | nReads_sample1 <- read_tsv("path/to/final/run1-ATAGAGAG/run1-ATAGAGAG-barcodes.tsv", col_names = c("cell", "nReads"))  %>% 
152 |         mutate(cell = make.names(cell)) %>%
153 |         mutate(sample = "Good name for sample1")
154 | 
155 | nReads_sample2 <- read_tsv("path/to/final/run1-ATTAGACG/run1-ATTAGACG-barcodes.tsv", col_names = c("cell", "nReads"))  %>% 
156 |         mutate(cell = make.names(cell)) %>%
157 |         mutate(sample = "Good name for sample2")
158 | 
159 | nReads_sample3 <- read_tsv("path/to/final/run1-CTCCTTAC/run1-CTCCTTAC-barcodes.tsv", col_names = c("cell", "nReads"))  %>% 
160 |         mutate(cell = make.names(cell)) %>%
161 |         mutate(sample = "Good name for sample3")
162 | 
163 | nReads_sample1 <- read_tsv("path/to/final/run1-TACTCCTT/run1-TACTCCTT-barcodes.tsv", col_names = c("cell", "nReads"))  %>% 
164 |         mutate(cell = make.names(cell)) %>%
165 |         mutate(sample = "Good name for sample4")
166 | 
167 | nReads <- rbind(nReads_sample1, nReads_sample2, nReads_sample3, nReads_sample4)
168 | 
169 | nReads$nReads <- log10(nReads$nReads)
170 | ```
171 | 
172 | # Quality control metrics
173 | 
174 | ## Reads per cell
175 | 
176 | These are counts of how many reads are assigned to a given cellular barcode. It is normal for single cell RNA-seq data to contain a large number of low complexity barcodes. The bcbio pipeline filters out most of these barcodes, and here we have applied a threshold cutoff of a minimum of 1000 reads per cell. The unfiltered read count distributions are shown here.
177 | 
178 | The plot shows the frequency distribution of the reads per cell. You can see that the vast majority of low complexity barcodes plateau at a read depth below 1000 reads per cell. This is to be expected and we will remove these by filtering these low quality "cells".
179 | 
180 | ```{r reads_per_cell}
181 | # Number of reads per cell
182 | nReads %>% 
183 |         ggplot(aes(color=sample, x=nReads, fill= sample,  )) + 
184 |         geom_density(alpha = 0.2) +
185 |         xlab("log10 reads per cell") +
186 |         geom_vline(xintercept = 3)
187 | 
188 | # Extract IDs for mitochondrial genes
189 | mt <- annotations %>% 
190 |         dplyr::filter(seq_name == "MT") %>%
191 |         dplyr::pull(gene_name)
192 | 
193 | # Number of UMIs assigned to mitochondrial genes
194 | metadata$mtUMI <- Matrix::colSums(counts[which(rownames(counts) %in% mt),], na.rm = T)
195 | 
196 | # Ensure all NAs receive zero counts
197 | metadata$mtUMI[is.na(metadata$mtUMI)] <- 0
198 | 
199 | # Calculate of mitoRatio per cell
200 | metadata$mitoRatio <- metadata$mtUMI/metadata$nUMI
201 | 
202 | # Keep cells with nUMI greater than 100
203 | idx <- which(metadata$nUMI > 100)
204 | 
205 | # Extract the counts for those cells
206 | counts_c <- counts[, idx]
207 | 
208 | 
209 | # Extract the metadata for those cells
210 | metadata_c <- metadata[idx,]
211 | 
212 | # Save data to single cell experiment variable
213 | se <- SingleCellExperiment(assays=list(counts=counts_c), 
214 |                            colData = metadata_c)
215 | 
216 | # Create a data frame containing the metrics for visualizations
217 | metrics <- colData(se) %>%
218 |         as.data.frame
219 | ```
220 | 
221 | 
222 | We expect to have 3,000 cells sequenced per sample, so we can explore the UMI counts per sample. After removing cells based on quality metrics, we expect to have ~3,000 counts per sample.
223 | 
224 | ```{r counts_per_sample}
225 | # Visualize the number of counts per sample
226 | metrics %>% 
227 |         ggplot(aes(x=sample, fill=sample)) + 
228 |         geom_bar() + 
229 |         theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
230 |         ggtitle("NCells")
231 | ```
232 | 
233 | ## UMI counts per cell
234 | 
235 | Now let's assess the distribution of unique molecular identifier (UMI)-deconvoluted counts per cell. In general, the distributions should be relatively uniform per sample.
236 | 
237 | ```{r UMIs_per_cell}
238 | # Visualize the number UMIs/transcripts per cell
239 | metrics %>% 
240 |         ggplot(aes(color=sample, x=nUMI, fill= sample)) + 
241 |         geom_density(alpha = 0.2) + 
242 |         scale_x_log10() + 
243 |         ylab("log10 cell density") +
244 |         geom_vline(xintercept = 500)
245 | ```
246 | 
247 | ## Genes detected per cell
248 | 
249 | Here by "detected", we mean genes with a non-zero count measurement per cell. Seeing gene detection in the range of `500`-`5000` is normal for most single-cell experiments.
250 | 
251 | ```{r genes_detected}
252 | # Visualize the distribution of genes detected per cell via histogram
253 | metrics %>% 
254 |         ggplot(aes(color=sample, x=nGene, fill= sample)) + 
255 |         geom_density(alpha = 0.2) + 
256 |         scale_x_log10() + 
257 |         geom_vline(xintercept = 300)
258 | 
259 | # Visualize the distribution of genes detected per cell via boxplot
260 | metrics %>% 
261 |         ggplot(aes(x=sample, y=log10(nGene), fill=sample)) + 
262 |         geom_boxplot() + 
263 |         theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
264 |         ggtitle("NCells vs NGenes")
265 | ```
266 | 
267 | ## UMIs vs. genes detected
268 | 
269 | If we graph out the total number of UMI counts per cell vs. the genes detected per cell, we can assess whether there is a large population of low quality cells with low counts and/or gene detection.
270 | 
271 | ```{r umis_vs_genes}
272 | # Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs
273 | metrics %>% 
274 |         ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
275 |         geom_point() + 
276 |         stat_smooth(method=lm) +
277 |         scale_x_log10() + 
278 |         scale_y_log10() + 
279 |         geom_vline(xintercept = 800) +
280 |         facet_wrap(~sample)
281 | ```
282 | 
283 | ## Mitochondrial abundance
284 | 
285 | We evaluate overall mitochondrial gene expression as a biomarker of cellular stress during sample preparation. 
286 | 
287 | ```{r mito_ratios}
288 | # Visualize the distribution of mitochondrial gene expression detected per cell
289 | metrics %>% 
290 |         ggplot(aes(color=sample, x=mitoRatio, fill=sample)) + 
291 |         geom_density(alpha = 0.2) + 
292 |         scale_x_log10() + 
293 |         geom_vline(xintercept = 0.25)
294 | ```
295 | 
296 | ## Novelty
297 | 
298 | Another way to QC the data is to look for less novelty, that is cells that have less genes detected per count than other cells. We can see the samples where we sequenced each cell less have a higher overall novelty, that is because we have not started saturated the sequencing for any given gene for these samples. Outlier cells in these samples might be cells that we have a less complex RNA species than other cells. Sometimes we can detect contamination with low complexity cell types like red blood cells via this metric.
299 | 
300 | ```{r novelty}
301 | # Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI
302 | metrics %>%
303 |         ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
304 |         geom_density(alpha = 0.2)
305 | ```
306 | 
307 | 
308 | # Filter cells
309 | 
310 | Based on the above metrics we can filter the cells using the discussed thresholds. We will also filter out those genes that do not have counts in at least 10 cells.
311 | 
312 | ```{r filtering}
313 | # Filter out low quality cells using selected thresholds - these will change with experiment
314 | keep_cells <- metrics %>%
315 |         dplyr::filter(nUMI >= 1300 , 
316 |                       nGene >= 500,
317 |                       log10GenesPerUMI >= 0.85,
318 |                       mitoRatio <= 0.25,
319 |         ) %>% 
320 |         pull(cells)
321 | 
322 | # Subset the cells to only include those cells and genes that meet the thresholds specified
323 | se_c <- se[, keep_cells]
324 | 
325 | # Output a logical vector for every gene on whether the more than zero counts per cell
326 | nonzero <- counts(se_c) > 0L
327 | 
328 | # Sums all TRUE values and returns TRUE if more than 10 TRUE values per gene
329 | keep_genes <- rowSums(as.matrix(nonzero)) >= 10
330 | 
331 | # Only keeping those genes expressed in more than 10 cells
332 | se_c <- se_c[keep_genes, ]
333 | 
334 | # Save subset to new metrics variable
335 | metrics_clean <- colData(se_c) %>%
336 |  as.data.frame()
337 | 
338 | # Save cleaned single-cell experimnet as .RData to load at any time
339 | saveRDS(se_c, file = "data/se_filtered.rds")
340 | 
341 | # The following can be used for SPRING interface
342 | #write.csv(as.matrix(counts(se_c)), "data/tseng_filtered_counts_baseR.csv")
343 | #write(rownames(counts(se_c)), "data/tseng_filtered_gene_names.csv")
344 | #write(colnames(counts(se_c)), "data/tseng_filtered_cell_ids.csv")
345 | ```
346 | 
347 | ```{r setting_up_for_clustering}
348 | # Create Seurat object from filtered SingleCellExperiment object
349 | seurat_raw <- CreateSeuratObject(raw.data = counts(se_c),
350 |                                  meta.data = colData(se_c) %>% 
351 |                                          data.frame())
352 | saveRDS(seurat_raw, file = file.path(data_dir,"seurat_raw.rds"))
353 | 
354 | # Download cell cycle genes for organism at https://github.com/hbc/tinyatlas/tree/master/cell_cycle. Read it in with:
355 | cell_cycle_genes <- read.csv("path/to/organism.csv")
356 | 
357 | cell_cycle_markers <- dplyr::left_join(cell_cycle_genes, annotations, by = c("geneID" = "gene_id"))
358 | 
359 | s_genes <- cell_cycle_markers %>%
360 |         dplyr::filter(phase == "S") %>%
361 |         pull("gene_name")
362 | g2m_genes <- cell_cycle_markers %>%
363 |         dplyr::filter(phase == "G2/M") %>%
364 |         pull("gene_name")
365 | save(g2m_genes, s_genes, file = file.path("data/cycle.rda"))
366 | 
367 | # Copy cycle.rda and seurat_raw.rds to O2 if not already there to perform clustering. 
368 | # I perform clustering on O2 using: https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/clustering_pre_regress.R
369 | # Next, on O2 I run https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/clustering_regress.R.
370 | ```
371 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/sc_clustering_template.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: ""
  3 | author: ""
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | # Load libraries
 12 | library(Seurat)
 13 | library(knitr)
 14 | library(rmarkdown)
 15 | library(tidyverse)
 16 | library(Matrix)
 17 | library(AnnotationHub)
 18 | library(ensembldb)
 19 | library(scales)
 20 | library(cowplot)
 21 | library(gridExtra)
 22 | 
 23 | # Set seed for reproducibility
 24 | set.seed(1454944673L)
 25 | 
 26 | opts_chunk[["set"]](
 27 |     audodep = TRUE,
 28 |     cache = FALSE,
 29 |     cache.lazy = FALSE,
 30 |     error = TRUE,
 31 |     echo = FALSE,
 32 |     fig.height = 10L,
 33 |     fig.retina = 2L,
 34 |     fig.width = 10L,
 35 |     message = FALSE,
 36 |     tidy = TRUE,
 37 |     warning = TRUE
 38 | )
 39 | ```
 40 | 
 41 | # Overview
 42 | 
 43 | - Principal Investigator: 
 44 | - Researcher: 
 45 | - Experiment: 
 46 | 
 47 | Experimental description:
 48 | 
 49 | 
 50 | * * *
 51 | 
 52 | This workflow is adapted from the following sources:
 53 | 
 54 | - Satija Lab: [Seurat v2 Guided Clustering Tutorial](http://satijalab.org/seurat/pbmc3k_tutorial.html)
 55 | - Paul Hoffman: [Cell-Cycle Scoring and Regression](http://satijalab.org/seurat/cell_cycle_vignette.html)
 56 | 
 57 | To identify clusters, the following steps will be performed:
 58 | 
 59 | 1. Normalization and transformation of the raw gene counts per cell to account for differences in sequencing depth.
 60 | 2. Identification of high variance genes.
 61 | 3. Regression of sources of unwanted variation (e.g. number of UMIs per cell, mitochondrial transcript abundance, cell cycle phase).
 62 | 4. Identification of the primary sources of heterogeneity using principal component (PC) analysis and heatmaps.
 63 | 5. Clustering cells based on significant PCs (metagenes).
 64 | 
 65 | * * *
 66 | 
 67 | # Clustering report 
 68 | 
 69 | In this report, we are exploring the clustering of cells by similarities in gene expression.
 70 | 
 71 | ```{r data, message=FALSE, warning=FALSE}
 72 | # You can use scripts at https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/clustering_pre_regress.R and https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/clustering_regress.R on O2 to generate the cell clusters and save to .rds files
 73 | 
 74 | pre_regressed_seurat <- readRDS("path/to/data/pre_regressed_seurat.rds") 
 75 | 
 76 | seurat <- readRDS("path/to/data/seurat_tsne.rds") # path on O2 - usually copy to local computer
 77 | 
 78 | ```
 79 | 
 80 | ## Normalizing counts, finding variable genes, and scaling the data
 81 | 
 82 | The raw counts for the sample are normalized using global-scaling normalization, which normalizes each cell by total expression and applies a log transformation. Then, the most variable genes are identified after calculation of the gene-wise dispersions.
 83 | 
 84 | We can plot dispersion (a normalized measure of to cell-to-cell variation) as a function of average expression for each gene to identify a set of high-variance genes, which are used to determine the principal components. 
 85 | 
 86 | ```{r dispersion}
 87 | # Plot variable genes
 88 | VariableGenePlot(pre_regressed_seurat)
 89 | ```
 90 | 
 91 | ## Cell cycle expression
 92 | 
 93 | We can check if our cell cycle is a major source of variation in our data by visualizing a PCA plot and coloring by cell cycle phase of each cell.
 94 | 
 95 | ```{r preregressed_cell_cycle}
 96 | # PCA by cell cycle of pre-regressed seurat
 97 | PCAPlot(pre_regressed_seurat, group.by= "Phase")
 98 | ```
 99 | 
100 | ## Apply regression variables
101 | 
102 | To explore the clustering of all of the samples together, we regressed out variation due to read count depth, cell cycle, and mitochondrial content. 
103 | 
104 | Now that regression has been applied, we should see the phase clusters superimpose by PCA.
105 | 
106 | ```{r cell_cycle_pca}
107 | # PCA plot after regression
108 | PCAPlot(seurat, group.by= "Phase")
109 | ```
110 | 
111 | ## Linear dimensionality reduction and determining significant prinicipal components
112 | 
113 | Next, we perform scoring of each gene in the dataset based on correlation with the calculated principal components. 
114 | 
115 | Then, to overcome the extensive technical noise in any single gene for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include downstream is therefore an important step. Often it is useful to explore the PCs prior to identifying the significant principal components to include for the downstream clustering analysis.
116 | 
117 | We can print out the top most variant genes for the select PCs. Here we are identifying the five most positively and negatively variant genes for the top 10 PCs:
118 | 
119 | ```{r exploring_pcs_list}
120 | ## Genes associated with principal components
121 | PrintPCA(object = seurat, 
122 |          pcs.print = 1:10, 
123 |          genes.print = 5, 
124 |          use.full = FALSE)
125 | ```
126 | 
127 | We can also explore the expression of the top most variant genes for select PCs. The genes and cells are ordered by PC scores:
128 | 
129 | ```{r exploring_pcs_heatmap}
130 | # Explore expression of most extreme genes per PC
131 | PCHeatmap(object = seurat, 
132 |           pc.use = 1:10, 
133 |           cells.use = 500, 
134 |           do.balanced = TRUE, 
135 |           label.columns = FALSE, 
136 |           use.full = FALSE)
137 | ```
138 | 
139 | PC selection — identifying the true dimensionality of a dataset — is an important step for our clustering analysis, but can be challenging/uncertain. While there are a variety of ways to choose a threshold, we’re going to calculate where the principal components start to elbow by taking the larger value of:
140 | 
141 | 1. The point where the principal components only contribute 5% of standard deviation (bottom left).
142 | 2. The point where the principal components cumulatively contribute 90% of the standard deviation (bottom right).
143 | 
144 | 
145 | ```{r elbow_plot}
146 | # Create elbow plot
147 | PCElbowPlot(seurat)
148 | 
149 | # Determine the estimate for significant PCs
150 | 
151 | pct <- seurat@dr$pca@sdev / sum(seurat@dr$pca@sdev) * 100
152 | cumu <- cumsum(pct)
153 | co1 <- which(cumu > 90 & pct < 5)[1]
154 | co2 <- sort(which((pct[1:length(pct)-1] - pct[2:length(pct)]) > 0.1),
155 |           decreasing = T)[1] + 1 # last point where change of % of variation is more than 0.1%.
156 | pcs <- min(co1, co2) # change to any other number
157 | ```
158 | 
159 | ## Cluster the cells
160 | 
161 | We can now use these significant PCs to determine which cells exhibit similar expression patterns for clustering. To do this, Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’. Details on this clustering methods are available in the Seurat paper. 
162 | 
163 | To cluster cells we used a `resolution` argument that sets the "granularity" of the downstream clustering, with increased values leading to a greater number of clusters. We used a resolution of X for this analysis. 
164 | 
165 | A useful feature in [Seurat][] is the ability to recall the parameters that were used in the analysis. Important are the resolution used for clustering and the number of principal components (dimensions) used for the calcuations if repeating. We have provided these below.
166 | 
167 | ```{r parameters}
168 | # Choose a resolution and set identity
169 | seurat <- SetAllIdent(object = seurat, id = "res.0.8")
170 | 
171 | PrintFindClustersParams(seurat, resolution = 0.8)
172 | 
173 | seurat <- RunTSNE(
174 |   seurat,
175 |   dims.use = 1:pcs,
176 |   do.fast = TRUE)
177 | 
178 | # TSNEPlot(object = seurat)
179 | ```
180 | 
181 | ### t-SNE
182 | 
183 | Seurat continues to use t-distributed stochastic neighbor embedding (t-SNE) as a powerful tool to visualize and explore these datasets. Generally, t-SNE aims to place cells with similar local neighborhoods in high-dimensional space together in low-dimensional space. We will use the selected prinicipal components to aid in clustering of cells with similar gene expression together. **Note that distance between clusters on the t-SNE plots does not represent degree of similarity between clusters.**
184 | 
185 | ```{r tsne, results="asis"}
186 | # Choose a resolution
187 | seurat <- SetAllIdent(object = seurat, id = "res.0.8")
188 | 
189 | # Run the TSNE to determine the clusters
190 | seurat <- RunTSNE(
191 |   seurat,
192 |   dims.use = 1:pcs,
193 |   do.fast = TRUE)
194 | 
195 | # Plot the TSNE
196 | DimPlot(seurat,
197 |         "tsne",
198 |         do.label = TRUE,
199 |         do.return = TRUE,
200 |         label.size = 6,
201 |         plot.title = "tSNE") 
202 | ```
203 | 
204 | 
205 | ### PCA
206 | 
207 | Note that t-SNE is not PCA! The measurement of distance in a t-SNE plot is difficult to interpret, and is most helpful for the relationships of close neighbors. To better infer separation distance between the putative clusters, let's visualize using PCA.
208 | 
209 | ```{r pca, results="asis"}
210 | # Plot the PCA
211 | DimPlot(seurat,
212 |         "pca",
213 |         do.label = TRUE,
214 |         do.return = TRUE,
215 |         label.size = 6,
216 |         plot.title = "PCA")
217 | ```
218 | 
219 | ### UMAP
220 | 
221 | Uniform Manifold Approximation and Projection (UMAP) is a dimensionality reduction technique that is similar to t-SNE, but where the distances between cells represent similarity in expression. We can explore the similarity in gene expression between clusters a bit more easily with UMAP.
222 | 
223 | ```{r umap, results="asis"}
224 | # To run UMAP, you need to install the tool using the command line with `conda install -c conda-forge umap-learn`. After successfully installed you will be able to run the following commands:
225 | 
226 | # Run UMAP
227 | seurat <- RunUMAP(seurat, reduction.use = "pca", dims.use = 1:pcs)
228 | 
229 | # Plot the UMAP
230 | DimPlot(seurat,
231 |         "umap",
232 |         do.label = TRUE,
233 |         do.return = TRUE,
234 |         label.size = 6,
235 |         plot.title = "UMAP")
236 | ```
237 | 
238 | ## Exploration of quality control metrics
239 | 
240 | To determine whether our clusters might be due to artifacts such as cell cycle phase or mitochondrial expression, it can be useful to explore these metrics visually to see if any clusters exhibit enrichment or are different from the other clusters. However, if enrichment or differences are observed for particular clusters it may not be worrisome if it can be explained by the cell type. 
241 | 
242 | We can start by exploring the distribution of cells per cluster for each sample:
243 | 
244 | ```{r cell_counts}
245 | # Extract identity and sample information from seurat object to determine the number of cells per cluster per sample
246 | n_cells <- FetchData(seurat, vars.all = c("ident", "sample")) %>% 
247 |   dplyr::count(sample, ident) %>% 
248 |   spread(ident, n)
249 | 
250 | # View table
251 | knitr::kable(n_cells)
252 | ```
253 | 
254 | Then, we can acquire the different cluster QC metrics. We will explore sample and cell cycle to view by tSNE and PCA:
255 | 
256 | ```{r plot_feature_tsne}
257 | # Establishing groups to color plots by
258 | group_by <- c("Phase", "sample")
259 | 
260 | # Getting coordinates for cells to use for tSNE and associated grouping variable information
261 | class_tsne_data <- FetchData(seurat, vars.all = c("ident", "tSNE_1", "tSNE_2", group_by))
262 | 
263 | # Adding cluster label to center of cluster on tSNE
264 | tsne_label <- FetchData(seurat, 
265 |                         vars.all = c("ident", "tSNE_1", "tSNE_2"))  %>% 
266 |   as.data.frame() %>% 
267 |   group_by(ident) %>%
268 |   summarise(x=mean(tSNE_1), y=mean(tSNE_2))
269 | 
270 | # Getting coordinates for cells to use for PCA and associated grouping variable information
271 | class_pca_data <- FetchData(seurat, vars.all = c("ident", "PC1", "PC2", group_by))
272 | 
273 | # Adding cluster label to center of cluster on PCA
274 | pca_label <- FetchData(seurat, vars.all = c("ident", "PC1", "PC2"))  %>% 
275 |   as.data.frame() %>% 
276 |   mutate(ident = seurat@ident) %>% 
277 |   group_by(ident) %>%
278 |   summarise(x=mean(PC1), y=mean(PC2))
279 | 
280 | # Function to plot tSNE and PCA as grids
281 | map(group_by, function(metric) {
282 |   cat("\n\n###", metric, "\n\n")
283 |   p <- plot_grid(
284 |     ggplot(class_tsne_data, aes(tSNE_1, tSNE_2)) +
285 |       geom_point(aes_string(color = metric), alpha = 0.7) +
286 |       scale_color_brewer(palette = "Set2")  +
287 |       geom_text(data=tsne_label, aes(label=ident, x, y)),
288 |     ggplot(class_pca_data, aes(PC1, PC2)) +
289 |       geom_point(aes_string(color = metric), alpha = 0.7) +
290 |       scale_color_brewer(palette = "Set2")  +
291 |       geom_text(data=pca_label, aes(label=ident, x, y)),
292 |     nrow = 1, align = "v"
293 |   ) 
294 |   print(p)
295 | }) %>% invisible()
296 | 
297 | ```
298 | 
299 | Next we will explore additional metrics, such as the number of UMIs and genes per cell, S-phase and G2M-phase markers, and mitochondrial gene expression by tSNE:
300 | 
301 | ```{r dim_features}
302 | # Determine metrics to plot present in seurat@meta.data
303 | metrics <-  c("nUMI", "nGene", "S.Score", "G2M.Score", "mitoRatio")
304 | 
305 | # Extract the TSNE coordinates for each cell and include information about the metrics to plot
306 | qc_data <- FetchData(seurat, vars.all = c(metrics, "ident", "tSNE_1", "tSNE_2"))
307 | 
308 | # Plot a tSNE plot for each metric
309 | map(metrics, function(qc){
310 |   ggplot(qc_data, aes(tSNE_1, tSNE_2)) +
311 |     geom_point(aes_string(color=qc), alpha = 0.7) +
312 |     scale_color_gradient(guide = FALSE, low = "grey90", high = "blue")  +
313 |     geom_text(data=tsne_label, aes(label=ident, x, y)) +
314 |     ggtitle(qc)
315 | }) %>% 
316 |   plot_grid(plotlist = .)
317 | ```
318 | 
319 | We can also explore how well our clusters separate by the different PCs; we hope that the defined PCs separate the cell types well. In the tSNE plots below, the cells are colored by their PC score for each respective principal component.
320 | 
321 | ```{r feature_pcs}
322 | # Defining the information in the seurat object of interest
323 | columns <- c(paste0("PC", 1:pcs),
324 |             "ident",
325 |             "tSNE_1", "tSNE_2")
326 | 
327 | # Extracting this data from the seurat object
328 | pc_data <- FetchData(seurat, vars.all = columns)
329 | 
330 | # Plotting a tSNE plot for each of the PCs
331 | map(paste0("PC", 1:pcs), function(pc){
332 |   ggplot(pc_data, aes(tSNE_1, tSNE_2)) +
333 |     geom_point(aes_string(color=pc), alpha = 0.7) +
334 |     scale_color_gradient(guide = FALSE, low = "grey90", high = "blue")  +
335 |     geom_text(data=tsne_label, aes(label=ident, x, y)) +
336 |     ggtitle(pc)
337 | }) %>% plot_grid(plotlist = .)
338 | ```
339 | 
340 | We can also view which cells from each condition are in the different clusters.
341 | 
342 | ```{r cluster_sample}
343 | 
344 | cells_sample1 <- rownames(seurat@meta.data[which(seurat@meta.data$interestingGroups == "control"), ])
345 | 
346 | p1 <- TSNEPlot(object = seurat, do.label = TRUE, cells.use = cells_sample1, do.return=TRUE)
347 | 
348 | cells_sample2 <- rownames(seurat@meta.data[which(seurat@meta.data$interestingGroups == "treatment"), ])
349 | 
350 | p2 <- TSNEPlot(object = seurat, do.label = TRUE, cells.use = cells_sample2, do.return=TRUE)
351 | 
352 | ```
353 | 
354 | ```{r samplegroup_clustering}
355 | grid.arrange(p1, p2, ncol=2, nrow = 1)
356 | ```
357 | 
358 | **Figure Legend:** *From left to right, control, treatment**
359 | 
360 | ## Marker checks
361 | 
362 | To determine whether the clustering seems appropriate, we checked for known markers of different cell types. 
363 | 
364 | ```{r all_markers}
365 | # Download all markers at https://github.com/hbc/tinyatlas/blob/master/cell_type
366 | 
367 | # Read into R:
368 | hbc_markers <- read.csv("path/to/organism.csv")
369 | hbc_markers <- hbc_markers[hbc_markers$highConf == TRUE, ]
370 | hbc_markers <- left_join(hbc_markers, annotations, by = c("geneID" = "gene_id"))
371 | 
372 | # Extract the count information for each cell for the genes of interest from the seurat object
373 | gene_data <- FetchData(seurat, vars.all = hbc_markers$gene_name)
374 | 
375 | # Extract tSNE coordinates, sample name, and cluster identity information from the seurat object
376 | tsne <- FetchData(seurat, 
377 |                   vars.all = c("tSNE_1", "tSNE_2", "sample", "ident"))
378 | 
379 | 
380 | # Merge counts for genes of interest with tSNE information
381 | gene_data <- cbind(tsne, gene_data)
382 | 
383 | 
384 | # Plot the expression of each of the genes of interest on the tSNE
385 | map(hbc_markers$gene_name, function(g){
386 |   ggplot(gene_data, aes(tSNE_1, tSNE_2)) +
387 |     geom_point(aes_string(color=g), alpha = 0.7, size = 0.3) +
388 |     scale_color_gradient(guide = FALSE, low = "grey90", high = "blue")  +
389 |     ggtitle(g)
390 | }) %>% 
391 |   plot_grid(plotlist = .)
392 | ```
393 | 
394 | We can also check markers of interest:
395 | 
396 | ```{r client_markers}
397 | # Read into R:
398 | client_markers <- read.csv("path/to/client_markers.csv")
399 | #client_markers <- left_join(client_markers, annotations, by = c("geneID" = "gene_id"))
400 | 
401 | # Extract the count information for each cell for the genes of interest from the seurat object
402 | gene_data <- FetchData(seurat, vars.all = client_markers$gene_name)
403 | 
404 | # Extract tSNE coordinates, sample name, and cluster identity information from the seurat object
405 | tsne <- FetchData(seurat, 
406 |                   vars.all = c("tSNE_1", "tSNE_2", "sample", "ident"))
407 | 
408 | 
409 | # Merge counts for genes of interest with tSNE information
410 | gene_data <- cbind(tsne, gene_data)
411 | 
412 | 
413 | # Plot the expression of each of the genes of interest on the tSNE
414 | map(client_markers$gene_name, function(g){
415 |   ggplot(gene_data, aes(tSNE_1, tSNE_2)) +
416 |     geom_point(aes_string(color=g), alpha = 0.7, size = 0.3) +
417 |     scale_color_gradient(guide = FALSE, low = "grey90", high = "blue")  +
418 |     ggtitle(g)
419 | }) %>% 
420 |   plot_grid(plotlist = .)
421 |   
422 | # Save seurat object for marker identification and DE analysis
423 | write_rds(seurat, "data/seurat_tsne.rds")
424 | ```
425 | 
426 | **Summary of the clusters using the given markers:**
427 | 
428 | Based on these markers, the following are the hypothesized identities of the clusters:
429 | 
430 | | Cluster | Cell type |
431 | |:---:|:---:|
432 | | 0 | Cell type |
433 | | 1 | Cell type |
434 | | 2 | Cell type |
435 | | 3 | Cell type |
436 | | 4 | Cell type |
437 | 
438 | 
439 | ```{r new_ids}
440 | # List of current cluster IDs
441 | current_cluster_ids <- 0:length(levels(seurat@ident)) - 1
442 | 
443 | # List of new cluster IDs
444 | new_cluster_ids <- c("")
445 | 
446 | # Changing IDs to cell type
447 | seurat_assigned <- seurat
448 | seurat_assigned@ident <- plyr::mapvalues(x = seurat_assigned@ident, 
449 |                                 from = current_cluster_ids, 
450 |                                 to = new_cluster_ids)
451 | # Re-run TSNE with cell types
452 | TSNEPlot(object = seurat_assigned, 
453 |          do.label = TRUE, 
454 |          pt.size = 0.5)
455 | 
456 | # Save assigned seurat for SPRING or marker identification
457 | write_rds(seurat_assigned, "data/seurat_tsne_assigned.rds")
458 | ```
459 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/sc_marker_identification_template.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: ""
  3 | author: ""
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | # Load libraries
 11 | library(Seurat)
 12 | library(knitr)
 13 | library(rmarkdown)
 14 | library(tidyverse)
 15 | library(Matrix)
 16 | library(AnnotationHub)
 17 | library(ensembldb)
 18 | library(scales)
 19 | library(cowplot)
 20 | library(gridExtra)
 21 | 
 22 | # Set seed for reproducibility
 23 | set.seed(1454944673L)
 24 | opts_chunk[["set"]](
 25 |     audodep = TRUE,
 26 |     cache = FALSE,
 27 |     cache.lazy = FALSE,
 28 |     error = TRUE,
 29 |     echo = FALSE,
 30 |     fig.height = 10L,
 31 |     fig.retina = 2L,
 32 |     fig.width = 10L,
 33 |     message = FALSE,
 34 |     tidy = TRUE,
 35 |     warning = TRUE
 36 | )
 37 | ```
 38 | 
 39 | # Overview
 40 | 
 41 | - Principal Investigator: 
 42 | - Researcher: 
 43 | - Experiment: 
 44 | 
 45 | Experimental description:
 46 | 
 47 | 
 48 | * * *
 49 | 
 50 | # Overview of clusters
 51 | 
 52 | For this marker identification analysis, we used Seurat to find markers that define clusters via differential expression. 
 53 | 
 54 | The TSNE plot below can be used for reference when looking for markers of each of the different clusters. 
 55 | 
 56 | 
 57 | ```{r setting_up, warning=FALSE, message=FALSE}
 58 | library(Seurat)
 59 | library(tidyverse)
 60 | library(annotables)
 61 | 
 62 | # Load data
 63 | seurat <- readRDS("../2018_11_clustering_final/seurat_tsne.rds")
 64 | 
 65 | # Plot the TSNE
 66 | DimPlot(seurat,
 67 |         "tsne",
 68 |         do.label = TRUE,
 69 |         do.return = TRUE,
 70 |         label.size = 6,
 71 |         plot.title = "tSNE") 
 72 | ```
 73 | 
 74 | 
 75 | # Marker identification
 76 | 
 77 | There are a few different types of marker identification that we will explore. Each with their own benefits and drawbacks:
 78 | 
 79 | 1. **Identification of all markers for each cluster:** this analysis compares each cluster against all others and outputs the genes that are differentially expressed/present. 
 80 | 2. **Identification of conserved markers for each cluster regardless of temperature:** This analysis looks for those genes that are conserved in the cluster across all temperature groups. This analysis will output genes that are consistently differentially expressed/present for all of the sample groups. These genes can help to figure out the identity for the cluster. Therefore, I only performed this analysis for those clusters whose identity was uncertain or novel.
 81 | 3. **Marker identification between specific clusters:** this analysis explores differentially expressed genes between specific clusters. This analysis is most useful for determining differences in gene expression between clusters with markers that are similar in the above analyses. 
 82 | 
 83 | ---
 84 | title: "WAT marker identification res1.4"
 85 | author: "Mary Piper"
 86 | date: "`r Sys.Date()`"
 87 | output: html_document
 88 | ---
 89 | 
 90 | ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE}
 91 | knitr::opts_chunk$set(echo = TRUE)
 92 | # Load libraries
 93 | library(Seurat)
 94 | library(knitr)
 95 | library(rmarkdown)
 96 | library(tidyverse)
 97 | library(Matrix)
 98 | library(AnnotationHub)
 99 | library(ensembldb)
100 | library(scales)
101 | library(cowplot)
102 | library(gridExtra)
103 | # Set seed for reproducibility
104 | set.seed(1454944673L)
105 | opts_chunk[["set"]](
106 |     audodep = TRUE,
107 |     cache = FALSE,
108 |     cache.lazy = FALSE,
109 |     error = TRUE,
110 |     echo = FALSE,
111 |     fig.height = 10L,
112 |     fig.retina = 2L,
113 |     fig.width = 10L,
114 |     message = FALSE,
115 |     tidy = TRUE,
116 |     warning = TRUE
117 | )
118 | ```
119 | 
120 | # Overview
121 | 
122 | - Principal Investigator: Yu-ha Tseng
123 | - Researcher: Farnaz Shamsi
124 | - Experiment: Characterization of adipose tissue niche with changes in temperature
125 | 
126 | Exploration of the adipose tissue niche, with the adipocytes removed. Mice were exposed to three different temperatures (cold, room temp, and thermal neutral), with one mouse per temperature for 7 days. From each animal, the white adipose tissue was harvested from the subcutaneous region in the legs and the brown adipose tissue was harvested from the interscapular region in the neck. 
127 | 
128 | At cold temperatures, the white adipose tissue develops features that are more brown-like, and is referred to as beige. The goals of this analysis are:
129 | 
130 | - Determine quality of the samples - this is a pilot and can add more samples if good quality
131 | - Compare brown adipose tissue at each temperature and white adipose tissue at each temperature
132 | - Determine the identity and differential abundance of cell clusters between temperatures
133 | - Explore what changes occur in the different cell populations: adipose progenitors, vascular tissue, neurons, immune cells at the different temperatures
134 | 
135 | * * *
136 | 
137 | # Overview of clusters
138 | 
139 | For this marker identification analysis, we used Seurat to find markers that define clusters via differential expression. 
140 | 
141 | The TSNE plot below can be used for reference when looking for markers of each of the different clusters. 
142 | 
143 | 
144 | ```{r setting_up, cache=TRUE, warning=FALSE, message=FALSE}
145 | library(Seurat)
146 | library(tidyverse)
147 | all_markers <- readRDS("WAT_results/seurat_white_res.1.4_all_markers.rds")
148 | seurat <- readRDS("../clustering_final/data/seurat_white_res.1.4_tsne_assigned.rds")
149 | library(AnnotationHub)
150 | library(ensembldb)
151 | 
152 | 
153 | ## Connect to AnnotationHub
154 | ah <- AnnotationHub()
155 | ## Access the Ensembl database for organism
156 | ahDb <- query(ah, 
157 |               pattern = c("Mus musculus", "EnsDb"), 
158 |               ignore.case = TRUE)
159 | ## Acquire the latest annotation files
160 | id <- ahDb %>%
161 |         mcols() %>%
162 |         rownames() %>%
163 |         tail(n = 1)
164 | ## Download the appropriate Ensembldb database
165 | edb <- ah[[id]]
166 | ## Extract gene-level information from database
167 | annotations <- genes(edb, 
168 |                      return.type = "data.frame")
169 | ## Select annotations of interest
170 | annotations <- annotations %>%
171 |         dplyr::select(gene_id, gene_name, gene_biotype, description)
172 | 
173 | ann_markers <- left_join(all_markers, annotations, by = c("gene" = "gene_name"))
174 | 
175 | 
176 | #write.csv(ann_markers, "WAT_results/tseng_all_WAT_markers_res1.4.csv", quote = FALSE, row.names = FALSE)
177 | 
178 | # Plot the TSNE
179 | DimPlot(seurat,
180 |         "tsne",
181 |         do.label = TRUE,
182 |         do.return = TRUE,
183 |         label.size = 4,
184 |         plot.title = "tSNE") 
185 | ```
186 | 
187 | 
188 | # Marker identification
189 | 
190 | There are a few different types of marker identification that we could explore. Each with their own benefits and drawbacks:
191 | 
192 | 1. **Identification of all markers for each cluster:** this analysis compares each cluster against all others and outputs the genes that are differentially expressed/present. 
193 | 2. **Identification of conserved markers for each cluster regardless of temperature:** This analysis looks for those genes that are conserved in the cluster across all temperature groups. This analysis will output genes that are consistently differentially expressed/present for all of the temperature groups. These genes can help to figure out the identity for the cluster. Therefore, I only performed this analysis for those clusters whose identity was uncertain or novel.
194 | 3. **Marker identification between specific clusters:** this analysis explores differentially expressed genes between specific clusters. This analysis is most useful for determining differences in gene expression between clusters with markers that are similar in the above analyses. 
195 | 
196 | ## Identification of all markers for each cluster
197 | 
198 | For this analysis we are comparing each cluster against all other clusters to identify cluster markers. 
199 | 
200 | To be identified as a marker, we specified that a gene needed to be detected at a minimum percentage of 0.25 in either of the two groups of cells and difference in expression is at least 0.25 between the two groups.
201 | 
202 | Usually the top markers are relatively trustworthy, but because of inflated p-values, many of the less significant genes are not so trustworthy as markers. 
203 | 
204 | When looking at the output, we suggest looking for markers with large differences in expression between `pct.1` and `pct.2` and larger fold changes. For instance if `pct.1` = 0.90 and `pct.2` = 0.80, I might not be as excited about that marker. However, if `pct.2` = 0.1 instead, then I would be much more excited about it. Also, I look for the majority of cells expressing marker in my cluster of interest. If `pct.1` is low, such as 0.3, I again might not be as interested in it.
205 | 
206 | [Download all marker results](WAT_results/tseng_all_WAT_markers_res1.4.csv)
207 | 
208 | The results table contains the following columns:
209 | 
210 | - **cluster:** number corresponding to cluster
211 | - **gene:** gene symbol
212 | - **gene_id:** Ensembl gene ID
213 | - **avg_logFC:** average log2 fold change. Positive values indicate that the gene is more highly expressed in the cluster.
214 | - **pct.1**: The percentage of cells where the gene is detected in the cluster
215 | - **pct.2**: The percentage of cells where the gene is detected on average in the other clusters
216 | - **p_val:** p-value not adjusted for multiple test correction
217 | - **p_val_adj:** Adjusted p-value, based on bonferroni correction using all genes in the dataset, used to determine significance
218 | - **gene_biotype:** type of gene
219 | - **description:** gene description
220 | 
221 | 
222 | 
223 | ```{r marker_expression, warning=FALSE, message=FALSE,}
224 | # Top 10 markers for each cluster
225 | top10_anno <- ann_markers %>%
226 |         group_by(cluster) %>%
227 |         top_n(10, avg_logFC)
228 |         
229 | # Function to extract clusters for plotting top 5 markers
230 | extract_cluster_markers <- function(cluster_name){
231 | top10 <- top10_anno[top10_anno$cluster == cluster_name, c(6:8, 2:4, 1, 5, 9:10)]
232 | 
233 | top5_anno <- top10[which(!(duplicated(top10$gene))), ] %>%
234 |         top_n(5, avg_logFC)
235 | 
236 | return(top5_anno)
237 | 
238 | }
239 | 
240 | 
241 | ```
242 | 
243 | 
244 | **The markers for cluster 'cluster1':**
245 | 
246 | ```{r expression_plots_cluster1, warning=FALSE, message=FALSE}
247 | 
248 | top5_anno <- extract_cluster_markers("cluster1")
249 | 
250 | knitr::kable(top5_anno)
251 | 
252 | FeaturePlot(object = seurat, 
253 |             features.plot = top5_anno$gene, 
254 |             cols.use = c("grey", "blue"), 
255 |             reduction.use = "tsne")
256 | ```
257 | 
258 | **The markers for cluster 'cluster2':**
259 | 
260 | ```{r expression_plots_cluster2, warning=FALSE, message=FALSE}
261 | 
262 | top5_anno <- extract_cluster_markers("cluster2")
263 | 
264 | knitr::kable(top5_anno)
265 | 
266 | FeaturePlot(object = seurat, 
267 |             features.plot = top5_anno$gene, 
268 |             cols.use = c("grey", "blue"), 
269 |             reduction.use = "tsne")
270 | 
271 | ```
272 | 
273 | ## Identification of conserved markers across conditions
274 | 
275 | This next analysis looks for those genes that are conserved in the cluster across all conditions, which is particularly useful for the clusters that are uncertain or unknown. Please look through these lists to determine whether the markers make sense for the annotated clusters and to help identify the unknown clusters.
276 | 
277 | **The conserved markers for cluster 'cluster1':**
278 | 
279 | ```{r cons_markers_cluster1}
280 | # Function to extract clusters for plotting
281 | extract_conserved_markers <- function(cluster_name){
282 |         
283 |         conserved_markers <- readRDS(paste0("results/", cluster_name, "_markers_conserved.rds" )) %>%
284 |         rownames_to_column(var = "gene") %>%
285 |         left_join(annotations, by = c("gene" = "gene_name"))
286 |         
287 |         conserved_markers <-  conserved_markers[which(!(duplicated( conserved_markers$gene))), ]
288 | 
289 | }
290 | 
291 | conserved_markers <- extract_conserved_markers("cluster1")
292 | 
293 | write.csv(conserved_markers, paste0("results/cluster1_markers_conserved.csv"), quote = FALSE)
294 | 
295 | knitr::kable(head(conserved_markers, n=5))
296 | 
297 | ```
298 | 
299 | [Download all conserved marker results](results/cluster1_markers_conserved.csv)
300 | 
301 | 
302 | **The conserved markers for cluster 'cluster2':**
303 | 
304 | ```{r cons_markers_cluster2}
305 | # Function to extract clusters for plotting
306 | 
307 | conserved_markers <- extract_conserved_markers("cluster2")
308 | 
309 | write.csv(conserved_markers, paste0("results/cluster2_markers_conserved.csv"), quote = FALSE)
310 | 
311 | knitr::kable(head(conserved_markers, n=5))
312 | 
313 | ```
314 | 
315 | [Download all conserved marker results](results/cluster2_markers_conserved.csv)
316 | 
317 | 
318 | ## Marker identification between specific clusters:
319 | 
320 | To further elucidate the different cell types and to identify whether to merge particular clusters, I also performed marker identification between specific clusters as shown below. Note that I included a minimum percent difference threshold of 0.15, which is less stringent than the previous threshold used to detect all markers (0.25) since these cells are more similar to each other.
321 | 
322 | **The markers for cluster1 vs cluster2:**
323 | 
324 | ```{r cluster1_vs_cluster2, warning=FALSE, message=FALSE}
325 | cluster2vs1_markers <- FindMarkers(object = seurat, ident.1 = "cluster1", ident.2= "cluster2", min.pct = 0.25, min.diff.pct = 0.15)
326 | 
327 | ann_cluster2vs1_markers <- cluster2vs1_markers %>%
328 |         rownames_to_column(var = "gene") %>%
329 |         left_join(annotations, by = c("gene" = "gene_name"))
330 | 
331 | ann_cluster2vs1_markers <- ann_cluster2vs1_markers[which(!(duplicated(ann_cluster2vs1_markers$gene))), ]
332 | 
333 | knitr::kable(head(ann_cluster2vs1_markers, n=5))
334 | 
335 | #write.csv(ann_cluster2vs1_markers, "results/cluster2vs1_markers.csv", quote = FALSE)
336 | 
337 | FeaturePlot(object = seurat, features.plot = head(ann_cluster2vs1_markers, n=5)$gene, cols.use = c("grey", "blue"), 
338 |     reduction.use = "tsne")
339 | ```
340 | 
341 | [Download all marker results](results/cluster2vs1_markers.csv)
342 | 
343 | # Conclusions
344 | 
345 | It may also be helpful to explore these markers in the SPRING interface I sent with the clustering report.
346 | 
347 | ```{r sessioninfo}
348 | sessionInfo()
349 | ```
350 | 


--------------------------------------------------------------------------------
/scRNAseq/templates/sc_prep_for_DESeq2_analysis.Rmd:
--------------------------------------------------------------------------------
 1 | # Preparing for DE analysis
 2 | 
 3 | ```{r creating_DESeq2_object}
 4 | # Load libraries
 5 | library(SummarizedExperiment)
 6 | library(DESeq2)
 7 | 
 8 | # Load clustered seurat object
 9 | seurat <- readRDS("/path/to/seurat_tsne.rds")
10 | 
11 | #levels(seurat@ident)
12 | 
13 | ## Subset and re-assign clusters as needed
14 | 
15 | # List of current cluster IDs
16 | current_cluster_ids <- c()
17 | 
18 | # List of new cluster IDs - do not use any spaces or symbols - snake_case or camelCase is best
19 | new_cluster_ids <- c()
20 | 
21 | seurat@ident <- plyr::mapvalues(x = seurat@ident, 
22 |                                 from = current_cluster_ids, 
23 |                                 to = new_cluster_ids)
24 | # Re-run TSNE with cell types
25 | TSNEPlot(object = seurat, 
26 |          do.label = TRUE, 
27 |          pt.size = 0.5)
28 | 
29 | # Remove 'clusterA' - likely junk
30 | seurat <- SubsetData(seurat, cells.use = seurat@ident != "clusterA")
31 | 
32 | # Creating DESeq2 object
33 | counts <- as.matrix(seurat@raw.data)
34 | counts <- counts[, colnames(seurat@data)]
35 | counts <- counts[Matrix::rowSums(counts >= 5) >= 5, ]
36 | 
37 | metadata <- seurat@meta.data
38 | metadata$ident <- seurat@ident
39 | metadata <- metadata[colnames(counts), ]
40 | 
41 | 
42 | se = SummarizedExperiment(assays = list(counts = as.matrix(counts)),
43 |                           colData = metadata)
44 | 
45 | design <- ~ nUMI + Phase + condition
46 | dds <- DESeqDataSet(se, design = design)
47 | 
48 | save(dds, se, seurat, file = "data/DESeq_object.rda") 
49 | 
50 | # Use this object as input to the DE analysis scripts (https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/DESeq2_analysis_inner.R and https://github.com/hbc/tutorials/blob/master/scRNAseq/scripts/DESeq2_analysis_outer.R)
51 | ```
52 | 


--------------------------------------------------------------------------------