├── 00_pkgs_docs_shared.R
├── 01_pbmc3k_duided_tutorial.R
├── 02_seurat_multimodal.R
├── 03_seurat_integration.R
├── 04_seurat_mapping.R
├── 05_seurat_integrate_tips.R
├── 05_seurat_rpca.R
├── 0X_cisTarget_dnload.R
├── README.md
├── advancedSingleCell.Rproj
├── git教程.md
└── src
    ├── 00_pkgs_docs_shared.R
    ├── 01_mergeSample.R
    ├── 02.1_cisTarget_dwonload.R
    └── 02_senic.R


/00_pkgs_docs_shared.R:
--------------------------------------------------------------------------------
 1 | suppressMessages(library(tidyverse))
 2 | suppressMessages(library(pacman))
 3 | suppressMessages(library(data.table))
 4 | 
 5 | wkPath <- c('./result', './processData')
 6 | for(i in wkPath){
 7 |   wkPathi = i
 8 |   # wkPathi = paste0(sectionName, '/', i)
 9 |   #每一个子项目都含plot、result、input
10 |   if (!dir.exists(wkPathi)) dir.create(wkPathi, recursive=T)
11 | }
12 | rm(list=c('i', 'wkPathi', 'wkPath'))
13 | 
14 | # install.packages('umap')
15 | # BiocManager::install("glmGamPoi")
16 | # remotes::install_github('satijalab/seurat-data')
17 | 


--------------------------------------------------------------------------------
/01_pbmc3k_duided_tutorial.R:
--------------------------------------------------------------------------------
  1 | suppressMessages(library(tidyverse))
  2 | suppressMessages(library(pacman))
  3 | suppressMessages(library(data.table))
  4 | suppressMessages(library(Seurat))
  5 | suppressMessages(library(patchwork))
  6 | options(stringsAsFactors = F)
  7 | rm(list = ls())
  8 | 
  9 | inDir = '~/BioFiles/pbmc3k/'
 10 | # Load the PBMC dataset
 11 | inFile = paste0(inDir, 'filtered_gene_bc_matrices/hg19/')
 12 | pbmc.data <- Read10X(data.dir = inFile)
 13 | # Initialize the Seurat object with the raw (non-normalized data).
 14 | pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
 15 | pbmc
 16 | # Lets examine a few genes in the first thirty cells
 17 | pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
 18 | 
 19 | #-----标准预处理工作流程-----------
 20 | #下面的步骤包含了 Seurat 的 scRNA-seq 数据的标准预处理流程。
 21 | #这些代表了基于 QC 指标的单元的选择和筛选、数据规范化和缩放以及高度可变特征的检测。
 22 | 
 23 | #-----------QC和选择细胞-----------
 24 | # The [[ operator can add columns to object metadata. This is a great place to stash QC stats
 25 | pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
 26 | pbmc[["percent.ribo"]] <- PercentageFeatureSet(pbmc, pattern = "^RP[SL]")
 27 | # Visualize QC metrics as a violin plot
 28 | VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
 29 | plot1 <- FeatureScatter(pbmc, feature1 = "nFeature_RNA", feature2 = "percent.mt")
 30 | plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
 31 | FeatureScatter(pbmc, feature1 = "nFeature_RNA", feature2 = "percent.ribo")
 32 | plot1 + plot2
 33 | pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
 34 | 
 35 | #----------规范化数据----------
 36 | #从数据集中删除不需要的单元格后，下一步是规范化数据。默认情况下，
 37 | #我们使用一个全局缩放标准化方法“ LogNormalize”，
 38 | #该方法通过总表达式对每个单元格的特征表达式度量值进行标准化，
 39 | #将其乘以一个比例因子(默认为10,000) ，并对结果进行 log-transforms。
 40 | #规范化值存储在 pbmc[["RNA"]]@data。
 41 | pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
 42 | #---------特征选择----------
 43 | pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
 44 | # Identify the 10 most highly variable genes
 45 | top10 <- head(VariableFeatures(pbmc), 10)
 46 | # plot variable features with and without labels
 47 | plot1 <- VariableFeaturePlot(pbmc)
 48 | plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
 49 | plot1 + plot2
 50 | 
 51 | #-----------归一化数据--------
 52 | all.genes <- rownames(pbmc)
 53 | pbmc <- ScaleData(pbmc, features = all.genes)
 54 | # pbmc <- SCTransform(pbmc,variable.features.n = 3000, vars.to.regress = "percent.mt")
 55 | pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
 56 | # Examine and visualize PCA results a few different ways
 57 | print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
 58 | VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
 59 | DimPlot(pbmc, reduction = "pca")
 60 | DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
 61 | 
 62 | #--确定数据集的维数------
 63 | pbmc <- JackStraw(pbmc, num.replicate = 100)
 64 | pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
 65 | JackStrawPlot(pbmc, dims = 1:15)
 66 | ElbowPlot(pbmc)
 67 | 
 68 | #---------细胞聚集---------
 69 | pbmc <- FindNeighbors(pbmc, dims = 1:10)
 70 | pbmc <- FindClusters(pbmc, resolution = 0.5)
 71 | 
 72 | #---------非线性降维----------
 73 | pbmc <- RunUMAP(pbmc, dims = 1:10)
 74 | set.seed(123)
 75 | DimPlot(pbmc, reduction = "umap")
 76 | saveRDS(pbmc, file = "./processData/01_pbmc_tutorial.rds")
 77 | 
 78 | #-------聚类生物标志物--------
 79 | # find all markers of cluster 1
 80 | cluster1.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
 81 | head(cluster1.markers, n = 5)
 82 | # find all markers distinguishing cluster 5 from clusters 0 and 3
 83 | cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
 84 | head(cluster5.markers, n = 5)
 85 | # find markers for every cluster compared to all remaining cells, report only the positive ones
 86 | pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
 87 | pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
 88 | 
 89 | # 修拉有几个差异表达式的测试，可以通过 test.use 参数设置(详见我们的 DE vignette)。
 90 | #例如，ROC 测试返回任何单个标记的“分类能力”(范围从0-random 到1-perfect)。
 91 | cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
 92 | VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
 93 | # you can plot raw counts as well
 94 | VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
 95 | FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", 
 96 |                                "CD8A"))
 97 | top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
 98 | DoHeatmap(pbmc, features = top10$gene) + NoLegend()
 99 | 
100 | FeaturePlot(pbmc, features = c("FCGR3A", "MS4A7", "FCER1A", "CST3", "PPBP"))
101 | #------------cell type identity-------------
102 | new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono", 
103 |                      "NK", "DC", "Platelet")
104 | names(new.cluster.ids) <- levels(pbmc)
105 | pbmc <- RenameIdents(pbmc, new.cluster.ids)
106 | DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
107 | 
108 | saveRDS(pbmc, file = "./processData/01_pbmc3k_final.rds")


--------------------------------------------------------------------------------
/02_seurat_multimodal.R:
--------------------------------------------------------------------------------
  1 | suppressMessages(library(tidyverse))
  2 | suppressMessages(library(pacman))
  3 | suppressMessages(library(data.table))
  4 | suppressMessages(library(Seurat))
  5 | suppressMessages(library(patchwork))
  6 | options(stringsAsFactors = F)
  7 | rm(list = ls())
  8 | # Load in the RNA UMI matrix
  9 | 
 10 | # Note that this dataset also contains ~5% of mouse cells, which we can use as negative controls
 11 | # for the protein measurements. For this reason, the gene expression matrix has HUMAN_ or MOUSE_
 12 | # appended to the beginning of each gene.
 13 | inFile = "~/BioFiles/GSE100866_CBMC/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz"
 14 | cbmc.rna <- as.sparse(read.csv(file = inFile, sep = ",", 
 15 |                                header = TRUE, row.names = 1))
 16 | # To make life a bit easier going forward, we're going to discard all but the top 100 most
 17 | # highly expressed mouse genes, and remove the 'HUMAN_' from the CITE-seq prefix
 18 | cbmc.rna <- CollapseSpeciesExpressionMatrix(cbmc.rna)
 19 | # Load in the ADT UMI matrix
 20 | inFile = "~/BioFiles/GSE100866_CBMC/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz"
 21 | cbmc.adt <- as.sparse(read.csv(file = inFile, sep = ",", 
 22 |                                header = TRUE, row.names = 1))
 23 | # Note that since measurements were made in the same cells, the two matrices have identical
 24 | # column names
 25 | all.equal(colnames(cbmc.rna), colnames(cbmc.adt))
 26 | 
 27 | #---------Seurat: 添加 RNA 和蛋白质数据--------
 28 | # creates a Seurat object based on the scRNA-seq data
 29 | cbmc <- CreateSeuratObject(counts = cbmc.rna)
 30 | # We can see that by default, the cbmc object contains an assay storing RNA measurement
 31 | Assays(cbmc)
 32 | # create a new assay to store ADT information
 33 | adt_assay <- CreateAssayObject(counts = cbmc.adt)
 34 | # add this assay to the previously created Seurat object
 35 | cbmc[["ADT"]] <- adt_assay
 36 | Assays(cbmc)
 37 | # Extract a list of features measured in the ADT assay
 38 | rownames(cbmc[["ADT"]])
 39 | # List the current default assay
 40 | DefaultAssay(cbmc)
 41 | # Switch the default to ADT
 42 | DefaultAssay(cbmc) <- "ADT"
 43 | DefaultAssay(cbmc)
 44 | #-------------Cluster cell-----------------
 45 | # Note that all operations below are performed on the RNA assay Set and verify that the default
 46 | # assay is RNA
 47 | DefaultAssay(cbmc) <- "RNA"
 48 | DefaultAssay(cbmc)
 49 | # perform visualization and clustering steps
 50 | cbmc <- NormalizeData(cbmc)
 51 | cbmc <- FindVariableFeatures(cbmc)
 52 | cbmc <- ScaleData(cbmc)
 53 | cbmc <- RunPCA(cbmc, verbose = FALSE)
 54 | cbmc <- FindNeighbors(cbmc, dims = 1:30)
 55 | cbmc <- FindClusters(cbmc, resolution = 0.8, verbose = FALSE)
 56 | cbmc <- RunUMAP(cbmc, dims = 1:30)
 57 | DimPlot(cbmc, label = TRUE)
 58 | 
 59 | #---------------并排查看的多种模式---------
 60 | # Normalize ADT data,
 61 | DefaultAssay(cbmc) <- "ADT"
 62 | cbmc <- NormalizeData(cbmc, normalization.method = "CLR", margin = 2)
 63 | DefaultAssay(cbmc) <- "RNA"
 64 | 
 65 | # Note that the following command is an alternative but returns the same result
 66 | cbmc <- NormalizeData(cbmc, normalization.method = "CLR", margin = 2, assay = "ADT")
 67 | 
 68 | # Now, we will visualize CD14 levels for RNA and protein By setting the default assay, we can
 69 | # visualize one or the other
 70 | DefaultAssay(cbmc) <- "ADT"
 71 | p1 <- FeaturePlot(cbmc, "CD19", cols = c("lightgrey", "darkgreen")) + ggtitle("CD19 protein")
 72 | DefaultAssay(cbmc) <- "RNA"
 73 | p2 <- FeaturePlot(cbmc, "CD19") + ggtitle("CD19 RNA")
 74 | # place plots side-by-side
 75 | p1 | p2
 76 | # for the RNA and protein assays
 77 | Key(cbmc[["RNA"]])
 78 | Key(cbmc[["ADT"]])
 79 | # Now, we can include the key in the feature name, which overrides the default assay
 80 | p1 <- FeaturePlot(cbmc, "adt_CD19", cols = c("lightgrey", "darkgreen")) + ggtitle("CD19 protein")
 81 | p2 <- FeaturePlot(cbmc, "rna_CD19") + ggtitle("CD19 RNA")
 82 | p1 | p2
 83 | 
 84 | #-----------识别细胞表面标记-----------
 85 | # surface
 86 | VlnPlot(cbmc, "adt_CD19")
 87 | # we can also identify alternative protein and RNA markers for this cluster through differential
 88 | # expression
 89 | adt_markers <- FindMarkers(cbmc, ident.1 = 5, assay = "ADT")
 90 | rna_markers <- FindMarkers(cbmc, ident.1 = 5, assay = "RNA")
 91 | head(adt_markers)
 92 | head(rna_markers)
 93 | #-----------更多可视化---------
 94 | # Draw ADT scatter plots (like biaxial plots for FACS). Note that you can even 'gate' cells if
 95 | # desired by using HoverLocator and FeatureLocator
 96 | FeatureScatter(cbmc, feature1 = "adt_CD19", feature2 = "adt_CD3")
 97 | # view relationship between protein and RNA
 98 | FeatureScatter(cbmc, feature1 = "adt_CD3", feature2 = "rna_CD3E")
 99 | FeatureScatter(cbmc, feature1 = "adt_CD4", feature2 = "adt_CD8")
100 | # number in cells, which significantly reduces 'drop-out' in ADT data
101 | FeatureScatter(cbmc, feature1 = "adt_CD4", feature2 = "adt_CD8", slot = "counts")
102 | #----------------10X 多模态数据---------------
103 | pbmc10k.data <- Read10X(data.dir = "../data/pbmc10k/filtered_feature_bc_matrix/")
104 | rownames(x = pbmc10k.data[["Antibody Capture"]]) <- gsub(pattern = "_[control_]*TotalSeqB", replacement = "", 
105 |                                                          x = rownames(x = pbmc10k.data[["Antibody Capture"]]))
106 | 
107 | pbmc10k <- CreateSeuratObject(counts = pbmc10k.data[["Gene Expression"]], min.cells = 3, min.features = 200)
108 | pbmc10k <- NormalizeData(pbmc10k)
109 | pbmc10k[["ADT"]] <- CreateAssayObject(pbmc10k.data[["Antibody Capture"]][, colnames(x = pbmc10k)])
110 | pbmc10k <- NormalizeData(pbmc10k, assay = "ADT", normalization.method = "CLR")
111 | 
112 | plot1 <- FeatureScatter(pbmc10k, feature1 = "adt_CD19", feature2 = "adt_CD3", pt.size = 1)
113 | plot2 <- FeatureScatter(pbmc10k, feature1 = "adt_CD4", feature2 = "adt_CD8a", pt.size = 1)
114 | plot3 <- FeatureScatter(pbmc10k, feature1 = "adt_CD3", feature2 = "CD3E", pt.size = 1)
115 | (plot1 + plot2 + plot3) & NoLegend()


--------------------------------------------------------------------------------
/03_seurat_integration.R:
--------------------------------------------------------------------------------
  1 | suppressMessages(library(tidyverse))
  2 | suppressMessages(library(pacman))
  3 | suppressMessages(library(data.table))
  4 | suppressMessages(library(Seurat))
  5 | suppressMessages(library(SeuratData))
  6 | suppressMessages(library(SeuratWrappers))
  7 | suppressMessages(library(patchwork))
  8 | options(stringsAsFactors = F)
  9 | rm(list = ls())
 10 | 
 11 | # load dataset
 12 | LoadData("ifnb")
 13 | # split the dataset into a list of two seurat objects (stim and CTRL)
 14 | ifnb.list <- SplitObject(ifnb, split.by = "stim")
 15 | # normalize and identify variable features for each dataset independently
 16 | ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
 17 |   x <- NormalizeData(x)
 18 |   x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
 19 | })
 20 | # select features that are repeatedly variable across datasets for integration
 21 | features <- SelectIntegrationFeatures(object.list = ifnb.list)
 22 | 
 23 | # 整合
 24 | immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features)
 25 | # this command creates an 'integrated' data assay
 26 | immune.combined <- IntegrateData(anchorset = immune.anchors)
 27 | # 综合分析
 28 | # specify that we will perform downstream analysis on the corrected data note that the original
 29 | # unmodified data still resides in the 'RNA' assay
 30 | DefaultAssay(immune.combined) <- "integrated"
 31 | 
 32 | # Run the standard workflow for visualization and clustering
 33 | set.seed(457865)
 34 | immune.combined <- ScaleData(immune.combined, verbose = FALSE)
 35 | immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
 36 | immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
 37 | immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
 38 | immune.combined <- FindClusters(immune.combined, resolution = 0.5)
 39 | # Visualization
 40 | p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim")
 41 | p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
 42 | p1 + p2
 43 | DimPlot(immune.combined, reduction = "umap", split.by = "stim")
 44 | 
 45 | # 确定保守的细胞类型标记--------------
 46 | # For performing differential expression after integration, we switch back to the original data
 47 | DefaultAssay(immune.combined) <- "RNA"
 48 | nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "stim", verbose = FALSE)
 49 | head(nk.markers)
 50 | 
 51 | 
 52 | FeaturePlot(immune.combined, features = c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", 
 53 |                                           "CCL2", "PPBP"), min.cutoff = "q9")
 54 | 
 55 | immune.combined <- RenameIdents(immune.combined, `0` = "CD14 Mono", `1` = "CD4 Naive T", `2` = "CD4 Memory T", 
 56 |                                 `3` = "CD16 Mono", `4` = "B", `5` = "CD8 T", `6` = "NK", `7` = "T activated", `8` = "DC", `9` = "B Activated", 
 57 |                                 `10` = "Mk", `11` = "pDC", `12` = "Eryth", `13` = "Mono/Mk Doublets", `14` = "HSPC")
 58 | DimPlot(immune.combined, label = TRUE)
 59 | 
 60 | Idents(immune.combined) <- factor(Idents(immune.combined), levels = c("HSPC", "Mono/Mk Doublets", 
 61 |                                                                       "pDC", "Eryth", "Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", 
 62 |                                                                       "CD4 Naive T", "CD4 Memory T"))
 63 | markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5", 
 64 |                      "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1", 
 65 |                      "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ", "PRSS57")
 66 | DotPlot(immune.combined, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8, split.by = "stim") + 
 67 |   RotatedAxis()
 68 | # 识别不同条件下的差异表达基因------------------
 69 | library(ggplot2)
 70 | library(cowplot)
 71 | theme_set(theme_cowplot())
 72 | t.cells <- subset(immune.combined, idents = "CD4 Naive T")
 73 | Idents(t.cells) <- "stim"
 74 | avg.t.cells <- as.data.frame(log1p(AverageExpression(t.cells, verbose = FALSE)$RNA))
 75 | avg.t.cells$gene <- rownames(avg.t.cells)
 76 | 
 77 | cd14.mono <- subset(immune.combined, idents = "CD14 Mono")
 78 | Idents(cd14.mono) <- "stim"
 79 | avg.cd14.mono <- as.data.frame(log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA))
 80 | avg.cd14.mono$gene <- rownames(avg.cd14.mono)
 81 | 
 82 | genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8")
 83 | p1 <- ggplot(avg.t.cells, aes(CTRL, STIM)) + geom_point() + ggtitle("CD4 Naive T Cells")
 84 | p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE)
 85 | p2 <- ggplot(avg.cd14.mono, aes(CTRL, STIM)) + geom_point() + ggtitle("CD14 Monocytes")
 86 | p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE)
 87 | p1 + p2
 88 | 
 89 | 
 90 | 
 91 | immune.combined$celltype.stim <- paste(Idents(immune.combined), immune.combined$stim, sep = "_")
 92 | immune.combined$celltype <- Idents(immune.combined)
 93 | Idents(immune.combined) <- "celltype.stim"
 94 | b.interferon.response <- FindMarkers(immune.combined, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE)
 95 | head(b.interferon.response, n = 15)
 96 | FeaturePlot(immune.combined, features = c("CD3D", "GNLY", "IFI6"), 
 97 |             split.by = "stim", max.cutoff = 3,cols = c("grey", "red"))
 98 | plots <- VlnPlot(immune.combined, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "celltype", 
 99 |                  pt.size = 0, combine = FALSE)
100 | wrap_plots(plots = plots, ncol = 1)
101 | 
102 | # 使用 SCTransform 对数据集进行规范化集成--------
103 | LoadData("ifnb")
104 | ifnb.list <- SplitObject(ifnb, split.by = "stim")
105 | ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform)
106 | features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000)
107 | ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features)
108 | immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, normalization.method = "SCT", 
109 |                                          anchor.features = features)
110 | immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = "SCT")
111 | set.seed(123)
112 | immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE)
113 | immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30)
114 | p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim")
115 | p2 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "seurat_annotations", label = TRUE, 
116 |               repel = TRUE)
117 | p1 + p2
118 | 
119 | immune.combined.sct <- FindNeighbors(immune.combined.sct, reduction = "pca", dims = 1:30)
120 | immune.combined.sct <- FindClusters(immune.combined.sct, resolution = 0.5)
121 | # Visualization
122 | p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim")
123 | p2 <- DimPlot(immune.combined.sct, reduction = "seurat_annotations", label = TRUE, repel = TRUE)
124 | p1 + p2
125 | 
126 | DefaultAssay(immune.combined.sct) <- "RNA"
127 | nk.markers <- FindConservedMarkers(immune.combined.sct, ident.1 = 6, grouping.var = "stim", verbose = FALSE)
128 | head(nk.markers)
129 | FeaturePlot(immune.combined.sct, features = c('FCGR3A', 'LST1'), min.cutoff = "q9")
130 | 
131 | 
132 | immune.combined.sct <- RenameIdents(immune.combined.sct, `0` = "CD14 Mono", `1` = "CD4 Naive T", `2` = "CD4 Memory T", 
133 |                                 `3` = "CD16 Mono", `4` = "B", `5` = "CD8 T", `6` = "NK", `7` = "T activated", `8` = "DC", `9` = "B Activated", 
134 |                                 `10` = "Mk", `11` = "pDC", `12` = "Eryth", `13` = "Mono/Mk Doublets", `14` = "HSPC")
135 | DimPlot(immune.combined.sct, label = TRUE)
136 | 


--------------------------------------------------------------------------------
/04_seurat_mapping.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(SeuratData)
 3 | library(tidyverse)
 4 | library(cowplot)
 5 | library(patchwork)
 6 | rm(list = ls())
 7 | # InstallData("panc8")
 8 | # install.packages("../tmp/panc8.SeuratData_3.0.2.tar.gz", repos = NULL
 9 | library(panc8.SeuratData)
10 | data("panc8")
11 | pancreas.list <- SplitObject(panc8, split.by = "tech")
12 | # pancreas.list <- pancreas.list[c("celseq", "celseq2", "fluidigmc1", "smartseq2")]
13 | for (i in 1:length(pancreas.list)) {
14 |   pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE)
15 |   pancreas.list[[i]] <- FindVariableFeatures(pancreas.list[[i]], selection.method = "vst", nfeatures = 2000, 
16 |                                              verbose = FALSE)
17 | }
18 | 
19 | reference.list <- pancreas.list[c("celseq", "celseq2", "smartseq2")]
20 | pancreas.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
21 | pancreas.integrated <- IntegrateData(anchorset = pancreas.anchors, dims = 1:30)
22 | 
23 | # switch to integrated assay. The variable features of this assay are automatically set during
24 | # IntegrateData
25 | DefaultAssay(pancreas.integrated) <- "integrated"
26 | # Run the standard workflow for visualization and clustering
27 | pancreas.integrated <- ScaleData(pancreas.integrated, verbose = FALSE)
28 | pancreas.integrated <- RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE)
29 | pancreas.integrated <- RunUMAP(pancreas.integrated, reduction = "pca", dims = 1:30, verbose = FALSE)
30 | p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "tech")
31 | p2 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", label = TRUE, repel = TRUE) + 
32 |   NoLegend()
33 | p1 + p2
34 | 
35 | # 细胞分类------------
36 | pancreas.query <- pancreas.list[["indrop"]]
37 | pancreas.anchors <- FindTransferAnchors(reference = pancreas.integrated, query = pancreas.query, 
38 |                                         dims = 1:30)
39 | predictions <- TransferData(anchorset = pancreas.anchors, refdata = pancreas.integrated$celltype, 
40 |                             dims = 1:30)
41 | pancreas.query <- AddMetaData(pancreas.query, metadata = predictions)
42 | pancreas.query$prediction.match <- pancreas.query$predicted.id == pancreas.query$celltype
43 | table(pancreas.query$prediction.match)
44 | table(pancreas.query$predicted.id)
45 | VlnPlot(pancreas.query, c("REG1A", "PPY", "SST", "GHRL", "VWF", "SOX10"), group.by = "predicted.id")
46 | 
47 | pancreas.integrated <- RunUMAP(pancreas.integrated, dims = 1:30, reduction = "pca", return.model = TRUE)
48 | pancreas.query <- MapQuery(anchorset = pancreas.anchors, reference = pancreas.integrated, query = pancreas.query, 
49 |                            refdata = list(celltype = "celltype"), reference.reduction = "pca", reduction.model = "umap")
50 | 
51 | p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", label = TRUE, label.size = 3, 
52 |               repel = TRUE) + NoLegend() + ggtitle("Reference annotations")
53 | p2 <- DimPlot(pancreas.query, reduction = "ref.umap", group.by = "predicted.celltype", label = TRUE, 
54 |               label.size = 3, repel = TRUE) + NoLegend() + ggtitle("Query transferred labels")
55 | p1 + p2
56 | 


--------------------------------------------------------------------------------
/05_seurat_integrate_tips.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | 
 3 | library(loomR)
 4 | library(SeuratDisk)
 5 | inFile = paste0('~/BioFiles/immuneCellAtlas/',
 6 |                 'cc95ff89-2e68-4a08-a234-480eca21ce79.homo_sapiens.loom')
 7 | 
 8 | 
 9 | bm280k.loom <- connect(filename = inFile, mode = "r+")
10 | bm280k.loom
11 | bm280k =  as.Seurat(bm280k.loom)
12 | bm280k.loom$close_all()
13 | # bm280k.data <- Read10X_h5("../data/ica_bone_marrow_h5.h5")
14 | # bm280k <- CreateSeuratObject(counts = bm280k.data, min.cells = 100, min.features = 500)
15 | bm280k.list <- SplitObject(bm280k, split.by = "orig.ident")
16 | bm280k.list <- lapply(X = bm280k.list, FUN = function(x) {
17 |   x <- NormalizeData(x, verbose = FALSE)
18 |   x <- FindVariableFeatures(x, verbose = FALSE)
19 | })
20 | 
21 | features <- SelectIntegrationFeatures(object.list = bm280k.list)
22 | bm280k.list <- lapply(X = bm280k.list, FUN = function(x) {
23 |   x <- ScaleData(x, features = features, verbose = FALSE)
24 |   x <- RunPCA(x, features = features, verbose = FALSE)
25 | })
26 | 
27 | anchors <- FindIntegrationAnchors(object.list = bm280k.list, reference = c(1, 2), reduction = "rpca", 
28 |                                   dims = 1:50)
29 | bm280k.integrated <- IntegrateData(anchorset = anchors, dims = 1:50)
30 | 
31 | bm280k.integrated <- ScaleData(bm280k.integrated, verbose = FALSE)
32 | bm280k.integrated <- RunPCA(bm280k.integrated, verbose = FALSE)
33 | bm280k.integrated <- RunUMAP(bm280k.integrated, dims = 1:50)
34 | 
35 | DimPlot(bm280k.integrated, group.by = "orig.ident")


--------------------------------------------------------------------------------
/05_seurat_rpca.R:
--------------------------------------------------------------------------------
 1 | library(SeuratData)
 2 | 
 3 | rm(list = ls())
 4 | library(ifnb.SeuratData)
 5 | LoadData("ifnb")
 6 | # split the dataset into a list of two seurat objects (stim and CTRL)
 7 | ifnb.list <- SplitObject(ifnb, split.by = "stim")
 8 | # normalize and identify variable features for each dataset independently
 9 | ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
10 |   x <- NormalizeData(x)
11 |   x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
12 | })
13 | 
14 | # select features that are repeatedly variable across datasets for integration run PCA on each
15 | # dataset using these features
16 | features <- SelectIntegrationFeatures(object.list = ifnb.list)
17 | ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
18 |   x <- ScaleData(x, features = features, verbose = FALSE)
19 |   x <- RunPCA(x, features = features, verbose = FALSE)
20 | })
21 | 
22 | # 执行集成-----------------
23 | immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features, reduction = "rpca")
24 | # this command creates an 'integrated' data assay
25 | immune.combined <- IntegrateData(anchorset = immune.anchors)
26 | #现在我们可以对所有的细胞进行单一的整合分析了！
27 | 
28 | # specify that we will perform downstream analysis on the corrected data note that the original
29 | # unmodified data still resides in the 'RNA' assay
30 | DefaultAssay(immune.combined) <- "integrated"
31 | 
32 | # Run the standard workflow for visualization and clustering
33 | immune.combined <- ScaleData(immune.combined, verbose = FALSE)
34 | immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
35 | #--确定数据集的维数------
36 | immune.combined <- JackStraw(immune.combined, num.replicate = 100)
37 | immune.combined <- ScoreJackStraw(immune.combined, dims = 1:20)
38 | JackStrawPlot(immune.combined, dims = 1:20)
39 | ElbowPlot(immune.combined)
40 | 
41 | 
42 | immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
43 | immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
44 | immune.combined <- FindClusters(immune.combined, resolution = 0.5)
45 | # Visualization
46 | p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim")
47 | p2 <- DimPlot(immune.combined, reduction = "umap", group.by = "seurat_annotations", label = TRUE, 
48 |               repel = TRUE)
49 | p1 + p2
50 | 
51 | # 调整整合的力度-----------------
52 | immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features, reduction = "rpca", 
53 |                                          k.anchor = 20)
54 | immune.combined <- IntegrateData(anchorset = immune.anchors)
55 | immune.combined <- ScaleData(immune.combined, verbose = FALSE)
56 | immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
57 | immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
58 | immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
59 | immune.combined <- FindClusters(immune.combined, resolution = 0.5)
60 | # Visualization
61 | p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim")
62 | p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
63 | p1 + p2
64 | 
65 | # SCTransform规范化集成----------------------
66 | ifnb.list <- SplitObject(ifnb, split.by = "stim")
67 | ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform, method = "glmGamPoi")
68 | features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000)
69 | ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features)
70 | ifnb.list <- lapply(X = ifnb.list, FUN = RunPCA, features = features)
71 | immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, normalization.method = "SCT", 
72 |                                          anchor.features = features, dims = 1:30, reduction = "rpca", k.anchor = 20)
73 | immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = "SCT", dims = 1:30)
74 | immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE)
75 | immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30)
76 | # Visualization
77 | p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim")
78 | p2 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "seurat_annotations", label = TRUE, 
79 |               repel = TRUE)
80 | p1 + p2
81 | 
82 | 


--------------------------------------------------------------------------------
/0X_cisTarget_dnload.R:
--------------------------------------------------------------------------------
 1 | ##1, For human:
 2 | dbFiles1 <- c("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather",
 3 |               "https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather")
 4 | ##2, For mouse:
 5 | dbFiles2 <- c("https://resources.aertslab.org/cistarget/databases/mus_musculus/mm10/refseq_r80/mc9nr/gene_based/mm10__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather",
 6 |               "https://resources.aertslab.org/cistarget/databases/mus_musculus/mm10/refseq_r80/mc9nr/gene_based/mm10__refseq-r80__10kb_up_and_down_tss.mc9nr.feather")
 7 | # mc9nr: Motif collection version 9: 24k motifs
 8 | 
 9 | ##4, download
10 | dir.create("~/database/cisTarget_databases");   #创建一个文件夹保存数据库
11 | setwd("~/database/cisTarget_databases")
12 | #如果3个参考数据库都想下载，每次设置变量dbFiles后，都要运行以下代码
13 | dbFiles = c(dbFiles1, dbFiles2)
14 | for(featherURL in dbFiles){
15 |   download.file(featherURL, destfile=basename(featherURL)) # saved in current dir
16 | }
17 | # mc9nr: Motif collection version 9: 24k motifs


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # advancedSingleCell
 2 | 
 3 | 对单细胞测序的学习，跟着流程走‘CNS图表复现’走
 4 | 
 5 | [jimmy的单细胞课程](https://mp.weixin.qq.com/s?__biz=MzAxMDkxODM1Ng%3D%3D&mid=2247496154&idx=3&sn=d3cfaa4a5b18235e0192619f64641635&scene=45#wechat_redirect)
 6 | 
 7 | ### 高级分析
 8 | 
 9 | - [单细胞转录组高级分析一：多样本合并与批次校正](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488375&idx=1&sn=a8c73ea647254baab7125babba027071&chksm=ea1f15f5dd689ce3b7c90dd2aeed140b23b83543e7def6094af98c40de58a2d1b08e15e8d2fe&scene=21#wechat_redirect)
10 | - [单细胞转录组高级分析二：转录调控网络分析](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488383&idx=1&sn=7b8504ed4449df3a707d1c83ec0b0a7a&chksm=ea1f15fddd689ceb5edf6635d2c74e9271eac0c30c4d1714403c9057cb3fa187a776e5a4f34b&scene=21#wechat_redirect)
11 | - [单细胞转录组高级分析三：细胞通讯分析](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488392&idx=1&sn=e0aa3d50eb0b1f3251f1ae7cf62c9616&chksm=ea1f150add689c1c0c75f6b1e1e6bf4d3e1faaf230b6b2ef4466d8530f08958bd6196849d61d&scene=21#wechat_redirect)
12 | - [单细胞转录组高级分析四：scRNA数据推断CNV](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488400&idx=1&sn=2cec23311fe972353dec8cbc24c6efbc&chksm=ea1f1512dd689c04ab0e822eabc96158cfd0d437e8cc8721dde77acf5834ccd3d7a26660f8f0&scene=21#wechat_redirect)
13 | - [单细胞转录组高级分析五：GSEA与GSVA分析](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488442&idx=1&sn=cfa26b7e4ee68a6e5a7929a0d5b98595&chksm=ea1f1538dd689c2e2bdfff6bf6956531abd9eee0492efd256f20bcfc877f351646e7e5e74934&scene=21#wechat_redirect)
14 | - [单细胞转录组高级分析六：TCGA生存分析](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488450&idx=1&sn=de7beeb1c144dee1197942cbc2cbe9fc&chksm=ea1f1540dd689c569c6d2a93ec7d4dd707c76a7290f2454ca61c7706799c72199bd5e60d892f&scene=21#wechat_redirect)
15 | - [单细胞转录组高级分析七：整合scATAC数据](http://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488458&idx=1&sn=890de4c0c4f1e286406560e97c3bf356&chksm=ea1f1548dd689c5e112760efc79562b78fae3e0982f4c9a10fdffb507d35b7de86277f21e4f9&scene=21#wechat_redirect)
16 | - [单细胞转录组高级分析八：整合V(D)J数据](https://mp.weixin.qq.com/s?__biz=MzI1Njk4ODE0MQ==&mid=2247488467&idx=1&sn=96407b7817a64b270752792b5e775d34&scene=21#wechat_redirect)
17 | 
18 | 
19 | ## Tips for integrating large datasets
20 | https://satijalab.org/seurat/articles/integration_large_datasets.html
21 | 
22 | 数据下载：
23 | https://data.humancellatlas.org/explore/projects/cc95ff89-2e68-4a08-a234-480eca21ce79/m/expression-matrices?catalog=dcp1


--------------------------------------------------------------------------------
/advancedSingleCell.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/git教程.md:
--------------------------------------------------------------------------------
  1 | **代码版本控制是作为程序员，必须要考虑的问题**，针对`Rstudio`可以利用`Github`进行版本控，下面对整个操作过程进行说明：
  2 | 
  3 | ## 设定目录
  4 | 
  5 | 在`windows`系统下，选择`Tools` --> `Global Options`，然后选择`Git/SVN`，选择`Git executable`，所以安装前提是你要有先安装`Git`，如下图所示
  6 | 
  7 | ![image-20210329122736942](https://gitee.com/cystone2020/document/raw/master/image-20210329122736942.png)
  8 | 
  9 | 选择git目录
 10 | 
 11 | 
 12 |  然后`Create RSA Key`
 13 | 
 14 | ![image-20210329122752182](https://gitee.com/cystone2020/document/raw/master/image-20210329122752182.png)
 15 | 
 16 | ![image-20210329122816918](https://gitee.com/cystone2020/document/raw/master/image-20210329122816918.png)
 17 | 
 18 | ![image-20210329122840750](https://gitee.com/cystone2020/document/raw/master/image-20210329122840750.png)
 19 | 
 20 | 打开`Tools`，选择`shell`，输入命令：
 21 |  `git config --global user.email "youremail@gmail.com`
 22 |  `git config --global user.name "yourname"`
 23 |  `ssh -T git@github.com`
 24 |  使用`GitHub`上的名字
 25 | 
 26 | ![image-20210329122854117](https://gitee.com/cystone2020/document/raw/master/image-20210329122854117.png)GitHub连接
 27 | 
 28 | ## 新建一个一个工程
 29 | 
 30 | 新建一个工程，选择`New Directory`
 31 | 
 32 | ![image-20210329122906337](https://gitee.com/cystone2020/document/raw/master/image-20210329122906337.png)
 33 | 
 34 | 新建工程
 35 | 
 36 | 然后勾选`Create a git repository`
 37 | 
 38 | ![image-20210329122936787](https://gitee.com/cystone2020/document/raw/master/image-20210329122936787.png)
 39 | 
 40 | 创建Git
 41 | 
 42 | 这个时候`Rstudio`会出现`git`栏，提交到本地，只需要在`git`栏下面点击`commit`，即可提交至本地
 43 | 
 44 | ![image-20210329122958635](https://gitee.com/cystone2020/document/raw/master/image-20210329122958635.png)GitHub提交
 45 | 
 46 | 
 47 | 
 48 | 可以将代码保存至`GitHub`上，并且创建分支，在`GitHub`上创建一个`New respository`，命名为`test`
 49 | 
 50 | ![image-20210329123014956](https://gitee.com/cystone2020/document/raw/master/image-20210329123014956.png)
 51 | 
 52 | GitHUb上创建
 53 | 
 54 | 打开`Rstudio`中的`Shell`窗口，输入`git`命令
 55 | 
 56 | 
 57 | 
 58 | ```csharp
 59 | git remote rm origin
 60 | git remote add origin  https://github.com/cystone/advancedSingleCell.git
 61 | git config remote.origin.url git@github.com:cystone/advancedSingleCell.git
 62 | git pull  origin master
 63 | git push  origin master
 64 | ```
 65 | 
 66 | 将`origin`重新定向
 67 |  `git remote set-url origin https://github.com/chengfeifan/test.git`
 68 | 
 69 | ## 在本地新建一个`GitHub`上已经存在的项目
 70 | 
 71 | 首先在`Rstudio`上新建一个`project`，选择`version control`,然后选`Clone Git Respository`，将`GitHub`上`repository`的`url`加入到选项中
 72 | 
 73 | ![image-20210329123033548](https://gitee.com/cystone2020/document/raw/master/image-20210329123033548.png)
 74 | 
 75 | Paste_Image.png
 76 | 
 77 | 然后在`shell`窗口输入
 78 |  `git config remote.origin.url git@github.com:ewenharrison/test.git`
 79 | 
 80 | ## git中设置上游
 81 | 
 82 | 在`git`的时候，我们会建立许多有特性的分支，建立分支的时候，如何使得远端也出现分支，需要用到下面的命令：
 83 | 
 84 | ```bash
 85 |  git push --set-upstream origin master
 86 | ```
 87 | 
 88 | ## 常见错误和解决办法
 89 | 
 90 | $git push
 91 | fatal: unable to access 'https://github.com/cystone/advancedSingleCell.git/': LibreSSL SSL_connect: SSL_ERROR_SYSCALL in connection to github.com:443
 92 | 
 93 | ```bash
 94 | git config --global --unset http.proxy
 95 | git config --global http.sslVerify false
 96 | ```
 97 | 
 98 | 如果有如下：
 99 | Enter passphrase for key '～/.ssh/id_rsa': 
100 | 
101 | ```bash
102 | ssh-add ~/.ssh/id_rsa
103 | ```
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/src/00_pkgs_docs_shared.R:
--------------------------------------------------------------------------------
 1 | suppressMessages(library(tidyverse))
 2 | suppressMessages(library(pacman))
 3 | suppressMessages(library(data.table))
 4 | 
 5 | wkPath <- c('./src',  './output','./data/raw', './data/processed','./reports')
 6 | for(i in wkPath){
 7 |   wkPathi = i
 8 |   # wkPathi = paste0(sectionName, '/', i)
 9 |   #每一个子项目都含plot、result、input
10 |   if (!dir.exists(wkPathi)) dir.create(wkPathi, recursive=T)
11 | }
12 | rm(list=c('i', 'wkPathi', 'wkPath'))
13 | 


--------------------------------------------------------------------------------
/src/01_mergeSample.R:
--------------------------------------------------------------------------------
  1 | library(Seurat)
  2 | library(tidyverse)
  3 | library(patchwork)
  4 | library(data.table)
  5 | dir.create('./output/01_mergeSample/cluster1', recursive=T)
  6 | dir.create('./output/01_mergeSample/cluster2', recursive=T)
  7 | dir.create('./output/01_mergeSample/cluster3', recursive=T)
  8 | set.seed(123)  #设置随机数种子，使结果可重复
  9 | 
 10 | #----------读入数据----------
 11 | ##使用目录向量合并
 12 | dir = c('./data/raw/GSE139324_RAW/GSM4138110', 
 13 |         './data/raw/GSE139324_RAW/GSM4138111',
 14 |         './data/raw/GSE139324_RAW/GSM4138128',
 15 |         './data/raw/GSE139324_RAW/GSM4138129',
 16 |         './data/raw/GSE139324_RAW/GSM4138148',
 17 |         './data/raw/GSE139324_RAW/GSM4138149',
 18 |         './data/raw/GSE139324_RAW/GSM4138162',
 19 |         './data/raw/GSE139324_RAW/GSM4138163',
 20 |         './data/raw/GSE139324_RAW/GSM4138168',
 21 |         './data/raw/GSE139324_RAW/GSM4138169')
 22 | 
 23 | for(i in dir){
 24 |   ind = str_split(i, '/',simplify=T)[5]
 25 |   indP = paste0(ind, '.')
 26 |   if(!dir.exists(i)){dir.create(i, recursive=T)}
 27 |   filern = list.files('./data/raw/GSE139324_RAW/', pattern=indP,include.dirs = F)
 28 |   if(length(filern == 3)){
 29 |     filname = str_split(filern, '_',simplify=T)[,5]
 30 |     file.rename(paste0('./data/raw/GSE139324_RAW/', filern),
 31 |                 paste0('./data/raw/GSE139324_RAW/', ind, '/',filname))
 32 |   }else(print(paste0('please check ', ind)))
 33 |   file.rename(paste0('./data/raw/GSE139324_RAW/',ind, '/genes.tsv.gz'),
 34 |               paste0('./data/raw/GSE139324_RAW/', ind, '/features.tsv.gz'))
 35 |   
 36 |   }
 37 | 
 38 | names(dir) = c('HNC01PBMC', 'HNC01TIL', 'HNC10PBMC', 'HNC10TIL', 'HNC20PBMC', 
 39 |                'HNC20TIL', 'PBMC1', 'PBMC2', 'Tonsil1', 'Tonsil2')
 40 | counts <- Read10X(data.dir = dir)
 41 | 
 42 | #-------样本合并-------
 43 | scRNA1 = CreateSeuratObject(counts, min.cells=1)
 44 | dim(scRNA1)   #查看基因数和细胞总数
 45 | #[1] 23603 19750 
 46 | table(scRNA1@meta.data$orig.ident)  #查看每个样本的细胞数
 47 | #HNC01PBMC  HNC01TIL HNC10PBMC  HNC10TIL HNC20PBMC  HNC20TIL     PBMC1     PBMC2   Tonsil1   Tonsil2 
 48 | #     1725      1298      1750      1384      1530      1148      2445      2436      3325      2709
 49 | 
 50 | #使用merge函数合并seurat对象
 51 | scRNAlist <- list()
 52 | #以下代码会把每个样本的数据创建一个seurat对象，并存放到列表scRNAlist里
 53 | for(i in 1:length(dir)){
 54 |   counts <- Read10X(data.dir = dir[i])
 55 |   scRNAlist[[i]] <- CreateSeuratObject(counts, min.cells=1)
 56 | }
 57 | #使用merge函数讲10个seurat对象合并成一个seurat对象
 58 | scRNA2 <- merge(scRNAlist[[1]],
 59 |                 y=c(scRNAlist[[2]], scRNAlist[[3]], scRNAlist[[4]],
 60 |                     scRNAlist[[5]], scRNAlist[[6]], scRNAlist[[7]], 
 61 |                     scRNAlist[[8]], scRNAlist[[9]], scRNAlist[[10]]))
 62 | #dim(scRNA2)
 63 | # [1] 23603 19750
 64 | table(scRNA2@meta.data$orig.ident)
 65 | #HNC01PBMC  HNC01TIL HNC10PBMC  HNC10TIL HNC20PBMC  HNC20TIL     PBMC1     PBMC2   Tonsil1   Tonsil2 
 66 | #     1725      1298      1750      1384      1530      1148      2445      2436      3325      2709
 67 | 
 68 | #计算主成分
 69 | {
 70 |   scRNA1 <- NormalizeData(scRNA1)
 71 |   scRNA1 <- FindVariableFeatures(scRNA1, selection.method = "vst")
 72 |   scRNA1 <- ScaleData(scRNA1, features = VariableFeatures(scRNA1))
 73 |   scRNA1 <- RunPCA(scRNA1, features = VariableFeatures(scRNA1))
 74 |   plot1 <- DimPlot(scRNA1, reduction = "pca", group.by="orig.ident")
 75 |   plot2 <- ElbowPlot(scRNA1, ndims=30, reduction="pca") 
 76 |   plotc <- plot1+plot2
 77 |   ggsave("./output/01_mergeSample/cluster1/pca.png", plot = plotc, width = 8, 
 78 |          height = 4)
 79 |   print(c("请选择哪些pc轴用于后续分析？示例如下：","pc.num=1:15"))
 80 | }
 81 | 
 82 | #选取主成分
 83 | pc.num=1:30
 84 | # 降维和非线性聚类
 85 | {
 86 |   ##细胞聚类
 87 |   scRNA1 <- FindNeighbors(scRNA1, dims = pc.num) 
 88 |   scRNA1 <- FindClusters(scRNA1, resolution = 0.5)
 89 |   table(scRNA1@meta.data$seurat_clusters)
 90 |   metadata <- scRNA1@meta.data
 91 |   cell_cluster <- data.frame(cell_ID=rownames(metadata), 
 92 |                              cluster_ID=metadata$seurat_clusters)
 93 |   write.csv(cell_cluster,'./output/01_mergeSample/cluster1/cell_cluster.csv',
 94 |             row.names = F)
 95 |   
 96 |   ##非线性降维
 97 |   #tSNE
 98 |   scRNA1 = RunTSNE(scRNA1, dims = pc.num)
 99 |   embed_tsne <- Embeddings(scRNA1, 'tsne')   #提取tsne图坐标
100 |   write.csv(embed_tsne,'./output/01_mergeSample/cluster1/embed_tsne.csv')
101 |   #group_by_cluster
102 |   plot1 = DimPlot(scRNA1, reduction = "tsne", label=T) 
103 |   ggsave("./output/01_mergeSample/cluster1/tSNE.png", plot = plot1, width = 8,
104 |          height = 7)
105 |   #group_by_sample
106 |   plot2 = DimPlot(scRNA1, reduction = "tsne", group.by='orig.ident') 
107 |   ggsave("./output/01_mergeSample/cluster1/tSNE_sample.png", plot = plot2, 
108 |          width = 8, height = 7)
109 |   #combinate
110 |   plotc <- plot1+plot2
111 |   ggsave("./output/01_mergeSample/cluster1/tSNE_cluster_sample.png", plot = plotc, 
112 |          width = 10, height = 5)
113 |   
114 |   #UMAP
115 |   scRNA1 <- RunUMAP(scRNA1, dims = pc.num)
116 |   embed_umap <- Embeddings(scRNA1, 'umap')   #提取umap图坐标
117 |   write.csv(embed_umap,'./output/01_mergeSample/cluster1/embed_umap.csv') 
118 |   #group_by_cluster
119 |   plot3 = DimPlot(scRNA1, reduction = "umap", label=T) 
120 |   ggsave("./output/01_mergeSample/cluster1/UMAP.png", plot = plot3, width = 8, 
121 |          height = 7)
122 |   #group_by_sample
123 |   plot4 = DimPlot(scRNA1, reduction = "umap", group.by='orig.ident')
124 |   ggsave("./output/01_mergeSample/cluster1/UMAP.png", plot = plot4, width = 8, 
125 |          height = 7)
126 |   #combinate
127 |   plotc <- plot3+plot4
128 |   ggsave("./output/01_mergeSample/cluster1/UMAP_cluster_sample.png", plot = plotc,
129 |          width = 10, height = 5)
130 | }
131 | 
132 | 
133 | #合并tSNE与UMAP
134 | plotc <- plot2+plot4+ plot_layout(guides = 'collect')
135 | ggsave("./output/01_mergeSample/cluster1/tSNE_UMAP.png", plot = plotc, 
136 |        width = 10, height = 5)
137 | 
138 | ##scRNA2对象的降维聚类参考scRNA1的代码
139 | #------------数据集合并---------------
140 | 
141 | #scRNAlist是之前代码运行保存好的seurat对象列表，保存了10个样本的独立数据
142 | #数据整合之前要对每个样本的seurat对象进行数据标准化和选择高变基因
143 | for (i in 1:length(scRNAlist)) {
144 |   scRNAlist[[i]] <- NormalizeData(scRNAlist[[i]])
145 |   scRNAlist[[i]] <- FindVariableFeatures(scRNAlist[[i]], selection.method = "vst")
146 | }
147 | ##以VariableFeatures为基础寻找锚点，运行时间较长
148 | scRNA.anchors <- FindIntegrationAnchors(object.list = scRNAlist)
149 | ##利用锚点整合数据，运行时间较长
150 | scRNA3 <- IntegrateData(anchorset = scRNA.anchors)
151 | dim(scRNA3)
152 | #[1]  2000 19750    
153 | #有没有发现基因数据只有2000个了？这是因为seurat整合数据时只用2000个高变基因。
154 | #降维聚类的代码省略
155 | 
156 | #-----------数据质控------------
157 | scRNA <- scRNA3  #以后的分析使用整合的数据进行
158 | ##meta.data添加信息
159 | dir.create("./output/01_mergeSample/QC", recursive=T)
160 | proj_name <- data.frame(proj_name=rep("demo2",ncol(scRNA)))
161 | rownames(proj_name) <- row.names(scRNA@meta.data)
162 | scRNA <- AddMetaData(scRNA, proj_name)
163 | 
164 | ##切换数据集
165 | DefaultAssay(scRNA) <- "RNA"
166 | 
167 | ##计算线粒体和红细胞基因比例
168 | scRNA[["percent.mt"]] <- PercentageFeatureSet(scRNA, pattern = "^MT-")
169 | #计算红细胞比例
170 | HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
171 | HB_m <- match(HB.genes, rownames(scRNA@assays$RNA)) 
172 | HB.genes <- rownames(scRNA@assays$RNA)[HB_m] 
173 | HB.genes <- HB.genes[!is.na(HB.genes)] 
174 | scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes) 
175 | #head(scRNA@meta.data)
176 | col.num <- length(levels(as.factor(scRNA@meta.data$orig.ident)))
177 | 
178 | ##绘制小提琴图
179 | #所有样本一个小提琴图用group.by="proj_name"，每个样本一个小提琴图用group.by="orig.ident"
180 | violin <-VlnPlot(scRNA, group.by = "proj_name",  
181 |                  features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), 
182 |                  cols =rainbow(col.num), 
183 |                  pt.size = 0.01, #不需要显示点，可以设置pt.size = 0
184 |                  ncol = 4) + 
185 |   theme(axis.title.x=element_blank(), axis.text.x=element_blank(), 
186 |         axis.ticks.x=element_blank()) 
187 | ggsave("./output/01_mergeSample/QC/vlnplot_before_qc.pdf", plot = violin,
188 |        width = 12, height = 6) 
189 | ggsave("./output/01_mergeSample/QC/vlnplot_before_qc.png", plot = violin, 
190 |        width = 12, height = 6)  
191 | plot1 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
192 | plot2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
193 | plot3 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.HB")
194 | pearplot <- CombinePlots(plots = list(plot1, plot2, plot3), nrow=1, legend="none") 
195 | ggsave("./output/01_mergeSample/QC/pearplot_before_qc.pdf", plot = pearplot, 
196 |        width = 12, height = 5) 
197 | ggsave("./output/01_mergeSample/QC/pearplot_before_qc.png", plot = pearplot,
198 |        width = 12, height = 5)
199 | 
200 | ##设置质控标准
201 | print(c("请输入允许基因数和核糖体比例，示例如下：", "minGene=500",
202 |         "maxGene=4000", "pctMT=20"))
203 | minGene=500
204 | maxGene=3000
205 | pctMT=10
206 | 
207 | ##数据质控
208 | scRNA <- subset(scRNA, 
209 |                 subset = nFeature_RNA > minGene & nFeature_RNA < maxGene & 
210 |                   percent.mt < pctMT)
211 | col.num <- length(levels(as.factor(scRNA@meta.data$orig.ident)))
212 | violin <-VlnPlot(scRNA, group.by = "proj_name",
213 |                  features = c("nFeature_RNA", "nCount_RNA", 
214 |                               "percent.mt","percent.HB"), 
215 |                  cols =rainbow(col.num), 
216 |                  pt.size = 0.1, 
217 |                  ncol = 4) + 
218 |   theme(axis.title.x=element_blank(), axis.text.x=element_blank(), 
219 |         axis.ticks.x=element_blank()) 
220 | ggsave("./output/01_mergeSample/QC/vlnplot_after_qc.pdf", plot = violin, 
221 |        width = 12, height = 6) 
222 | ggsave("./output/01_mergeSample/QC/vlnplot_after_qc.png", plot = violin, 
223 |        width = 12, height = 6)
224 | 
225 | #-----------------细胞类型鉴定-------------------
226 | ##==鉴定细胞类型==##
227 | library(SingleR)
228 | dir.create("./output/01_mergeSample/CellType")
229 | #计算主成分
230 | {
231 |   scRNA <- NormalizeData(scRNA)
232 |   scRNA <- FindVariableFeatures(scRNA, selection.method = "vst")
233 |   scRNA <- ScaleData(scRNA, features = VariableFeatures(scRNA))
234 |   scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA))
235 |   plot1 <- DimPlot(scRNA, reduction = "pca", group.by="orig.ident")
236 |   plot2 <- ElbowPlot(scRNA, ndims=30, reduction="pca") 
237 |   plotc <- plot1+plot2
238 |   ggsave("./output/01_mergeSample/cluster3/pca.png", plot = plotc, width = 8, 
239 |          height = 4)
240 |   print(c("请选择哪些pc轴用于后续分析？示例如下：","pc.num=1:15"))
241 | }
242 | 
243 | #选取主成分
244 | pc.num=1:30
245 | # 降维和非线性聚类
246 | {
247 |   ##细胞聚类
248 |   scRNA <- FindNeighbors(scRNA, dims = pc.num) 
249 |   scRNA <- FindClusters(scRNA, resolution = 0.5)
250 |   table(scRNA@meta.data$seurat_clusters)
251 |   metadata <- scRNA@meta.data
252 |   cell_cluster <- data.frame(cell_ID=rownames(metadata), 
253 |                              cluster_ID=metadata$seurat_clusters)
254 |   write.csv(cell_cluster,'./output/01_mergeSample/cluster3/cell_cluster.csv',
255 |             row.names = F)
256 |   
257 |   ##非线性降维
258 |   #tSNE
259 |   scRNA = RunTSNE(scRNA, dims = pc.num)
260 |   embed_tsne <- Embeddings(scRNA, 'tsne')   #提取tsne图坐标
261 |   write.csv(embed_tsne,'./output/01_mergeSample/cluster3/embed_tsne.csv')
262 |   #group_by_cluster
263 |   plot1 = DimPlot(scRNA, reduction = "tsne", label=T) 
264 |   ggsave("./output/01_mergeSample/cluster3/tSNE.png", plot = plot1, width = 8,
265 |          height = 7)
266 |   #group_by_sample
267 |   plot2 = DimPlot(scRNA, reduction = "tsne", group.by='orig.ident') 
268 |   ggsave("./output/01_mergeSample/cluster3/tSNE_sample.png", plot = plot2, 
269 |          width = 8, height = 7)
270 |   #combinate
271 |   plotc <- plot1+plot2
272 |   ggsave("./output/01_mergeSample/cluster3/tSNE_cluster_sample.png", plot = plotc, 
273 |          width = 10, height = 5)
274 |   
275 |   #UMAP
276 |   scRNA <- RunUMAP(scRNA, dims = pc.num)
277 |   embed_umap <- Embeddings(scRNA1, 'umap')   #提取umap图坐标
278 |   write.csv(embed_umap,'./output/01_mergeSample/cluster3/embed_umap.csv') 
279 |   #group_by_cluster
280 |   plot3 = DimPlot(scRNA, reduction = "umap", label=T) 
281 |   ggsave("./output/01_mergeSample/cluster3/UMAP.png", plot = plot3, width = 8, 
282 |          height = 7)
283 |   #group_by_sample
284 |   plot4 = DimPlot(scRNA, reduction = "umap", group.by='orig.ident')
285 |   ggsave("./output/01_mergeSample/cluster3/UMAP.png", plot = plot4, width = 8, 
286 |          height = 7)
287 |   #combinate
288 |   plotc <- plot3+plot4
289 |   ggsave("./output/01_mergeSample/cluster3/UMAP_cluster_sample.png", plot = plotc,
290 |          width = 10, height = 5)
291 | }
292 | 
293 | 
294 | #合并tSNE与UMAP
295 | plotc <- plot2+plot4+ plot_layout(guides = 'collect')
296 | ggsave("./output/01_mergeSample/cluster3/tSNE_UMAP.png", plot = plotc, 
297 |        width = 10, height = 5)
298 | 
299 | 
300 | # refdata <- MonacoImmuneData()
301 | # save(refdata, file = '~/database/SingleR_ref/ref_MonacoImmuneData.RData')
302 | load(file = '~/database/SingleR_ref/ref_MonacoImmuneData.RData')
303 | testdata <- GetAssayData(scRNA, slot="data")
304 | clusters <- scRNA@meta.data$seurat_clusters
305 | #使用Monaco参考数据库鉴定
306 | cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main, 
307 |                     method = "cluster", clusters = clusters, 
308 |                     assay.type.test = "logcounts", assay.type.ref = "logcounts")
309 | celltype = data.frame(ClusterID=rownames(cellpred), 
310 |                       celltype=cellpred$labels, stringsAsFactors = F)
311 | write.csv(celltype,"./output/01_mergeSample/CellType/celltype_Monaco.csv",
312 |           row.names = F)
313 | scRNA@meta.data$celltype_Monaco = "NA"
314 | for(i in 1:nrow(celltype)){
315 |   ind = which(scRNA@meta.data$seurat_clusters == celltype$ClusterID[i])
316 |   scRNA@meta.data[ind,'celltype_Monaco'] <- celltype$celltype[i]}
317 | p1 = DimPlot(scRNA, group.by="celltype_Monaco", repel=T, label=T, label.size=5,
318 |              reduction='tsne')
319 | p2 = DimPlot(scRNA, group.by="celltype_Monaco", repel=T, label=T, label.size=5,
320 |              reduction='umap')
321 | p3 = p1+p2+ plot_layout(guides = 'collect')
322 | ggsave("./output/01_mergeSample/CellType/tSNE_celltype_Monaco.png", p1, 
323 |        width=7 ,height=6)
324 | ggsave("./output/01_mergeSample/CellType/UMAP_celltype_Monaco.png", p2, 
325 |        width=7 ,height=6)
326 | ggsave("./output/01_mergeSample/CellType/celltype_Monaco.png", p3, width=10 ,
327 |        height=5)
328 | #使用DICE参考数据库鉴定
329 | # refdata <- DatabaseImmuneCellExpressionData()
330 | # save(refdata, file='~/database/SingleR_ref/ref_DICE_1561s.RData')
331 | load('~/database/SingleR_ref/ref_DICE_1561s.RData')
332 | # refdata <- ref_DICE
333 | testdata <- GetAssayData(scRNA, slot="data")
334 | clusters <- scRNA@meta.data$seurat_clusters
335 | #tSNE
336 | scRNA = RunTSNE(scRNA, dims = pc.num)
337 | embed_tsne <- Embeddings(scRNA, 'tsne') 
338 | 
339 | cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main, 
340 |                     method = "cluster", clusters = clusters, 
341 |                     assay.type.test = "logcounts", assay.type.ref = "logcounts")
342 | celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, 
343 |                       stringsAsFactors = F)
344 | write.csv(celltype,"./output/01_mergeSample/CellType/celltype_DICE.csv",
345 |           row.names = F)
346 | scRNA@meta.data$celltype_DICE = "NA"
347 | for(i in 1:nrow(celltype)){
348 |   ind = which(scRNA@meta.data$seurat_clusters == celltype$ClusterID[i])
349 |   scRNA@meta.data[ind,'celltype_DICE'] <- celltype$celltype[i]}
350 | p4 = DimPlot(scRNA, group.by="celltype_DICE", repel=T, label=T, label.size=5, 
351 |              reduction='tsne')
352 | p5 = DimPlot(scRNA, group.by="celltype_DICE", repel=T, label=T, label.size=5, 
353 |              reduction='umap')
354 | p6 = p3+p4+ plot_layout(guides = 'collect')
355 | ggsave("./output/01_mergeSample/CellType/tSNE_celltype_DICE.png", p4, width=7,
356 |        height=6)
357 | ggsave("./output/01_mergeSample/CellType/UMAP_celltype_DICE.png", p5, width=7,
358 |        height=6)
359 | ggsave("./output/01_mergeSample/CellType/celltype_DICE.png", p6, width=10,
360 |        height=5)
361 | #对比两种数据库鉴定的结果
362 | p8 = p1+p4
363 | ggsave("./output/01_mergeSample/CellType/Monaco_DICE.png", p8, width=12,height=5)
364 | 
365 | ##保存数据
366 | saveRDS(scRNA,'./output/01_mergeSample/scRNA.rds')


--------------------------------------------------------------------------------
/src/02.1_cisTarget_dwonload.R:
--------------------------------------------------------------------------------
 1 | ##1, For human:
 2 | dbFiles1 <- c("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather",
 3 |               "https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather")
 4 | ##2, For mouse:
 5 | dbFiles2 <- c("https://resources.aertslab.org/cistarget/databases/mus_musculus/mm10/refseq_r80/mc9nr/gene_based/mm10__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather",
 6 |               "https://resources.aertslab.org/cistarget/databases/mus_musculus/mm10/refseq_r80/mc9nr/gene_based/mm10__refseq-r80__10kb_up_and_down_tss.mc9nr.feather")
 7 | # mc9nr: Motif collection version 9: 24k motifs
 8 | 
 9 | ##4, download
10 | dir.create("~/database/cisTarget_databases");   #创建一个文件夹保存数据库
11 | setwd("~/database/cisTarget_databases")
12 | #如果3个参考数据库都想下载，每次设置变量dbFiles后，都要运行以下代码
13 | dbFiles = c(dbFiles1, dbFiles2)
14 | for(featherURL in dbFiles){
15 |   download.file(featherURL, destfile=basename(featherURL)) # saved in current dir
16 | }
17 | # mc9nr: Motif collection version 9: 24k motifs


--------------------------------------------------------------------------------
/src/02_senic.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(tidyverse)
 3 | library(patchwork)
 4 | library(SCENIC)
 5 | rm(list=ls())
 6 | 
 7 | BiocManager::install('arrow')
 8 | 
 9 | setwd('~/Desktop/advancedSingleCell')
10 | ##==分析准备==##
11 | dir.create("./output/02_SCENIC")
12 | dir.create("./output/02_SCENIC/int")
13 | scRNA <- readRDS("./output/01_mergeSample/scRNA.rds")
14 | ##准备细胞meta信息
15 | cellInfo <- data.frame(scRNA@meta.data)
16 | colnames(cellInfo)[which(colnames(cellInfo)=="orig.ident")] <- "sample"
17 | colnames(cellInfo)[which(colnames(cellInfo)=="seurat_clusters")] <- "cluster"
18 | colnames(cellInfo)[which(colnames(cellInfo)=="celltype_Monaco")] <- "celltype"
19 | cellInfo <- cellInfo[,c("sample","cluster","celltype")]
20 | saveRDS(cellInfo, file="./output/02_SCENIC/cellInfo.Rds")
21 | 
22 | 
23 | ##准备表达矩阵
24 | #为了节省计算资源，随机抽取1000个细胞的数据子集
25 | subcell <- sample(colnames(scRNA),1000)
26 | scRNAsub <- scRNA[,subcell]
27 | saveRDS(scRNAsub, "./output/02_SCENIC/scRNAsub.rds")
28 | exprMat <- as.matrix(scRNAsub@assays$RNA@counts)
29 | ##设置分析环境
30 | mydbDIR <- "~/database/cisTarget_databases/"
31 | mydbs <- c("hg38__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather",
32 |            "hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather")
33 | names(mydbs) <- c("500bp", "10kb")
34 | scenicOptions <- initializeScenic(org="hgnc", 
35 |                                   nCores=8,
36 |                                   dbDir=mydbDIR, 
37 |                                   dbs = mydbs,
38 |                                   datasetTitle = "HNSCC")
39 | saveRDS(scenicOptions, "./output/02_SCENIC/scenicOptions.rds")
40 | 
41 | #----------共表达网络计算-------
42 | 
43 | ##==转录调控网络推断==##
44 | ##基因过滤
45 | #过滤标准是基因表达量之和>细胞数*3%，且在1%的细胞中表达
46 | genesKept <- geneFiltering(exprMat, scenicOptions, 
47 |                            minCountsPerGene = 3 * 0.01 * ncol(exprMat), 
48 |                            minSamples = ncol(exprMat) * 0.01)
49 | exprMat_filtered <- exprMat[genesKept, ]
50 | ##计算相关性矩阵
51 | runCorrelation(exprMat_filtered, scenicOptions)
52 | ##TF-Targets相关性回归分析
53 | exprMat_filtered_log <- log2(exprMat_filtered+1)
54 | runGenie3(exprMat_filtered_log, scenicOptions, nParts = 20)
55 | #这一步消耗的计算资源非常大，个人电脑需要几个小时的运行时间
56 | 


--------------------------------------------------------------------------------