├── .Rbuildignore ├── .Rhistory ├── .gitignore ├── DESCRIPTION ├── Mixscale.Rproj ├── NAMESPACE ├── R ├── .Rhistory ├── decomposition.R ├── enrichment_test.R ├── get_fold_change.R ├── glm_gp_disp_only.R ├── perturbation_scoring.R ├── scoring_de.R └── visualization.R ├── README.md ├── docs ├── index.html └── old │ ├── New_Vignette_2024Jan.Rmd │ ├── index copy 2.Rmd │ ├── index copy 2.html │ ├── index copy 3.html │ ├── index copy.Rmd │ ├── index copy.html │ ├── index.Rmd │ └── index.html └── man ├── DE_heatmap.Rd ├── DEenrich.Rd ├── DEenrich_DotPlot.Rd ├── DEhclust.Rd ├── DEmultiCCA.Rd ├── FoldChange_new.Rd ├── Mixscale_DoHeatmap.Rd ├── Mixscale_RidgePlot.Rd ├── Mixscale_ScatterPlot.Rd ├── PCApermtest.Rd ├── RunMixscale.Rd ├── Run_wmvRegDE.Rd ├── fisher_enrich_test.Rd ├── get_DE_mat.Rd ├── get_fc.Rd ├── get_sig_genes.Rd ├── get_sig_genes_DEhclust.Rd ├── get_sig_genes_DEmultiCCA.Rd ├── glm_gp_disp_only.Rd ├── glm_gp_disp_only_impl.Rd ├── prune_DE_mat.Rd ├── rbo.Rd └── rbo_enrich_test.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | library(roxygen2); # Read in the roxygen2 R package 2 | roxygenise(); 3 | roxygenise() 4 | roxygenise() 5 | ?ScoringDE 6 | ?PRTBScoring 7 | ?FoldChange_new 8 | ?get_fc 9 | ?get_idx 10 | q() 11 | library(roxygen2) 12 | library(devtools) 13 | roxygenise() 14 | roxygenise() 15 | roxygenise() 16 | roxygenise() 17 | ?PCApermtest 18 | roxygenise() 19 | ?DE_heatmap 20 | roxygenise() 21 | ?DE_heatmap 22 | ?PCApermtest 23 | ?DE_heatmap 24 | roxygenise() 25 | ?PCApermtest 26 | library(roxygen2); # Read in the roxygen2 R package 27 | roxygenise(); 28 | library(roxygen2) 29 | roxygenise() 30 | q() 31 | library(roxygen2) 32 | roxygenise() 33 | roxygenise() 34 | library(roxygen2) 35 | roxygenise() 36 | library(roxygen2) 37 | roxygenise() 38 | library(roxygen2) 39 | roxygenise() 40 | install.packages("protoclust") 41 | install.packages("protoclust") 42 | library(PRTBScoring) 43 | library(roxygen2) 44 | roxygenise() 45 | library(roxygen2) 46 | roxygenise() 47 | roxygenise() 48 | library(roxygen2) 49 | roxygenise() 50 | roxygenise() 51 | roxygenise() 52 | library(roxygen2) 53 | roxygenise() 54 | roxygenise() 55 | library(roxygen2) 56 | roxygenise() 57 | library(roxygen2) 58 | roxygenise() 59 | library(roxygen2) 60 | roxygenise() 61 | roxygenise() 62 | library(roxygen2) 63 | roxygenise() 64 | library(roxygen2) 65 | roxygenise() 66 | library(roxygen2) 67 | roxygenise() 68 | roxygenise() 69 | library(roxygen2) 70 | roxygenise() 71 | roxygenise() 72 | library(roxygen2) 73 | roxygenise() 74 | library(roxygen2) 75 | roxygenise() 76 | roxygenise() 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rproj.user 3 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: Mixscale 2 | Type: Package 3 | Title: Quantify the perturbation heterogeneity in Perturb-seq 4 | Version: 0.3.0 5 | RoxygenNote: 7.2.3 6 | Collate: 7 | 'decomposition.R' 8 | 'enrichment_test.R' 9 | 'get_fold_change.R' 10 | 'glm_gp_disp_only.R' 11 | 'perturbation_scoring.R' 12 | 'scoring_de.R' 13 | 'visualization.R' 14 | -------------------------------------------------------------------------------- /Mixscale.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(DE_heatmap) 4 | export(DEenrich) 5 | export(DEenrich_DotPlot) 6 | export(DEhclust) 7 | export(DEmultiCCA) 8 | export(FoldChange_new) 9 | export(Mixscale_DoHeatmap) 10 | export(Mixscale_RidgePlot) 11 | export(Mixscale_ScatterPlot) 12 | export(PCApermtest) 13 | export(RunMixscale) 14 | export(Run_wmvRegDE) 15 | export(fisher_enrich_test) 16 | export(get_DE_mat) 17 | export(get_fc) 18 | export(get_sig_genes) 19 | export(get_sig_genes_DEhclust) 20 | export(get_sig_genes_DEmultiCCA) 21 | export(prune_DE_mat) 22 | export(rbo) 23 | export(rbo_enrich_test) 24 | import(Seurat) 25 | import(SeuratObject) 26 | import(ggplot2) 27 | import(ggridges) 28 | import(glmGamPoi) 29 | importFrom(Matrix,rowMeans) 30 | importFrom(Matrix,rowSums) 31 | importFrom(PMA,MultiCCA) 32 | importFrom(RColorBrewer,brewer.pal) 33 | importFrom(gplots,heatmap.2) 34 | importFrom(protoclust,protoclust) 35 | importFrom(protoclust,protocut) 36 | -------------------------------------------------------------------------------- /R/.Rhistory: -------------------------------------------------------------------------------- 1 | nt.class.name = "NT", 2 | min.de.genes = 5, 3 | split.by = "cell_type", 4 | logfc.threshold = 0.2, 5 | de.assay = "RNA", 6 | max.de.genes = 100, 7 | prtb.type = "P", 8 | new.class.name = "mixscape_v1", 9 | fine.mode = F, 10 | harmonize = T, 11 | seed = 1) 12 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring") 13 | # 3. Perturbation scoring for each cell 14 | seurat_obj = PRTBScoring( 15 | object = seurat_obj, 16 | assay = "PRTB", 17 | slot = "scale.data", 18 | labels = "gene", 19 | nt.class.name = "NT", 20 | min.de.genes = 5, 21 | split.by = "cell_type", 22 | logfc.threshold = 0.2, 23 | de.assay = "RNA", 24 | max.de.genes = 100, 25 | prtb.type = "P", 26 | new.class.name = "mixscape_v1", 27 | fine.mode = F, 28 | harmonize = T, 29 | seed = 1) 30 | # 4. Perform scoring-based DE test using the scores 31 | de_res = scoringDE(object = seurat_obj, assay = "RNA", slot = "counts", 32 | labels = "gene") 33 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring") 34 | # 4. Perform scoring-based DE test using the scores 35 | de_res = scoringDE(object = seurat_obj, assay = "RNA", slot = "counts", 36 | PRTB_list = c("RFX5", "ZC3H3", "IFNGR1", "IFNGR2", 37 | "IRF1", "IRF2", "JUN", "MAFF", 38 | "PARP12", "TRAFD1", 39 | "JAK1", "JAK2", 40 | "STAT1", "SP100"), 41 | labels = "gene") 42 | str(de_res) 43 | # and re-arrange the DE results into Z-score matrices 44 | DEG_mat = get_DE_mat(de_res) 45 | DEG_mat = prune_DE_mat(de_res) 46 | DEG_mat = prune_DE_mat(DEG_mat) 47 | str(DEG_mat) 48 | # and re-arrange the DE results into Z-score matrices 49 | DEG_mat = get_DE_mat(de_res) 50 | DEG_mat = prune_DE_mat(DEG_mat, mask_target = T, min_sig_DEG = 3) 51 | str(DEG_mat) 52 | head(DEG_mat$A549) 53 | ######### 54 | # 5.1 within-prtb 55 | celltype_list = names(DEG_mat) 56 | PRTB_list = colnames(DEG_mat[[1]]) 57 | gene_ID = rownames(DEG_mat[[1]]) 58 | ##### 59 | for(i in 1:length(PRTB_list)){ 60 | PRTB = PRTB_list[i] 61 | tmp=list() 62 | for(CELLTYPE in celltype_list){ 63 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB] 64 | } 65 | tmp = Reduce(cbind, tmp) 66 | colnames(tmp) = celltype_list 67 | rownames(tmp) = gene_ID 68 | # run Permutation test and extract gene signatures 69 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1) 70 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T) 71 | # plot Z-score heatmap for the gene signatures 72 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30, 73 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/", 74 | prefix = PRTB) 75 | } 76 | names(DEG_mat) 77 | ##### 78 | for(i in 1:length(PRTB_list)){ 79 | PRTB = PRTB_list[i] 80 | tmp=list() 81 | for(CELLTYPE in celltype_list){ 82 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){ 83 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB] 84 | } 85 | } 86 | tmp = Reduce(cbind, tmp) 87 | colnames(tmp) = celltype_list 88 | rownames(tmp) = gene_ID 89 | # run Permutation test and extract gene signatures 90 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1) 91 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T) 92 | # plot Z-score heatmap for the gene signatures 93 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30, 94 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/", 95 | prefix = PRTB) 96 | } 97 | colnames(DEG_mat[[CELLTYPE]]) 98 | PRTB 99 | PRTB %in% colnames(DEG_mat[[CELLTYPE]]) 100 | head(tmp) 101 | # and re-arrange the DE results into Z-score matrices 102 | DEG_mat = get_DE_mat(de_res) 103 | DEG_mat_main = DEG_mat 104 | ######### 105 | # 5.1 within-prtb 106 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0) 107 | celltype_list = names(DEG_mat) 108 | PRTB_list = colnames(DEG_mat[[1]]) 109 | gene_ID = rownames(DEG_mat[[1]]) 110 | ##### 111 | for(i in 1:length(PRTB_list)){ 112 | PRTB = PRTB_list[i] 113 | tmp=list() 114 | for(CELLTYPE in celltype_list){ 115 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){ 116 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB] 117 | } 118 | } 119 | tmp = Reduce(cbind, tmp) 120 | colnames(tmp) = celltype_list 121 | rownames(tmp) = gene_ID 122 | # run Permutation test and extract gene signatures 123 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1) 124 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T) 125 | # plot Z-score heatmap for the gene signatures 126 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30, 127 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/", 128 | prefix = PRTB) 129 | } 130 | ######### 131 | # 5.2 within-celltype 132 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3) 133 | celltype_list = names(DEG_mat) 134 | PRTB_list = colnames(DEG_mat[[1]]) 135 | gene_ID = rownames(DEG_mat[[1]]) 136 | for(i in 1:length(celltype_list)){ 137 | CELLTYPE = celltype_list[i] 138 | tmp=DEG_mat[[CELLTYPE]] 139 | # run Permutation test and extract gene signatures 140 | res = DEhclust(mat = tmp) 141 | sig_genes = get_sig_genes_DEhclust(obj = res) 142 | # plot Z-score heatmap for the gene signatures 143 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30, 144 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level2/", 145 | prefix = CELLTYPE) 146 | } 147 | ######### 148 | # 5.3 MultiCCA analysis 149 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3, center = T) 150 | celltype_list = names(DEG_mat) 151 | PRTB_list = colnames(DEG_mat[[1]]) 152 | gene_ID = rownames(DEG_mat[[1]]) 153 | # run Permutation test and extract gene signatures 154 | res = DEmultiCCA(DEG_mat, cor_coef_thres = 0.6, max_k = 3) 155 | str(DEG_mat) 156 | # run Permutation test and extract gene signatures 157 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3) 158 | str(res) 159 | str(DEG_mat) 160 | ######### 161 | # 5.3 MultiCCA analysis 162 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T) 163 | celltype_list = names(DEG_mat) 164 | PRTB_list = colnames(DEG_mat[[1]]) 165 | gene_ID = rownames(DEG_mat[[1]]) 166 | # run Permutation test and extract gene signatures 167 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.6, max_k = 3) 168 | sig_genes = get_sig_genes_DEmultiCCA(res) 169 | str(sig_genes) 170 | str(res) 171 | # run Permutation test and extract gene signatures 172 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3) 173 | str(res) 174 | sig_genes = get_sig_genes_DEmultiCCA(res) 175 | # visualization. 176 | DE_heatmap(obj = res, sig_genes = sig_genes, 177 | type = "multiCCA", direction = "both", 178 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/", 179 | prefix = "IFNG") 180 | # run Permutation test and extract gene signatures 181 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.6, max_k = 3, standardize = T) 182 | sig_genes = get_sig_genes_DEmultiCCA(res) 183 | # visualization. 184 | DE_heatmap(obj = res, sig_genes = sig_genes, 185 | type = "multiCCA", direction = "both", 186 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/", 187 | prefix = "IFNG") 188 | # visualization. 189 | DE_heatmap(obj = res, sig_genes = sig_genes, 190 | type = "multiCCA", direction = "both", 191 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/", 192 | prefix = "IFNG") 193 | # run Permutation test and extract gene signatures 194 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T) 195 | sig_genes = get_sig_genes_DEmultiCCA(res) 196 | # visualization. 197 | DE_heatmap(obj = res, sig_genes = sig_genes, 198 | type = "multiCCA", direction = "both", 199 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/", 200 | prefix = "IFNG") 201 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0) 202 | celltype_list = names(DEG_mat) 203 | PRTB_list = colnames(DEG_mat[[1]]) 204 | gene_ID = rownames(DEG_mat[[1]]) 205 | PRTB = PRTB_list[i] 206 | tmp=list() 207 | for(CELLTYPE in celltype_list){ 208 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){ 209 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB] 210 | } 211 | } 212 | tmp = Reduce(cbind, tmp) 213 | colnames(tmp) = celltype_list 214 | rownames(tmp) = gene_ID 215 | # run Permutation test and extract gene signatures 216 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1) 217 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T) 218 | str(sig_genes) 219 | PRTB 220 | ######### 221 | # 5.2 within-celltype 222 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3) 223 | celltype_list = names(DEG_mat) 224 | PRTB_list = colnames(DEG_mat[[1]]) 225 | gene_ID = rownames(DEG_mat[[1]]) 226 | CELLTYPE = celltype_list[i] 227 | tmp=DEG_mat[[CELLTYPE]] 228 | # run Permutation test and extract gene signatures 229 | res = DEhclust(mat = tmp) 230 | sig_genes = get_sig_genes_DEhclust(obj = res) 231 | str(sig_genes) 232 | names(sig_genes) 233 | ######### 234 | # 5.3 MultiCCA analysis 235 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T) 236 | celltype_list = names(DEG_mat) 237 | PRTB_list = colnames(DEG_mat[[1]]) 238 | gene_ID = rownames(DEG_mat[[1]]) 239 | # run Permutation test and extract gene signatures 240 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T) 241 | sig_genes = get_sig_genes_DEmultiCCA(res) 242 | str(sig_genes) 243 | names(sig_genes) 244 | ################################## 245 | # Decomposition 246 | go_db = list() 247 | ######### 248 | # 5.1 within-prtb 249 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0) 250 | celltype_list = names(DEG_mat) 251 | PRTB_list = colnames(DEG_mat[[1]]) 252 | gene_ID = rownames(DEG_mat[[1]]) 253 | ##### 254 | for(i in 1:length(PRTB_list)){ 255 | PRTB = PRTB_list[i] 256 | tmp=list() 257 | for(CELLTYPE in celltype_list){ 258 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){ 259 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB] 260 | } 261 | } 262 | tmp = Reduce(cbind, tmp) 263 | colnames(tmp) = celltype_list 264 | rownames(tmp) = gene_ID 265 | # run Permutation test and extract gene signatures 266 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1) 267 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T) 268 | # plot Z-score heatmap for the gene signatures 269 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30, 270 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/", 271 | prefix = PRTB) 272 | # store the gene signatures to the go-term repo 273 | if(length(sig_genes$upDEGs) >= 10){ 274 | go_db[[paste0(PRTB, "_upDEGs")]] = sig_genes$upDEGs 275 | } 276 | if(length(sig_genes$downDEGs) >= 10){ 277 | go_db[[paste0(PRTB, "_downDEGs")]] = sig_genes$downDEGs 278 | } 279 | } 280 | ######### 281 | # 5.2 within-celltype 282 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3) 283 | celltype_list = names(DEG_mat) 284 | PRTB_list = colnames(DEG_mat[[1]]) 285 | gene_ID = rownames(DEG_mat[[1]]) 286 | for(i in 1:length(celltype_list)){ 287 | CELLTYPE = celltype_list[i] 288 | tmp=DEG_mat[[CELLTYPE]] 289 | # run Permutation test and extract gene signatures 290 | res = DEhclust(mat = tmp) 291 | sig_genes = get_sig_genes_DEhclust(obj = res) 292 | # plot Z-score heatmap for the gene signatures 293 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30, 294 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level2/", 295 | prefix = CELLTYPE) 296 | # store the gene signatures to the go-term repo 297 | for(CLUSTER in names(sig_genes)){ 298 | if(length(sig_genes[[CLUSTER]]$sig_genes$upDEGs) >= 10){ 299 | go_db[[paste0(CELLTYPE, "_", CLUSTER, "_upDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$upDEGs 300 | } 301 | if(length(sig_genes[[CLUSTER]]$sig_genes$downDEGs) >= 10){ 302 | go_db[[paste0(CELLTYPE, "_", CLUSTER,"_downDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$downDEGs 303 | } 304 | } 305 | } 306 | ######### 307 | # 5.3 MultiCCA analysis 308 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T) 309 | celltype_list = names(DEG_mat) 310 | PRTB_list = colnames(DEG_mat[[1]]) 311 | gene_ID = rownames(DEG_mat[[1]]) 312 | # run Permutation test and extract gene signatures 313 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T) 314 | sig_genes = get_sig_genes_DEmultiCCA(res) 315 | # visualization. 316 | DE_heatmap(obj = res, sig_genes = sig_genes, 317 | type = "multiCCA", direction = "both", 318 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/", 319 | prefix = "IFNG") 320 | # store the gene signatures to the go-term repo 321 | for(PROGRAM in names(sig_genes)){ 322 | if(length(sig_genes[[PROGRAM]]$sig_genes$upDEGs) >= 10){ 323 | go_db[[paste0("IFNG_", PROGRAM, "_upDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$upDEGs 324 | } 325 | if(length(sig_genes[[PROGRAM]]$sig_genes$downDEGs) >= 10){ 326 | go_db[[paste0("IFNG_", PROGRAM, "_downDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$downDEGs 327 | } 328 | } 329 | str(go_db) 330 | head(seurat_obj) 331 | ########## 332 | # 6.1 DE tests 333 | seurat_obj$Condition = paste0(seurat_obj$cell_type, "_", seurat_obj$gene) 334 | Idents(seurat_obj) = "Condition" 335 | table(seurat_obj$Condition) 336 | new_DE_test = FindMarkers(seurat_obj, ident.1 = "A549_NT", ident.2 = "A549_IFNGR2") 337 | seurat_obj 338 | new_DE_test = FindMarkers(seurat_obj, ident.1 = "A549_NT", ident.2 = "A549_IFNGR2", 339 | slot = "data", logfc.threshold = 0) 340 | head(new_DE_test) 341 | # get the background gene list 342 | background = rownames(new_DE_test) 343 | # get the significant down-reg genes (the input list ) 344 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0, ]) 345 | # get the significant down-reg genes (the input list ) 346 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0.2, ]) 347 | # get the significant down-reg genes (the input list ) 348 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0.2, ]) 349 | # 6.2 Conventional test (Fisher's exact test) 350 | fisher_enrich_res = fisher_enrich_test(input_list = input_list, 351 | background = background, 352 | go_term_db = go_db) 353 | head(fisher_enrich_res) 354 | fisher_enrich_res = fisher_enrich_res[order(fisher_enrich_res$Pval), ] 355 | head(fisher_enrich_res) 356 | head(fisher_enrich_res, 20) 357 | # 6.3 Rank biased overlap based test 358 | # RBO test does NOT require pre-select DEGs based on P-value or log-fold-change 359 | input_list2 = rownames(new_DE_test[new_DE_test$avg_log2FC > 0, ]) 360 | head(new_DE_test[new_DE_test$avg_log2FC > 0, ], 30) 361 | # 6.3 Rank biased overlap based test 362 | # RBO test does NOT require pre-select DEGs based on P-value or log-fold-change 363 | input_list2 = rownames(new_DE_test[new_DE_test$avg_log2FC > 0, ]) 364 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 365 | go_term_db = go_db, 366 | p = 0.99) 367 | head(input_list2) 368 | length(input_list2) 369 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 370 | go_term_db = go_db, 371 | p = 0.99, 372 | side = "bottom") 373 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 374 | go_term_db = go_db, 375 | p = 0.99, 376 | k = 300, 377 | side = "bottom") 378 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring") 379 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 380 | go_term_db = go_db, 381 | p = 0.99, 382 | k = 300, 383 | side = "bottom") 384 | head(input_list2) 385 | str(go_db) 386 | go_term_db = go_db 387 | class(go_term_db[[1]]) == "list" 388 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 389 | go_term_db = go_db, 390 | p = 0.99, 391 | k = 300, 392 | side = "bottom") 393 | rbo 394 | input_list, 395 | input_list 396 | go_term_db 397 | p 398 | input_list 399 | go_term_db 400 | p = 0.99 401 | n_iter = 500 402 | k=300 403 | side= "bottom" 404 | mid = NULL 405 | uneven.lengths = TRUE 406 | empirical_test = FALSE 407 | seed = 131415926 408 | if(length(input_list) < 5){ 409 | print("The length of the input list is less than 5, stopping analysis...") 410 | return(NULL) 411 | } 412 | # 0. each vector in the go_term_db is assumed to be an ordered vector of characters (gene names). 413 | # we need to convert each vector in to a named vector of number (number being the rank of each gene). 414 | go_term_db2 = lapply(X = go_term_db, 415 | FUN = function(x) { 416 | if(side == "bottom") tmp = 1:length(x) 417 | else if (side == "top") tmp = length(x):1 418 | names(tmp) = x 419 | return(tmp) 420 | }) 421 | str(go_term_db2 ) 422 | # 1. calculate the true RBO for the input_list and all the GO terms 423 | rbo_real = sapply(X = go_term_db2, 424 | FUN = rbo, 425 | list2 = input_list, 426 | p = p, 427 | k = k, 428 | side = side, 429 | mid = mid, 430 | uneven.lengths = uneven.lengths) 431 | head(input_list) 432 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring") 433 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 434 | go_term_db = go_db, 435 | p = 0.99, 436 | k = 100, 437 | side = "bottom") 438 | head(rbo_enrich_res) 439 | rbo_enrich_res = rbo_enrich_res[order(rbo_enrich_res$RBO, decreasing = T), ] 440 | head(rbo_enrich_res, 20) 441 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 442 | go_term_db = go_db, 443 | p = 0.98, 444 | k = 100, 445 | side = "bottom") 446 | rbo_enrich_res = rbo_enrich_res[order(rbo_enrich_res$RBO, decreasing = T), ] 447 | head(rbo_enrich_res) 448 | options(Seurat.object.assay.version = 'v3') 449 | library(Seurat) 450 | library(ggridges) 451 | library(stringr) 452 | library(Mixscale) 453 | library(ggplot2) 454 | seurat_obj = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/seurat_obj_GSE132080_QCed_2023Aug10.rds") 455 | seurat_obj 456 | gv.list = Tool(seurat_obj, slot = "RunMixscape") 457 | gv.list = Tool(seurat_obj, slot = "PRTBScoring") 458 | str(gv.list) 459 | load_all("/Users/uqljian5/Documents/github_repo/Mixscale") 460 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/") 461 | seurat_obj 462 | obj2 = RunMixscale( 463 | object = seurat_obj, 464 | assay = "PRTB", 465 | slot = "scale.data", 466 | labels = "gene", 467 | nt.class.name = "neg", 468 | min.de.genes = 5, 469 | logfc.threshold = 0.2, 470 | de.assay = "RNA", 471 | max.de.genes = 100, 472 | fine.mode = F) 473 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/") 474 | obj2 = RunMixscale( 475 | object = seurat_obj, 476 | assay = "PRTB", 477 | slot = "scale.data", 478 | labels = "gene", 479 | nt.class.name = "neg", 480 | min.de.genes = 5, 481 | logfc.threshold = 0.2, 482 | de.assay = "RNA", 483 | max.de.genes = 100, 484 | fine.mode = F) 485 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/") 486 | obj2 = RunMixscale( 487 | object = seurat_obj, 488 | assay = "PRTB", 489 | slot = "scale.data", 490 | labels = "gene", 491 | nt.class.name = "neg", 492 | min.de.genes = 5, 493 | logfc.threshold = 0.2, 494 | de.assay = "RNA", 495 | max.de.genes = 100, 496 | fine.mode = F) 497 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/") 498 | obj2 = RunMixscale( 499 | object = seurat_obj, 500 | assay = "PRTB", 501 | slot = "scale.data", 502 | labels = "gene", 503 | nt.class.name = "neg", 504 | min.de.genes = 5, 505 | logfc.threshold = 0.2, 506 | de.assay = "RNA", 507 | max.de.genes = 100, 508 | fine.mode = F) 509 | head(obj2) 510 | plot(obj2$pvec, obj2$weight) 511 | rm(list=ls()) 512 | gc() 513 | -------------------------------------------------------------------------------- /R/enrichment_test.R: -------------------------------------------------------------------------------- 1 | #' 2 | NULL 3 | 4 | #' Wrapper function for DE and enrichment test 5 | #' 6 | #' This function provides a wrapper of Seurat::FindMarkers() and Mixscale::fisher_enrich_test(). 7 | #' Users can input a Seurat object they want to investigate and a list of gene sets they want to 8 | #' test against, and the wrapper will perform DE tests + Fisher's enrichment test across all the 9 | #' available cell types. It will then return a list of data frames, containing gene set enrichment 10 | #' results for each cell type. 11 | #' 12 | #' @export 13 | #' @param object a seurat object to perform the DE test and the enrichment test 14 | #' @param plist the pathway gene lists to test the DE genes against 15 | #' @param split.by Regroup cells into a different identity class prior to performing differential expression. 16 | #' Default is NULL (so all cells be used simultaneously). 17 | #' @param slct.ct Subset a particular identity class prior to regrouping. Only relevant if group.by is set. 18 | #' @param ident.1 Identity class to define markers for; pass an object of class phylo or 'clustertree' to find markers for a node in a cluster tree; passing 'clustertree' requires BuildClusterTree to have been run 19 | #' @param ident.2 A second identity class for comparison; if NULL, use all other cells for comparison; if an object of class phylo or 'clustertree' is passed to ident.1, must pass a node to find markers for 20 | #' @return a list of data frames containing the gene set enrichment results for each group in "group.by" 21 | #' 22 | 23 | DEenrich <- function(object, 24 | plist = NULL, 25 | ident = NULL, 26 | ident.1 = NULL, 27 | ident.2 = NULL, 28 | split.by = NULL, 29 | slct.ct = NULL, 30 | direction = c("up", "down", "both"), 31 | logfc.threshold = 0.25, 32 | p.val.cutoff = 0.05, 33 | min.pct = 0.1, 34 | assay = NULL, 35 | ...){ 36 | 37 | slct_celltype = sort(unique(object[[split.by]][, 1])) 38 | if(!is.null(split.by) & is.null(slct_celltype)){ 39 | stop("Please check if your split.by is correctly specified.") 40 | } 41 | 42 | if(!is.null(slct.ct)){ 43 | slct_celltype = intersect(slct_celltype, slct.ct) 44 | } 45 | if(length(slct_celltype) == 0){ 46 | slct_celltype = "con1" 47 | } 48 | 49 | enrich_list = list() 50 | for(CELLTYPE in slct_celltype){ 51 | if(!is.null(ident)){ 52 | Idents(object) = ident 53 | } 54 | # 55 | if(is.null(split.by)){ 56 | object[["new_ident"]] = paste0("con1", "_", Idents(object)) 57 | } else { 58 | object[["new_ident"]] = paste0(object[[split.by]][,1], "_", Idents(object)) 59 | } 60 | ident.1.tmp = paste0(CELLTYPE, "_", ident.1) 61 | ident.2.tmp = paste0(CELLTYPE, "_", ident.2) 62 | 63 | # run DE 64 | Idents(object) = "new_ident" 65 | DE_res = FindMarkers(object, 66 | ident.1 = ident.1.tmp, 67 | ident.2 = ident.2.tmp, 68 | min.pct = min.pct, 69 | logfc.threshold = 0, 70 | ...) 71 | 72 | # get the top DEGs separately for up and down regulated genes 73 | upDEG = rownames(DE_res[DE_res$p_val_adj <= p.val.cutoff & DE_res$avg_log2FC > logfc.threshold, ]) 74 | downDEG = rownames(DE_res[DE_res$p_val_adj <= p.val.cutoff & DE_res$avg_log2FC < -logfc.threshold, ]) 75 | background = rownames(DE_res) # the background gene list 76 | 77 | # run enrichment test for the DEGs 78 | if(length(downDEG) < 5 | direction == "up"){ 79 | enrich_res_down = NULL 80 | } else { 81 | enrich_res_down = fisher_enrich_test(input_list = downDEG, 82 | background = background, 83 | go_term_db = plist) 84 | enrich_res_down$num_DEG = length(downDEG) 85 | enrich_res_down$direction_DEG = "downDEG" 86 | enrich_res_down = enrich_res_down[order(enrich_res_down$Pval), ] 87 | enrich_res_down$slct.ct = CELLTYPE 88 | } 89 | 90 | # 91 | if(length(upDEG) < 5 | direction == "down"){ 92 | enrich_res_up = NULL 93 | } else { 94 | enrich_res_up = fisher_enrich_test(input_list = upDEG, 95 | background = background, 96 | go_term_db = plist) 97 | enrich_res_up$num_DEG = length(upDEG) 98 | enrich_res_up$direction_DEG = "upDEG" 99 | enrich_res_up = enrich_res_up[order(enrich_res_up$Pval), ] 100 | enrich_res_up$slct.ct = CELLTYPE 101 | } 102 | 103 | # save the results to the list 104 | enrich_list[[CELLTYPE]] = rbind(enrich_res_up, enrich_res_down) 105 | } 106 | 107 | if(length(enrich_list) == 1){ 108 | return(enrich_list[[1]]) 109 | } else { 110 | return(enrich_list) 111 | } 112 | } 113 | 114 | 115 | 116 | #' Rank biased overlap 117 | #' 118 | #' A function for a new gene-set enrichment test based on the 119 | #' RBO (rank biased overlap) calculation with extropolation (Webber et al., 2010). 120 | #' The core functions of rbo() calculation was modified from the "gespeR" package (original author: Fabian Schmich). 121 | #' We modified it to accomodate our package and data type. We also developed a permutation scheme for 122 | #' RBO to allow for p-value calculations. 123 | #' 124 | #' @author Fabian Schmich ("gespeR" package) 125 | #' @export 126 | #' 127 | #' @param list1 List 1 128 | #' @param list2 List 2 129 | #' @param p Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements 130 | #' @param k Evaluation depth for extrapolation 131 | #' @param side Evaluate similarity between the top or the bottom of the ranked lists 132 | #' @param mid Set the mid point to for example only consider positive or negative scores 133 | #' @param uneven.lengths Indicator if lists have uneven lengths 134 | #' @return a scaler value measuring the rank biased overlap (rbo) 135 | #' 136 | 137 | rbo <- function(list1, list2, p, k=floor(max(length(list1), length(list2))/2), side=c("top", "bottom"), mid = NULL, uneven.lengths = TRUE) { 138 | side <- match.arg(side) 139 | if (!is.numeric(list1) | !is.numeric(list2)) 140 | stop("Input vectors are not numeric.") 141 | if (is.null(names(list1)) | is.null(names(list2))) 142 | stop("Input vectors are not named.") 143 | ids <- switch(side, 144 | "top"=list(list1=.select.ids(list1, "top", mid), list2=.select.ids(list2, "top", mid)), 145 | "bottom"=list(list1=.select.ids(list1, "bottom", mid), list2=.select.ids(list2, "bottom", mid)) 146 | ) 147 | min(1, .rbo.ext(ids$list1, ids$list2, p, k, uneven.lengths = uneven.lengths)) 148 | } 149 | 150 | 151 | # rbo2 <- function(list1, list2, p, k=floor(max(length(list1), length(list2))/2), side=c("top", "bottom"), mid = NULL, uneven.lengths = TRUE) { 152 | # side <- match.arg(side) 153 | # if (!is.numeric(list1) | !is.numeric(list2)) 154 | # stop("Input vectors are not numeric.") 155 | # if (is.null(names(list1)) | is.null(names(list2))) 156 | # stop("Input vectors are not named.") 157 | # ids <- switch(side, 158 | # "top"=list(list1=.select.ids(list1, "top", mid), list2=.select.ids(list2, "top", mid)), 159 | # "bottom"=list(list1=.select.ids(list1, "bottom", mid), list2=.select.ids(list2, "bottom", mid)) 160 | # ) 161 | # min(1, rbo_ext(ids$list1, ids$list2, p, k, uneven_lengths = uneven.lengths)) 162 | # } 163 | 164 | 165 | #' Select top or bottom names of ranked vector 166 | #' 167 | #' @author Fabian Schmich ("gespeR" package) 168 | #' @noRd 169 | #' 170 | #' @param x The ranked list 171 | #' @param side The side to be evaluated ("top" or "bottom" of ranked list) 172 | #' @param mid The mid point to split a list, e.g. to split between positive and negative values choose mid=0 173 | #' @return A vector of selected identifiers 174 | .select.ids <- function(x, side=c("top", "bottom"), mid=NULL) { 175 | side <- match.arg(side) 176 | if (side == "top") { 177 | x <- sort(x, decreasing=TRUE) 178 | if (is.null(mid)) 179 | return(names(x)) 180 | else 181 | return(names(x)[which(x > mid)]) 182 | } else if (side == "bottom") { 183 | x <- sort(x, decreasing=FALSE) 184 | if (is.null(mid)) 185 | return(names(x)) 186 | else 187 | return(names(x)[which(x < mid)]) 188 | } 189 | } 190 | 191 | 192 | #' Rank biased overlap formula based on (32) from "A Similarity Measure for Indefinite Rankings" (Webber et al.) 193 | #' 194 | #' @author Fabian Schmich ("gespeR" package) 195 | #' @noRd 196 | #' 197 | #' @param x List 1 198 | #' @param y List 2 199 | #' @param p The weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements 200 | #' @param k The evaluation depth 201 | #' @param uneven.lengths Indicator if lists have uneven lengths 202 | #' @return The rank biased overlap between x and y 203 | .rbo.ext <- function(x, y, p, k, uneven.lengths = TRUE) { 204 | if (length(x) <= length(y)) { 205 | S <- x 206 | L <- y 207 | } else { 208 | S <- y 209 | L <- x 210 | } 211 | l <- min(k, length(L)) 212 | s <- min(k, length(S)) 213 | 214 | if (uneven.lengths) { 215 | Xd <- sapply(1:l, function(i) length(intersect(S[1:i], L[1:i]))) 216 | ((1-p) / p) * 217 | ((sum(Xd[seq(1, l)] / seq(1, l) * p^seq(1, l))) + 218 | (sum(Xd[s] * (seq(s+1, l) - s) / (s * seq(s+1, l)) * p^seq(s+1, l)))) + 219 | ((Xd[l] - Xd[s]) / l + (Xd[s] / s)) * p^l 220 | } else { 221 | #stopifnot(l == s) 222 | k <- min(s, k) 223 | Xd <- sapply(1:k, function(i) length(intersect(x[1:i], y[1:i]))) 224 | Xk <- Xd[k] 225 | (Xk / k) * p^k + (((1-p)/p) * sum((Xd / seq(1,k)) * p^seq(1,k))) 226 | } 227 | } 228 | 229 | 230 | #' Rank biased overlap (RBO) based enrichment test 231 | #' 232 | #' To perform enrichment test based on rank biased overlap and permutation. 233 | #' 234 | #' @export 235 | #' 236 | #' @param input_list input gene list from user (a named vector) 237 | #' @param go_term_db a list object of multiple gene-ontology (GO) terms to run enrichment test against 238 | #' @param p Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements 239 | #' @param n_iter the number of iteration to perform the permutation to obtain the P-values of the enrichment test 240 | #' @param k Evaluation depth for extrapolation 241 | #' @param side Evaluate similarity between the top or the bottom of the ranked lists 242 | #' @param mid Set the mid point to for example only consider positive or negative scores 243 | #' @param uneven.lengths Indicator if lists have uneven lengths 244 | #' @param empirical_test a boolen value to tell the function is an empirical test should be performed. If TRUE, 245 | #' the exact empirical proportion of the permutated elements that are greater than the true RBO 246 | #' is returned as the p-value (high accuracy usually requires a large n_iter, e.g., 1000). If FALSE, then a standard 247 | #' Z-score test is applied to the RBO based on the mean and standard deviation of all the permuated elements (less accurate 248 | #' but more efficient. A small n_iter is usually enough (e.g., 100 or 200) to get good approximation compared to 249 | #' the true empirical test). 250 | #' 251 | #' @return a data.frame consists of rbo measurement between the inptu gene list and all the GO terms, 252 | #' as well as the P-values based on permutation. Please note that the P-values indicate whether the rank of the input gene 253 | #' list and the GO-term gene set are consistent or not. It does NOT indicate if RBO is significantly different from 0. 254 | #' 255 | 256 | rbo_enrich_test <- function(input_list, 257 | go_term_db, 258 | p, 259 | n_iter = 500, 260 | k=300, 261 | side=c("top", "bottom"), 262 | mid = NULL, 263 | uneven.lengths = TRUE, 264 | empirical_test = FALSE, 265 | seed = 131415926) { 266 | if(length(input_list) < 5){ 267 | print("The length of the input list is less than 5, stopping analysis...") 268 | return(NULL) 269 | } 270 | 271 | # 0. each vector in the go_term_db is assumed to be an ordered vector of characters (gene names). 272 | # we need to convert each vector in to a named vector of number (number being the rank of each gene). 273 | go_term_db2 = lapply(X = go_term_db, 274 | FUN = function(x) { 275 | if(side == "bottom") tmp = 1:length(x) 276 | else if (side == "top") tmp = length(x):1 277 | names(tmp) = x 278 | return(tmp) 279 | }) 280 | 281 | # create a rank vector from input_list 282 | ori_input_list = input_list 283 | if(side == "bottom"){ 284 | input_list = 1:length(ori_input_list) 285 | names(input_list) = ori_input_list 286 | } else if (side == "top") { 287 | input_list = length(ori_input_list):1 288 | names(input_list) = ori_input_list 289 | } 290 | 291 | # 1. calculate the true RBO for the input_list and all the GO terms 292 | rbo_real = sapply(X = go_term_db2, 293 | FUN = rbo, 294 | list2 = input_list, 295 | p = p, 296 | k = k, 297 | side = side, 298 | mid = mid, 299 | uneven.lengths = uneven.lengths) 300 | 301 | # 2. now we need to proceed to the permutation tests 302 | set.seed(seed) 303 | 304 | # Shuffle the input_list for n_iter times (shuffled matrix has n_iter columns) 305 | max_d = ifelse(k >= length(input_list), yes = length(input_list), no = k) 306 | shuffled_matrix <- replicate(n_iter, sample(input_list[1:max_d])) 307 | rownames(shuffled_matrix) = names(input_list[1:max_d]) 308 | 309 | # Function to calculate rbo for each go_term against the shuffled matrix 310 | calculate_rbo <- function(go_term) { 311 | apply(shuffled_matrix, 312 | MARGIN = 2, 313 | FUN = rbo, 314 | list2 = go_term, 315 | p = p, 316 | k = k, 317 | side = side, 318 | mid = mid, 319 | uneven.lengths = uneven.lengths) 320 | } 321 | 322 | # Apply the function to each go_term in go_term_db2 323 | list_perm_vector <- lapply(go_term_db2, calculate_rbo) 324 | 325 | # now calculate the P-values based on empirical_test 326 | if(empirical_test == TRUE){ 327 | calculate_proportion <- function(element, list_vec) { 328 | mean(list_vec > element) 329 | } 330 | p_values <- mapply(calculate_proportion, rbo_real, list_perm_vector) 331 | } else if (empirical_test == FALSE){ 332 | calculate_proportion <- function(element, list_vec) { 333 | pnorm(q = element, 334 | mean = mean(list_vec), 335 | sd = sd(list_vec), 336 | lower.tail = F) 337 | } 338 | p_values <- mapply(calculate_proportion, rbo_real, list_perm_vector) 339 | } 340 | 341 | # merge the results into one data.frame 342 | res_dat = data.frame(GO_term = names(rbo_real), 343 | RBO = rbo_real, 344 | Pval = p_values, 345 | n_GO_term = sapply(X = go_term_db2, FUN = length), 346 | n_intersect = sapply(X = go_term_db, FUN = function(x, y) length(intersect(x, y)), y = names(input_list))) 347 | # 348 | res_dat$Pval[res_dat$n_intersect <= 1 | res_dat$RBO <= 0.01 ] = 1 349 | rownames(res_dat) = NULL 350 | return(res_dat) 351 | } 352 | 353 | 354 | #' get the weight of each depth till 'd' given a weight parameter 'p' 355 | #' @noRd 356 | #' @return a numeric vector of the weights from rank depth 1 to d. 357 | gs_seq = function(d, p){ 358 | gs = function(d, p){ 359 | (1-p)*p^(d-1) 360 | } 361 | return(gs(1:d, p)) 362 | } 363 | 364 | 365 | 366 | #' Standard Fisher's exact test for enrichment analysis 367 | #' 368 | #' This function will perform the strandard Fisher's exact test between the input gene 369 | #' list and a series of gene-ontology gene sets (adopted from DAVID GO analysis). 370 | #' 371 | #' @export 372 | #' 373 | #' @param input_list the input gene list 374 | #' @param background the background gene list (usually the expressed genes where the 375 | #' input gene list is generate from, ). 376 | #' @param go_term_db a list of gene-lists (GO term). It should be a list contain multiple named vector, 377 | #' and each vector should be a vector of multiple marker/signature genes for some biological pathway/process. 378 | #' @param list_gene A Boolen value to indicate if the overlapping genes between the input gene list and 379 | #' the GO-term should be output as well. 380 | #' @param EASE A Boolen value to indicate if the EASE correction should be applied (see 381 | #' https://david.ncifcrf.gov/helps/functional_annotation.html). This is useful to mitigate the 382 | #' small-sample inflation when the input gene list is short (e.g., < 10). 383 | #' 384 | #' @return a data frame contains the enrichment test results. Each row contains the P-value and enrichment odds 385 | #' ratio calculated from a Fisher's exact test for one GO-term in the go_term_db. 386 | 387 | fisher_enrich_test = function(input_list = NULL, 388 | background = NULL, 389 | go_term_db = NULL, 390 | list_gene = F, 391 | EASE = F){ 392 | PT = length(background) 393 | 394 | # if go_term_db is a list of lists, Reduce it down to a list of vector (remove all the intermediate layers) 395 | while(class(go_term_db[[1]]) == "list"){ 396 | go_term_db = Reduce(c, go_term_db) 397 | } 398 | 399 | if(list_gene == F){ 400 | dat = matrix(nrow = length(go_term_db), ncol = 6) 401 | i = 1 402 | for(GO_TERM in names(go_term_db)){ 403 | PH = length(intersect(go_term_db[[GO_TERM]], background)) 404 | LT = length(input_list) 405 | 406 | # LH_list = intersect(input_list, go_term_db[[GO_TERM]]) 407 | LH = length(intersect(input_list, go_term_db[[GO_TERM]])) 408 | 409 | # the Fisher exact test with EASE correction 410 | dat2 = matrix(data = c(LH-1, PH-LH+1, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2) 411 | # print(dat2) 412 | if(LH < 1 | EASE == T){ 413 | dat2 = matrix(data = c(LH, PH-LH, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2) 414 | } 415 | # 416 | res = fisher.test(dat2, alternative = "greater") 417 | # 418 | dat[i, 1] = GO_TERM 419 | dat[i, 2] = res$estimate 420 | dat[i, 3] = res$p.value 421 | dat[i, 4] = -log(res$p.value)*res$estimate 422 | dat[i, 5] = LH 423 | dat[i, 6] = PH 424 | # 425 | i = i + 1 426 | } 427 | dat = as.data.frame(dat) 428 | dat = dat[complete.cases(dat), ] 429 | names(dat) = c("GO_term", "OR", "Pval", "combined_score", "num_LH", "num_PH") 430 | 431 | } else { 432 | dat = matrix(nrow = length(go_term_db), ncol = 7) 433 | i = 1 434 | for(GO_TERM in names(go_term_db)){ 435 | PH = length(intersect(go_term_db[[GO_TERM]], background)) 436 | LT = length(input_list) 437 | 438 | LH_list = intersect(input_list, go_term_db[[GO_TERM]]) 439 | LH = length(LH_list) 440 | 441 | # the Fisher exact test with EASE correction 442 | dat2 = matrix(data = c(LH-1, PH-LH+1, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2) 443 | # print(dat2) 444 | if(LH < 1){ 445 | dat2 = matrix(data = c(LH, PH-LH, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2) 446 | } 447 | # 448 | res = fisher.test(dat2, alternative = "greater") 449 | # 450 | dat[i, 1] = GO_TERM 451 | dat[i, 2] = res$estimate 452 | dat[i, 3] = res$p.value 453 | dat[i, 4] = -log(res$p.value)*res$estimate 454 | dat[i, 5] = LH 455 | dat[i, 6] = PH 456 | dat[i, 7] = paste0(LH_list, collapse = ";") 457 | 458 | # 459 | i = i + 1 460 | } 461 | dat = as.data.frame(dat) 462 | dat = dat[complete.cases(dat), ] 463 | names(dat) = c("GO_term", "OR", "Pval", "combined_score", "num_LH", "num_PH", "overlap_gene") 464 | 465 | } 466 | 467 | dat$OR = as.numeric(dat$OR) 468 | dat$Pval = as.numeric(dat$Pval) 469 | dat$num_LH = as.integer(dat$num_LH) 470 | dat$num_PH = as.integer(dat$num_PH) 471 | dat$n_GO_term = sapply(X = go_term_db, FUN = length) 472 | 473 | return(dat) 474 | } 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | -------------------------------------------------------------------------------- /R/get_fold_change.R: -------------------------------------------------------------------------------- 1 | #' 2 | NULL 3 | 4 | 5 | #' Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells 6 | #' 7 | #' Function to calculate log-fold-change for pooled CRISPR screen datasets. 8 | #' It is just a simple function to calculate the log-fold-change. Users can customise the min.cells, 9 | #' minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)), 10 | #' minimal percentage of cells expression the genes, and the base of the log. 11 | #' 12 | #' @param gene_exp a vector of the gene expression levels 13 | #' @param idx_P a vector of index for the perturbed cells in the gene_exp 14 | #' @param idx_NT a vector of index for the non-target cells (controls) in the gene_exp 15 | #' @param min.cells the minimal number of cells that expresses the gene; if lower than this value, the 16 | #' fold-change will be returned as NA. Default is 3. 17 | #' @param thresh.min the minimal value of expression; any expression value lower than this will be 18 | #' considered as 0. Default is 0. 19 | #' @param pseudocount.use the small value that will be added to the log-transformation to avoid log(0). 20 | #' For example, if a mean expression value is x, the final log- 21 | #' @param min.pct the minimal proportion of cells in either groups that expresses the gene 22 | #' @param base the base for log() 23 | #' @param norm.method the normalization method for the input gene_exp. Default is 'raw', which means 24 | #' the original count value without normalization. The other supported values are 'log.norm', "scale.data". 25 | #' The mean.fxn() will change accordingly. 26 | #' @return Returns a single value of the log-fold-change of the input gene. 27 | #' @export 28 | #' @concept perturbation_scoring 29 | 30 | get_fc = function(gene_exp = NULL, idx_P = NULL, idx_NT = NULL, 31 | min.cells = 3, 32 | thresh.min = 0, 33 | pseudocount.use = 1, 34 | min.pct = 0.1, 35 | base = 2, 36 | norm.method = 'raw' 37 | ){ 38 | # the exp_vec should have been std 39 | 40 | # flag 1: do minimum cell check 41 | if (sum(gene_exp[idx_P] > 0) < min.cells && 42 | sum(gene_exp[idx_NT] > 0) < min.cells) { 43 | return(NA) 44 | } 45 | 46 | # flag 2: do variance check (not 0) 47 | if (var(gene_exp) == 0) { 48 | return(NA) 49 | } 50 | 51 | # flag 3: min.pct check 52 | # calculate fraction of cell with expression > 0 53 | pct.1 <- round( 54 | x = sum(x = gene_exp[idx_NT] > thresh.min) / 55 | length(x = gene_exp[idx_NT]), 56 | digits = 3 57 | ) 58 | pct.2 <- round( 59 | x = sum(x = gene_exp[idx_P] > thresh.min) / 60 | length(x = gene_exp[idx_P]), 61 | digits = 3 62 | ) 63 | 64 | if (pct.1 < min.pct & pct.2 < min.pct) { 65 | return(NA) 66 | } 67 | 68 | # set the mean.fxn() function according to norm.method 69 | default.mean.fxn <- function(x) { 70 | return(log(x = mean(x = x) + pseudocount.use, base = base)) 71 | } 72 | mean.fxn <- switch( 73 | EXPR = norm.method, 74 | 'log.norm' = function(x) { 75 | return(log(x = mean(x = expm1(x = x)) + pseudocount.use, base = base)) 76 | }, 77 | 'scale.data' = mean, 78 | default.mean.fxn 79 | ) 80 | 81 | # flag 4: minimum fold change check 82 | # log(x = mean(x = x) + pseudocount.use, base = base) 83 | data.1 <- mean.fxn(gene_exp[idx_NT]) 84 | data.2 <- mean.fxn(gene_exp[idx_P]) 85 | fc <- - data.1 + data.2 86 | return(fc) 87 | 88 | } 89 | 90 | 91 | 92 | # a function to do the filtering for all the genens 93 | get_idx = function(gene_exp = NULL, idx_P = NULL, idx_NT = NULL, 94 | min.cells = 3, # the minimum cell threshold to perform DE 95 | thresh.min = 0, # the minimum expression level 96 | pseudocount.use = 1, 97 | min.pct = 0.1, 98 | logfc.threshold = 0.1, 99 | base = 2, 100 | norm.method = 'raw' 101 | ){ 102 | 103 | fc <- get_fc(gene_exp = gene_exp, idx_P = idx_P, idx_NT = idx_NT, 104 | min.cells = min.cells, 105 | thresh.min = thresh.min, 106 | pseudocount.use = pseudocount.use, 107 | min.pct = min.pct, 108 | base = base, 109 | norm.method = norm.method 110 | ) 111 | 112 | if (fc < logfc.threshold){ 113 | return(FALSE) 114 | } 115 | 116 | return(TRUE) 117 | } 118 | 119 | 120 | #' Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells 121 | #' 122 | #' Function to calculate log-fold-change for pooled CRISPR screen datasets. 123 | #' It is just a simple function to calculate the log-fold-change. Users can customise the min.cells, 124 | #' minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)), 125 | #' minimal percentage of cells expression the genes, and the base of the log. 126 | #' 127 | #' @inheritParams Seurat::FoldChange 128 | #' @importFrom Matrix rowSums 129 | #' @return Returns a single value of the log-fold-change of the input gene. 130 | #' @export 131 | #' @concept perturbation_scoring 132 | FoldChange_new <- function( 133 | object, 134 | cells.1, 135 | cells.2, 136 | mean.fxn, 137 | fc.name, 138 | features = NULL, 139 | ... 140 | ) { 141 | features <- features %||% rownames(x = object) 142 | 143 | # Calculate percent expressed 144 | thresh.min <- 0 145 | 146 | min.cell.1 = Matrix::rowSums(x = object[features, cells.1, drop = FALSE] > thresh.min) 147 | min.cell.2 = Matrix::rowSums(x = object[features, cells.2, drop = FALSE] > thresh.min) 148 | 149 | pct.1 <- round( 150 | x = Matrix::rowSums(x = object[features, cells.1, drop = FALSE] > thresh.min) / 151 | length(x = cells.1), 152 | digits = 3 153 | ) 154 | pct.2 <- round( 155 | x = Matrix::rowSums(x = object[features, cells.2, drop = FALSE] > thresh.min) / 156 | length(x = cells.2), 157 | digits = 3 158 | ) 159 | # Calculate fold change 160 | data.1 <- mean.fxn(object[features, cells.1, drop = FALSE]) 161 | data.2 <- mean.fxn(object[features, cells.2, drop = FALSE]) 162 | fc <- (data.1 - data.2) 163 | fc.results <- as.data.frame(x = cbind(fc, pct.1, pct.2, min.cell.1, min.cell.2)) 164 | colnames(fc.results) <- c(fc.name, "pct.1", "pct.2", "min.cell.1", "min.cell.2") 165 | return(fc.results) 166 | } 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /R/glm_gp_disp_only.R: -------------------------------------------------------------------------------- 1 | #' 2 | NULL 3 | 4 | 5 | #' Internal Function to Fit a Gamma-Poisson GLM 6 | #' 7 | #' @import glmGamPoi 8 | #' @inheritParams glmGamPoi::glm_gp 9 | #' @inheritParams glmGamPoi::overdispersion_mle 10 | #' @param Y any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with 11 | #' one column per sample and row per gene. 12 | #' 13 | #' @return a list with four elements 14 | #' * `Beta` the coefficient matrix 15 | #' * `overdispersion` the vector with the estimated overdispersions 16 | #' * `Mu` a matrix with the corresponding means for each gene 17 | #' and sample 18 | #' * `size_factors` a vector with the size factor for each 19 | #' sample 20 | #' * `ridge_penalty` a vector with the ridge penalty 21 | #' 22 | #' @seealso [glm_gp()] and [overdispersion_mle()] 23 | #' @keywords internal 24 | glm_gp_disp_only_impl <- function(Y, model_matrix, 25 | offset = 0, 26 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"), 27 | overdispersion = TRUE, 28 | overdispersion_shrinkage = TRUE, 29 | ridge_penalty = 0, 30 | do_cox_reid_adjustment = TRUE, 31 | subsample = FALSE, 32 | verbose = FALSE){ 33 | if(is.vector(Y)){ 34 | Y <- matrix(Y, nrow = 1) 35 | } 36 | # Error conditions 37 | stopifnot(is.matrix(Y) || is(Y, "DelayedArray")) 38 | stopifnot(is.matrix(model_matrix) && nrow(model_matrix) == ncol(Y)) 39 | glmGamPoi:::validate_Y_matrix(Y) 40 | subsample <- glmGamPoi:::handle_subsample_parameter(Y, subsample) 41 | ridge_penalty <- glmGamPoi:::handle_ridge_penalty_parameter(ridge_penalty, model_matrix, verbose = verbose) 42 | 43 | # Combine offset and size factor 44 | off_and_sf <- glmGamPoi:::combine_size_factors_and_offset(offset, size_factors, Y, verbose = verbose) 45 | offset_matrix <- off_and_sf$offset_matrix 46 | size_factors <- off_and_sf$size_factors 47 | 48 | # Check if there distinct groups in model matrix 49 | # returns NULL if there would be more groups than columns 50 | # only_intercept_model <- ncol(model_matrix) == 1 && all(model_matrix == 1) 51 | groups <- glmGamPoi:::get_groups_for_model_matrix(model_matrix) 52 | if(! is.null(groups) && any(ridge_penalty > 1e-10)){ 53 | # Cannot apply ridge penalty in group-wise optimization 54 | groups <- NULL 55 | } 56 | 57 | # If no overdispersion, make rough first estimate 58 | if(isTRUE(overdispersion)){ 59 | if(verbose){ message("Make initial dispersion estimate") } 60 | disp_init <- glmGamPoi:::estimate_dispersions_roughly(Y, model_matrix, offset_matrix = offset_matrix) 61 | }else if(isFALSE(overdispersion)){ 62 | disp_init <- rep(0, times = nrow(Y)) 63 | }else if(is.character(overdispersion) && overdispersion == "global"){ 64 | if(verbose){ message("Make initial dispersion estimate") } 65 | disp_init <- glmGamPoi:::estimate_dispersions_roughly(Y, model_matrix, offset_matrix = offset_matrix) 66 | disp_init <- rep(median(disp_init), nrow(Y)) 67 | }else{ 68 | stopifnot(is.numeric(overdispersion) && (length(overdispersion) == 1 || length(overdispersion) == nrow(Y))) 69 | if(length(overdispersion) == 1){ 70 | disp_init <- rep(overdispersion, times = nrow(Y)) 71 | }else{ 72 | disp_init <- overdispersion 73 | } 74 | } 75 | 76 | 77 | # Estimate the betas 78 | if(! is.null(groups)){ 79 | if(verbose){ message("Make initial beta estimate") } 80 | beta_group_init <- glmGamPoi:::estimate_betas_roughly_group_wise(Y, offset_matrix, groups) 81 | if(verbose){ message("Estimate beta") } 82 | beta_res <- glmGamPoi:::estimate_betas_group_wise(Y, offset_matrix = offset_matrix, 83 | dispersions = disp_init, beta_group_init = beta_group_init, 84 | groups = groups, model_matrix = model_matrix) 85 | }else{ 86 | # Init beta with reasonable values 87 | if(verbose){ message("Make initial beta estimate") } 88 | beta_init <- glmGamPoi:::estimate_betas_roughly(Y, model_matrix, offset_matrix = offset_matrix, ridge_penalty = ridge_penalty) 89 | if(verbose){ message("Estimate beta") } 90 | beta_res <- glmGamPoi:::estimate_betas_fisher_scoring(Y, model_matrix = model_matrix, offset_matrix = offset_matrix, 91 | dispersions = disp_init, beta_mat_init = beta_init, ridge_penalty = ridge_penalty) 92 | } 93 | Beta <- beta_res$Beta 94 | 95 | # Calculate corresponding predictions 96 | # Mu <- exp(Beta %*% t(model_matrix) + offset_matrix) 97 | Mu <- glmGamPoi:::calculate_mu(Beta, model_matrix, offset_matrix) 98 | 99 | # Make estimate of over-disperion 100 | if(isTRUE(overdispersion) || (is.character(overdispersion) && overdispersion == "global")){ 101 | if(verbose){ message("Estimate dispersion") } 102 | if(isTRUE(overdispersion)){ 103 | disp_est <- overdispersion_mle(Y, Mu, model_matrix = model_matrix, 104 | do_cox_reid_adjustment = do_cox_reid_adjustment, 105 | subsample = subsample, verbose = verbose)$estimate 106 | }else if(is.character(overdispersion) && overdispersion == "global"){ 107 | disp_est <- overdispersion_mle(Y, Mu, model_matrix = model_matrix, 108 | do_cox_reid_adjustment = do_cox_reid_adjustment, 109 | global_estimate = TRUE, 110 | subsample = subsample, verbose = verbose)$estimate 111 | disp_est <- rep(disp_est, times = nrow(Y)) 112 | } 113 | 114 | if(isTRUE(overdispersion_shrinkage)){ 115 | dispersion_shrinkage <- overdispersion_shrinkage(disp_est, gene_means = DelayedMatrixStats::rowMeans2(Mu), 116 | df = subsample - ncol(model_matrix), 117 | ql_disp_trend = length(disp_est) >= 100, 118 | npoints = max(0.1 * length(disp_est), 100), 119 | verbose = verbose) 120 | disp_latest <- dispersion_shrinkage$dispersion_trend 121 | }else{ 122 | dispersion_shrinkage <- NULL 123 | disp_latest <- disp_est 124 | } 125 | 126 | # Estimate the betas again (only necessary if disp_est has changed) 127 | # if(verbose){ message("Estimate beta again") } 128 | # if(! is.null(groups)){ 129 | # beta_res <- estimate_betas_group_wise(Y, offset_matrix = offset_matrix, 130 | # dispersions = disp_latest, beta_mat_init = Beta, 131 | # groups = groups, model_matrix = model_matrix) 132 | # }else{ 133 | # beta_res <- estimate_betas_fisher_scoring(Y, model_matrix = model_matrix, offset_matrix = offset_matrix, 134 | # dispersions = disp_latest, beta_mat_init = Beta, ridge_penalty = ridge_penalty) 135 | # } 136 | # Beta <- beta_res$Beta 137 | # 138 | # # Calculate corresponding predictions 139 | # Mu <- calculate_mu(Beta, model_matrix, offset_matrix) 140 | }else if(isTRUE(overdispersion_shrinkage) || is.numeric(overdispersion_shrinkage)){ 141 | # Given predefined disp_est shrink them 142 | disp_est <- disp_init 143 | dispersion_shrinkage <- overdispersion_shrinkage(disp_est, gene_means = DelayedMatrixStats::rowMeans2(Mu), 144 | df = subsample - ncol(model_matrix), 145 | disp_trend = overdispersion_shrinkage, verbose = verbose) 146 | disp_latest <- dispersion_shrinkage$dispersion_trend 147 | # if(verbose){ message("Estimate beta again") } 148 | # if(! is.null(groups)){ 149 | # beta_res <- estimate_betas_group_wise(Y, offset_matrix = offset_matrix, 150 | # dispersions = disp_latest, beta_mat_init = Beta, 151 | # groups = groups, model_matrix = model_matrix) 152 | # }else{ 153 | # beta_res <- estimate_betas_fisher_scoring(Y, model_matrix = model_matrix, offset_matrix = offset_matrix, 154 | # dispersions = disp_latest, beta_mat_init = Beta, ridge_penalty = ridge_penalty) 155 | # } 156 | # Beta <- beta_res$Beta 157 | # # Calculate corresponding predictions 158 | # Mu <- calculate_mu(Beta, model_matrix, offset_matrix) 159 | }else{ 160 | # Use disp_init, because it is already in vector shape 161 | disp_est <- disp_init 162 | dispersion_shrinkage <- NULL 163 | } 164 | 165 | 166 | # Return everything 167 | list(Beta = Beta, 168 | overdispersions = disp_est, 169 | overdispersion_shrinkage_list = dispersion_shrinkage) 170 | } 171 | 172 | 173 | 174 | #' Internal Function to Fit a Gamma-Poisson GLM 175 | #' @import glmGamPoi 176 | #' @inheritParams glmGamPoi::glm_gp 177 | #' @inheritParams glmGamPoi::overdispersion_mle 178 | #' @import glmGamPoi 179 | #' 180 | #' @param Y any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with 181 | #' one column per sample and row per gene. 182 | #' 183 | #' @return a list with four elements 184 | #' * `Beta` the coefficient matrix 185 | #' * `overdispersion` the vector with the estimated overdispersions 186 | #' * `Mu` a matrix with the corresponding means for each gene 187 | #' and sample 188 | #' * `size_factors` a vector with the size factor for each 189 | #' sample 190 | #' * `ridge_penalty` a vector with the ridge penalty 191 | #' 192 | #' @seealso [glm_gp()] and [overdispersion_mle()] 193 | #' @keywords internal 194 | glm_gp_disp_only <- function(data, 195 | design = ~ 1, 196 | col_data = NULL, 197 | reference_level = NULL, 198 | offset = 0, 199 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"), 200 | overdispersion = TRUE, 201 | overdispersion_shrinkage = TRUE, 202 | ridge_penalty = 0, 203 | do_cox_reid_adjustment = TRUE, 204 | subsample = FALSE, 205 | on_disk = NULL, 206 | use_assay = NULL, 207 | verbose = FALSE){ 208 | 209 | # Validate `data` 210 | if(inherits(data, "formula")){ 211 | if(length(design) != 2 || design != ~ 1){ 212 | stop("If the first argument is already a formula, the second argument must not be set. Please call this function like this:\n", 213 | "'glm_gp(data = mat, design = ~ a + b + c, ...)'", call. = FALSE) 214 | } 215 | extr <- glmGamPoi:::extract_data_from_formula(data, col_data, parent.frame()) 216 | data <- extr$data 217 | design <- extr$design 218 | } 219 | if(is.vector(data)){ 220 | data <- matrix(data, nrow = 1) 221 | } 222 | data_mat <- glmGamPoi:::handle_data_parameter(data, on_disk = F) 223 | 224 | # Convert the formula to a model_matrix 225 | col_data <- glmGamPoi:::get_col_data(data, col_data) 226 | des <- glmGamPoi:::handle_design_parameter(design, data, col_data, reference_level) 227 | 228 | # Call glm_gp_impl() 229 | res <- glm_gp_disp_only_impl(data_mat, 230 | model_matrix = des$model_matrix, 231 | offset = offset, 232 | size_factors = size_factors, 233 | overdispersion = overdispersion, 234 | overdispersion_shrinkage = overdispersion_shrinkage, 235 | ridge_penalty = ridge_penalty, 236 | do_cox_reid_adjustment = do_cox_reid_adjustment, 237 | subsample = subsample, 238 | verbose = verbose) 239 | # Make sure that the output is nice and beautiful 240 | names(res$overdispersions) <- rownames(data) 241 | 242 | class(res) <- "glmGamPoi" 243 | res 244 | } 245 | 246 | 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /R/perturbation_scoring.R: -------------------------------------------------------------------------------- 1 | #' 2 | NULL 3 | 4 | 5 | #' Mixscale scoring for perturbations 6 | #' 7 | #' Function to calculate perturbation scores for perturbed and non-perturbed gRNA expressing cells. 8 | #' The perturbation score reflects the perturbation strength of each cells (inherited from the RunMixscape() 9 | #' function). It is calculated by using the large-effect DE genes from raw DE tests between the 10 | #' perturbed and non-perturbed gRNA expressing cells. 11 | #' 12 | #' @export 13 | #' 14 | #' @inheritParams Seurat::RunMixscape 15 | #' @import Seurat 16 | #' 17 | #' @param object An object of class Seurat. 18 | #' @param assay Assay to use for mixscape classification. 19 | #' @param slot Assay data slot to use. 20 | #' @param labels metadata column with target gene labels. 21 | #' @param nt.class.name Classification name of non-targeting gRNA cells. 22 | #' @param new.class.name Name of mixscale scores to be stored in 23 | #' metadata. 24 | #' @param min.de.genes Required number of genes that are differentially 25 | #' expressed for method to separate perturbed and non-perturbed cells. 26 | #' @param min.cells Minimum number of cells in target gene class. If fewer than 27 | #' this many cells are assigned to a target gene class during classification, 28 | #' all are assigned NP. 29 | #' @param de.assay Assay to use when performing differential expression analysis. 30 | #' Usually RNA. 31 | #' @param logfc.threshold the log-fold-change threshold to select the large-effect 32 | #' DE genes. Only DE genes with log-fold-change larger than this value will be 33 | #' selected. Default is 0.25. 34 | #' @param verbose Display messages 35 | #' @param split.by metadata column with experimental condition/cell type 36 | #' classification information. This is meant to be used to account for cases a 37 | #' perturbation is condition/cell type -specific. 38 | #' @param fine.mode When this is equal to TRUE, DE genes for each target gene 39 | #' class will be calculated for each gRNA separately and pooled into one DE list 40 | #' for calculating the perturbation score of every cell and their subsequent 41 | #' classification. 42 | #' @param fine.mode.labels metadata column with gRNA ID labels. 43 | #' @param DE.gene specify a list of user-defined large-effect DE genes to calculate the perturbation score. 44 | #' @param max.de.genes the maximum number of top large-effect DE genes to calculate the perturbation score. Default is 100. 45 | #' @param harmonize a boolen value to specify whether a harmonization of the cell-type proportion between the NT cells and 46 | #' the perturbed cells should be performed prior to the DE test. If fine.mode is TRUE, this harmonization step will be 47 | #' performed for each fine.mode gRNA. Default is FALSE. 48 | #' @param min_prop_ntgd a minimal threshold to remove cells if any cell type has a proportion less than this value. It will 49 | #' only be used when harmonize is TRUE. Default is 0.1. 50 | #' @param pval.cutoff specify the DE test p-value cutoff (after Bonferroni correction) to select top large-effect DE genes. 51 | #' Default is 0.05. 52 | #' 53 | #' @return Returns a Seurat object containing the perturbation scores. It is stored in the Tool Data of the object, also 54 | #' the standardized scores are stored in the meta.data (column is specified by new.class.name). 55 | #' @concept perturbation_scoring 56 | 57 | RunMixscale = function (object, assay = "PRTB", slot = "scale.data", labels = "gene", 58 | nt.class.name = "NT", 59 | new.class.name = "mixscale_score", 60 | min.de.genes = 5, min.cells = 5, de.assay = "RNA", logfc.threshold = 0.25, 61 | verbose = FALSE, split.by = NULL, fine.mode = FALSE, 62 | fine.mode.labels = "guide_ID", 63 | DE.gene = NULL, 64 | max.de.genes = 100, harmonize = F, 65 | min_prop_ntgd = 0.1, pval.cutoff = 0.05, 66 | seed = 10282021) 67 | { 68 | message("Calculating Mixscale scores ...") 69 | 70 | assay <- assay %||% DefaultAssay(object = object) 71 | if (!assay %in% names(object@assays)){ 72 | stop(paste0("The 'assay' being specified does not exist! Please check. Have you run CalcPerturbSig() yet?")) 73 | } 74 | 75 | if (is.null(x = labels)) { 76 | stop("Please specify target gene class metadata name") 77 | } 78 | 79 | if (min.de.genes <= 1) { 80 | warning("The min.de.genes should be larger than 1!") 81 | } 82 | 83 | prtb_markers <- list() 84 | prtb_markers2 <- list() 85 | object[[new.class.name]] <- object[[labels]] 86 | object[[new.class.name]][, 1] <- as.character(x = object[[new.class.name]][, 87 | 1]) 88 | gv.list <- list() 89 | if (is.null(x = split.by)) { 90 | split.by <- splits <- "con1" 91 | } else { 92 | splits <- as.character(x = unique(x = object[[split.by]][, 93 | 1])) 94 | } 95 | cells.s.list <- list() 96 | 97 | # 98 | Idents(object = object) <- "con1" 99 | cells.s <- WhichCells(object = object, idents = "con1") 100 | # cells.s.list[[s]] <- cells.s 101 | genes <- setdiff(x = unique(x = object[[labels]][cells.s, 102 | 1]), y = nt.class.name) 103 | # 104 | for (gene in genes) { 105 | Idents(object = object) <- labels 106 | 107 | if (isTRUE(x = verbose)) { 108 | message("Processing ", gene) 109 | } 110 | orig.guide.cells <- intersect(x = WhichCells(object = object, 111 | idents = gene), y = cells.s) 112 | nt.cells <- intersect(x = WhichCells(object = object, 113 | idents = nt.class.name), y = cells.s) 114 | 115 | ############################################# 116 | 117 | if (isTRUE(x = fine.mode)) { 118 | guides <- setdiff(x = unique(x = object[[fine.mode.labels]][orig.guide.cells, 119 | 1]), y = nt.class.name) 120 | all.de.genes <- c() 121 | for (gd in guides) { 122 | gd.cells <- rownames(x = object[[]][orig.guide.cells, 123 | ])[which(x = object[[]][orig.guide.cells, 124 | fine.mode.labels] == gd)] 125 | # we will need to extract the NT cells based on each celltype and do harmonization based on cell comp in PRTB cells 126 | if(harmonize == T & length(split.by) > 1){ 127 | # this is a flag to indicate if the harmonization process is okay (selected_Cell >= 50% of total cell) 128 | flag_good_harm = F 129 | # the initial list of split-groups 130 | splits_list = splits 131 | 132 | while(flag_good_harm == F){ 133 | # get the cell label splitted by splits 134 | cells.s.list.gd = list() 135 | cells.s.list.ntgd = list() 136 | Idents(object = object) <- split.by 137 | 138 | for (s in splits_list) { 139 | cells.s.list.gd[[s]] <- intersect(gd.cells, WhichCells(object = object, idents = s)) 140 | cells.s.list.ntgd[[s]] <- intersect(nt.cells, WhichCells(object = object, idents = s)) 141 | } 142 | 143 | # calculate the desired number of nt cells in each splits; 144 | length.gd = sapply(X = cells.s.list.gd, FUN = length) 145 | length.ntgd = sapply(X = cells.s.list.ntgd, FUN = length) 146 | 147 | prop.gd = length.gd/sum(length.gd, na.rm = T) 148 | # prop.ntgd = length.ntgd/sum(length.ntgd, na.rm = T) 149 | sum.desire.length.ntgd = floor(min(length.ntgd/prop.gd, na.rm = T)) 150 | if(sum.desire.length.ntgd > sum(length.ntgd, na.rm = T)){ 151 | stop("The sum.desire.length.ntgd is greater than the total number of NT cells. Need to check!") 152 | } 153 | 154 | if(sum.desire.length.ntgd >= min_prop_ntgd*sum(length.ntgd, na.rm = T) ){ 155 | flag_good_harm = T 156 | } else { 157 | message(paste("Removing cell from ", splits_list[which.min(length.ntgd)], "due to 50% check during harmonization step.")) 158 | splits_list = splits_list[-which.min(length.ntgd)] 159 | } 160 | } 161 | 162 | # calculate the final number of NT cells to extract from splits_list 163 | desire.length.ntgd = floor(sum.desire.length.ntgd*prop.gd) 164 | 165 | # start to subsample the nt cells based on the desire length: 166 | sub.cells.s.list.ntgd = list() 167 | for (s in splits_list) { 168 | set.seed(seed = seed) 169 | sub.cells.s.list.ntgd[[s]] <- sample(x = cells.s.list.ntgd[[s]], size = desire.length.ntgd[s]) 170 | } 171 | 172 | # collapse the list into a single vectors of sub-sampled NT cells 173 | sub.ntgd.cells = Reduce(c, sub.cells.s.list.ntgd) 174 | rm(cells.s.list.gd, cells.s.list.ntgd, length.gd, length.ntgd, prop.gd, sum.desire.length.ntgd, desire.length.ntgd, sub.cells.s.list.ntgd) 175 | if(verbose){ 176 | message("Done with harmonizing the cell composition in NT cells (fine mode).") 177 | } 178 | 179 | Idents(object = object) <- labels 180 | } else { 181 | sub.ntgd.cells = nt.cells 182 | } 183 | 184 | # run DE 185 | if(!is.null(DE.gene) ){ 186 | if(!is.null(DE.gene[[gene]]) | length(DE.gene[[gene]]) != 0){ 187 | all.de.genes = DE.gene[[gene]] 188 | } else { 189 | all.de.genes = character() 190 | warning(paste("No de.genes are provided for PRTB:", gene, ". Pls check!")) 191 | } 192 | } else { 193 | # run DE 194 | de.genes <- Seurat:::TopDEGenesMixscape(object = object, 195 | ident.1 = gd.cells, ident.2 = sub.ntgd.cells, de.assay = de.assay, 196 | logfc.threshold = logfc.threshold, labels = fine.mode.labels, 197 | verbose = verbose, pval.cutoff = pval.cutoff) 198 | all.de.genes <- c(all.de.genes, de.genes) 199 | } 200 | } 201 | all.de.genes <- unique(all.de.genes) 202 | } else { 203 | # we will need to extract the NT cells based on each celltype and do harmonization based on cell comp in PRTB cells 204 | if(harmonize == T & length(split.by) > 1){ 205 | # this is a flag to indicate if the harmonization process is okay (selected_Cell >= 50% of total cell) 206 | flag_good_harm = F 207 | # the initial list of split-groups 208 | splits_list = splits 209 | 210 | while(flag_good_harm == F){ 211 | # get the cell label splitted by splits 212 | cells.s.list.gene = list() 213 | cells.s.list.nt = list() 214 | Idents(object = object) <- split.by 215 | 216 | for (s in splits_list) { 217 | cells.s.list.gene[[s]] <- intersect(orig.guide.cells, WhichCells(object = object, idents = s)) 218 | cells.s.list.nt[[s]] <- intersect(nt.cells, WhichCells(object = object, idents = s)) 219 | } 220 | 221 | # calculate the desired number of nt cells in each splits; 222 | length.gene = sapply(X = cells.s.list.gene, FUN = length) 223 | length.nt = sapply(X = cells.s.list.nt, FUN = length) 224 | 225 | prop.gene = length.gene/sum(length.gene, na.rm = T) 226 | # prop.nt = length.nt/sum(length.nt, na.rm = T) 227 | sum.desire.length.nt = floor(min(length.nt/prop.gene, na.rm = T)) 228 | if(sum.desire.length.nt > sum(length.nt, na.rm = T)){ 229 | stop("The sum.desire.length.nt is greater than the total number of NT cells. Need to check!") 230 | } 231 | # 232 | if(sum.desire.length.nt >= min_prop_ntgd*sum(length.nt, na.rm = T) ){ 233 | flag_good_harm = T 234 | } else { 235 | message(paste("Removing cell from ", splits_list[which.min(length.nt)], "due to 50% check during harmonization step.")) 236 | splits_list = splits_list[-which.min(length.nt)] 237 | } 238 | } 239 | 240 | ### 241 | desire.length.nt = floor(sum.desire.length.nt*prop.gene) 242 | 243 | # start to subsample the nt cells based on the desire length: 244 | sub.cells.s.list.nt = list() 245 | for (s in splits_list) { 246 | set.seed(seed = seed) 247 | sub.cells.s.list.nt[[s]] <- sample(x = cells.s.list.nt[[s]], size = desire.length.nt[s]) 248 | } 249 | 250 | # collapse the list into a single vectors of sub-sampled NT cells 251 | sub.nt.cells = Reduce(c, sub.cells.s.list.nt) 252 | rm(cells.s.list.gene, cells.s.list.nt, length.gene, length.nt, prop.gene, sum.desire.length.nt, desire.length.nt, sub.cells.s.list.nt) 253 | if(verbose){ 254 | message("Done with harmonizing the cell composition in NT cells.") 255 | } 256 | 257 | Idents(object = object) <- labels 258 | } else { 259 | sub.nt.cells = nt.cells 260 | } 261 | # run DE 262 | if(!is.null(DE.gene) ){ 263 | if(!is.null(DE.gene[[gene]]) | length(DE.gene[[gene]]) != 0){ 264 | all.de.genes = DE.gene[[gene]] 265 | } else { 266 | all.de.genes = character() 267 | message(paste("No de.genes are provided for PRTB:", gene, ". Pls check!")) 268 | } 269 | } else { 270 | all.de.genes <- Seurat:::TopDEGenesMixscape(object = object, 271 | ident.1 = orig.guide.cells, ident.2 = sub.nt.cells, 272 | de.assay = de.assay, logfc.threshold = logfc.threshold, 273 | labels = labels, verbose = verbose, pval.cutoff = pval.cutoff) 274 | } 275 | 276 | 277 | } 278 | # print(gene) 279 | # print(all.de.genes) 280 | 281 | # only keep the top max.de.genes as the de.genes for PRTB score calculation 282 | if(!is.null(max.de.genes) ){ 283 | if(length(all.de.genes) <= max.de.genes){ 284 | if(verbose){ 285 | message(paste("The number of de.genes (", length(all.de.genes), ") is less than max.de.genes (", max.de.genes, ").")) 286 | } 287 | } else { 288 | if(verbose){ 289 | message(paste("The number of de.genes (", length(all.de.genes), ") is larger than max.de.genes (", max.de.genes, ").", 290 | "Restricting to top", max.de.genes, "genes...")) 291 | } 292 | all.de.genes = all.de.genes[1:max.de.genes] 293 | } 294 | } 295 | 296 | # use user-defined DE.gene list 297 | # if(!is.null(DE.gene) ){ 298 | # all.de.genes = DE.gene 299 | # } 300 | 301 | for (s in splits) { 302 | Idents(object = object) <- split.by 303 | cells.s.list[[s]] <- WhichCells(object = object, idents = s) 304 | 305 | prtb_markers[[s]][[gene]] <- all.de.genes 306 | prtb_markers2[[s]][[gene]] <- all.de.genes 307 | if (length(x = all.de.genes) < min.de.genes) { 308 | prtb_markers[[s]][[gene]] <- character() 309 | } 310 | 311 | } 312 | if(verbose){ 313 | message(paste0("Done with extracting top DE genes for ", gene)) 314 | } 315 | } 316 | 317 | all_markers <- unique(x = unlist(x = prtb_markers)) 318 | missing_genes <- all_markers[!all_markers %in% rownames(x = object[[assay]])] 319 | # print(missing_genes) 320 | object <- Seurat:::GetMissingPerturb(object = object, assay = assay, 321 | features = missing_genes, verbose = verbose) 322 | 323 | if(verbose){ 324 | message("Done with getting Missing PRTB") 325 | } 326 | 327 | for (s in splits) { 328 | # print(splits) 329 | cells.s <- cells.s.list[[s]] 330 | genes <- setdiff(x = unique(x = object[[labels]][cells.s, 331 | 1]), y = nt.class.name) 332 | for (gene in genes) { 333 | Idents(object = object) <- labels 334 | post.prob <- 0 335 | orig.guide.cells <- intersect(x = WhichCells(object = object, 336 | idents = gene), y = cells.s) 337 | nt.cells <- intersect(x = WhichCells(object = object, 338 | idents = nt.class.name), y = cells.s) 339 | all.cells <- c(orig.guide.cells, nt.cells) 340 | if (length(x = prtb_markers[[s]][[gene]]) == 0) { 341 | if (verbose) { 342 | message(" Fewer than ", min.de.genes, " DE genes for ", 343 | gene, ". Assigning cells as NP.") 344 | } 345 | object[[new.class.name]][orig.guide.cells, 1] <- paste0(gene, " NP") 346 | } 347 | else { 348 | if (verbose) { 349 | message(" ", gene) 350 | } 351 | de.genes <- prtb_markers[[s]][[gene]] 352 | dat <- GetAssayData(object = object[[assay]], 353 | slot = "data")[de.genes, all.cells, drop = FALSE] 354 | if (slot == "scale.data") { 355 | dat <- ScaleData(object = dat, features = de.genes, 356 | verbose = FALSE) 357 | } 358 | 359 | # the first step to calculate the overall PRTB score 360 | if(verbose){ 361 | cat(paste0("Calculating the overall PRTB score...\n")) 362 | } 363 | Idents(object = object) <- new.class.name 364 | guide.cells <- intersect(x = WhichCells(object = object, 365 | idents = gene), y = cells.s) 366 | vec <- matrixStats::rowMeans2(x = dat[, guide.cells, drop = FALSE]) - 367 | matrixStats::rowMeans2(x = dat[, nt.cells, drop = FALSE]) 368 | 369 | # save the mat and vec to easily calculate the weights by rowSums 370 | pvec_mat = sweep(t(dat), MARGIN=2, vec, `*`) 371 | vec_mat = vec * vec 372 | names(vec_mat) = colnames(pvec_mat) 373 | 374 | # the weights 375 | pvec = matrixStats::rowSums2(pvec_mat)/sum(vec_mat) 376 | names(pvec) = rownames(pvec_mat) 377 | 378 | # create a list to store the PRTB score 379 | gv <- as.data.frame(x = pvec) 380 | gv[, "gene"] <- nt.class.name 381 | gv[intersect(x = rownames(x = gv), y = guide.cells), 382 | "gene"] <- gene 383 | gv.list[[gene]][[s]] <- gv 384 | 385 | # the LOOv2 weights 386 | # for(omit_gene in de.genes){ 387 | # if(verbose){ 388 | # cat(paste0("Calculating the LOO PRTB score by using rowSums2 for ", omit_gene, " in subset ", s, "...\n")) 389 | # } 390 | # remain_gene = de.genes[which(de.genes != omit_gene)] 391 | # pvec2 <- rowSums2(pvec_mat[, remain_gene, drop = F])/sum(vec_mat[remain_gene]) 392 | # # save the LOO weights 393 | # gv.list[[gene]][[s]][, omit_gene] <- pvec2 394 | # } 395 | 396 | # 2023 July 03: substitute the above loop with the following matrix manipulation for speed. 397 | omit_mat <- outer(de.genes, de.genes, `!=`) 398 | 399 | # Function to calculate pvec2 400 | calc_pvec2 <- function(include_gene) { 401 | pvec2 <- matrixStats::rowSums2(pvec_mat[, include_gene, drop = F])/sum(vec_mat[include_gene]) 402 | return(pvec2) 403 | } 404 | 405 | # Apply the function to each column of omit_mat 406 | gv.list[[gene]][[s]][, de.genes] <- apply(omit_mat, 2, calc_pvec2) 407 | 408 | 409 | # the second step to calculate the leave-one-out (LOO) PRTB score 410 | if(verbose){ 411 | message(paste0("Done calculating LOO PRTB score for ", length(de.genes), " genes in ", s, "...\n")) 412 | } 413 | 414 | } 415 | 416 | } 417 | if(verbose){ 418 | message(paste0("Done with calculating scores for ", s)) 419 | } 420 | } 421 | SeuratObject::Tool(object = object) <- gv.list 422 | 423 | # check if gv.list is empty. 424 | if(length(gv.list) == 0){ 425 | warning("Failed to calculate Mixscale scores for any group. \nThis is probably due to insufficient response to perturbation.\nYou may consider lowering the logfc.threshold or min.de.genes when running this function.") 426 | } 427 | 428 | # added Jan 16: calculate the standardized scores and append them to the meta-data 429 | # get the list of PRTBs 430 | wt_PRTB_list = sort(names(gv.list)) 431 | all_PRTB_list = sort(unique(object[[labels]][,1])) 432 | all_PRTB_list = all_PRTB_list[all_PRTB_list != nt.class.name] 433 | wt_PRTB_list = wt_PRTB_list[wt_PRTB_list %in% all_PRTB_list] 434 | 435 | # 436 | mat_B = data.frame(cell_label = colnames(object), 437 | gene = object[[labels]][,1] ) 438 | 439 | # 440 | all_score = data.frame() # to store the scores from each PRTB 441 | for(PRTB in all_PRTB_list){ 442 | mat_A = data.frame() 443 | # check if the scores are calculated successful for this PRTB 444 | if(PRTB %in% wt_PRTB_list){ 445 | celltype_list = names(gv.list[[PRTB]]) 446 | for(celltype in celltype_list){ 447 | # print(gv.list[[PRTB]][[celltype]]) 448 | tmp = gv.list[[PRTB]][[celltype]][, c("pvec", "gene"), drop = FALSE] 449 | 450 | # get the idx for NT cells and PRTBed cells 451 | idx_NT = which(tmp$gene == nt.class.name) 452 | idx_gene = which(tmp$gene == PRTB) 453 | 454 | # 1. calculate the overall weights 455 | # calculate the mean and sd of the PRTB score for the NT cells 456 | mean_NT = mean(tmp$pvec[idx_NT], na.rm = T) 457 | sd_NT = sd(tmp$pvec[idx_NT], na.rm = T) 458 | # standardize the PRTB scores for the PRTBed cells based on the mean and SD from those of the NT cells 459 | std_weight_gene = (tmp$pvec[idx_gene] - mean_NT)/sd_NT 460 | # convert those negative standardised PRTB score to 0 461 | # std_weight_gene[which(std_weight_gene < 0)] = 0 462 | 463 | # create a new column called "weight" in the tmp dataframe. 464 | tmp$weight = 0 465 | tmp$weight[idx_gene] = std_weight_gene 466 | 467 | tmp$cell_label = row.names(tmp) 468 | tmp = tmp[, c("cell_label", "gene", "pvec", "weight")] 469 | 470 | mat_A = rbind(mat_A, tmp) 471 | rm(tmp) 472 | } 473 | } else { 474 | # celltype_list = names(gv.list[[1]]) 475 | # 476 | tmp = mat_B[mat_B$gene %in% c(PRTB, nt.class.name), ] 477 | tmp$weight = 0 478 | tmp[tmp$gene == PRTB, "weight"] = 1 479 | tmp$pvec = tmp$weight 480 | # 481 | tmp = tmp[, c("cell_label", "gene", "pvec", "weight")] 482 | mat_A = tmp 483 | rm(tmp) 484 | } 485 | all_score = rbind(all_score, mat_A) 486 | } 487 | 488 | # some final editing 489 | all_score = all_score[!duplicated(all_score$cell_label), ] 490 | rownames(all_score) = all_score$cell_label 491 | all_score = all_score[, "weight", drop = FALSE] 492 | names(all_score) = new.class.name 493 | # 494 | object[[new.class.name]] = NULL 495 | 496 | # add the standardized scores to the meta-data 497 | object = AddMetaData(object, metadata = all_score) 498 | 499 | return(object) 500 | } 501 | 502 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mixscale 2 | Mixscale is an R package designed to analyze CRISPR interference (CRISPRi) based Perturb-seq data. It can quantify the heterogeneity of perturbation strength in each cell and improve the statistical power when doing differential expression (DE) analysis. It also provides functions for downstream analyses including decomposition, permutation test, gene set enrichment test, etc. A brief vignette is available at https://satijalab.github.io/Mixscale/. 3 | 4 | ## Dependencies 5 | This package depends on several other R packages: 6 | ``` 7 | install.packages("Seurat") 8 | install.packages("PMA") 9 | install.packages("protoclust") 10 | BiocManager::install("glmGamPoi") 11 | ``` 12 | 13 | ## Installation 14 | You can easily install the package by the following command: 15 | ``` 16 | devtools::install_github("satijalab/Mixscale") 17 | ``` 18 | 19 | ## Other resources 20 | * Our preprint is available at https://www.biorxiv.org/content/10.1101/2024.01.29.576933v2. 21 | * If you want to access the data generated in our paper (including the processed scRNA-seq data and the pathway gene signatures), you can download them at https://doi.org/10.5281/zenodo.14518762. 22 | * Raw fastq files are available at the Gene Expression Omnibus (GEO) under the accession code [GSE281048](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE281048). 23 | 24 | -------------------------------------------------------------------------------- /docs/old/New_Vignette_2024Jan.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using Mixscale for Perturb-seq data" 3 | author: "Longda Jiang" 4 | date: "2024-01-09" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Introduction 13 | 14 | In this tutorial, we will describe an R package "Mixscale" for analyzing Perturb-seq data. Mixscale contains functions designed to tackle the following tasks:\ 15 | 1. calculate 'Mixscale scores' for cells that receives the same perturbation to quantify the heterogeneity in perturbation strength \ 16 | 2. perform a scoring-based weighted differential expression (DE) tests to identify DE genes for each perturbation \ 17 | 3. perform different levels of decomposition analysis to identify correlated perturbations and group them into a program \ 18 | 4. perform a PCA-based permutation test to extract shared genes for the perturbation programs (program gene signature) \ 19 | 5. identify shared and unique signature between two relevant programs \ 20 | 6. perform gene set enrichment tests using the program signature for new datasets \ 21 | 7. perform module score analyses using the program signature to quantify the program activity in new datasets \ 22 | \ 23 | The tutorial is divided into two sections. The first section will describe task 1 to 5 using a public Perturb-seq dataset from the Weissman Lab ([Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5)), which can be downloaded from [GSE132080](https://0-www-ncbi-nlm-nih-gov.brum.beds.ac.uk/geo/query/acc.cgi?acc=GSE132080). The second section will describe task 6 and 7 using the pathway gene lists generated from our study (available at [Zenodo](not_inserted_yet)) and an interferon-beta stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042). This ifnb dataset is available via the SeuratData package (see section 2 below). 24 | 25 | 26 | ### load the packages 27 | ```{r load_package, message=FALSE, warning=FALSE} 28 | options(Seurat.object.assay.version = 'v3') 29 | 30 | library(Seurat) 31 | library(ggridges) 32 | library(stringr) 33 | library(Mixscale) 34 | ``` 35 | 36 | ## Section 1. 37 | In this section we will focus on how to use Mixscale to analyze Perturb-seq data. 38 | 39 | ### 0. load the demo data 40 | The demo dataset from [Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5) contains CRISPRi Perturb-seq data targeting 25 key genes involved in essential cell biological processes. We will load in the count matrix to create a Seurat object and append the provided meta data to it. \ 41 | One special feature of this dataset is that, for each perturbation target gene, there are five different gRNAs designed to target it. One of the gRNA has the perfectly matched sequence for the target region (labelled with "_00"), while the others contain 1~3 nucleotide mismatches so that their perturbation stength is "titrated". We will treat the cells that have the same target gene as the same group in our downstream analyses. 42 | 43 | ```{r load_data, message=FALSE, warning=FALSE, cache=TRUE} 44 | # load the count matrix 45 | ct_mat = ReadMtx(mtx = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_matrix.mtx", 46 | cells = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_barcodes.tsv", 47 | features = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_genes.tsv") 48 | # load the meta_data 49 | meta_data = read.csv("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_cell_identities.csv") 50 | rownames(meta_data) = meta_data$cell_barcode 51 | 52 | # create a seurat object 53 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data) 54 | rm(ct_mat, meta_data) 55 | 56 | # retrieve the guide information for each cell 57 | txt = seurat_obj$guide_identity 58 | txt2 = str_extract(txt, "^[^_]+") 59 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt) 60 | seurat_obj[['gene']] = txt2 61 | seurat_obj[['gRNA_name']] = txt3 62 | 63 | # remove ambiguous cells 64 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain 65 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain 66 | 67 | seurat_obj 68 | ``` 69 | 70 | ### 1. Pre-processing and calculating the Mixscale score 71 | We will first run standard pre-processing (normalization, find variable features, etc) for the dataset. Then, we will follow the standard [Mixscape analysis](https://satijalab.org/seurat/articles/mixscape_vignette) to calculate local perturbation signatures that mitigate confounding effects. Briefly speaking, for each cell we will search for its 20 nearest neighbors from the non-targeted (NT) cells, and then remove all technical variation so that perturbation-specific effect can be revealed. 72 | 73 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE} 74 | # standard pre-processing 75 | seurat_obj = NormalizeData(seurat_obj) 76 | seurat_obj = FindVariableFeatures(seurat_obj) 77 | seurat_obj = ScaleData(seurat_obj) 78 | seurat_obj = RunPCA(seurat_obj) 79 | 80 | # calculate Perturbation signatures 81 | seurat_obj <- CalcPerturbSig( 82 | object = seurat_obj, 83 | assay = "RNA", 84 | slot = "data", 85 | gd.class ="gene", 86 | nt.cell.class = "neg", 87 | reduction = "pca", 88 | ndims = 40, 89 | num.neighbors = 20, 90 | new.assay.name = "PRTB") 91 | 92 | ``` 93 | 94 | Now we will calculate the Mixscale scores for each cell within each perturbation group. 95 | ```{r scoring, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE} 96 | # Mixscale 97 | seurat_obj = RunMixscale( 98 | object = seurat_obj, 99 | assay = "PRTB", 100 | slot = "scale.data", 101 | labels = "gene", 102 | nt.class.name = "neg", 103 | min.de.genes = 5, 104 | logfc.threshold = 0.2, 105 | de.assay = "RNA", 106 | max.de.genes = 100, 107 | prtb.type = "P", new.class.name = "mixscale_id", fine.mode = F) 108 | 109 | ``` 110 | 111 | 112 | ### 2. Visualizations for the scores 113 | We will now use some plotting functions to explore the perturbation scores that we just calculated. 114 | 115 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE} 116 | # a. Check the distribution of the scores for the first 10 perturbations 117 | Mixscale_RidgePlot(object = seurat_obj, 118 | nt.class.name = "neg", 119 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10], 120 | facet_wrap = "gene", facet_scale = "fixed") 121 | 122 | # b. Check if the scores correlate with the expression level of the target gene itself 123 | Mixscale_ScatterPlot(object = seurat_obj, nt.class.name = "neg", 124 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10], 125 | facet_wrap = "gene", facet_scale = "free_y", nbin = 10) 126 | ``` 127 | 128 | ### 3. Differential expression (DE) analysis 129 | After calculating the scores, we can use the scores to enhance the statistical power of DE analysis by using them as a "weights" in the regression model. Briefly speaking, instead of coding the NT cells as 0 and the targeted cells as 1, we used the standardized scores to code the targeted cells in the regression, so that cells with stronger perturbation strength will have higher "weights" and vice versa. 130 | 131 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 132 | # run score-based weighted DE test for 12 selected perturbations. It will return a list of data frames (one for each perturbation) 133 | de_res = Run_wtDE(object = seurat_obj, assay = "RNA", slot = "counts", 134 | labels = "gene", nt.class.name = "neg", 135 | logfc.threshold = 0.1) 136 | 137 | # have a quick look at the DE results 138 | head(de_res[[1]]) 139 | 140 | ``` 141 | 142 | We can now explore the top DE genes for each perturbations using the customized DoHeatmap function, where cells are ordered by Mixscale scores. 143 | 144 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 145 | # heatmap for the top DE genes 146 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "POLR2H", 147 | slct_condition = "con1", 148 | nt.class.name = "neg", 149 | labels = "gene", 150 | slct_features = rownames(de_res[["POLR2H"]][order(de_res[["POLR2H"]]$p_weight), ])[1:20]) + NoLegend() 151 | 152 | # similar heatmap for the top DE genes, but this time the cells are divided based on gRNA identity using slct_ident 153 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "GATA1", 154 | slct_condition = "con1", 155 | nt.class.name = "neg", 156 | labels = "gene", 157 | slct_features = rownames(de_res[["GATA1"]][order(de_res[["GATA1"]]$p_weight), ])[1:20], 158 | slct_ident = "gRNA_name") + NoLegend() 159 | ``` 160 | 161 | ### 5. Decomposition analyses to identify correlated perturbations 162 | #### 5.1 Hierarchical clustering 163 | In this section we will perform mainly two types of decomposition analyses for our DE results. The first is a hierarchical clustering analysis based on ([MinMax](https://www.sciencedirect.com/science/article/abs/pii/S0031320314000338)). We will apply it to the DE Z-score matrix of our DE results. 164 | 165 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 166 | # get the Z-score matrix (a list of matrices will be returned, with each matrix correspond to a cell type) 167 | # as for the selection of rows (features), we will use the union set of the top 100 DE genes from each column 168 | DEG_mat_main = get_DE_mat(de_res, p_threshold = 0.05/30000, fc_threshold = 0.2, num_top_DEG = 100) 169 | 170 | # slightly clean up the matrices by removing columns with not enough significant DEGs and replace all the NAs with 0 171 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 5) 172 | 173 | # an empty list to store all the gene sets (a repository of gene sets) 174 | go_db = list() 175 | 176 | # get the cell type (in this example there is only one cell type) 177 | celltype_list = names(DEG_mat) 178 | 179 | # loop through all the cell types to perform Minmax hierarchical clustering 180 | for(i in 1:length(celltype_list)){ 181 | CELLTYPE = celltype_list[i] 182 | tmp=DEG_mat[[CELLTYPE]] 183 | 184 | # run hierarchical clustering using Minmax. Other standard hclust methods are also supported. 185 | # dist_thres defines the height (= 1 - dist_thres) that is used to cut the hclust tree. 186 | # a lower value indicates a more stringent threshold to define clusters. 187 | res = DEhclust(mat = tmp, cor_method = "pearson", hclust_method = "minmax", dist_thres = 0.4) 188 | 189 | # get_sig_genes_DEhclust() is a wrapper function for PCApermtest() and get_sig_genes(), which will 190 | # perform a PCA-based permutation test and extract the top shared DE genes across the perturbations 191 | # in the same cluster 192 | sig_genes = get_sig_genes_DEhclust(obj = res, row_filtering_pval = 0.05) 193 | 194 | # store the extracted top genes as the cluster signature into go_db 195 | for(CLUSTER in names(sig_genes)){ 196 | if(length(sig_genes[[CLUSTER]]$sig_genes$upDEGs) >= 10){ 197 | go_db[[paste0(CELLTYPE, "_", CLUSTER, "_upDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$upDEGs 198 | } 199 | if(length(sig_genes[[CLUSTER]]$sig_genes$downDEGs) >= 10){ 200 | go_db[[paste0(CELLTYPE, "_", CLUSTER,"_downDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$downDEGs 201 | } 202 | } 203 | 204 | } 205 | 206 | # check the clustering results 207 | res$cluster_assignment 208 | 209 | # generate a correlation matrix plot based on the clustering results 210 | col3 = rev(brewer.pal(11,"RdBu")) 211 | heatmap.2(cor(tmp), 212 | Rowv = as.dendrogram(res$hclust), 213 | Colv = as.dendrogram(res$hclust), 214 | dendrogram = "none", 215 | col = col3) 216 | ``` 217 | 218 | we can also use the following function to generate Z-score heatmap for each perturbation cluster and save them to a user defined directory 219 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE} 220 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30, 221 | output_path = "output_path/", 222 | prefix = CELLTYPE) 223 | ``` 224 | 225 | #### 5.2 MultiCCA analysis 226 | In our [paper](not_insert_yet), we introduced a novel approach for identifying correlated perturbations both within and between various matrices. This method is applicable when DE Z-scores are organized into a list of Z-score matrices, with each matrix corresponding to a cell type. This is especially useful when multiple cell types/lines are used in a Perturb-seq experiment. The demo dataset only contains one cell type, so we will randomly divide the Z-score matrix above into a list of 3 matrices, and use them to test our MultiCCA method. 227 | 228 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 229 | # first we randomly divide the DEG_mat_main into a list of 3 matrices 230 | set.seed(100) 231 | DEG_mat_main2 = list(type1 = DEG_mat_main[, c(1, sample(2:26)[1:8])], 232 | type2 = DEG_mat_main[, c(1, sample(2:26)[9:16])], 233 | type2 = DEG_mat_main[, c(1, sample(2:26)[17:25])] ) 234 | 235 | # clean up the matrices 236 | DEG_mat = prune_DE_mat(DEG_mat_main2, min_sig_DEG = 5, center = T) 237 | 238 | # run MultiCCA 239 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, mean_cor_thres = 0.2, max_k = 3, standardize = F) 240 | 241 | # get_sig_genes_DEmultiCCA is a wrapper function for PCApermtest() and get_sig_genes() for DEmultiCCA object. 242 | sig_genes = get_sig_genes_DEmultiCCA(res, row_filtering_pval = 0.05) 243 | 244 | # store the gene signatures to the go-term repo 245 | for(PROGRAM in names(sig_genes)){ 246 | if(length(sig_genes[[PROGRAM]]$sig_genes$upDEGs) >= 10){ 247 | go_db[[paste0("MultiCCA_", PROGRAM, "_upDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$upDEGs 248 | } 249 | if(length(sig_genes[[PROGRAM]]$sig_genes$downDEGs) >= 10){ 250 | go_db[[paste0("MultiCCA_", PROGRAM, "_downDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$downDEGs 251 | } 252 | } 253 | 254 | ``` 255 | 256 | Again, we can use the following function to generate Z-score heatmap for each perturbation program and save them to a user defined directory 257 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE} 258 | DE_heatmap(obj = res, sig_genes = sig_genes, 259 | type = "multiCCA", direction = "both", 260 | top_n = 30, labRow = T, 261 | output_path = "output_path/", 262 | prefix = "MultiCCA") 263 | ``` 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | -------------------------------------------------------------------------------- /docs/old/index copy 2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Merged_Vignette_2024Jan16" 3 | author: "Longda Jiang" 4 | date: "2024-01-16" 5 | output: html_document 6 | --- 7 | 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(echo = TRUE) 11 | ``` 12 | 13 | ## Introduction 14 | 15 | In this tutorial, we will describe an R package "Mixscale" for analyzing Perturb-seq data. Mixscale contains functions designed to tackle the following tasks:\ 16 | 1. calculate 'Mixscale scores' for cells that receives the same perturbation to quantify the heterogeneity in perturbation strength \ 17 | 2. perform a scoring-based weighted differential expression (DE) tests to identify DE genes for each perturbation \ 18 | 3. perform different levels of decomposition analysis to identify correlated perturbations and group them into a program \ 19 | 4. perform a PCA-based permutation test to extract shared genes for the perturbation programs (program gene signature) \ 20 | 5. identify shared and unique signature between two relevant programs \ 21 | 6. perform gene set enrichment tests using the program signature for new datasets \ 22 | 7. perform module score analyses using the program signature to quantify the program activity in new datasets \ 23 | \ 24 | The tutorial is divided into two sections. The first section will describe task 1 to 5 using a public Perturb-seq dataset from the Weissman Lab ([Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5)), which can be downloaded from [GSE132080](https://0-www-ncbi-nlm-nih-gov.brum.beds.ac.uk/geo/query/acc.cgi?acc=GSE132080). The second section will describe task 6 and 7 using the pathway gene lists generated from our study (available at [Zenodo](not_inserted_yet)) and an interferon-beta stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042). This ifnb dataset is available via the SeuratData package (see section 2 below). 25 | 26 | 27 | ### load the packages 28 | ```{r load_package, message=FALSE, warning=FALSE} 29 | options(Seurat.object.assay.version = 'v3') 30 | 31 | library(Seurat) 32 | library(ggridges) 33 | library(stringr) 34 | library(Mixscale) 35 | library(ggplot2) 36 | ``` 37 | 38 | ## Section A. 39 | In this section we will focus on how to use Mixscale to analyze Perturb-seq data. 40 | 41 | ### 0. Description of the demo data 42 | The demo dataset from [Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5) contains CRISPRi Perturb-seq data targeting 25 key genes involved in essential cell biological processes. We will load in the count matrix to create a Seurat object and append the provided meta data to it. \ 43 | One special feature of this dataset is that, for each perturbation target gene, there are five different gRNAs designed to target it. One of the gRNA has the perfectly matched sequence for the target region (labelled with "_00"), while the others contain 1~3 nucleotide mismatches so that their perturbation stength is "titrated". We will treat the cells that have the same target gene as the same group in our downstream analyses. 44 | 45 | ```{r load_data, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 46 | # load in the count matrix downloaded from GSE132080 47 | ct_mat = ReadMtx(mtx = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_matrix.mtx", 48 | cells = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_barcodes.tsv", 49 | features = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_genes.tsv") 50 | # load the meta_data 51 | meta_data = read.csv("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_cell_identities.csv") 52 | rownames(meta_data) = meta_data$cell_barcode 53 | 54 | # create a seurat object 55 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data) 56 | rm(ct_mat, meta_data) 57 | 58 | # retrieve the guide information for each cell 59 | txt = seurat_obj$guide_identity 60 | txt2 = str_extract(txt, "^[^_]+") 61 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt) 62 | seurat_obj[['gene']] = txt2 63 | seurat_obj[['gRNA_name']] = txt3 64 | seurat_obj[['cell_type']] = "K562" 65 | rm(txt, txt2, txt3) 66 | 67 | # remove ambiguous cells 68 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain 69 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain 70 | 71 | ``` 72 | 73 | 74 |
**Click here to see how to generate the Seurat object** 75 | ```{r load_data2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE} 76 | # load in the count matrix downloaded from GSE132080 77 | ct_mat = ReadMtx(mtx = "GSE132080/GSE132080_10X_matrix.mtx", 78 | cells = "GSE132080/GSE132080_10X_barcodes.tsv", 79 | features = "GSE132080/GSE132080_10X_genes.tsv") 80 | # load the meta_data 81 | meta_data = read.csv("GSE132080/GSE132080_cell_identities.csv") 82 | rownames(meta_data) = meta_data$cell_barcode 83 | 84 | # create a seurat object 85 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data) 86 | rm(ct_mat, meta_data) 87 | 88 | # retrieve the guide information for each cell 89 | txt = seurat_obj$guide_identity 90 | txt2 = str_extract(txt, "^[^_]+") 91 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt) 92 | seurat_obj[['gene']] = txt2 93 | seurat_obj[['gRNA_name']] = txt3 94 | seurat_obj[['cell_type']] = "K562" 95 | rm(txt, txt2, txt3) 96 | 97 | # remove ambiguous cells 98 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain 99 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain 100 | 101 | ``` 102 |
103 | \ 104 | 105 | 106 | ### 1. Pre-processing and calculating the Mixscale score 107 | We will first run standard pre-processing (normalization, find variable features, etc) for the dataset. Then, we will follow the standard [Mixscape analysis](https://satijalab.org/seurat/articles/mixscape_vignette) to calculate local perturbation signatures that mitigate confounding effects. Briefly speaking, for each cell we will search for its 20 nearest neighbors from the non-targeted (NT) cells, and then remove all technical variation so that perturbation-specific effect can be revealed. 108 | 109 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 110 | # quick check of the data 111 | head(seurat_obj) 112 | 113 | # standard pre-processing 114 | seurat_obj = NormalizeData(seurat_obj) 115 | seurat_obj = FindVariableFeatures(seurat_obj) 116 | seurat_obj = ScaleData(seurat_obj) 117 | seurat_obj = RunPCA(seurat_obj) 118 | 119 | # calculate Perturbation signatures 120 | seurat_obj <- CalcPerturbSig( 121 | object = seurat_obj, 122 | assay = "RNA", 123 | slot = "data", 124 | gd.class ="gene", 125 | nt.cell.class = "neg", 126 | reduction = "pca", 127 | ndims = 40, 128 | num.neighbors = 20, 129 | new.assay.name = "PRTB") 130 | 131 | ``` 132 | 133 | Now we will calculate the Mixscale scores for each cell within each perturbation group. 134 | ```{r scoring, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 135 | # Mixscale 136 | seurat_obj = RunMixscale( 137 | object = seurat_obj, 138 | assay = "PRTB", 139 | slot = "scale.data", 140 | labels = "gene", 141 | nt.class.name = "neg", 142 | min.de.genes = 5, 143 | logfc.threshold = 0.2, 144 | de.assay = "RNA", 145 | max.de.genes = 100, 146 | prtb.type = "P", new.class.name = "mixscale_id", fine.mode = F) 147 | 148 | ``` 149 | 150 | 151 | ### 2. Visualizations for the scores 152 | We will now use some plotting functions to explore the perturbation scores that we just calculated. 153 | 154 | ```{r ridge_plot, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 155 | # a. Check the distribution of the scores for the first 10 perturbations 156 | Mixscale_RidgePlot(object = seurat_obj, 157 | nt.class.name = "neg", 158 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10], 159 | facet_wrap = "gene", facet_scale = "fixed") 160 | 161 | # b. Check if the scores correlate with the expression level of the target gene itself 162 | Mixscale_ScatterPlot(object = seurat_obj, nt.class.name = "neg", 163 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10], 164 | facet_wrap = "gene", facet_scale = "free_y", nbin = 10) 165 | ``` 166 | 167 | ### 3. Differential expression (DE) analysis 168 | After calculating the scores, we can use the scores to enhance the statistical power of DE analysis by using them as a "weights" in the regression model. Briefly speaking, instead of coding the NT cells as 0 and the targeted cells as 1, we used the standardized scores to code the targeted cells in the regression, so that cells with stronger perturbation strength will have higher "weights" and vice versa. 169 | 170 | ```{r DE, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 171 | # run score-based weighted DE test for 12 selected perturbations. It will return a list of data frames (one for each perturbation) 172 | de_res = Run_wtDE(object = seurat_obj, assay = "RNA", slot = "counts", 173 | labels = "gene", nt.class.name = "neg", 174 | logfc.threshold = 0.1) 175 | 176 | # have a quick look at the DE results 177 | head(de_res[[1]]) 178 | 179 | ``` 180 | 181 | We can now explore the top DE genes for each perturbations using the customized DoHeatmap function, where cells are ordered by Mixscale scores. 182 | 183 | ```{r DE_heatmap, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 184 | # select the top 20 DE genes from on of the perturbation 185 | top_res = de_res[["GATA1"]][order(de_res[["GATA1"]]$p_weight)[1:20], ] 186 | # order the DE genes based on its log-fold-change 187 | top_DEG = rownames(top_res[order(top_res$beta_weight), ]) 188 | 189 | # heatmap for the top DE genes. cells ordered by Mixscale scores 190 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "GATA1", 191 | slct_condition = "con1", 192 | nt.class.name = "neg", 193 | labels = "gene", 194 | slct_features = top_DEG, 195 | slct_ident = "gene") 196 | 197 | ``` 198 | 199 | We can also explore the DE results for some other perturbations using similar codes as above. 200 | ```{r DE_heatmap_2, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 201 | # select the top 20 DE genes from on of the perturbation 202 | for(PRTB in c("GINS1", "MTOR", "TUBB")){ 203 | top_res = de_res[[PRTB]][order(de_res[[PRTB]]$p_weight)[1:20], ] 204 | # order the DE genes based on its log-fold-change 205 | top_DEG = rownames(top_res[order(top_res$beta_weight), ]) 206 | 207 | # 208 | p = Mixscale_DoHeatmap(object = seurat_obj, PRTB = PRTB, 209 | slct_condition = "con1", 210 | nt.class.name = "neg", 211 | labels = "gene", 212 | slct_features = top_DEG, 213 | slct_ident = "gene") 214 | print(p) 215 | } 216 | 217 | ``` 218 | 219 | 220 | 221 | ## Section B. 222 | In this section we will focus on how to use the pathway signatures from our study to run gene set enrichment test in external datasets. 223 | 224 | ### 0. Introduction 225 | We will use the interferon-beta (IFNB) stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042) (available via [SeuratData](https://github.com/satijalab/seurat-data) package) to demonstrate how to perform gene set enrichment analyses using the pathway gene sets from our study. We aim to show that by using our pathway gene lists, we can correctly infer the pathway activation of IFNB across different cell types in the human PBMCs. 226 | 227 | ```{r load_ifnb, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 228 | library(SeuratData) 229 | # load dataset 230 | ifnb <- LoadData("ifnb") 231 | ``` 232 | 233 | ```{r load_ifnb2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE} 234 | # install the ifnb dataset 235 | SeuratData::InstallData("ifnb") 236 | # load dataset 237 | ifnb <- SeuratData::LoadData("ifnb") 238 | ``` 239 | 240 | We can then load the pathway gene sets we generated (can be downloaded from [Zenodo](not_yet_insert)). There are two versions of pathway gene lists provided. One is the standard pathway gene list for different pathway programs we compiled, and the other one is the pathway exclusive gene list that filtered out the shared genes shared with other relevant pathways in the experiment. 241 | ```{r load_geneset, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 242 | plist3 = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/P3_signatures_2023Jun19.rds") 243 | plist = Reduce(c, plist3) 244 | plist = plist[c("IFNB_program1_down", #"IFNB_program1_up", 245 | "IFNB_program2_down", #"IFNB_program2_up", 246 | "IFNG_program1_down", #"IFNG_program1_up", 247 | "IFNG_program2_down", #"IFNG_program2_up", 248 | "TNFA_program1_down", #"TNFA_program1_up", 249 | "TNFA_program2_down", #"TNFA_program2_up", 250 | "TGFB1_program1_down", #"TGFB1_program1_up", 251 | "TGFB1_program2_down")] #"TGFB1_program2_up")] 252 | 253 | exclusive_plist = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/Exclusive_signatures_2023Jun20.rds") 254 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG", 255 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")] 256 | ``` 257 | 258 | ```{r load_geneset2, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE, eval=FALSE} 259 | plist = readRDS("pathway_genelist.rds") 260 | exclusive_plist = readRDS("Exclusive_pathway_genelist.rds") 261 | 262 | # only extract the exclusive gene lists that are relevant to IFNB pathway 263 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG", 264 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")] 265 | ``` 266 | 267 | ### 1. DE tests and Fisher enrichment tests for ifnb dataset 268 | We will first conduct Wilcox DE tests between the control and the IFNB-stimulated cells in each cell types in the ifnb dataset. Then, we will perform Fisher enrichment tests for the DE genes from each of the cell types, testing them against the pathway gene lists we just load. These two steps are merged by a wrapper function. 269 | ```{r ifnb_DE, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 270 | # Normalize the counts 271 | ifnb = NormalizeData(ifnb) 272 | 273 | # A wrapper function to perform both DE and enrichment test 274 | res = Mixscale_DEenrich(object = ifnb, 275 | plist = plist, 276 | labels = "seurat_annotations", 277 | conditions = "stim", 278 | ident.1 = "STIM", 279 | ident.2 = "CTRL", 280 | direction = "up", 281 | logfc.threshold = 0.2, 282 | p.val.cutoff = 0.05, 283 | min.pct = 0.1) 284 | 285 | # check the enrichment results for CD14 Monocytes 286 | head(res$`CD14 Mono`) 287 | ``` 288 | 289 | 290 | ### 2. Enrichment tests using pathway exclusive gene lists 291 | Gene lists from related pathways, such as IFNG, IFNB, and TNFA which are all linked to immune responses, frequently share many genes. This overlap makes it challenging to differentiate the activation of these pathways. For example, as the result above shows, DE genes due to IFNB stimulation are enriched in not just the IFNB pathway, but also in IFNG and TNFA pathways. To overcome this challenge, we have introduced a concept of pathway-exclusive gene lists. Essentially, for any two related pathways, we define the exclusive genes of one pathway as those that are absent from the gene list of the other. To refine this further, we employed a more stringent criterion to exclude genes that, while potentially related, are not explicitly listed in the gene list of the other pathway (For a detailed explanation, please refer to our [paper](not_yet_insert)). Performing enrichment tests using the exclusive gene lists enhances our ability to accurately distinguish activations among closely associated pathways. 292 | 293 | ```{r ifnb_excl_enrich, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 294 | # A wrapper function to perform both DE and enrichment test 295 | res_exclusive = Mixscale_DEenrich(object = ifnb, 296 | plist = exclusive_plist, 297 | labels = "seurat_annotations", 298 | conditions = "stim", 299 | ident.1 = "STIM", 300 | ident.2 = "CTRL", 301 | direction = "up", 302 | logfc.threshold = 0.2, 303 | p.val.cutoff = 0.05, 304 | min.pct = 0.1) 305 | 306 | # check the enrichment results for CD14 Monocytes 307 | head(res_exclusive$`CD14 Mono`) 308 | ``` 309 | 310 | We can see that the exclusive gene lists for IFNB (removing TNFA) and IFNB (removing IFNG) are still enriched for IFNB-stimulated DE genes. But we do not observe signals from IFNG (removing IFNB) or TNFA (removing IFNB), indicating that the underlying activated pathway during IFNB stimulation is indeed IFNB, while IFNG and TNFA are showing enrichment just because of their substantial overlap with IFNB. 311 | 312 | 313 | ### 3. Visualization 314 | We can now visualize the enrichment results across all the cell types in the ifnb dataset. First we will check the results for the standard enrichment test 315 | ```{r plot_standard, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 316 | DEenrich_DotPlot(res, 317 | direction = "up", 318 | plot_title = "Standard pathway gene lists") 319 | 320 | ``` 321 | 322 | 323 | ```{r plot_exclusive, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 324 | DEenrich_DotPlot(res_exclusive, 325 | direction = "up", 326 | plot_title = "Pathway exclusive gene lists", 327 | OR_cutoff = 10) 328 | 329 | ``` 330 | 331 | ### 4. Module score analysis 332 | Apart from performing enrishment test, we can also evaluate the pathway activity by calculating the over all expression level of all the genes within a gene list (the so-called module score analysis). We will use package ["UCell"](https://bioconductor.org/packages/release/bioc/html/UCell.html) for module score analysis. Alternatively, we can use the built-in function [AddModuleScore()](https://satijalab.org/seurat/reference/addmodulescore) from Seurat as well. 333 | 334 | ```{r module_score, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 335 | ifnb = UCell::AddModuleScore_UCell(ifnb, 336 | features = plist[c("IFNB_program1_down", "IFNG_program1_down", 337 | "TNFA_program1_down", "TGFB1_program1_down")] ) 338 | 339 | # using VlnPlot to visualize the score of each cell 340 | VlnPlot(ifnb, 341 | features = grep("_UCell", names(ifnb@meta.data), value = T), 342 | pt.size = 0, 343 | group.by = "seurat_annotations", 344 | split.by = "stim", 345 | ncol = 2) & 346 | theme(legend.position = "NA", 347 | axis.title = element_text(size = 15), 348 | axis.text = element_text(size = 12), 349 | plot.title = element_text(size = 18)) & 350 | ylim(0.1, 0.4) 351 | 352 | ``` 353 | 354 | We can observe very similar results as in our enrichment tests, where all IFNB, IFNG, and TNF pathways show a high activity (and not for TGFB pathway) in the IFNB-stimulated cells compared to the non-stimulated cells. And if we repeat the module score analysis using the pathway exclusive gene lists, we should be able to determine the pathway actually being activated (i.e., IFNB pathway). 355 | 356 | ```{r module_score_excl, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 357 | ifnb = UCell::AddModuleScore_UCell(ifnb, 358 | features = exclusive_plist[c("IFNB_REMOVE_IFNG", "IFNB_REMOVE_TNFA", 359 | "IFNG_REMOVE_IFNB", "TNFA_REMOVE_IFNB")] ) 360 | 361 | # using VlnPlot to visualize the score of each cell 362 | VlnPlot(ifnb, 363 | features = grep("_REMOVE_.*UCell", names(ifnb@meta.data), value = T), 364 | pt.size = 0, 365 | group.by = "seurat_annotations", 366 | split.by = "stim", 367 | ncol = 2) & 368 | theme(legend.position = "NA", 369 | axis.title = element_text(size = 15), 370 | axis.text = element_text(size = 12), 371 | plot.title = element_text(size = 18)) 372 | 373 | ``` 374 | 375 | 376 | 377 | 378 | 379 | -------------------------------------------------------------------------------- /docs/old/index copy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Merged_Vignette_2024Jan16" 3 | author: "Longda Jiang" 4 | date: "2024-01-16" 5 | output: html_document 6 | --- 7 | 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(echo = TRUE) 11 | ``` 12 | 13 | ## Introduction 14 | 15 | In this tutorial, we will describe an R package "Mixscale" for analyzing Perturb-seq data. Mixscale contains functions designed to tackle the following tasks:\ 16 | 1. calculate 'Mixscale scores' for cells that receives the same perturbation to quantify the heterogeneity in perturbation strength \ 17 | 2. perform a scoring-based weighted differential expression (DE) tests to identify DE genes for each perturbation \ 18 | 3. perform different levels of decomposition analysis to identify correlated perturbations and group them into a program \ 19 | 4. perform a PCA-based permutation test to extract shared genes for the perturbation programs (program gene signature) \ 20 | 5. identify shared and unique signature between two relevant programs \ 21 | 6. perform gene set enrichment tests using the program signature for new datasets \ 22 | 7. perform module score analyses using the program signature to quantify the program activity in new datasets \ 23 | \ 24 | The tutorial is divided into two sections. The first section will describe task 1 to 5 using a public Perturb-seq dataset from the Weissman Lab ([Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5)), which can be downloaded from [GSE132080](https://0-www-ncbi-nlm-nih-gov.brum.beds.ac.uk/geo/query/acc.cgi?acc=GSE132080). The second section will describe task 6 and 7 using the pathway gene lists generated from our study (available at [Zenodo](not_inserted_yet)) and an interferon-beta stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042). This ifnb dataset is available via the SeuratData package (see section 2 below). 25 | 26 | 27 | ### load the packages 28 | ```{r load_package, message=FALSE, warning=FALSE} 29 | options(Seurat.object.assay.version = 'v3') 30 | 31 | library(Seurat) 32 | library(ggridges) 33 | library(stringr) 34 | library(Mixscale) 35 | library(ggplot2) 36 | ``` 37 | 38 | ## Section A. 39 | In this section we will focus on how to use Mixscale to analyze Perturb-seq data. 40 | 41 | ### 0. Description of the demo data 42 | The demo dataset from [Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5) contains CRISPRi Perturb-seq data targeting 25 key genes involved in essential cell biological processes. We will load in the count matrix to create a Seurat object and append the provided meta data to it. \ 43 | One special feature of this dataset is that, for each perturbation target gene, there are five different gRNAs designed to target it. One of the gRNA has the perfectly matched sequence for the target region (labelled with "_00"), while the others contain 1~3 nucleotide mismatches so that their perturbation stength is "titrated". We will treat the cells that have the same target gene as the same group in our downstream analyses. 44 | 45 | ```{r load_data, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 46 | # load in the count matrix downloaded from GSE132080 47 | ct_mat = ReadMtx(mtx = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_matrix.mtx", 48 | cells = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_barcodes.tsv", 49 | features = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_genes.tsv") 50 | # load the meta_data 51 | meta_data = read.csv("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_cell_identities.csv") 52 | rownames(meta_data) = meta_data$cell_barcode 53 | 54 | # create a seurat object 55 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data) 56 | rm(ct_mat, meta_data) 57 | 58 | # retrieve the guide information for each cell 59 | txt = seurat_obj$guide_identity 60 | txt2 = str_extract(txt, "^[^_]+") 61 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt) 62 | seurat_obj[['gene']] = txt2 63 | seurat_obj[['gRNA_name']] = txt3 64 | seurat_obj[['cell_type']] = "K562" 65 | rm(txt, txt2, txt3) 66 | 67 | # remove ambiguous cells 68 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain 69 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain 70 | 71 | ``` 72 | 73 | 74 |
**Click here to see how to generate the Seurat object** 75 | ```{r load_data2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE} 76 | # load in the count matrix downloaded from GSE132080 77 | ct_mat = ReadMtx(mtx = "GSE132080/GSE132080_10X_matrix.mtx", 78 | cells = "GSE132080/GSE132080_10X_barcodes.tsv", 79 | features = "GSE132080/GSE132080_10X_genes.tsv") 80 | # load the meta_data 81 | meta_data = read.csv("GSE132080/GSE132080_cell_identities.csv") 82 | rownames(meta_data) = meta_data$cell_barcode 83 | 84 | # create a seurat object 85 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data) 86 | rm(ct_mat, meta_data) 87 | 88 | # retrieve the guide information for each cell 89 | txt = seurat_obj$guide_identity 90 | txt2 = str_extract(txt, "^[^_]+") 91 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt) 92 | seurat_obj[['gene']] = txt2 93 | seurat_obj[['gRNA_name']] = txt3 94 | seurat_obj[['cell_type']] = "K562" 95 | rm(txt, txt2, txt3) 96 | 97 | # remove ambiguous cells 98 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain 99 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain 100 | 101 | ``` 102 |
103 | \ 104 | 105 | 106 | ### 1. Pre-processing and calculating the Mixscale score 107 | We will first run standard pre-processing (normalization, find variable features, etc) for the dataset. Then, we will follow the standard [Mixscape analysis](https://satijalab.org/seurat/articles/mixscape_vignette) to calculate local perturbation signatures that mitigate confounding effects. Briefly speaking, for each cell we will search for its 20 nearest neighbors from the non-targeted (NT) cells, and then remove all technical variation so that perturbation-specific effect can be revealed. 108 | 109 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 110 | # quick check of the data 111 | head(seurat_obj) 112 | 113 | # standard pre-processing 114 | seurat_obj = NormalizeData(seurat_obj) 115 | seurat_obj = FindVariableFeatures(seurat_obj) 116 | seurat_obj = ScaleData(seurat_obj) 117 | seurat_obj = RunPCA(seurat_obj) 118 | 119 | # calculate Perturbation signatures 120 | seurat_obj <- CalcPerturbSig( 121 | object = seurat_obj, 122 | assay = "RNA", 123 | slot = "data", 124 | gd.class ="gene", 125 | nt.cell.class = "neg", 126 | reduction = "pca", 127 | ndims = 40, 128 | num.neighbors = 20, 129 | new.assay.name = "PRTB") 130 | 131 | ``` 132 | 133 | Now we will calculate the Mixscale scores for each cell within each perturbation group. 134 | ```{r scoring, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 135 | # Mixscale 136 | seurat_obj = RunMixscale( 137 | object = seurat_obj, 138 | assay = "PRTB", 139 | slot = "scale.data", 140 | labels = "gene", 141 | nt.class.name = "neg", 142 | min.de.genes = 5, 143 | logfc.threshold = 0.2, 144 | de.assay = "RNA", 145 | max.de.genes = 100, 146 | prtb.type = "P", new.class.name = "mixscale_id", fine.mode = F) 147 | 148 | ``` 149 | 150 | 151 | ### 2. Visualizations for the scores 152 | We will now use some plotting functions to explore the perturbation scores that we just calculated. 153 | 154 | ```{r ridge_plot, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 155 | # a. Check the distribution of the scores for the first 10 perturbations 156 | Mixscale_RidgePlot(object = seurat_obj, 157 | nt.class.name = "neg", 158 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10], 159 | facet_wrap = "gene", facet_scale = "fixed") 160 | 161 | # b. Check if the scores correlate with the expression level of the target gene itself 162 | Mixscale_ScatterPlot(object = seurat_obj, nt.class.name = "neg", 163 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10], 164 | facet_wrap = "gene", facet_scale = "free_y", nbin = 10) 165 | ``` 166 | 167 | ### 3. Differential expression (DE) analysis 168 | After calculating the scores, we can use the scores to enhance the statistical power of DE analysis by using them as a "weights" in the regression model. Briefly speaking, instead of coding the NT cells as 0 and the targeted cells as 1, we used the standardized scores to code the targeted cells in the regression, so that cells with stronger perturbation strength will have higher "weights" and vice versa. 169 | 170 | ```{r DE, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 171 | # run score-based weighted DE test for 12 selected perturbations. It will return a list of data frames (one for each perturbation) 172 | de_res = Run_wtDE(object = seurat_obj, assay = "RNA", slot = "counts", 173 | labels = "gene", nt.class.name = "neg", 174 | logfc.threshold = 0.1) 175 | 176 | # have a quick look at the DE results 177 | head(de_res[[1]]) 178 | 179 | ``` 180 | 181 | We can now explore the top DE genes for each perturbations using the customized DoHeatmap function, where cells are ordered by Mixscale scores. 182 | 183 | ```{r DE_heatmap, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 184 | # select the top 20 DE genes from on of the perturbation 185 | top_res = de_res[["GATA1"]][order(de_res[["GATA1"]]$p_weight)[1:20], ] 186 | # order the DE genes based on its log-fold-change 187 | top_DEG = rownames(top_res[order(top_res$beta_weight), ]) 188 | 189 | # heatmap for the top DE genes. cells ordered by Mixscale scores 190 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "GATA1", 191 | slct_condition = "con1", 192 | nt.class.name = "neg", 193 | labels = "gene", 194 | slct_features = top_DEG, 195 | slct_ident = "gene") 196 | ``` 197 | 198 | 199 | ## Section B. 200 | In this section we will focus on how to use the pathway signatures from our study to run gene set enrichment test in external datasets. 201 | 202 | ### 0. Introduction 203 | We will use the interferon-beta (IFNB) stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042) (available via [SeuratData](https://github.com/satijalab/seurat-data) package) to demonstrate how to perform gene set enrichment analyses using the pathway gene sets from our study. We aim to show that by using our pathway gene lists, we can correctly infer the pathway activation of IFNB across different cell types in the human PBMCs. 204 | 205 | ```{r load_ifnb, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 206 | library(SeuratData) 207 | # load dataset 208 | ifnb <- LoadData("ifnb") 209 | ``` 210 | 211 | ```{r load_ifnb2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE} 212 | # install the ifnb dataset 213 | SeuratData::InstallData("ifnb") 214 | # load dataset 215 | ifnb <- SeuratData::LoadData("ifnb") 216 | ``` 217 | 218 | We can then load the pathway gene sets we generated (can be downloaded from [Zenodo](not_yet_insert)). There are two versions of pathway gene lists provided. One is the standard pathway gene list for different pathway programs we compiled, and the other one is the pathway exclusive gene list that filtered out the shared genes shared with other relevant pathways in the experiment. 219 | ```{r load_geneset, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE} 220 | plist3 = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/P3_signatures_2023Jun19.rds") 221 | plist = Reduce(c, plist3) 222 | plist = plist[c("IFNB_program1_down", #"IFNB_program1_up", 223 | "IFNB_program2_down", #"IFNB_program2_up", 224 | "IFNG_program1_down", #"IFNG_program1_up", 225 | "IFNG_program2_down", #"IFNG_program2_up", 226 | "TNFA_program1_down", #"TNFA_program1_up", 227 | "TNFA_program2_down", #"TNFA_program2_up", 228 | "TGFB1_program1_down", #"TGFB1_program1_up", 229 | "TGFB1_program2_down")] #"TGFB1_program2_up")] 230 | 231 | exclusive_plist = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/Exclusive_signatures_2023Jun20.rds") 232 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG", 233 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")] 234 | ``` 235 | 236 | ```{r load_geneset2, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE, eval=FALSE} 237 | plist = readRDS("pathway_genelist.rds") 238 | exclusive_plist = readRDS("Exclusive_pathway_genelist.rds") 239 | 240 | # only extract the exclusive gene lists that are relevant to IFNB pathway 241 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG", 242 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")] 243 | ``` 244 | 245 | ### 1. DE tests and Fisher enrichment tests for ifnb dataset 246 | We will first conduct Wilcox DE tests between the control and the IFNB-stimulated cells in each cell types in the ifnb dataset. Then, we will perform Fisher enrichment tests for the DE genes from each of the cell types, testing them against the pathway gene lists we just load. These two steps are merged by a wrapper function. 247 | ```{r ifnb_DE, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 248 | # Normalize the counts 249 | ifnb = NormalizeData(ifnb) 250 | 251 | # A wrapper function to perform both DE and enrichment test 252 | res = Mixscale_DEenrich(object = ifnb, 253 | plist = plist, 254 | labels = "seurat_annotations", 255 | conditions = "stim", 256 | ident.1 = "STIM", 257 | ident.2 = "CTRL", 258 | direction = "up", 259 | logfc.threshold = 0.2, 260 | p.val.cutoff = 0.05, 261 | min.pct = 0.1) 262 | 263 | # check the enrichment results for CD14 Monocytes 264 | head(res$`CD14 Mono`) 265 | ``` 266 | 267 | 268 | ### 2. Enrichment tests using pathway exclusive gene lists 269 | Gene lists from related pathways, such as IFNG, IFNB, and TNFA which are all linked to immune responses, frequently share many genes. This overlap makes it challenging to differentiate the activation of these pathways. For example, as the result above shows, DE genes due to IFNB stimulation are enriched in not just the IFNB pathway, but also in IFNG and TNFA pathways. To overcome this challenge, we have introduced a concept of pathway-exclusive gene lists. Essentially, for any two related pathways, we define the exclusive genes of one pathway as those that are absent from the gene list of the other. To refine this further, we employed a more stringent criterion to exclude genes that, while potentially related, are not explicitly listed in the gene list of the other pathway (For a detailed explanation, please refer to our [paper](not_yet_insert)). Performing enrichment tests using the exclusive gene lists enhances our ability to accurately distinguish activations among closely associated pathways. 270 | 271 | ```{r ifnb_excl_enrich, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 272 | # A wrapper function to perform both DE and enrichment test 273 | res_exclusive = Mixscale_DEenrich(object = ifnb, 274 | plist = exclusive_plist, 275 | labels = "seurat_annotations", 276 | conditions = "stim", 277 | ident.1 = "STIM", 278 | ident.2 = "CTRL", 279 | direction = "up", 280 | logfc.threshold = 0.2, 281 | p.val.cutoff = 0.05, 282 | min.pct = 0.1) 283 | 284 | # check the enrichment results for CD14 Monocytes 285 | head(res_exclusive$`CD14 Mono`) 286 | ``` 287 | 288 | We can see that the exclusive gene lists for IFNB (removing TNFA) and IFNB (removing IFNG) are still enriched for IFNB-stimulated DE genes. But we do not observe signals from IFNG (removing IFNB) or TNFA (removing IFNB), indicating that the underlying activated pathway during IFNB stimulation is indeed IFNB, while IFNG and TNFA are showing enrichment just because of their substantial overlap with IFNB. 289 | 290 | 291 | ### 3. Visualization 292 | We can now visualize the enrichment results across all the cell types in the ifnb dataset. First we will check the results for the standard enrichment test 293 | ```{r plot_standard, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 294 | DEenrich_DotPlot(res, 295 | direction = "up", 296 | plot_title = "Standard pathway gene lists") 297 | 298 | ``` 299 | 300 | 301 | ```{r plot_exclusive, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 302 | DEenrich_DotPlot(res_exclusive, 303 | direction = "up", 304 | plot_title = "Pathway exclusive gene lists", 305 | OR_cutoff = 10) 306 | 307 | ``` 308 | 309 | ### 4. Module score analysis 310 | Apart from performing enrishment test, we can also evaluate the pathway activity by calculating the over all expression level of all the genes within a gene list (the so-called module score analysis). We will use package ["UCell"](https://bioconductor.org/packages/release/bioc/html/UCell.html) for module score analysis. Alternatively, we can use the built-in function [AddModuleScore()](https://satijalab.org/seurat/reference/addmodulescore) from Seurat as well. 311 | 312 | ```{r module_score, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 313 | ifnb = UCell::AddModuleScore_UCell(ifnb, 314 | features = plist[c("IFNB_program1_down", "IFNG_program1_down", 315 | "TNFA_program1_down", "TGFB1_program1_down")] ) 316 | 317 | # using VlnPlot to visualize the score of each cell 318 | VlnPlot(ifnb, 319 | features = grep("_UCell", names(ifnb@meta.data), value = T), 320 | pt.size = 0, 321 | group.by = "seurat_annotations", 322 | split.by = "stim", 323 | ncol = 2) & 324 | theme(legend.position = "NA", 325 | axis.title = element_text(size = 15), 326 | axis.text = element_text(size = 12), 327 | plot.title = element_text(size = 18)) & 328 | ylim(0.1, 0.4) 329 | 330 | ``` 331 | 332 | We can observe very similar results as in our enrichment tests, where all IFNB, IFNG, and TNF pathways show a high activity (and not for TGFB pathway) in the IFNB-stimulated cells compared to the non-stimulated cells. And if we repeat the module score analysis using the pathway exclusive gene lists, we should be able to determine the pathway actually being activated (i.e., IFNB pathway). 333 | 334 | ```{r module_score_excl, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE} 335 | ifnb = UCell::AddModuleScore_UCell(ifnb, 336 | features = exclusive_plist[c("IFNB_REMOVE_IFNG", "IFNB_REMOVE_TNFA", 337 | "IFNG_REMOVE_IFNB", "TNFA_REMOVE_IFNB")] ) 338 | 339 | # using VlnPlot to visualize the score of each cell 340 | VlnPlot(ifnb, 341 | features = grep("_REMOVE_.*UCell", names(ifnb@meta.data), value = T), 342 | pt.size = 0, 343 | group.by = "seurat_annotations", 344 | split.by = "stim", 345 | ncol = 2) & 346 | theme(legend.position = "NA", 347 | axis.title = element_text(size = 15), 348 | axis.text = element_text(size = 12), 349 | plot.title = element_text(size = 18)) 350 | 351 | ``` 352 | 353 | 354 | 355 | 356 | 357 | -------------------------------------------------------------------------------- /docs/old/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Calculate perturbation scores for Perturb-seq data" 3 | author: "Longda Jiang" 4 | date: "2023-09-12" 5 | output: html_document 6 | --- 7 | 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(warning = FALSE) 11 | ``` 12 | 13 | ## Introduction 14 | 15 | This html file describes a robust computational framework, "perturbation scoring", that allows us to accurately and efficiently capture the wide range of cellular responses to different perturbations in the context of single-cell pooled CRSIPR screens. Instead of treating cells that receive the same gRNA equally, this framework uses a scoring strategy (extended from 'mixscape', see https://satijalab.org/seurat/articles/mixscape_vignette) to capture the variability in each cell's response to the perturbation. Specifically, we introduce functions for: \ 16 | 1. calculating 'perturbation scores' for cells that receives the same perturbation across multiple cell lines \ 17 | 2. performing scoring-based differential expression (DE) tests to identify DE genes for different perturbations \ 18 | 3. performing different levels of decomposition analysis to identify correlated perturbations given their shared DE genes \ 19 | 4. performing PCA-based permutation tests to identify shared gene signatures for correlation perturbations \ 20 | 5. constructing a gene-set repository for each group of correlated perturbations (similar to a gene-ontology database) \ 21 | 5. performing enrichment tests for external gene lists using the gene-set repo generated above \ 22 | \ 23 | 24 | ## load the packages 25 | ```{r load_package, message=FALSE, warning=FALSE} 26 | library(devtools) 27 | library(Seurat) 28 | library(ggridges) 29 | library(PRTBScoring) 30 | ``` 31 | 32 | ## 0. load the demo data 33 | The demo dataset contains CRISPRi Perturb-seq data for the IFN-gamma pathway. The perturbation targets include "IFNGR1", "IRF1", "IRF2", "JAK1", "STAT1", etc.. 34 | 35 | ```{r load_data, message=FALSE, warning=FALSE, cache=FALSE} 36 | seurat_obj = readRDS(file = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/raw_seq_processing_2021Dec/mixscape_dat10x_NovaSeq_2021Dec07.rds") 37 | seurat_obj$cell_type = seurat_obj$HTO_classification 38 | seurat_obj$cell_type[seurat_obj$cell_type == "Negative"] = "K562" 39 | DefaultAssay(seurat_obj) = "RNA" 40 | seurat_obj[['PRTB']] = NULL 41 | 42 | # select a subset of perturbations for our demo 43 | # the perturbation identity is stored in the "gene" column in the meta.data 44 | seurat_obj = subset(seurat_obj, subset = gene %in% c("NT", "ZC3H3", "IFNGR1", "IFNGR2", 45 | "IRF1", "IRF2", "IRF5", "JUN", "MAFF", 46 | "PARP12", "RUNX1", 47 | "JAK1", "JAK2", 48 | "STAT1", "STAT2", "STAT3")) 49 | table(seurat_obj$gene, seurat_obj$cell_type) 50 | ``` 51 | 52 | 53 | ## 1. Standard processing 54 | To perform standard processing for the dataset. 55 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 56 | seurat_obj = NormalizeData(seurat_obj) 57 | seurat_obj = FindVariableFeatures(seurat_obj) 58 | seurat_obj = ScaleData(seurat_obj) 59 | seurat_obj = RunPCA(seurat_obj) 60 | seurat_obj = RunUMAP(seurat_obj, dims = 1:40) 61 | DimPlot(seurat_obj, group.by = "cell_type") 62 | 63 | ``` 64 | 65 | 66 | ## 2. Calculate perturbation signatures (correcting for confounding) 67 | To use the CalcPerturbSig() function from "Mixscape" to correct for confounding factors in each cell. 68 | ```{r calc_prtb_sig, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 69 | seurat_obj <- CalcPerturbSig( 70 | object = seurat_obj, 71 | assay = "RNA", 72 | slot = "data", 73 | gd.class ="gene", 74 | nt.cell.class = "NT", 75 | reduction = "pca", 76 | ndims = 40, 77 | num.neighbors = 20, 78 | new.assay.name = "PRTB", 79 | split.by = "cell_type") 80 | 81 | ``` 82 | 83 | 84 | ## 3. Perturbation scoring 85 | This part will use the scoring strategy to assign a score to each cell. The score represents the perturbation strength 86 | each cell undergoes. 87 | 88 | ```{r, echo=TRUE, cache=FALSE} 89 | seurat_obj = PRTBScoring( 90 | object = seurat_obj, 91 | assay = "PRTB", 92 | slot = "scale.data", 93 | labels = "gene", 94 | nt.class.name = "NT", 95 | min.de.genes = 5, 96 | split.by = "cell_type", 97 | logfc.threshold = 0.2, 98 | de.assay = "RNA", 99 | max.de.genes = 100, 100 | prtb.type = "P", 101 | new.class.name = "mixscape_v1", 102 | fine.mode = F, 103 | harmonize = T, 104 | seed = 1) 105 | 106 | # take a look at the scores 107 | Tool(seurat_obj, slot = "PRTBScoring")[[1]][[1]][1:5, 1:2] 108 | 109 | ``` 110 | 111 | ## 3.5 Some visualizations for the scores 112 | We will now use some plotting functions to explore the perturbation scores that we just calculated. 113 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 114 | # a. Check the distribution of the scores 115 | PRTBscore_RidgePlot(object = seurat_obj, split.by = "cell_type", 116 | PRTB = c("IFNGR1", "IRF1", "STAT1"), 117 | facet_wrap = "gene", facet_scale = "fixed", 118 | slct_split.by = c("A549", "HT29", "MCF7"), 119 | facet_nrow = 2) 120 | 121 | # b. Check if the scores correlate with the expression level of the target gene itself 122 | PRTBscore_ScatterPlot(object = seurat_obj, split.by = "cell_type", 123 | PRTB = c("IFNGR2", "JAK1", "IRF1", "STAT1"), 124 | facet_wrap = "gene", facet_scale = "free_y") 125 | 126 | # c. Check the single-cell heatmap (stratified by the expression level of the target genes) 127 | PRTBscore_DoHeatmap(object = seurat_obj, PRTB = "STAT1", slct_condition = "A549", 128 | slct_features = c("IRF1", "STAT2", "B2M", "WARS", "JAK1", 129 | "CCR5", "CXCL9", "CXCL10", "CXCL11", "IDO1")) 130 | 131 | ``` 132 | 133 | 134 | ## 4. Perform scoring-based DE test 135 | This step will incorporate the scores we just calculated and use them in the differential expression tests. By using 136 | the scores as a 'weight' for the perturbed cells (instead of universally coding them as 1), we can achieve a higher 137 | statistical power. 138 | 139 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 140 | de_res = scoringDE(object = seurat_obj, assay = "RNA", slot = "counts", 141 | PRTB_list = c("RFX5", "ZC3H3", "IFNGR1", "IFNGR2", 142 | "IRF1", "IRF2", "JUN", "MAFF", 143 | "PARP12", "TRAFD1", 144 | "JAK1", 145 | "STAT1", "SP100"), 146 | labels = "gene") 147 | 148 | # have a quick look at the DE results 149 | head(de_res[[1]][order(de_res[[1]][, 24]), c(23:31)]) 150 | 151 | # re-arrange the results into a list of DE Z-score matrices, removing non-significant DE genes. 152 | # the function will summarize the number of significant DE genes in the screen output. 153 | DEG_mat_main = get_DE_mat(de_res, p_threshold = 0.05/30000, fc_threshold = 0.2) 154 | 155 | ``` 156 | 157 | 158 | ## 5. Decomposition 159 | In this section we will perform a series of decomposition analyses for our DE results. We will also 160 | perform PCA-based permutation tests to extract the gene signatures for the correlated perturbations. 161 | We will then aggregate all the gene signatures and arrange them into a repository of different 162 | sets of gene signatures. 163 | 164 | ### 5.1 within perturbation decomposition 165 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 166 | # an empty list to store all the gene sets (a repository of gene sets) 167 | go_db = list() 168 | 169 | # 5.1 within-prtb decomposition 170 | # to slightly clean up the matrices before decomposition 171 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0) 172 | 173 | celltype_list = names(DEG_mat) 174 | PRTB_list = colnames(DEG_mat[[1]]) 175 | gene_ID = rownames(DEG_mat[[1]]) 176 | 177 | # loop through all the perturbations in our dataset 178 | for(i in 1:length(PRTB_list)){ 179 | PRTB = PRTB_list[i] 180 | 181 | tmp=list() 182 | for(CELLTYPE in celltype_list){ 183 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){ 184 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB] 185 | } 186 | } 187 | tmp = Reduce(cbind, tmp) 188 | colnames(tmp) = celltype_list 189 | rownames(tmp) = gene_ID 190 | 191 | # run Permutation test and extract gene signatures 192 | # before each permutation test, PCApermtest() will further filter the 193 | # sub-matrix by removing any row (gene) without raw DE P-value <= 0.05 in any column (perturbation). 194 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1) 195 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T) 196 | 197 | ## plot Z-score heatmap for the gene signatures 198 | ## !!! this is currently commented out, but remove the '#' if you decide to run them 199 | ## the figures will be automatically saved to the folder you specify. 200 | # DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30, 201 | # output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/", 202 | # prefix = PRTB) 203 | 204 | # store the gene signatures to the go-term repo 205 | if(length(sig_genes$upDEGs) >= 10){ 206 | go_db[[paste0(PRTB, "_upDEGs")]] = sig_genes$upDEGs 207 | } 208 | if(length(sig_genes$downDEGs) >= 10){ 209 | go_db[[paste0(PRTB, "_downDEGs")]] = sig_genes$downDEGs 210 | } 211 | } 212 | 213 | 214 | ``` 215 | 216 | 217 | ### 5.2 within cell type decomposition 218 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 219 | # clean up the matrices 220 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3) 221 | 222 | celltype_list = names(DEG_mat) 223 | PRTB_list = colnames(DEG_mat[[1]]) 224 | gene_ID = rownames(DEG_mat[[1]]) 225 | 226 | # loop through all the cell types in our datasets 227 | for(i in 1:length(celltype_list)){ 228 | CELLTYPE = celltype_list[i] 229 | tmp=DEG_mat[[CELLTYPE]] 230 | 231 | # run Permutation test and extract gene signatures 232 | res = DEhclust(mat = tmp) 233 | 234 | # get_sig_genes_DEhclust() is a wrapper function for PCApermtest() and get_sig_genes() for DEhclust object. 235 | sig_genes = get_sig_genes_DEhclust(obj = res, row_filtering_pval = 0.05) 236 | 237 | ## plot Z-score heatmap for the gene signatures 238 | ## remove the '#' to generate the figures. 239 | # DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30, 240 | # output_path = "/Users/uqljian5/Desktop/test_multiCCA/level2/", 241 | # prefix = CELLTYPE) 242 | 243 | # store the gene signatures to the go-term repo 244 | for(CLUSTER in names(sig_genes)){ 245 | if(length(sig_genes[[CLUSTER]]$sig_genes$upDEGs) >= 10){ 246 | go_db[[paste0(CELLTYPE, "_", CLUSTER, "_upDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$upDEGs 247 | } 248 | if(length(sig_genes[[CLUSTER]]$sig_genes$downDEGs) >= 10){ 249 | go_db[[paste0(CELLTYPE, "_", CLUSTER,"_downDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$downDEGs 250 | } 251 | } 252 | 253 | } 254 | 255 | ``` 256 | 257 | 258 | ### 5.3 MultiCCA analysis (decomposition across cell types and perturbations) 259 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 260 | # clean up the matrices 261 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T) 262 | 263 | celltype_list = names(DEG_mat) 264 | PRTB_list = colnames(DEG_mat[[1]]) 265 | gene_ID = rownames(DEG_mat[[1]]) 266 | 267 | # run Permutation test and extract gene signatures 268 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T) 269 | 270 | # get_sig_genes_DEmultiCCA is a wrapper function for PCApermtest() and get_sig_genes() for DEmultiCCA object. 271 | sig_genes = get_sig_genes_DEmultiCCA(res, row_filtering_pval = 0.05) 272 | 273 | ## visualization. 274 | ## remove '#' to generate the figures 275 | # DE_heatmap(obj = res, sig_genes = sig_genes, 276 | # type = "multiCCA", direction = "both", 277 | # top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/", 278 | # prefix = "IFNG") 279 | 280 | # store the gene signatures to the go-term repo 281 | for(PROGRAM in names(sig_genes)){ 282 | if(length(sig_genes[[PROGRAM]]$sig_genes$upDEGs) >= 10){ 283 | go_db[[paste0("IFNG_", PROGRAM, "_upDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$upDEGs 284 | } 285 | if(length(sig_genes[[PROGRAM]]$sig_genes$downDEGs) >= 10){ 286 | go_db[[paste0("IFNG_", PROGRAM, "_downDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$downDEGs 287 | } 288 | } 289 | 290 | ``` 291 | 292 | 293 | ## 6. Enrichment analysis 294 | After generating the repository of the gene sets, we will now use it to perform enrichment analyses. Enrichment tests 295 | aim to identify if a user input gene list shows any significant overlap with an existing gene-ontology (GO) gene set. In our 296 | case, the GO gene sets are generated as above, each represent the gene signatures of a perturbation or a group of correlated 297 | perturbations. \ 298 | We implemented two different methods for enrichment analyses:\ 299 | 1. a standard enrichment test method based on Fisher's exact test. \ 300 | 2. a novel enrichment test method based on rank biased overlap (RBO). This method has the advantage that it not only 301 | shows the overlap between 2 lists, but also takes the consistency of the rank of each gene into consideration. \ 302 | \ 303 | For demonstration, we will use the DE genes identified for "JAK2" perturbation from the same dataset 304 | (which was not included in our above analyses) as the input gene list, and test it against the gene set 305 | repository we just generated. \ 306 | 307 | 308 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE} 309 | 310 | # a. DE tests for JAK2 in A549 cell line. The DE genes will be used as the input gene list. 311 | seurat_obj$Condition = paste0(seurat_obj$cell_type, "_", seurat_obj$gene) 312 | Idents(seurat_obj) = "Condition" 313 | 314 | new_DE_test = FindMarkers(seurat_obj, ident.1 = "A549_NT", ident.2 = "A549_JAK2", 315 | slot = "data", logfc.threshold = 0) 316 | 317 | # get the background gene list for conventional enrich test 318 | background = rownames(new_DE_test) 319 | # get the significant down-reg genes (the input list ) 320 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0.2, ]) 321 | 322 | 323 | # b. Conventional enrichment test (Fisher's exact test) 324 | fisher_enrich_res = fisher_enrich_test(input_list = input_list, 325 | background = background, 326 | go_term_db = go_db) 327 | fisher_enrich_res = fisher_enrich_res[order(fisher_enrich_res$Pval), ] 328 | 329 | head(fisher_enrich_res, 10) 330 | 331 | # c. Rank biased overlap based test 332 | # RBO test does NOT require pre-select DEGs based on P-value or log-fold-change. We can simply input the 333 | # complete list of ordered DE genes as the input gene list. (here it is ordered by P-values) 334 | input_list2 = rownames(new_DE_test[new_DE_test$avg_log2FC > 0, ]) 335 | 336 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2, 337 | go_term_db = go_db, 338 | p = 0.98, 339 | k = 100, 340 | side = "bottom") 341 | 342 | rbo_enrich_res = rbo_enrich_res[order(rbo_enrich_res$RBO, decreasing = T), ] 343 | rownames(rbo_enrich_res) = NULL 344 | 345 | head(rbo_enrich_res, 10) 346 | 347 | ``` 348 | 349 | 350 | -------------------------------------------------------------------------------- /man/DE_heatmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{DE_heatmap} 4 | \alias{DE_heatmap} 5 | \title{Draw DE Z-score heatmap for gene signatures} 6 | \usage{ 7 | DE_heatmap( 8 | obj = NULL, 9 | sig_genes = NULL, 10 | type = c("standard", "hclust", "multiCCA"), 11 | direction = c("both", "down", "up"), 12 | top_n = 30, 13 | zscore_cap = 15, 14 | labRow = T, 15 | output_path = "./", 16 | prefix = "heatmap", 17 | height = 15, 18 | width = 12, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{obj}{the object produced by PCApermtest()} 24 | 25 | \item{sig_genes}{the object produced by get_sig_genes()} 26 | 27 | \item{type}{the type of obj and sig_genes that are being input. The "standard" (default) indicates 28 | the results from a standard within-perturbation analysis. The "hclust" indicates the results from 29 | a hierarchical clustering analysis. The "multiCCA" indicates the results from a multiCCA analysis.} 30 | 31 | \item{direction}{to indicate whether gene signatures of both directions should be plotted, or just 32 | the up-regulated genes ("up") or down-regulated genes ("down") should be plotted.} 33 | 34 | \item{top_n}{a positive integer to indicate how many gene signatures should be plotted. If provided, 35 | the top top_n genes will be selected for plotting.} 36 | 37 | \item{zscore_cap}{the cap for Z-scores in the Z-score matrix. Any Z-score that is larger than this 38 | value will be capped to this value.} 39 | 40 | \item{labRow}{a boolen variable to indicate if the row names should be labelled in the heatmap or not.} 41 | 42 | \item{output_path}{the directory where the heatmap will be saved.} 43 | 44 | \item{prefix}{the prefix for how the file of the heatmap should be named} 45 | 46 | \item{height}{the height (in inch) for the figure} 47 | 48 | \item{width}{the width (in inch) for the figure} 49 | } 50 | \value{ 51 | this function returns nothing. It directly output the generated figures to the directory that 52 | a user specifies. 53 | } 54 | \description{ 55 | This function will generate a standard heatmap based on the DE Z-score heatmap. 56 | Only the selected significant gene signatures will be plotted in the rows. 57 | } 58 | -------------------------------------------------------------------------------- /man/DEenrich.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/enrichment_test.R 3 | \name{DEenrich} 4 | \alias{DEenrich} 5 | \title{Wrapper function for DE and enrichment test} 6 | \usage{ 7 | DEenrich( 8 | object, 9 | plist = NULL, 10 | ident = NULL, 11 | ident.1 = NULL, 12 | ident.2 = NULL, 13 | split.by = NULL, 14 | slct.ct = NULL, 15 | direction = c("up", "down", "both"), 16 | logfc.threshold = 0.25, 17 | p.val.cutoff = 0.05, 18 | min.pct = 0.1, 19 | assay = NULL, 20 | ... 21 | ) 22 | } 23 | \arguments{ 24 | \item{object}{a seurat object to perform the DE test and the enrichment test} 25 | 26 | \item{plist}{the pathway gene lists to test the DE genes against} 27 | 28 | \item{ident.1}{Identity class to define markers for; pass an object of class phylo or 'clustertree' to find markers for a node in a cluster tree; passing 'clustertree' requires BuildClusterTree to have been run} 29 | 30 | \item{ident.2}{A second identity class for comparison; if NULL, use all other cells for comparison; if an object of class phylo or 'clustertree' is passed to ident.1, must pass a node to find markers for} 31 | 32 | \item{split.by}{Regroup cells into a different identity class prior to performing differential expression. 33 | Default is NULL (so all cells be used simultaneously).} 34 | 35 | \item{slct.ct}{Subset a particular identity class prior to regrouping. Only relevant if group.by is set.} 36 | } 37 | \value{ 38 | a list of data frames containing the gene set enrichment results for each group in "group.by" 39 | } 40 | \description{ 41 | This function provides a wrapper of Seurat::FindMarkers() and Mixscale::fisher_enrich_test(). 42 | Users can input a Seurat object they want to investigate and a list of gene sets they want to 43 | test against, and the wrapper will perform DE tests + Fisher's enrichment test across all the 44 | available cell types. It will then return a list of data frames, containing gene set enrichment 45 | results for each cell type. 46 | } 47 | -------------------------------------------------------------------------------- /man/DEenrich_DotPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{DEenrich_DotPlot} 4 | \alias{DEenrich_DotPlot} 5 | \title{Dot plot for enrichment results across multiple cell types} 6 | \usage{ 7 | DEenrich_DotPlot( 8 | obj, 9 | adjust.methods = "BH", 10 | direction = c("up", "down", "both"), 11 | log10P_cutoff = 10, 12 | OR_cutoff = 20, 13 | slct_labels = NULL, 14 | plot_title = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{obj}{the list of data frames generated by Mixscale_DEenrich()} 19 | 20 | \item{adjust.methods}{the method for multiple testing correction method (see 'p.adjust.methods')} 21 | 22 | \item{direction}{a character to specify what to plot: to plot the enrichment results for 23 | the up-regulated DE genes ("up") or the down-regulated DE genes ("down").} 24 | 25 | \item{log10P_cutoff}{the maximum value of -log10(adjusted P-value) for plotting the size of the dot (any dot with a p-value 26 | larger than this will be set to the same size)} 27 | 28 | \item{OR_cutoff}{the odds ratio (of the Fisher's exact test) cutoff for plotting the color gradient 29 | of the dot (any dot with a OR larger than this will be set to the same color)} 30 | 31 | \item{slct_labels}{the selected labels (cell types) that need to be plotted} 32 | 33 | \item{adjust.split}{TRUR/FALSE to specify if the multiple testing correction should be done for 34 | each group separately (TRUE) or all together (FALSE)} 35 | } 36 | \value{ 37 | a ggplot2 object 38 | } 39 | \description{ 40 | This function will generate a Dot plot for the enrichment results generated by Mixscale_DEenrich(). 41 | It will also perform multiple testing for the P-values for all the cell types taken together 42 | (or within each cell type). 43 | } 44 | -------------------------------------------------------------------------------- /man/DEhclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomposition.R 3 | \name{DEhclust} 4 | \alias{DEhclust} 5 | \title{Run Hierarchical clustering for a matrix} 6 | \usage{ 7 | DEhclust( 8 | mat = NULL, 9 | cor_method = c("pearson", "kendall", "spearman"), 10 | hclust_method = "minmax", 11 | dist_thres = 0.6, 12 | ... 13 | ) 14 | } 15 | \arguments{ 16 | \item{mat}{the Z-score matrix to perform the permutation test. 17 | Rows are the gene and columns are the conditions/samples.} 18 | 19 | \item{cor_method}{the method to calculate the correlation matrix. see cor() 20 | function for details.} 21 | 22 | \item{hclust_method}{the method to perform the hierarchical clustering. The default method 23 | is MinMax clustering (package 'protoclust' required). Other methods available in the hclust() 24 | function is also allowed.} 25 | 26 | \item{dist_thres}{The distance to cut the hierarchical clustering tree ("tree height"). See cutree() 27 | function for details. Default is 0.6.} 28 | } 29 | \value{ 30 | return a list of two object: 1. a list of the cluster assignment of the 31 | columns (only those got successfully assigned to a multi-member cluster will be stored). 32 | 2. a object of the output object from protocut() (if MinMax hclust method is selected) or 33 | from cutree() (if other hclust method is selected) 34 | } 35 | \description{ 36 | A wrapper for different hierarchical clustering methods to be applied to the within-cell-type 37 | cross-conditions Z-score matrix (input). Highly similar conditions (columns) will be grouped together 38 | given the DE Z-scores of rows (genes). 39 | } 40 | -------------------------------------------------------------------------------- /man/DEmultiCCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomposition.R 3 | \name{DEmultiCCA} 4 | \alias{DEmultiCCA} 5 | \title{Run MultiCCA for a list of matrices} 6 | \usage{ 7 | DEmultiCCA( 8 | mat_list = NULL, 9 | penalty = FALSE, 10 | standardize = FALSE, 11 | max_k = 5, 12 | cor_number = "all", 13 | mean_cor_thres = 0.2, 14 | flag_cor_num = T, 15 | flag_loose = F, 16 | pval_thres = 0.05, 17 | cor_coef_thres = 0.6, 18 | cor_coef_mean_thres = 0.3, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{mat_list}{the list of >= 2 DE Z-score matrices for the multiCCA analysis. Each matrix 24 | should have the same named rows, but can have different number of columns (samples).} 25 | 26 | \item{penalty}{to indicate if penalty should be applied during the multiCCA process. When set 27 | to "FALSE", no penalty will be applied so that the results are not forced to be "sparse". However, 28 | if a single value or a vector of k values (k should be the same as the number of matrices in the 29 | input list), then L1 penalty will be applied to each matrix to force the output CVs to be sparse. 30 | See MultiCCA() from the PMA package for details.} 31 | 32 | \item{standardize}{a boolen value to indicate whether to standardize each column before running 33 | the MultiCCA. Default is FALSE.} 34 | 35 | \item{max_k}{the maximum number of MultiCCA runs. MultiCCA will be repeated until this number is 36 | reached or the CVs across the matrices have very low correlation coefficients.} 37 | 38 | \item{mean_cor_thres}{During the multiCCA analysis, if any of the input matrix has significantly 39 | low(er) correlation with other matrices, it will impact the MultiCCA process. This is the threshold 40 | to remove such matrices. If (the CV of) any matrix has a mean correlation coefficient <= 0.2, it will 41 | be removed and the MultiCCA will be repeated for this round (k-th). Note that this matrix will be appended 42 | back to the list for the next round (k+1-th) of MultiCCA, and the same filtering will be repeated. 43 | Default is 0.2. Set it to 0 to avoid such filtering. Vector is also accepted and will be sequentially used 44 | for each iteration of MultiCCA.} 45 | 46 | \item{cor_num}{for each column of a matrix, the number of CVs that it needs to be significantly correlated 47 | with to be selected as a member of the program. Default is "all", meaning it needs to be significantly 48 | correlated with all other CVs. Alternatively, users may input a integer (e.g., 2, 3, ...).} 49 | } 50 | \value{ 51 | a list of MultiCCA results for each program identified. 52 | } 53 | \description{ 54 | A function to perform MultiCCA analysis (main function imported from package "PMA", 55 | see PMID 19377034 for details of the algorithm) that takes in a list of multiple 56 | Z-score matrices to find the canonical variates (CVs) that maximize the cross-matrices 57 | correlation. The MultiCCA process is modified so that it is not completed in one 58 | run when multiple rounds of CVs are desired. Instead, after each MultiCCA run, we will 59 | identify the columns (samples) from each matrix that highly correlate with the CV of that 60 | matrix, and extract + remove them from the matrix. The next MultiCCA is performed to the 61 | list of such "filtered" matrices. This process is repeated until the desired number of 62 | runs is reached (set by users) or the CVs across the matrices have very low correlation 63 | coefficients. 64 | } 65 | \seealso{ 66 | [MultiCCA()] 67 | } 68 | -------------------------------------------------------------------------------- /man/FoldChange_new.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_fold_change.R 3 | \name{FoldChange_new} 4 | \alias{FoldChange_new} 5 | \title{Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells} 6 | \usage{ 7 | FoldChange_new( 8 | object, 9 | cells.1, 10 | cells.2, 11 | mean.fxn, 12 | fc.name, 13 | features = NULL, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{object}{A Seurat object} 19 | 20 | \item{cells.1}{Vector of cell names belonging to group 1} 21 | 22 | \item{cells.2}{Vector of cell names belonging to group 2} 23 | 24 | \item{mean.fxn}{Function to use for fold change or average difference calculation} 25 | 26 | \item{fc.name}{Name of the fold change, average difference, or custom function column 27 | in the output data.frame} 28 | 29 | \item{features}{Features to calculate fold change for. 30 | If NULL, use all features} 31 | 32 | \item{...}{Arguments passed to other methods} 33 | } 34 | \value{ 35 | Returns a single value of the log-fold-change of the input gene. 36 | } 37 | \description{ 38 | Function to calculate log-fold-change for pooled CRISPR screen datasets. 39 | It is just a simple function to calculate the log-fold-change. Users can customise the min.cells, 40 | minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)), 41 | minimal percentage of cells expression the genes, and the base of the log. 42 | } 43 | \concept{perturbation_scoring} 44 | -------------------------------------------------------------------------------- /man/Mixscale_DoHeatmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{Mixscale_DoHeatmap} 4 | \alias{Mixscale_DoHeatmap} 5 | \title{Single-cell heatmap for selected DE genes stratified by target expression} 6 | \usage{ 7 | Mixscale_DoHeatmap( 8 | object = NULL, 9 | assay = "RNA", 10 | slot = "data", 11 | labels = "gene", 12 | nt.class.name = "NT", 13 | slct.ident = NULL, 14 | mixscale.score.name = "mixscale_score", 15 | features = NULL, 16 | group.by = NULL, 17 | ct.class = NULL, 18 | slct.ct = NULL, 19 | max.num.cell = 300, 20 | ... 21 | ) 22 | } 23 | \arguments{ 24 | \item{object}{a seurat object returned by RunMixscale()} 25 | 26 | \item{assay}{the assay name to extract the expression level data from for plotting} 27 | 28 | \item{slot}{the slot name to extract the expression level data from for plotting} 29 | 30 | \item{labels}{the column name in the object's meta.data that contains the target 31 | gene labels} 32 | 33 | \item{nt.class.name}{the classification name of non-targeting gRNA cells} 34 | 35 | \item{slct.ident}{the name of the perturbation target in 'labels' to be plotted.} 36 | 37 | \item{mixscale.score.name}{Name of mixscale scores to be stored in metadata. Default is "mixscale_score".} 38 | 39 | \item{features}{A vector of features to plot.} 40 | 41 | \item{group.by}{A vector of variables to group cells by; pass 'ident' to group by cell identity classes. 42 | Default is the same as 'labels'.} 43 | 44 | \item{ct.class}{the metadata colname that stores the cell type (or other conditions) information. 45 | If this is set and the correct slct.ct is provided, only cells with the corresponding slct.ct will 46 | be plotted. Default is NULL (so cells from all the ct.class will be plotted).} 47 | 48 | \item{slct.ct}{a character of the group of cells in ct.class to be plotted. Default is NULL (so cells from all the ct.class will be plotted).} 49 | } 50 | \value{ 51 | a ggplot2 object of the single-cell heatmap. 52 | } 53 | \description{ 54 | This function will generate single-cell expression heatmap for selected DE genes 55 | in cells of the same perturbation target (gRNA). This function is basically a 56 | wrapper function of the Seurat::DoHeatmap(), but with easier usage to select the 57 | cells based on given gRNA identity. Cells will be ordered in each stratification based 58 | on their perturbation scores. 59 | } 60 | -------------------------------------------------------------------------------- /man/Mixscale_RidgePlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{Mixscale_RidgePlot} 4 | \alias{Mixscale_RidgePlot} 5 | \title{Plot to show the distribution of perturbation scores} 6 | \usage{ 7 | Mixscale_RidgePlot( 8 | object = NULL, 9 | labels = "gene", 10 | nt.class.name = "NT", 11 | split.by = NULL, 12 | PRTB = NULL, 13 | slct_split.by = NULL, 14 | facet_wrap = c(NULL, "gene", "split.by"), 15 | facet_scale = c("fixed", "free_y"), 16 | facet_nrow = 1, 17 | ... 18 | ) 19 | } 20 | \arguments{ 21 | \item{object}{a seurat object returned by RunMixscale()} 22 | 23 | \item{labels}{the column name in the object's meta.data that contains the target 24 | gene labels} 25 | 26 | \item{nt.class.name}{the classification name of non-targeting gRNA cells} 27 | 28 | \item{split.by}{metadata column with experimental condition/cell type classification 29 | information. This is meant to be used to account for cases a perturbation is 30 | condition/cell type -specific.} 31 | 32 | \item{PRTB}{the perturbation target genes to extract for plotting. Multiple values are 33 | allowed.} 34 | 35 | \item{slct_split.by}{if only a subset of the conditions/cell-types in the split.by column need 36 | to be plotted, users can specify them as a character vector here. Default is NULL, meaning all the 37 | conditions/cell-types need to be plotted.} 38 | 39 | \item{facet_wrap}{whether to divide the plot into multiple facets based on either the 40 | perturbation targets ("gene") or conditions/cell types ("split.by"). Default is NULL, meaning 41 | no facet.} 42 | 43 | \item{facet_scale}{whether to use a fixed scale for y-axis across all facets or allow 44 | y axis to vary.} 45 | 46 | \item{facet_nrow}{the number of rows to plot the different panels when facet_wrap is set.} 47 | } 48 | \value{ 49 | a ggplot2 object that contains the ridge plot. 50 | } 51 | \description{ 52 | This function will generate a density (ridge) plot for the perturbation scores across 53 | different cell types or different perturbation targets. 54 | } 55 | -------------------------------------------------------------------------------- /man/Mixscale_ScatterPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{Mixscale_ScatterPlot} 4 | \alias{Mixscale_ScatterPlot} 5 | \title{Plot to compare the expression level of the perturbation target and the 6 | perturbation scores} 7 | \usage{ 8 | Mixscale_ScatterPlot( 9 | object = NULL, 10 | assay = "RNA", 11 | slot = "data", 12 | nt.class.name = "NT", 13 | split.by = NULL, 14 | slct.ident = NULL, 15 | nbin = 10, 16 | facet_wrap = c(NULL, "gene", "split.by"), 17 | facet_scale = "free_y", 18 | ... 19 | ) 20 | } 21 | \arguments{ 22 | \item{object}{a seurat object returned by RunMixscale()} 23 | 24 | \item{assay}{the assay name to extract the expression level data from for plotting} 25 | 26 | \item{slot}{the slot name to extract the expression level data from for plotting} 27 | 28 | \item{nt.class.name}{the classification name of non-targeting gRNA cells} 29 | 30 | \item{split.by}{metadata column with experimental condition/cell type classification 31 | information. This is meant to be used to account for cases a perturbation is 32 | condition/cell type -specific.} 33 | 34 | \item{slct.ident}{the perturbation target genes to extract for plotting from 'labels'. Multiple values are 35 | allowed. Default is NULL (every class will be plotted).} 36 | 37 | \item{nbin}{the number of bins to divide the perturbation scores into.} 38 | 39 | \item{facet_wrap}{whether to divide the plot into multiple facets based on either the 40 | perturbation targets ("gene") or conditions/cell types ("split.by"). Default is NULL, meaning 41 | no facet.} 42 | 43 | \item{facet_scale}{whether to use a fixed scale for y-axis across all facets or allow 44 | y axis to vary.} 45 | } 46 | \value{ 47 | a ggplot2 object that contains the connected scatterplot. 48 | } 49 | \description{ 50 | This function will generate a connected scatterplot to compare the mean 51 | expression level of the perturbation target gene within different perturbation 52 | percentile bins. After running the RunMixscale() function, user can specify the 53 | gene name of the perturbation target and the number of bins to divide the scores 54 | into, and this function will sutomatically generate a connected scatterplot. 55 | Multiple perturbation targets and cell types are allowed. 56 | } 57 | -------------------------------------------------------------------------------- /man/PCApermtest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomposition.R 3 | \name{PCApermtest} 4 | \alias{PCApermtest} 5 | \title{Run PCA-based permutation test for a matrix} 6 | \usage{ 7 | PCApermtest( 8 | mat = NULL, 9 | k = 1, 10 | var_prop = NULL, 11 | var_prop_total = NULL, 12 | center = T, 13 | scale = T, 14 | row_filtering_pval = 0.05, 15 | num_iter = 200, 16 | seed = 123124125, 17 | ... 18 | ) 19 | } 20 | \arguments{ 21 | \item{mat}{the Z-score matrix to perform the permutation test. 22 | Rows are the gene and columns are the conditions/samples.} 23 | 24 | \item{k}{the number of top PCs to extract. If set to NULL, the 25 | function will determine the number of PCs to extract based on 26 | the var_prop, which is the cut-off on the proportion of variance 27 | explained for each PC. Only PCs with prop_var greater than this 28 | value will be removed extracted.} 29 | 30 | \item{var_prop}{if k is not set, then use this value as a cutoff for 31 | %var explained to select the PCs. Only the PCs with %var larger than this 32 | value will be selected (the top k PCs).} 33 | 34 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var 35 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the 36 | accumulated sum larger than this value to be selected.} 37 | 38 | \item{center}{a boolen value to indicate whether the column will be centered.} 39 | 40 | \item{scale}{a boolen value to indicate whether the column will be scaled to 1.} 41 | 42 | \item{num_iter}{the number of iteration for the permutation test.} 43 | 44 | \item{seed}{seed for random number generator.} 45 | } 46 | \value{ 47 | a list object consists of 3 elements: the original input Z-score matrix, 48 | the p-value matrix produced by the permutation test for each genes (same dimension 49 | as the original input matrix), and the prcomp() object done for the input Z-score 50 | matrix. 51 | } 52 | \description{ 53 | This function will load in a DE test Z-score matrix, and perform PCA to it to get the 1st to k-th PCs. 54 | Then it will permuate the matrix and then the same PCA analysis will be performed. By default, this process 55 | will be repeated for 200 times (default), so that we will have a decent number of null values 56 | in the top k PCs. The proportion of extreme values greater or smaller than the actual 57 | value will be used as the P-values. 58 | } 59 | -------------------------------------------------------------------------------- /man/RunMixscale.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/perturbation_scoring.R 3 | \name{RunMixscale} 4 | \alias{RunMixscale} 5 | \title{Mixscale scoring for perturbations} 6 | \usage{ 7 | RunMixscale( 8 | object, 9 | assay = "PRTB", 10 | slot = "scale.data", 11 | labels = "gene", 12 | nt.class.name = "NT", 13 | new.class.name = "mixscale_score", 14 | min.de.genes = 5, 15 | min.cells = 5, 16 | de.assay = "RNA", 17 | logfc.threshold = 0.25, 18 | verbose = FALSE, 19 | split.by = NULL, 20 | fine.mode = FALSE, 21 | fine.mode.labels = "guide_ID", 22 | DE.gene = NULL, 23 | max.de.genes = 100, 24 | harmonize = F, 25 | min_prop_ntgd = 0.1, 26 | pval.cutoff = 0.05, 27 | seed = 10282021 28 | ) 29 | } 30 | \arguments{ 31 | \item{object}{An object of class Seurat.} 32 | 33 | \item{assay}{Assay to use for mixscape classification.} 34 | 35 | \item{slot}{Assay data slot to use.} 36 | 37 | \item{labels}{metadata column with target gene labels.} 38 | 39 | \item{nt.class.name}{Classification name of non-targeting gRNA cells.} 40 | 41 | \item{new.class.name}{Name of mixscale scores to be stored in 42 | metadata.} 43 | 44 | \item{min.de.genes}{Required number of genes that are differentially 45 | expressed for method to separate perturbed and non-perturbed cells.} 46 | 47 | \item{min.cells}{Minimum number of cells in target gene class. If fewer than 48 | this many cells are assigned to a target gene class during classification, 49 | all are assigned NP.} 50 | 51 | \item{de.assay}{Assay to use when performing differential expression analysis. 52 | Usually RNA.} 53 | 54 | \item{logfc.threshold}{the log-fold-change threshold to select the large-effect 55 | DE genes. Only DE genes with log-fold-change larger than this value will be 56 | selected. Default is 0.25.} 57 | 58 | \item{verbose}{Display messages} 59 | 60 | \item{split.by}{metadata column with experimental condition/cell type 61 | classification information. This is meant to be used to account for cases a 62 | perturbation is condition/cell type -specific.} 63 | 64 | \item{fine.mode}{When this is equal to TRUE, DE genes for each target gene 65 | class will be calculated for each gRNA separately and pooled into one DE list 66 | for calculating the perturbation score of every cell and their subsequent 67 | classification.} 68 | 69 | \item{fine.mode.labels}{metadata column with gRNA ID labels.} 70 | 71 | \item{DE.gene}{specify a list of user-defined large-effect DE genes to calculate the perturbation score.} 72 | 73 | \item{max.de.genes}{the maximum number of top large-effect DE genes to calculate the perturbation score. Default is 100.} 74 | 75 | \item{harmonize}{a boolen value to specify whether a harmonization of the cell-type proportion between the NT cells and 76 | the perturbed cells should be performed prior to the DE test. If fine.mode is TRUE, this harmonization step will be 77 | performed for each fine.mode gRNA. Default is FALSE.} 78 | 79 | \item{min_prop_ntgd}{a minimal threshold to remove cells if any cell type has a proportion less than this value. It will 80 | only be used when harmonize is TRUE. Default is 0.1.} 81 | 82 | \item{pval.cutoff}{specify the DE test p-value cutoff (after Bonferroni correction) to select top large-effect DE genes. 83 | Default is 0.05.} 84 | } 85 | \value{ 86 | Returns a Seurat object containing the perturbation scores. It is stored in the Tool Data of the object, also 87 | the standardized scores are stored in the meta.data (column is specified by new.class.name). 88 | } 89 | \description{ 90 | Function to calculate perturbation scores for perturbed and non-perturbed gRNA expressing cells. 91 | The perturbation score reflects the perturbation strength of each cells (inherited from the RunMixscape() 92 | function). It is calculated by using the large-effect DE genes from raw DE tests between the 93 | perturbed and non-perturbed gRNA expressing cells. 94 | } 95 | \concept{perturbation_scoring} 96 | -------------------------------------------------------------------------------- /man/Run_wmvRegDE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scoring_de.R 3 | \name{Run_wmvRegDE} 4 | \alias{Run_wmvRegDE} 5 | \title{Scoring-based weighted DE test} 6 | \usage{ 7 | Run_wmvRegDE( 8 | object, 9 | assay = "RNA", 10 | slot = "counts", 11 | labels = "gene", 12 | nt.class.name = "NT", 13 | verbose = FALSE, 14 | PRTB_list = NULL, 15 | split.by = NULL, 16 | logfc.threshold = 0, 17 | min.pct = 0.1, 18 | min.cells.group = 10, 19 | total_ct_labels = "nCount_RNA", 20 | pseudocount.use = 1, 21 | base = 2, 22 | full.results = FALSE 23 | ) 24 | } 25 | \arguments{ 26 | \item{object}{An object of class Seurat.} 27 | 28 | \item{assay}{Assay to use for mixscape classification.} 29 | 30 | \item{slot}{Assay data slot to use.} 31 | 32 | \item{labels}{metadata column with target gene labels.} 33 | 34 | \item{nt.class.name}{Classification name of non-targeting gRNA cells.} 35 | 36 | \item{verbose}{Print a progress bar once expression testing begins} 37 | 38 | \item{PRTB_list}{provide a vector of perturbations that the DE tests are restricted 39 | to. Default is NULL (DE tests will be performed for all available perturbations).} 40 | 41 | \item{split.by}{metadata column with experimental condition/cell type 42 | classification information. This is used to account for cases where 43 | perturbations are done for multiple condition/cell type. Default is NULL (only one 44 | cell type).} 45 | 46 | \item{logfc.threshold}{the log-fold-change threshold to select genes for DE 47 | test. Genes with log2-fold-change larger than this value will be selected for DE test. 48 | Note that if split.by is set and more than 1 split.by group exists, this 49 | logfc.threashold will be applied to each group and if any of them satisfies this criteria, the 50 | gene will be selected. Default is 0 (no filtering based on log2-fold-change).} 51 | 52 | \item{min.pct}{only test genes that are detected in a minimum fraction of min.pct cells in either 53 | of the two populations. Meant to speed up the function by not testing genes that are very 54 | infrequently expressed. Default is 0.1. Same as logfc.threshold, if split.by is set and more than 1 split.by 55 | group exists, thiswill be applied to each group and if any of them satisfies this criteria, the 56 | gene will be selected.} 57 | 58 | \item{min.cells.group}{Minimum number of cells in one of the groups} 59 | 60 | \item{total_ct_labels}{metadata column for the total RNA counts of each cell. The default is 61 | nCount_RNA, which is the default names used by Seurat package.} 62 | 63 | \item{pseudocount.use}{Pseudocount to add to averaged expression values when calculating logFC. 1 by default.} 64 | 65 | \item{base}{The base with respect to which logarithms are computed.} 66 | 67 | \item{full.results}{A boolen value to indicate if the full DE results should be output. Default is 68 | FALSE (only the regression coefficients/P-values of the mixscale scores will be output).} 69 | } 70 | \value{ 71 | a list of DE results, one for each perturbation 72 | } 73 | \description{ 74 | A function to perform differential expression (DE) tests based on the perturbation scores from the 75 | RunMixscale() function. It is a multivariate negative binomial based model that incorporates both the heterogeneity 76 | of perturbation strength in each cell, as well as their cell type background. 77 | } 78 | \concept{perturbation_scoring} 79 | -------------------------------------------------------------------------------- /man/fisher_enrich_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/enrichment_test.R 3 | \name{fisher_enrich_test} 4 | \alias{fisher_enrich_test} 5 | \title{Standard Fisher's exact test for enrichment analysis} 6 | \usage{ 7 | fisher_enrich_test( 8 | input_list = NULL, 9 | background = NULL, 10 | go_term_db = NULL, 11 | list_gene = F, 12 | EASE = F 13 | ) 14 | } 15 | \arguments{ 16 | \item{input_list}{the input gene list} 17 | 18 | \item{background}{the background gene list (usually the expressed genes where the 19 | input gene list is generate from, ).} 20 | 21 | \item{go_term_db}{a list of gene-lists (GO term). It should be a list contain multiple named vector, 22 | and each vector should be a vector of multiple marker/signature genes for some biological pathway/process.} 23 | 24 | \item{list_gene}{A Boolen value to indicate if the overlapping genes between the input gene list and 25 | the GO-term should be output as well.} 26 | 27 | \item{EASE}{A Boolen value to indicate if the EASE correction should be applied (see 28 | https://david.ncifcrf.gov/helps/functional_annotation.html). This is useful to mitigate the 29 | small-sample inflation when the input gene list is short (e.g., < 10).} 30 | } 31 | \value{ 32 | a data frame contains the enrichment test results. Each row contains the P-value and enrichment odds 33 | ratio calculated from a Fisher's exact test for one GO-term in the go_term_db. 34 | } 35 | \description{ 36 | This function will perform the strandard Fisher's exact test between the input gene 37 | list and a series of gene-ontology gene sets (adopted from DAVID GO analysis). 38 | } 39 | -------------------------------------------------------------------------------- /man/get_DE_mat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scoring_de.R 3 | \name{get_DE_mat} 4 | \alias{get_DE_mat} 5 | \title{Rearrange the DE results into a list of Z-score matrices} 6 | \usage{ 7 | get_DE_mat( 8 | de_res = NULL, 9 | p_threshold = 0.05/30000, 10 | fc_threshold = 0.2, 11 | num_top_DEG = 50 12 | ) 13 | } 14 | \arguments{ 15 | \item{de_res}{the DE results produced by Run_wmvRegDE(), which is a list of data frames.} 16 | 17 | \item{p_threshold}{the DE P-value threshold to define statistically significant DE genes.} 18 | 19 | \item{fc_threshold}{the log-fold-change threhsold to define statistically significant DE genes.} 20 | 21 | \item{num_top_DEG}{for each perturbation, only the top num_top_DEG DEG within each condition/cell line 22 | will be selected as the rows (features). This helps avoid the feature space is dominated by one or a few perturbations 23 | that have a huge number of DEGs. The default is 50. Set it to NULL to avoid filtering.} 24 | } 25 | \description{ 26 | A function to re-arrange the DE results produced by Run_wmvRegDE() into a list of Z-score matrices. 27 | Each matrix represents one cell type (if multiple cell types were included), and contains the 28 | DE test Z-scores for each valid gene being tested (rows) and each perturbation target (columns). 29 | } 30 | -------------------------------------------------------------------------------- /man/get_fc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_fold_change.R 3 | \name{get_fc} 4 | \alias{get_fc} 5 | \title{Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells} 6 | \usage{ 7 | get_fc( 8 | gene_exp = NULL, 9 | idx_P = NULL, 10 | idx_NT = NULL, 11 | min.cells = 3, 12 | thresh.min = 0, 13 | pseudocount.use = 1, 14 | min.pct = 0.1, 15 | base = 2, 16 | norm.method = "raw" 17 | ) 18 | } 19 | \arguments{ 20 | \item{gene_exp}{a vector of the gene expression levels} 21 | 22 | \item{idx_P}{a vector of index for the perturbed cells in the gene_exp} 23 | 24 | \item{idx_NT}{a vector of index for the non-target cells (controls) in the gene_exp} 25 | 26 | \item{min.cells}{the minimal number of cells that expresses the gene; if lower than this value, the 27 | fold-change will be returned as NA. Default is 3.} 28 | 29 | \item{thresh.min}{the minimal value of expression; any expression value lower than this will be 30 | considered as 0. Default is 0.} 31 | 32 | \item{pseudocount.use}{the small value that will be added to the log-transformation to avoid log(0). 33 | For example, if a mean expression value is x, the final log-} 34 | 35 | \item{min.pct}{the minimal proportion of cells in either groups that expresses the gene} 36 | 37 | \item{base}{the base for log()} 38 | 39 | \item{norm.method}{the normalization method for the input gene_exp. Default is 'raw', which means 40 | the original count value without normalization. The other supported values are 'log.norm', "scale.data". 41 | The mean.fxn() will change accordingly.} 42 | } 43 | \value{ 44 | Returns a single value of the log-fold-change of the input gene. 45 | } 46 | \description{ 47 | Function to calculate log-fold-change for pooled CRISPR screen datasets. 48 | It is just a simple function to calculate the log-fold-change. Users can customise the min.cells, 49 | minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)), 50 | minimal percentage of cells expression the genes, and the base of the log. 51 | } 52 | \concept{perturbation_scoring} 53 | -------------------------------------------------------------------------------- /man/get_sig_genes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomposition.R 3 | \name{get_sig_genes} 4 | \alias{get_sig_genes} 5 | \title{Extract significant genes from PCApermtest} 6 | \usage{ 7 | get_sig_genes( 8 | perm_obj = NULL, 9 | k = 1, 10 | var_prop = NULL, 11 | var_prop_total = NULL, 12 | perm_pval_thres = 0.05, 13 | ori_pval_thres = 1.666667e-06, 14 | cor_threshold = 0.2, 15 | collapse = T, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{perm_obj}{the list object produces by the PCApermtest() function.} 21 | 22 | \item{k}{the number of top PCs to extract. If set to NULL, the 23 | function will determine the number of PCs to extract based on 24 | the var_prop, which is the cut-off on the proportion of variance 25 | explained for each PC. Only PCs with prop_var greater than this 26 | value will be removed extracted.} 27 | 28 | \item{var_prop}{if k is not set, then use this value as a cutoff for 29 | %var explained to select the PCs. Only the PCs with %var larger than this 30 | value will be selected (the top k PCs).} 31 | 32 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var 33 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the 34 | accumulated sum larger than this value to be selected.} 35 | 36 | \item{perm_pval_thres}{the p-value threshold for the permutation test. Rows 37 | (genes) with permutation p-value lower then permtest_pval_thres or greater than 38 | (1 - permtest_pval_thres) will be selected. Default is 0.05.} 39 | 40 | \item{ori_pval_thres}{the p-value threshold for the original DE test. In the original 41 | Z-score matrix we store all the Z-score from the DE test. When selecting significant 42 | rows (genes), we might wish them to be significant in the original DE test as well. 43 | By setting this value, we force the rows (genes) to be both significant in the 44 | permutation test and the DE test. Default is 0.05/30000 = 1.666667e-06 (which is 45 | gene-wide significant after Bonferroni correction). Set it to 1 to avoid such filtering.} 46 | 47 | \item{cor_threshold}{After the PCApermtest, the actual orientation of the PC 48 | might not be the same as the orientation of its correlated columns in the original 49 | matrix. We need to do correlation test between each PC and all the columns in the 50 | original matrix, and then use this cor_threshold to define and extract the 51 | correlated columns and then use them to determine the actual orientation of the PC.} 52 | 53 | \item{collapse}{a boolen value to indicate when k >= 2, whether the significant 54 | genes from the top k PCs should be return together as one gene list (collapse = T) 55 | or separately for each k (collapse = F). Default is T.} 56 | } 57 | \value{ 58 | a list of significant genes (gene IDs) selected from the output of PCApermtest. 59 | The order of genes in each gene list indicates the significance of P-value (high 60 | significance at the top). 61 | } 62 | \description{ 63 | The function will load the pval matrix calculated from PCApermtest() and 64 | return the significant rows (usually genes) given the threshold that users 65 | provide. 66 | } 67 | -------------------------------------------------------------------------------- /man/get_sig_genes_DEhclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomposition.R 3 | \name{get_sig_genes_DEhclust} 4 | \alias{get_sig_genes_DEhclust} 5 | \title{Run PCApermtest and get significant genes from DEhclust} 6 | \usage{ 7 | get_sig_genes_DEhclust( 8 | obj = NULL, 9 | k = 1, 10 | var_prop = NULL, 11 | center = T, 12 | scale = T, 13 | num_iter = 200, 14 | row_filtering_pval = 0.05, 15 | var_prop_total = NULL, 16 | perm_pval_thres = 0.05, 17 | ori_pval_thres = 1.666667e-06, 18 | cor_threshold = 0.2, 19 | collapse = T, 20 | ... 21 | ) 22 | } 23 | \arguments{ 24 | \item{obj}{The results object produced by DEhclust() function.} 25 | 26 | \item{k}{the number of top PCs to extract. If set to NULL, the 27 | function will determine the number of PCs to extract based on 28 | the var_prop, which is the cut-off on the proportion of variance 29 | explained for each PC. Only PCs with prop_var greater than this 30 | value will be removed extracted.} 31 | 32 | \item{var_prop}{if k is not set, then use this value as a cutoff for 33 | %var explained to select the PCs. Only the PCs with %var larger than this 34 | value will be selected (the top k PCs).} 35 | 36 | \item{center}{a boolen value to indicate whether the column will be centered.} 37 | 38 | \item{scale}{a boolen value to indicate whether the column will be scaled to 1.} 39 | 40 | \item{num_iter}{the number of iteration for the permutation test.} 41 | 42 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var 43 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the 44 | accumulated sum larger than this value to be selected.} 45 | 46 | \item{perm_pval_thres}{the p-value threshold for the permutation test. Rows 47 | (genes) with permutation p-value lower then permtest_pval_thres or greater than 48 | (1 - permtest_pval_thres) will be selected. Default is 0.05.} 49 | 50 | \item{ori_pval_thres}{the p-value threshold for the original DE test. In the original 51 | Z-score matrix we store all the Z-score from the DE test. When selecting significant 52 | rows (genes), we might wish them to be significant in the original DE test as well. 53 | By setting this value, we force the rows (genes) to be both significant in the 54 | permutation test and the DE test. Default is 0.05/30000 = 1.666667e-06 (which is 55 | gene-wide significant after Bonferroni correction). Set it to 1 to avoid such filtering.} 56 | 57 | \item{cor_threshold}{After the PCApermtest, the actual orientation of the PC 58 | might not be the same as the orientation of its correlated columns in the original 59 | matrix. We need to do correlation test between each PC and all the columns in the 60 | original matrix, and then use this cor_threshold to define and extract the 61 | correlated columns and then use them to determine the actual orientation of the PC.} 62 | 63 | \item{collapse}{a boolen value to indicate when k >= 2, whether the significant 64 | genes from the top k PCs should be return together as one gene list (collapse = T) 65 | or separately for each k (collapse = F). Default is T.} 66 | } 67 | \value{ 68 | return a list of vectors, and each vector contains the signature genes identified for each cluster. 69 | } 70 | \description{ 71 | This function will use the output from the DEhclust() and get the necessary elements for PCApermtest(): 72 | For each cluster of columns being identified, this function will create a truncated sub-matrix given 73 | the original Z-score matrix. The sub-matrix will only contains the selected columns, and they will be input 74 | to the PCApermtest() and get_sig_genes() to get the gene signatures for this cluster. This process will 75 | be repeated for each cluster. 76 | } 77 | -------------------------------------------------------------------------------- /man/get_sig_genes_DEmultiCCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomposition.R 3 | \name{get_sig_genes_DEmultiCCA} 4 | \alias{get_sig_genes_DEmultiCCA} 5 | \title{Run PCApermtest and get significant genes from DEmultiCCA} 6 | \usage{ 7 | get_sig_genes_DEmultiCCA( 8 | obj = NULL, 9 | k = 1, 10 | var_prop = NULL, 11 | center = T, 12 | scale = T, 13 | num_iter = 200, 14 | row_filtering_pval = 0.05, 15 | var_prop_total = NULL, 16 | perm_pval_thres = 0.05, 17 | ori_pval_thres = 1.666667e-06, 18 | cor_threshold = 0.2, 19 | collapse = T, 20 | ... 21 | ) 22 | } 23 | \arguments{ 24 | \item{obj}{The results object produced by DEmultiCCA() function.} 25 | 26 | \item{k}{the number of top PCs to extract. If set to NULL, the 27 | function will determine the number of PCs to extract based on 28 | the var_prop, which is the cut-off on the proportion of variance 29 | explained for each PC. Only PCs with prop_var greater than this 30 | value will be removed extracted.} 31 | 32 | \item{var_prop}{if k is not set, then use this value as a cutoff for 33 | %var explained to select the PCs. Only the PCs with %var larger than this 34 | value will be selected (the top k PCs).} 35 | 36 | \item{center}{a boolen value to indicate whether the column will be centered.} 37 | 38 | \item{scale}{a boolen value to indicate whether the column will be scaled to 1.} 39 | 40 | \item{num_iter}{the number of iteration for the permutation test.} 41 | 42 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var 43 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the 44 | accumulated sum larger than this value to be selected.} 45 | 46 | \item{perm_pval_thres}{the p-value threshold for the permutation test. Rows 47 | (genes) with permutation p-value lower then permtest_pval_thres or greater than 48 | (1 - permtest_pval_thres) will be selected. Default is 0.05.} 49 | 50 | \item{ori_pval_thres}{the p-value threshold for the original DE test. In the original 51 | Z-score matrix we store all the Z-score from the DE test. When selecting significant 52 | rows (genes), we might wish them to be significant in the original DE test as well. 53 | By setting this value, we force the rows (genes) to be both significant in the 54 | permutation test and the DE test. Default is 0.05/30000 = 1.666667e-06 (which is 55 | gene-wide significant after Bonferroni correction). Set it to 1 to avoid such filtering.} 56 | 57 | \item{cor_threshold}{After the PCApermtest, the actual orientation of the PC 58 | might not be the same as the orientation of its correlated columns in the original 59 | matrix. We need to do correlation test between each PC and all the columns in the 60 | original matrix, and then use this cor_threshold to define and extract the 61 | correlated columns and then use them to determine the actual orientation of the PC.} 62 | 63 | \item{collapse}{a boolen value to indicate when k >= 2, whether the significant 64 | genes from the top k PCs should be return together as one gene list (collapse = T) 65 | or separately for each k (collapse = F). Default is T.} 66 | } 67 | \value{ 68 | return a list of vectors, and each vector contains the signature genes identified for each MultiCCA program. 69 | } 70 | \description{ 71 | This function will use the output from the DEmultiCCA() and get the neccessary elements for PCApermtest and 72 | get the gene signatures for each perturbation program that DEmultiCCA() identifies. It works in a similar way 73 | as get_sig_genes_DEhclust() . 74 | } 75 | -------------------------------------------------------------------------------- /man/glm_gp_disp_only.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/glm_gp_disp_only.R 3 | \name{glm_gp_disp_only} 4 | \alias{glm_gp_disp_only} 5 | \title{Internal Function to Fit a Gamma-Poisson GLM} 6 | \usage{ 7 | glm_gp_disp_only( 8 | data, 9 | design = ~1, 10 | col_data = NULL, 11 | reference_level = NULL, 12 | offset = 0, 13 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"), 14 | overdispersion = TRUE, 15 | overdispersion_shrinkage = TRUE, 16 | ridge_penalty = 0, 17 | do_cox_reid_adjustment = TRUE, 18 | subsample = FALSE, 19 | on_disk = NULL, 20 | use_assay = NULL, 21 | verbose = FALSE 22 | ) 23 | } 24 | \arguments{ 25 | \item{data}{any matrix-like object (e.g. \link{matrix}, \link{DelayedArray}, \link{HDF5Matrix}) or 26 | anything that can be cast to a \link{SummarizedExperiment} (e.g. \code{MSnSet}, \code{eSet} etc.) with 27 | one column per sample and row per gene.} 28 | 29 | \item{design}{a specification of the experimental design used to fit the Gamma-Poisson GLM. 30 | It can be a \code{\link[=model.matrix]{model.matrix()}} with one row for each sample and one column for each 31 | coefficient. \cr 32 | Alternatively, \code{design} can be a \code{formula}. The entries in the 33 | formula can refer to global objects, columns in the \code{col_data} parameter, or the \code{colData(data)} 34 | of \code{data} if it is a \code{SummarizedExperiment}. \cr 35 | The third option is that \code{design} is a vector where each element specifies to which 36 | condition a sample belongs. \cr 37 | Default: \code{design = ~ 1}, which means that all samples are treated as if they belong to the 38 | same condition. Note that this is the fasted option.} 39 | 40 | \item{col_data}{a dataframe with one row for each sample in \code{data}. Default: \code{NULL}.} 41 | 42 | \item{reference_level}{a single string that specifies which level is used as reference 43 | when the model matrix is created. The reference level becomes the intercept and all 44 | other coefficients are calculated with respect to the \code{reference_level}. 45 | Default: \code{NULL}.} 46 | 47 | \item{offset}{Constant offset in the model in addition to \code{log(size_factors)}. It can 48 | either be a single number, a vector of length \code{ncol(data)} or a matrix with the 49 | same dimensions as \code{dim(data)}. Note that if data is a \link{DelayedArray} or \link{HDF5Matrix}, 50 | \code{offset} must be as well. Default: \code{0}.} 51 | 52 | \item{size_factors}{in large scale experiments, each sample is typically of different size 53 | (for example different sequencing depths). A size factor is an internal mechanism of GLMs to 54 | correct for this effect.\cr 55 | \code{size_factors} is either a numeric vector with positive entries that has the same lengths as columns in the data 56 | that specifies the size factors that are used. 57 | Or it can be a string that species the method that is used to estimate the size factors 58 | (one of \code{c("normed_sum", "deconvolution", "poscounts")}). 59 | Note that \code{"normed_sum"} and \code{"poscounts"} are fairly 60 | simple methods and can lead to suboptimal results. For the best performance, I recommend to use 61 | \code{size_factors = "deconvolution"} which calls \code{scran::calculateSumFactors()}. However, you need 62 | to separately install the \code{scran} package from Bioconductor for this method to work. 63 | Also note that \code{size_factors = 1} and \code{size_factors = FALSE} are equivalent. If only a single gene is given, 64 | no size factor is estimated (ie. \code{size_factors = 1}). Default: \code{"normed_sum"}.} 65 | 66 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model 67 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson 68 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr 69 | \code{overdispersion} can either be 70 | \itemize{ 71 | \item a single boolean that indicates if an overdispersion is estimated for each gene. 72 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values. 73 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes. 74 | } 75 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce 76 | the Gamma-Poisson to the classical Poisson model. Default: \code{TRUE}.} 77 | 78 | \item{overdispersion_shrinkage}{the overdispersion can be difficult to estimate with few replicates. To 79 | improve the overdispersion estimates, we can share information across genes and shrink each individual 80 | overdispersion estimate towards a global overdispersion estimate. Empirical studies show however that 81 | the overdispersion varies based on the mean expression level (lower expression level => higher 82 | dispersion). If \code{overdispersion_shrinkage = TRUE}, a median trend of dispersion and expression level is 83 | fit and used to estimate the variances of a quasi Gamma Poisson model (Lund et al. 2012). Default: \code{TRUE}.} 84 | 85 | \item{ridge_penalty}{to avoid overfitting, we can penalize fits with large coefficient estimates. Instead 86 | of directly minimizing the deviance per gene (\eqn{Sum dev(y_i, X_i b)}), we will minimize 87 | \eqn{Sum dev(y_i, X_i b) + N * Sum (penalty_p * b_p)^2}.\cr 88 | \code{ridge_penalty} can be 89 | \itemize{ 90 | \item a scalar in which case all parameters except the intercept are penalized. 91 | \item a vector which has to have the same length as columns in the model matrix 92 | \item a matrix with the same number of columns as columns in the model matrix. This gives 93 | maximum flexibility for expert users and allows for full Tikhonov regularization. 94 | } 95 | Default: \code{ridge_penalty = 0}, which is internally replaced with a small positive number for numerical stability.} 96 | 97 | \item{do_cox_reid_adjustment}{the classical maximum likelihood estimator of the \code{overdisperion} is biased 98 | towards small values. McCarthy \emph{et al.} (2012) showed that it is preferable to optimize the Cox-Reid 99 | adjusted profile likelihood.\cr 100 | \code{do_cox_reid_adjustment} can be either be \code{TRUE} or \code{FALSE} to indicate if the adjustment is 101 | added during the optimization of the \code{overdispersion} parameter. Default: \code{TRUE}.} 102 | 103 | \item{subsample}{the estimation of the overdispersion is the slowest step when fitting 104 | a Gamma-Poisson GLM. For datasets with many samples, the estimation can be considerably sped up 105 | without loosing much precision by fitting the overdispersion only on a random subset of the samples. 106 | Default: \code{FALSE} which means that the data is not subsampled. If set to \code{TRUE}, at most 1,000 samples 107 | are considered. Otherwise the parameter just specifies the number of samples that are considered 108 | for each gene to estimate the overdispersion.} 109 | 110 | \item{on_disk}{a boolean that indicates if the dataset is loaded into memory or if it is kept on disk 111 | to reduce the memory usage. Processing in memory can be significantly faster than on disk. 112 | Default: \code{NULL} which means that the data is only processed in memory if \code{data} is an in-memory 113 | data structure.} 114 | 115 | \item{verbose}{a boolean that indicates if information about the individual steps are printed 116 | while fitting the GLM. Default: \code{FALSE}.} 117 | 118 | \item{Y}{any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with 119 | one column per sample and row per gene.} 120 | } 121 | \value{ 122 | a list with four elements 123 | * `Beta` the coefficient matrix 124 | * `overdispersion` the vector with the estimated overdispersions 125 | * `Mu` a matrix with the corresponding means for each gene 126 | and sample 127 | * `size_factors` a vector with the size factor for each 128 | sample 129 | * `ridge_penalty` a vector with the ridge penalty 130 | } 131 | \description{ 132 | Internal Function to Fit a Gamma-Poisson GLM 133 | } 134 | \seealso{ 135 | [glm_gp()] and [overdispersion_mle()] 136 | } 137 | \keyword{internal} 138 | -------------------------------------------------------------------------------- /man/glm_gp_disp_only_impl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/glm_gp_disp_only.R 3 | \name{glm_gp_disp_only_impl} 4 | \alias{glm_gp_disp_only_impl} 5 | \title{Internal Function to Fit a Gamma-Poisson GLM} 6 | \usage{ 7 | glm_gp_disp_only_impl( 8 | Y, 9 | model_matrix, 10 | offset = 0, 11 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"), 12 | overdispersion = TRUE, 13 | overdispersion_shrinkage = TRUE, 14 | ridge_penalty = 0, 15 | do_cox_reid_adjustment = TRUE, 16 | subsample = FALSE, 17 | verbose = FALSE 18 | ) 19 | } 20 | \arguments{ 21 | \item{Y}{any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with 22 | one column per sample and row per gene.} 23 | 24 | \item{model_matrix}{a numeric matrix that specifies the experimental 25 | design. It can be produced using \code{stats::model.matrix()}. 26 | Default: \code{NULL}} 27 | 28 | \item{offset}{Constant offset in the model in addition to \code{log(size_factors)}. It can 29 | either be a single number, a vector of length \code{ncol(data)} or a matrix with the 30 | same dimensions as \code{dim(data)}. Note that if data is a \link{DelayedArray} or \link{HDF5Matrix}, 31 | \code{offset} must be as well. Default: \code{0}.} 32 | 33 | \item{size_factors}{in large scale experiments, each sample is typically of different size 34 | (for example different sequencing depths). A size factor is an internal mechanism of GLMs to 35 | correct for this effect.\cr 36 | \code{size_factors} is either a numeric vector with positive entries that has the same lengths as columns in the data 37 | that specifies the size factors that are used. 38 | Or it can be a string that species the method that is used to estimate the size factors 39 | (one of \code{c("normed_sum", "deconvolution", "poscounts")}). 40 | Note that \code{"normed_sum"} and \code{"poscounts"} are fairly 41 | simple methods and can lead to suboptimal results. For the best performance, I recommend to use 42 | \code{size_factors = "deconvolution"} which calls \code{scran::calculateSumFactors()}. However, you need 43 | to separately install the \code{scran} package from Bioconductor for this method to work. 44 | Also note that \code{size_factors = 1} and \code{size_factors = FALSE} are equivalent. If only a single gene is given, 45 | no size factor is estimated (ie. \code{size_factors = 1}). Default: \code{"normed_sum"}.} 46 | 47 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model 48 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson 49 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr 50 | \code{overdispersion} can either be 51 | \itemize{ 52 | \item a single boolean that indicates if an overdispersion is estimated for each gene. 53 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values. 54 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes. 55 | } 56 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce 57 | the Gamma-Poisson to the classical Poisson model. Default: \code{TRUE}.} 58 | 59 | \item{overdispersion_shrinkage}{the overdispersion can be difficult to estimate with few replicates. To 60 | improve the overdispersion estimates, we can share information across genes and shrink each individual 61 | overdispersion estimate towards a global overdispersion estimate. Empirical studies show however that 62 | the overdispersion varies based on the mean expression level (lower expression level => higher 63 | dispersion). If \code{overdispersion_shrinkage = TRUE}, a median trend of dispersion and expression level is 64 | fit and used to estimate the variances of a quasi Gamma Poisson model (Lund et al. 2012). Default: \code{TRUE}.} 65 | 66 | \item{ridge_penalty}{to avoid overfitting, we can penalize fits with large coefficient estimates. Instead 67 | of directly minimizing the deviance per gene (\eqn{Sum dev(y_i, X_i b)}), we will minimize 68 | \eqn{Sum dev(y_i, X_i b) + N * Sum (penalty_p * b_p)^2}.\cr 69 | \code{ridge_penalty} can be 70 | \itemize{ 71 | \item a scalar in which case all parameters except the intercept are penalized. 72 | \item a vector which has to have the same length as columns in the model matrix 73 | \item a matrix with the same number of columns as columns in the model matrix. This gives 74 | maximum flexibility for expert users and allows for full Tikhonov regularization. 75 | } 76 | Default: \code{ridge_penalty = 0}, which is internally replaced with a small positive number for numerical stability.} 77 | 78 | \item{do_cox_reid_adjustment}{the classical maximum likelihood estimator of the \code{overdisperion} is biased 79 | towards small values. McCarthy \emph{et al.} (2012) showed that it is preferable to optimize the Cox-Reid 80 | adjusted profile likelihood.\cr 81 | \code{do_cox_reid_adjustment} can be either be \code{TRUE} or \code{FALSE} to indicate if the adjustment is 82 | added during the optimization of the \code{overdispersion} parameter. Default: \code{TRUE}.} 83 | 84 | \item{subsample}{the estimation of the overdispersion is the slowest step when fitting 85 | a Gamma-Poisson GLM. For datasets with many samples, the estimation can be considerably sped up 86 | without loosing much precision by fitting the overdispersion only on a random subset of the samples. 87 | Default: \code{FALSE} which means that the data is not subsampled. If set to \code{TRUE}, at most 1,000 samples 88 | are considered. Otherwise the parameter just specifies the number of samples that are considered 89 | for each gene to estimate the overdispersion.} 90 | 91 | \item{verbose}{a boolean that indicates if information about the individual steps are printed 92 | while fitting the GLM. Default: \code{FALSE}.} 93 | } 94 | \value{ 95 | a list with four elements 96 | * `Beta` the coefficient matrix 97 | * `overdispersion` the vector with the estimated overdispersions 98 | * `Mu` a matrix with the corresponding means for each gene 99 | and sample 100 | * `size_factors` a vector with the size factor for each 101 | sample 102 | * `ridge_penalty` a vector with the ridge penalty 103 | } 104 | \description{ 105 | Internal Function to Fit a Gamma-Poisson GLM 106 | } 107 | \seealso{ 108 | [glm_gp()] and [overdispersion_mle()] 109 | } 110 | \keyword{internal} 111 | -------------------------------------------------------------------------------- /man/prune_DE_mat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scoring_de.R 3 | \name{prune_DE_mat} 4 | \alias{prune_DE_mat} 5 | \title{QC for the list of DE Z-score matrices} 6 | \usage{ 7 | prune_DE_mat( 8 | DEG_mat = NULL, 9 | zscore_cap = 37, 10 | mask_target = FALSE, 11 | p_threshold = 0.05/30000, 12 | min_sig_DEG = 0, 13 | center = FALSE, 14 | scale = FALSE, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{DEG_mat}{the list of DE Z-score matrices that produced by get_DE_mat()} 20 | 21 | \item{zscore_cap}{the cap value for the Z-scores. Any absolute(Z-score) larger than this value will be 22 | set to this value to avoid extreme values affecting the downstream analyses. Default is 37 23 | which is the machine precision limit for a Z-score to produce a non-zero P-value (= ~1e-300).} 24 | 25 | \item{mask_target}{a boolen value to indicate if the Z-score of the perturbation target (labelled 26 | by the column names) should be masked as 0. Default is FALSE so no masking will happen.} 27 | 28 | \item{p_threshold}{the P-value threshold to define the 'significant' DE genes. Default is 0.05/30000, which 29 | is approximately the Bonferroni correction threshold for genome-wide DE tests (assuming 30,000 genes in total).} 30 | 31 | \item{min_sig_DEG}{the minimal number of significant DE genes that each column must contain. Any 32 | column with sig DE genes less than this value will be removed from the matrix. Default is 0 so no 33 | column will be removed.} 34 | 35 | \item{center}{a boolen value to indicate whether we should center each column to 0. Default is FALSE.} 36 | 37 | \item{scale}{a boolen value to indicate whether we should scale each column to have variance = 1. Default is FALSE.} 38 | } 39 | \value{ 40 | The function will return a list of QCed DE Z-score matrices. This list can directly be the input for 41 | DEmultiCCA(). 42 | } 43 | \description{ 44 | A function to perform QC/filtering for the DE Z-score matrices that produced by get_DE_mat(). 45 | } 46 | -------------------------------------------------------------------------------- /man/rbo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/enrichment_test.R 3 | \name{rbo} 4 | \alias{rbo} 5 | \title{Rank biased overlap} 6 | \usage{ 7 | rbo( 8 | list1, 9 | list2, 10 | p, 11 | k = floor(max(length(list1), length(list2))/2), 12 | side = c("top", "bottom"), 13 | mid = NULL, 14 | uneven.lengths = TRUE 15 | ) 16 | } 17 | \arguments{ 18 | \item{list1}{List 1} 19 | 20 | \item{list2}{List 2} 21 | 22 | \item{p}{Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements} 23 | 24 | \item{k}{Evaluation depth for extrapolation} 25 | 26 | \item{side}{Evaluate similarity between the top or the bottom of the ranked lists} 27 | 28 | \item{mid}{Set the mid point to for example only consider positive or negative scores} 29 | 30 | \item{uneven.lengths}{Indicator if lists have uneven lengths} 31 | } 32 | \value{ 33 | a scaler value measuring the rank biased overlap (rbo) 34 | } 35 | \description{ 36 | A function for a new gene-set enrichment test based on the 37 | RBO (rank biased overlap) calculation with extropolation (Webber et al., 2010). 38 | The core functions of rbo() calculation was modified from the "gespeR" package (original author: Fabian Schmich). 39 | We modified it to accomodate our package and data type. We also developed a permutation scheme for 40 | RBO to allow for p-value calculations. 41 | } 42 | \author{ 43 | Fabian Schmich ("gespeR" package) 44 | } 45 | -------------------------------------------------------------------------------- /man/rbo_enrich_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/enrichment_test.R 3 | \name{rbo_enrich_test} 4 | \alias{rbo_enrich_test} 5 | \title{Rank biased overlap (RBO) based enrichment test} 6 | \usage{ 7 | rbo_enrich_test( 8 | input_list, 9 | go_term_db, 10 | p, 11 | n_iter = 500, 12 | k = 300, 13 | side = c("top", "bottom"), 14 | mid = NULL, 15 | uneven.lengths = TRUE, 16 | empirical_test = FALSE, 17 | seed = 131415926 18 | ) 19 | } 20 | \arguments{ 21 | \item{input_list}{input gene list from user (a named vector)} 22 | 23 | \item{go_term_db}{a list object of multiple gene-ontology (GO) terms to run enrichment test against} 24 | 25 | \item{p}{Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements} 26 | 27 | \item{n_iter}{the number of iteration to perform the permutation to obtain the P-values of the enrichment test} 28 | 29 | \item{k}{Evaluation depth for extrapolation} 30 | 31 | \item{side}{Evaluate similarity between the top or the bottom of the ranked lists} 32 | 33 | \item{mid}{Set the mid point to for example only consider positive or negative scores} 34 | 35 | \item{uneven.lengths}{Indicator if lists have uneven lengths} 36 | 37 | \item{empirical_test}{a boolen value to tell the function is an empirical test should be performed. If TRUE, 38 | the exact empirical proportion of the permutated elements that are greater than the true RBO 39 | is returned as the p-value (high accuracy usually requires a large n_iter, e.g., 1000). If FALSE, then a standard 40 | Z-score test is applied to the RBO based on the mean and standard deviation of all the permuated elements (less accurate 41 | but more efficient. A small n_iter is usually enough (e.g., 100 or 200) to get good approximation compared to 42 | the true empirical test).} 43 | } 44 | \value{ 45 | a data.frame consists of rbo measurement between the inptu gene list and all the GO terms, 46 | as well as the P-values based on permutation. Please note that the P-values indicate whether the rank of the input gene 47 | list and the GO-term gene set are consistent or not. It does NOT indicate if RBO is significantly different from 0. 48 | } 49 | \description{ 50 | To perform enrichment test based on rank biased overlap and permutation. 51 | } 52 | --------------------------------------------------------------------------------