├── .Rbuildignore
├── .Rhistory
├── .gitignore
├── DESCRIPTION
├── Mixscale.Rproj
├── NAMESPACE
├── R
├── .Rhistory
├── decomposition.R
├── enrichment_test.R
├── get_fold_change.R
├── glm_gp_disp_only.R
├── perturbation_scoring.R
├── scoring_de.R
└── visualization.R
├── README.md
├── docs
├── index.html
└── old
│ ├── New_Vignette_2024Jan.Rmd
│ ├── index copy 2.Rmd
│ ├── index copy 2.html
│ ├── index copy 3.html
│ ├── index copy.Rmd
│ ├── index copy.html
│ ├── index.Rmd
│ └── index.html
└── man
├── DE_heatmap.Rd
├── DEenrich.Rd
├── DEenrich_DotPlot.Rd
├── DEhclust.Rd
├── DEmultiCCA.Rd
├── FoldChange_new.Rd
├── Mixscale_DoHeatmap.Rd
├── Mixscale_RidgePlot.Rd
├── Mixscale_ScatterPlot.Rd
├── PCApermtest.Rd
├── RunMixscale.Rd
├── Run_wmvRegDE.Rd
├── fisher_enrich_test.Rd
├── get_DE_mat.Rd
├── get_fc.Rd
├── get_sig_genes.Rd
├── get_sig_genes_DEhclust.Rd
├── get_sig_genes_DEmultiCCA.Rd
├── glm_gp_disp_only.Rd
├── glm_gp_disp_only_impl.Rd
├── prune_DE_mat.Rd
├── rbo.Rd
└── rbo_enrich_test.Rd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 |
--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
1 | library(roxygen2); # Read in the roxygen2 R package
2 | roxygenise();
3 | roxygenise()
4 | roxygenise()
5 | ?ScoringDE
6 | ?PRTBScoring
7 | ?FoldChange_new
8 | ?get_fc
9 | ?get_idx
10 | q()
11 | library(roxygen2)
12 | library(devtools)
13 | roxygenise()
14 | roxygenise()
15 | roxygenise()
16 | roxygenise()
17 | ?PCApermtest
18 | roxygenise()
19 | ?DE_heatmap
20 | roxygenise()
21 | ?DE_heatmap
22 | ?PCApermtest
23 | ?DE_heatmap
24 | roxygenise()
25 | ?PCApermtest
26 | library(roxygen2); # Read in the roxygen2 R package
27 | roxygenise();
28 | library(roxygen2)
29 | roxygenise()
30 | q()
31 | library(roxygen2)
32 | roxygenise()
33 | roxygenise()
34 | library(roxygen2)
35 | roxygenise()
36 | library(roxygen2)
37 | roxygenise()
38 | library(roxygen2)
39 | roxygenise()
40 | install.packages("protoclust")
41 | install.packages("protoclust")
42 | library(PRTBScoring)
43 | library(roxygen2)
44 | roxygenise()
45 | library(roxygen2)
46 | roxygenise()
47 | roxygenise()
48 | library(roxygen2)
49 | roxygenise()
50 | roxygenise()
51 | roxygenise()
52 | library(roxygen2)
53 | roxygenise()
54 | roxygenise()
55 | library(roxygen2)
56 | roxygenise()
57 | library(roxygen2)
58 | roxygenise()
59 | library(roxygen2)
60 | roxygenise()
61 | roxygenise()
62 | library(roxygen2)
63 | roxygenise()
64 | library(roxygen2)
65 | roxygenise()
66 | library(roxygen2)
67 | roxygenise()
68 | roxygenise()
69 | library(roxygen2)
70 | roxygenise()
71 | roxygenise()
72 | library(roxygen2)
73 | roxygenise()
74 | library(roxygen2)
75 | roxygenise()
76 | roxygenise()
77 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .Rproj.user
3 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: Mixscale
2 | Type: Package
3 | Title: Quantify the perturbation heterogeneity in Perturb-seq
4 | Version: 0.3.0
5 | RoxygenNote: 7.2.3
6 | Collate:
7 | 'decomposition.R'
8 | 'enrichment_test.R'
9 | 'get_fold_change.R'
10 | 'glm_gp_disp_only.R'
11 | 'perturbation_scoring.R'
12 | 'scoring_de.R'
13 | 'visualization.R'
14 |
--------------------------------------------------------------------------------
/Mixscale.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(DE_heatmap)
4 | export(DEenrich)
5 | export(DEenrich_DotPlot)
6 | export(DEhclust)
7 | export(DEmultiCCA)
8 | export(FoldChange_new)
9 | export(Mixscale_DoHeatmap)
10 | export(Mixscale_RidgePlot)
11 | export(Mixscale_ScatterPlot)
12 | export(PCApermtest)
13 | export(RunMixscale)
14 | export(Run_wmvRegDE)
15 | export(fisher_enrich_test)
16 | export(get_DE_mat)
17 | export(get_fc)
18 | export(get_sig_genes)
19 | export(get_sig_genes_DEhclust)
20 | export(get_sig_genes_DEmultiCCA)
21 | export(prune_DE_mat)
22 | export(rbo)
23 | export(rbo_enrich_test)
24 | import(Seurat)
25 | import(SeuratObject)
26 | import(ggplot2)
27 | import(ggridges)
28 | import(glmGamPoi)
29 | importFrom(Matrix,rowMeans)
30 | importFrom(Matrix,rowSums)
31 | importFrom(PMA,MultiCCA)
32 | importFrom(RColorBrewer,brewer.pal)
33 | importFrom(gplots,heatmap.2)
34 | importFrom(protoclust,protoclust)
35 | importFrom(protoclust,protocut)
36 |
--------------------------------------------------------------------------------
/R/.Rhistory:
--------------------------------------------------------------------------------
1 | nt.class.name = "NT",
2 | min.de.genes = 5,
3 | split.by = "cell_type",
4 | logfc.threshold = 0.2,
5 | de.assay = "RNA",
6 | max.de.genes = 100,
7 | prtb.type = "P",
8 | new.class.name = "mixscape_v1",
9 | fine.mode = F,
10 | harmonize = T,
11 | seed = 1)
12 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring")
13 | # 3. Perturbation scoring for each cell
14 | seurat_obj = PRTBScoring(
15 | object = seurat_obj,
16 | assay = "PRTB",
17 | slot = "scale.data",
18 | labels = "gene",
19 | nt.class.name = "NT",
20 | min.de.genes = 5,
21 | split.by = "cell_type",
22 | logfc.threshold = 0.2,
23 | de.assay = "RNA",
24 | max.de.genes = 100,
25 | prtb.type = "P",
26 | new.class.name = "mixscape_v1",
27 | fine.mode = F,
28 | harmonize = T,
29 | seed = 1)
30 | # 4. Perform scoring-based DE test using the scores
31 | de_res = scoringDE(object = seurat_obj, assay = "RNA", slot = "counts",
32 | labels = "gene")
33 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring")
34 | # 4. Perform scoring-based DE test using the scores
35 | de_res = scoringDE(object = seurat_obj, assay = "RNA", slot = "counts",
36 | PRTB_list = c("RFX5", "ZC3H3", "IFNGR1", "IFNGR2",
37 | "IRF1", "IRF2", "JUN", "MAFF",
38 | "PARP12", "TRAFD1",
39 | "JAK1", "JAK2",
40 | "STAT1", "SP100"),
41 | labels = "gene")
42 | str(de_res)
43 | # and re-arrange the DE results into Z-score matrices
44 | DEG_mat = get_DE_mat(de_res)
45 | DEG_mat = prune_DE_mat(de_res)
46 | DEG_mat = prune_DE_mat(DEG_mat)
47 | str(DEG_mat)
48 | # and re-arrange the DE results into Z-score matrices
49 | DEG_mat = get_DE_mat(de_res)
50 | DEG_mat = prune_DE_mat(DEG_mat, mask_target = T, min_sig_DEG = 3)
51 | str(DEG_mat)
52 | head(DEG_mat$A549)
53 | #########
54 | # 5.1 within-prtb
55 | celltype_list = names(DEG_mat)
56 | PRTB_list = colnames(DEG_mat[[1]])
57 | gene_ID = rownames(DEG_mat[[1]])
58 | #####
59 | for(i in 1:length(PRTB_list)){
60 | PRTB = PRTB_list[i]
61 | tmp=list()
62 | for(CELLTYPE in celltype_list){
63 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB]
64 | }
65 | tmp = Reduce(cbind, tmp)
66 | colnames(tmp) = celltype_list
67 | rownames(tmp) = gene_ID
68 | # run Permutation test and extract gene signatures
69 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1)
70 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T)
71 | # plot Z-score heatmap for the gene signatures
72 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30,
73 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/",
74 | prefix = PRTB)
75 | }
76 | names(DEG_mat)
77 | #####
78 | for(i in 1:length(PRTB_list)){
79 | PRTB = PRTB_list[i]
80 | tmp=list()
81 | for(CELLTYPE in celltype_list){
82 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){
83 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB]
84 | }
85 | }
86 | tmp = Reduce(cbind, tmp)
87 | colnames(tmp) = celltype_list
88 | rownames(tmp) = gene_ID
89 | # run Permutation test and extract gene signatures
90 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1)
91 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T)
92 | # plot Z-score heatmap for the gene signatures
93 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30,
94 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/",
95 | prefix = PRTB)
96 | }
97 | colnames(DEG_mat[[CELLTYPE]])
98 | PRTB
99 | PRTB %in% colnames(DEG_mat[[CELLTYPE]])
100 | head(tmp)
101 | # and re-arrange the DE results into Z-score matrices
102 | DEG_mat = get_DE_mat(de_res)
103 | DEG_mat_main = DEG_mat
104 | #########
105 | # 5.1 within-prtb
106 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0)
107 | celltype_list = names(DEG_mat)
108 | PRTB_list = colnames(DEG_mat[[1]])
109 | gene_ID = rownames(DEG_mat[[1]])
110 | #####
111 | for(i in 1:length(PRTB_list)){
112 | PRTB = PRTB_list[i]
113 | tmp=list()
114 | for(CELLTYPE in celltype_list){
115 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){
116 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB]
117 | }
118 | }
119 | tmp = Reduce(cbind, tmp)
120 | colnames(tmp) = celltype_list
121 | rownames(tmp) = gene_ID
122 | # run Permutation test and extract gene signatures
123 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1)
124 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T)
125 | # plot Z-score heatmap for the gene signatures
126 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30,
127 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/",
128 | prefix = PRTB)
129 | }
130 | #########
131 | # 5.2 within-celltype
132 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3)
133 | celltype_list = names(DEG_mat)
134 | PRTB_list = colnames(DEG_mat[[1]])
135 | gene_ID = rownames(DEG_mat[[1]])
136 | for(i in 1:length(celltype_list)){
137 | CELLTYPE = celltype_list[i]
138 | tmp=DEG_mat[[CELLTYPE]]
139 | # run Permutation test and extract gene signatures
140 | res = DEhclust(mat = tmp)
141 | sig_genes = get_sig_genes_DEhclust(obj = res)
142 | # plot Z-score heatmap for the gene signatures
143 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30,
144 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level2/",
145 | prefix = CELLTYPE)
146 | }
147 | #########
148 | # 5.3 MultiCCA analysis
149 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3, center = T)
150 | celltype_list = names(DEG_mat)
151 | PRTB_list = colnames(DEG_mat[[1]])
152 | gene_ID = rownames(DEG_mat[[1]])
153 | # run Permutation test and extract gene signatures
154 | res = DEmultiCCA(DEG_mat, cor_coef_thres = 0.6, max_k = 3)
155 | str(DEG_mat)
156 | # run Permutation test and extract gene signatures
157 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3)
158 | str(res)
159 | str(DEG_mat)
160 | #########
161 | # 5.3 MultiCCA analysis
162 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T)
163 | celltype_list = names(DEG_mat)
164 | PRTB_list = colnames(DEG_mat[[1]])
165 | gene_ID = rownames(DEG_mat[[1]])
166 | # run Permutation test and extract gene signatures
167 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.6, max_k = 3)
168 | sig_genes = get_sig_genes_DEmultiCCA(res)
169 | str(sig_genes)
170 | str(res)
171 | # run Permutation test and extract gene signatures
172 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3)
173 | str(res)
174 | sig_genes = get_sig_genes_DEmultiCCA(res)
175 | # visualization.
176 | DE_heatmap(obj = res, sig_genes = sig_genes,
177 | type = "multiCCA", direction = "both",
178 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/",
179 | prefix = "IFNG")
180 | # run Permutation test and extract gene signatures
181 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.6, max_k = 3, standardize = T)
182 | sig_genes = get_sig_genes_DEmultiCCA(res)
183 | # visualization.
184 | DE_heatmap(obj = res, sig_genes = sig_genes,
185 | type = "multiCCA", direction = "both",
186 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/",
187 | prefix = "IFNG")
188 | # visualization.
189 | DE_heatmap(obj = res, sig_genes = sig_genes,
190 | type = "multiCCA", direction = "both",
191 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/",
192 | prefix = "IFNG")
193 | # run Permutation test and extract gene signatures
194 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T)
195 | sig_genes = get_sig_genes_DEmultiCCA(res)
196 | # visualization.
197 | DE_heatmap(obj = res, sig_genes = sig_genes,
198 | type = "multiCCA", direction = "both",
199 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/",
200 | prefix = "IFNG")
201 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0)
202 | celltype_list = names(DEG_mat)
203 | PRTB_list = colnames(DEG_mat[[1]])
204 | gene_ID = rownames(DEG_mat[[1]])
205 | PRTB = PRTB_list[i]
206 | tmp=list()
207 | for(CELLTYPE in celltype_list){
208 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){
209 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB]
210 | }
211 | }
212 | tmp = Reduce(cbind, tmp)
213 | colnames(tmp) = celltype_list
214 | rownames(tmp) = gene_ID
215 | # run Permutation test and extract gene signatures
216 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1)
217 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T)
218 | str(sig_genes)
219 | PRTB
220 | #########
221 | # 5.2 within-celltype
222 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3)
223 | celltype_list = names(DEG_mat)
224 | PRTB_list = colnames(DEG_mat[[1]])
225 | gene_ID = rownames(DEG_mat[[1]])
226 | CELLTYPE = celltype_list[i]
227 | tmp=DEG_mat[[CELLTYPE]]
228 | # run Permutation test and extract gene signatures
229 | res = DEhclust(mat = tmp)
230 | sig_genes = get_sig_genes_DEhclust(obj = res)
231 | str(sig_genes)
232 | names(sig_genes)
233 | #########
234 | # 5.3 MultiCCA analysis
235 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T)
236 | celltype_list = names(DEG_mat)
237 | PRTB_list = colnames(DEG_mat[[1]])
238 | gene_ID = rownames(DEG_mat[[1]])
239 | # run Permutation test and extract gene signatures
240 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T)
241 | sig_genes = get_sig_genes_DEmultiCCA(res)
242 | str(sig_genes)
243 | names(sig_genes)
244 | ##################################
245 | # Decomposition
246 | go_db = list()
247 | #########
248 | # 5.1 within-prtb
249 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0)
250 | celltype_list = names(DEG_mat)
251 | PRTB_list = colnames(DEG_mat[[1]])
252 | gene_ID = rownames(DEG_mat[[1]])
253 | #####
254 | for(i in 1:length(PRTB_list)){
255 | PRTB = PRTB_list[i]
256 | tmp=list()
257 | for(CELLTYPE in celltype_list){
258 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){
259 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB]
260 | }
261 | }
262 | tmp = Reduce(cbind, tmp)
263 | colnames(tmp) = celltype_list
264 | rownames(tmp) = gene_ID
265 | # run Permutation test and extract gene signatures
266 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1)
267 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T)
268 | # plot Z-score heatmap for the gene signatures
269 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30,
270 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/",
271 | prefix = PRTB)
272 | # store the gene signatures to the go-term repo
273 | if(length(sig_genes$upDEGs) >= 10){
274 | go_db[[paste0(PRTB, "_upDEGs")]] = sig_genes$upDEGs
275 | }
276 | if(length(sig_genes$downDEGs) >= 10){
277 | go_db[[paste0(PRTB, "_downDEGs")]] = sig_genes$downDEGs
278 | }
279 | }
280 | #########
281 | # 5.2 within-celltype
282 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3)
283 | celltype_list = names(DEG_mat)
284 | PRTB_list = colnames(DEG_mat[[1]])
285 | gene_ID = rownames(DEG_mat[[1]])
286 | for(i in 1:length(celltype_list)){
287 | CELLTYPE = celltype_list[i]
288 | tmp=DEG_mat[[CELLTYPE]]
289 | # run Permutation test and extract gene signatures
290 | res = DEhclust(mat = tmp)
291 | sig_genes = get_sig_genes_DEhclust(obj = res)
292 | # plot Z-score heatmap for the gene signatures
293 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30,
294 | output_path = "/Users/uqljian5/Desktop/test_multiCCA/level2/",
295 | prefix = CELLTYPE)
296 | # store the gene signatures to the go-term repo
297 | for(CLUSTER in names(sig_genes)){
298 | if(length(sig_genes[[CLUSTER]]$sig_genes$upDEGs) >= 10){
299 | go_db[[paste0(CELLTYPE, "_", CLUSTER, "_upDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$upDEGs
300 | }
301 | if(length(sig_genes[[CLUSTER]]$sig_genes$downDEGs) >= 10){
302 | go_db[[paste0(CELLTYPE, "_", CLUSTER,"_downDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$downDEGs
303 | }
304 | }
305 | }
306 | #########
307 | # 5.3 MultiCCA analysis
308 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T)
309 | celltype_list = names(DEG_mat)
310 | PRTB_list = colnames(DEG_mat[[1]])
311 | gene_ID = rownames(DEG_mat[[1]])
312 | # run Permutation test and extract gene signatures
313 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T)
314 | sig_genes = get_sig_genes_DEmultiCCA(res)
315 | # visualization.
316 | DE_heatmap(obj = res, sig_genes = sig_genes,
317 | type = "multiCCA", direction = "both",
318 | top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/",
319 | prefix = "IFNG")
320 | # store the gene signatures to the go-term repo
321 | for(PROGRAM in names(sig_genes)){
322 | if(length(sig_genes[[PROGRAM]]$sig_genes$upDEGs) >= 10){
323 | go_db[[paste0("IFNG_", PROGRAM, "_upDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$upDEGs
324 | }
325 | if(length(sig_genes[[PROGRAM]]$sig_genes$downDEGs) >= 10){
326 | go_db[[paste0("IFNG_", PROGRAM, "_downDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$downDEGs
327 | }
328 | }
329 | str(go_db)
330 | head(seurat_obj)
331 | ##########
332 | # 6.1 DE tests
333 | seurat_obj$Condition = paste0(seurat_obj$cell_type, "_", seurat_obj$gene)
334 | Idents(seurat_obj) = "Condition"
335 | table(seurat_obj$Condition)
336 | new_DE_test = FindMarkers(seurat_obj, ident.1 = "A549_NT", ident.2 = "A549_IFNGR2")
337 | seurat_obj
338 | new_DE_test = FindMarkers(seurat_obj, ident.1 = "A549_NT", ident.2 = "A549_IFNGR2",
339 | slot = "data", logfc.threshold = 0)
340 | head(new_DE_test)
341 | # get the background gene list
342 | background = rownames(new_DE_test)
343 | # get the significant down-reg genes (the input list )
344 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0, ])
345 | # get the significant down-reg genes (the input list )
346 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0.2, ])
347 | # get the significant down-reg genes (the input list )
348 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0.2, ])
349 | # 6.2 Conventional test (Fisher's exact test)
350 | fisher_enrich_res = fisher_enrich_test(input_list = input_list,
351 | background = background,
352 | go_term_db = go_db)
353 | head(fisher_enrich_res)
354 | fisher_enrich_res = fisher_enrich_res[order(fisher_enrich_res$Pval), ]
355 | head(fisher_enrich_res)
356 | head(fisher_enrich_res, 20)
357 | # 6.3 Rank biased overlap based test
358 | # RBO test does NOT require pre-select DEGs based on P-value or log-fold-change
359 | input_list2 = rownames(new_DE_test[new_DE_test$avg_log2FC > 0, ])
360 | head(new_DE_test[new_DE_test$avg_log2FC > 0, ], 30)
361 | # 6.3 Rank biased overlap based test
362 | # RBO test does NOT require pre-select DEGs based on P-value or log-fold-change
363 | input_list2 = rownames(new_DE_test[new_DE_test$avg_log2FC > 0, ])
364 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
365 | go_term_db = go_db,
366 | p = 0.99)
367 | head(input_list2)
368 | length(input_list2)
369 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
370 | go_term_db = go_db,
371 | p = 0.99,
372 | side = "bottom")
373 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
374 | go_term_db = go_db,
375 | p = 0.99,
376 | k = 300,
377 | side = "bottom")
378 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring")
379 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
380 | go_term_db = go_db,
381 | p = 0.99,
382 | k = 300,
383 | side = "bottom")
384 | head(input_list2)
385 | str(go_db)
386 | go_term_db = go_db
387 | class(go_term_db[[1]]) == "list"
388 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
389 | go_term_db = go_db,
390 | p = 0.99,
391 | k = 300,
392 | side = "bottom")
393 | rbo
394 | input_list,
395 | input_list
396 | go_term_db
397 | p
398 | input_list
399 | go_term_db
400 | p = 0.99
401 | n_iter = 500
402 | k=300
403 | side= "bottom"
404 | mid = NULL
405 | uneven.lengths = TRUE
406 | empirical_test = FALSE
407 | seed = 131415926
408 | if(length(input_list) < 5){
409 | print("The length of the input list is less than 5, stopping analysis...")
410 | return(NULL)
411 | }
412 | # 0. each vector in the go_term_db is assumed to be an ordered vector of characters (gene names).
413 | # we need to convert each vector in to a named vector of number (number being the rank of each gene).
414 | go_term_db2 = lapply(X = go_term_db,
415 | FUN = function(x) {
416 | if(side == "bottom") tmp = 1:length(x)
417 | else if (side == "top") tmp = length(x):1
418 | names(tmp) = x
419 | return(tmp)
420 | })
421 | str(go_term_db2 )
422 | # 1. calculate the true RBO for the input_list and all the GO terms
423 | rbo_real = sapply(X = go_term_db2,
424 | FUN = rbo,
425 | list2 = input_list,
426 | p = p,
427 | k = k,
428 | side = side,
429 | mid = mid,
430 | uneven.lengths = uneven.lengths)
431 | head(input_list)
432 | load_all("/Users/uqljian5/Documents/github_repo/Perturbation_Scoring")
433 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
434 | go_term_db = go_db,
435 | p = 0.99,
436 | k = 100,
437 | side = "bottom")
438 | head(rbo_enrich_res)
439 | rbo_enrich_res = rbo_enrich_res[order(rbo_enrich_res$RBO, decreasing = T), ]
440 | head(rbo_enrich_res, 20)
441 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
442 | go_term_db = go_db,
443 | p = 0.98,
444 | k = 100,
445 | side = "bottom")
446 | rbo_enrich_res = rbo_enrich_res[order(rbo_enrich_res$RBO, decreasing = T), ]
447 | head(rbo_enrich_res)
448 | options(Seurat.object.assay.version = 'v3')
449 | library(Seurat)
450 | library(ggridges)
451 | library(stringr)
452 | library(Mixscale)
453 | library(ggplot2)
454 | seurat_obj = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/seurat_obj_GSE132080_QCed_2023Aug10.rds")
455 | seurat_obj
456 | gv.list = Tool(seurat_obj, slot = "RunMixscape")
457 | gv.list = Tool(seurat_obj, slot = "PRTBScoring")
458 | str(gv.list)
459 | load_all("/Users/uqljian5/Documents/github_repo/Mixscale")
460 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/")
461 | seurat_obj
462 | obj2 = RunMixscale(
463 | object = seurat_obj,
464 | assay = "PRTB",
465 | slot = "scale.data",
466 | labels = "gene",
467 | nt.class.name = "neg",
468 | min.de.genes = 5,
469 | logfc.threshold = 0.2,
470 | de.assay = "RNA",
471 | max.de.genes = 100,
472 | fine.mode = F)
473 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/")
474 | obj2 = RunMixscale(
475 | object = seurat_obj,
476 | assay = "PRTB",
477 | slot = "scale.data",
478 | labels = "gene",
479 | nt.class.name = "neg",
480 | min.de.genes = 5,
481 | logfc.threshold = 0.2,
482 | de.assay = "RNA",
483 | max.de.genes = 100,
484 | fine.mode = F)
485 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/")
486 | obj2 = RunMixscale(
487 | object = seurat_obj,
488 | assay = "PRTB",
489 | slot = "scale.data",
490 | labels = "gene",
491 | nt.class.name = "neg",
492 | min.de.genes = 5,
493 | logfc.threshold = 0.2,
494 | de.assay = "RNA",
495 | max.de.genes = 100,
496 | fine.mode = F)
497 | devtools::load_all("/Users/uqljian5/Documents/github_repo/Mixscale/")
498 | obj2 = RunMixscale(
499 | object = seurat_obj,
500 | assay = "PRTB",
501 | slot = "scale.data",
502 | labels = "gene",
503 | nt.class.name = "neg",
504 | min.de.genes = 5,
505 | logfc.threshold = 0.2,
506 | de.assay = "RNA",
507 | max.de.genes = 100,
508 | fine.mode = F)
509 | head(obj2)
510 | plot(obj2$pvec, obj2$weight)
511 | rm(list=ls())
512 | gc()
513 |
--------------------------------------------------------------------------------
/R/enrichment_test.R:
--------------------------------------------------------------------------------
1 | #'
2 | NULL
3 |
4 | #' Wrapper function for DE and enrichment test
5 | #'
6 | #' This function provides a wrapper of Seurat::FindMarkers() and Mixscale::fisher_enrich_test().
7 | #' Users can input a Seurat object they want to investigate and a list of gene sets they want to
8 | #' test against, and the wrapper will perform DE tests + Fisher's enrichment test across all the
9 | #' available cell types. It will then return a list of data frames, containing gene set enrichment
10 | #' results for each cell type.
11 | #'
12 | #' @export
13 | #' @param object a seurat object to perform the DE test and the enrichment test
14 | #' @param plist the pathway gene lists to test the DE genes against
15 | #' @param split.by Regroup cells into a different identity class prior to performing differential expression.
16 | #' Default is NULL (so all cells be used simultaneously).
17 | #' @param slct.ct Subset a particular identity class prior to regrouping. Only relevant if group.by is set.
18 | #' @param ident.1 Identity class to define markers for; pass an object of class phylo or 'clustertree' to find markers for a node in a cluster tree; passing 'clustertree' requires BuildClusterTree to have been run
19 | #' @param ident.2 A second identity class for comparison; if NULL, use all other cells for comparison; if an object of class phylo or 'clustertree' is passed to ident.1, must pass a node to find markers for
20 | #' @return a list of data frames containing the gene set enrichment results for each group in "group.by"
21 | #'
22 |
23 | DEenrich <- function(object,
24 | plist = NULL,
25 | ident = NULL,
26 | ident.1 = NULL,
27 | ident.2 = NULL,
28 | split.by = NULL,
29 | slct.ct = NULL,
30 | direction = c("up", "down", "both"),
31 | logfc.threshold = 0.25,
32 | p.val.cutoff = 0.05,
33 | min.pct = 0.1,
34 | assay = NULL,
35 | ...){
36 |
37 | slct_celltype = sort(unique(object[[split.by]][, 1]))
38 | if(!is.null(split.by) & is.null(slct_celltype)){
39 | stop("Please check if your split.by is correctly specified.")
40 | }
41 |
42 | if(!is.null(slct.ct)){
43 | slct_celltype = intersect(slct_celltype, slct.ct)
44 | }
45 | if(length(slct_celltype) == 0){
46 | slct_celltype = "con1"
47 | }
48 |
49 | enrich_list = list()
50 | for(CELLTYPE in slct_celltype){
51 | if(!is.null(ident)){
52 | Idents(object) = ident
53 | }
54 | #
55 | if(is.null(split.by)){
56 | object[["new_ident"]] = paste0("con1", "_", Idents(object))
57 | } else {
58 | object[["new_ident"]] = paste0(object[[split.by]][,1], "_", Idents(object))
59 | }
60 | ident.1.tmp = paste0(CELLTYPE, "_", ident.1)
61 | ident.2.tmp = paste0(CELLTYPE, "_", ident.2)
62 |
63 | # run DE
64 | Idents(object) = "new_ident"
65 | DE_res = FindMarkers(object,
66 | ident.1 = ident.1.tmp,
67 | ident.2 = ident.2.tmp,
68 | min.pct = min.pct,
69 | logfc.threshold = 0,
70 | ...)
71 |
72 | # get the top DEGs separately for up and down regulated genes
73 | upDEG = rownames(DE_res[DE_res$p_val_adj <= p.val.cutoff & DE_res$avg_log2FC > logfc.threshold, ])
74 | downDEG = rownames(DE_res[DE_res$p_val_adj <= p.val.cutoff & DE_res$avg_log2FC < -logfc.threshold, ])
75 | background = rownames(DE_res) # the background gene list
76 |
77 | # run enrichment test for the DEGs
78 | if(length(downDEG) < 5 | direction == "up"){
79 | enrich_res_down = NULL
80 | } else {
81 | enrich_res_down = fisher_enrich_test(input_list = downDEG,
82 | background = background,
83 | go_term_db = plist)
84 | enrich_res_down$num_DEG = length(downDEG)
85 | enrich_res_down$direction_DEG = "downDEG"
86 | enrich_res_down = enrich_res_down[order(enrich_res_down$Pval), ]
87 | enrich_res_down$slct.ct = CELLTYPE
88 | }
89 |
90 | #
91 | if(length(upDEG) < 5 | direction == "down"){
92 | enrich_res_up = NULL
93 | } else {
94 | enrich_res_up = fisher_enrich_test(input_list = upDEG,
95 | background = background,
96 | go_term_db = plist)
97 | enrich_res_up$num_DEG = length(upDEG)
98 | enrich_res_up$direction_DEG = "upDEG"
99 | enrich_res_up = enrich_res_up[order(enrich_res_up$Pval), ]
100 | enrich_res_up$slct.ct = CELLTYPE
101 | }
102 |
103 | # save the results to the list
104 | enrich_list[[CELLTYPE]] = rbind(enrich_res_up, enrich_res_down)
105 | }
106 |
107 | if(length(enrich_list) == 1){
108 | return(enrich_list[[1]])
109 | } else {
110 | return(enrich_list)
111 | }
112 | }
113 |
114 |
115 |
116 | #' Rank biased overlap
117 | #'
118 | #' A function for a new gene-set enrichment test based on the
119 | #' RBO (rank biased overlap) calculation with extropolation (Webber et al., 2010).
120 | #' The core functions of rbo() calculation was modified from the "gespeR" package (original author: Fabian Schmich).
121 | #' We modified it to accomodate our package and data type. We also developed a permutation scheme for
122 | #' RBO to allow for p-value calculations.
123 | #'
124 | #' @author Fabian Schmich ("gespeR" package)
125 | #' @export
126 | #'
127 | #' @param list1 List 1
128 | #' @param list2 List 2
129 | #' @param p Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements
130 | #' @param k Evaluation depth for extrapolation
131 | #' @param side Evaluate similarity between the top or the bottom of the ranked lists
132 | #' @param mid Set the mid point to for example only consider positive or negative scores
133 | #' @param uneven.lengths Indicator if lists have uneven lengths
134 | #' @return a scaler value measuring the rank biased overlap (rbo)
135 | #'
136 |
137 | rbo <- function(list1, list2, p, k=floor(max(length(list1), length(list2))/2), side=c("top", "bottom"), mid = NULL, uneven.lengths = TRUE) {
138 | side <- match.arg(side)
139 | if (!is.numeric(list1) | !is.numeric(list2))
140 | stop("Input vectors are not numeric.")
141 | if (is.null(names(list1)) | is.null(names(list2)))
142 | stop("Input vectors are not named.")
143 | ids <- switch(side,
144 | "top"=list(list1=.select.ids(list1, "top", mid), list2=.select.ids(list2, "top", mid)),
145 | "bottom"=list(list1=.select.ids(list1, "bottom", mid), list2=.select.ids(list2, "bottom", mid))
146 | )
147 | min(1, .rbo.ext(ids$list1, ids$list2, p, k, uneven.lengths = uneven.lengths))
148 | }
149 |
150 |
151 | # rbo2 <- function(list1, list2, p, k=floor(max(length(list1), length(list2))/2), side=c("top", "bottom"), mid = NULL, uneven.lengths = TRUE) {
152 | # side <- match.arg(side)
153 | # if (!is.numeric(list1) | !is.numeric(list2))
154 | # stop("Input vectors are not numeric.")
155 | # if (is.null(names(list1)) | is.null(names(list2)))
156 | # stop("Input vectors are not named.")
157 | # ids <- switch(side,
158 | # "top"=list(list1=.select.ids(list1, "top", mid), list2=.select.ids(list2, "top", mid)),
159 | # "bottom"=list(list1=.select.ids(list1, "bottom", mid), list2=.select.ids(list2, "bottom", mid))
160 | # )
161 | # min(1, rbo_ext(ids$list1, ids$list2, p, k, uneven_lengths = uneven.lengths))
162 | # }
163 |
164 |
165 | #' Select top or bottom names of ranked vector
166 | #'
167 | #' @author Fabian Schmich ("gespeR" package)
168 | #' @noRd
169 | #'
170 | #' @param x The ranked list
171 | #' @param side The side to be evaluated ("top" or "bottom" of ranked list)
172 | #' @param mid The mid point to split a list, e.g. to split between positive and negative values choose mid=0
173 | #' @return A vector of selected identifiers
174 | .select.ids <- function(x, side=c("top", "bottom"), mid=NULL) {
175 | side <- match.arg(side)
176 | if (side == "top") {
177 | x <- sort(x, decreasing=TRUE)
178 | if (is.null(mid))
179 | return(names(x))
180 | else
181 | return(names(x)[which(x > mid)])
182 | } else if (side == "bottom") {
183 | x <- sort(x, decreasing=FALSE)
184 | if (is.null(mid))
185 | return(names(x))
186 | else
187 | return(names(x)[which(x < mid)])
188 | }
189 | }
190 |
191 |
192 | #' Rank biased overlap formula based on (32) from "A Similarity Measure for Indefinite Rankings" (Webber et al.)
193 | #'
194 | #' @author Fabian Schmich ("gespeR" package)
195 | #' @noRd
196 | #'
197 | #' @param x List 1
198 | #' @param y List 2
199 | #' @param p The weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements
200 | #' @param k The evaluation depth
201 | #' @param uneven.lengths Indicator if lists have uneven lengths
202 | #' @return The rank biased overlap between x and y
203 | .rbo.ext <- function(x, y, p, k, uneven.lengths = TRUE) {
204 | if (length(x) <= length(y)) {
205 | S <- x
206 | L <- y
207 | } else {
208 | S <- y
209 | L <- x
210 | }
211 | l <- min(k, length(L))
212 | s <- min(k, length(S))
213 |
214 | if (uneven.lengths) {
215 | Xd <- sapply(1:l, function(i) length(intersect(S[1:i], L[1:i])))
216 | ((1-p) / p) *
217 | ((sum(Xd[seq(1, l)] / seq(1, l) * p^seq(1, l))) +
218 | (sum(Xd[s] * (seq(s+1, l) - s) / (s * seq(s+1, l)) * p^seq(s+1, l)))) +
219 | ((Xd[l] - Xd[s]) / l + (Xd[s] / s)) * p^l
220 | } else {
221 | #stopifnot(l == s)
222 | k <- min(s, k)
223 | Xd <- sapply(1:k, function(i) length(intersect(x[1:i], y[1:i])))
224 | Xk <- Xd[k]
225 | (Xk / k) * p^k + (((1-p)/p) * sum((Xd / seq(1,k)) * p^seq(1,k)))
226 | }
227 | }
228 |
229 |
230 | #' Rank biased overlap (RBO) based enrichment test
231 | #'
232 | #' To perform enrichment test based on rank biased overlap and permutation.
233 | #'
234 | #' @export
235 | #'
236 | #' @param input_list input gene list from user (a named vector)
237 | #' @param go_term_db a list object of multiple gene-ontology (GO) terms to run enrichment test against
238 | #' @param p Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements
239 | #' @param n_iter the number of iteration to perform the permutation to obtain the P-values of the enrichment test
240 | #' @param k Evaluation depth for extrapolation
241 | #' @param side Evaluate similarity between the top or the bottom of the ranked lists
242 | #' @param mid Set the mid point to for example only consider positive or negative scores
243 | #' @param uneven.lengths Indicator if lists have uneven lengths
244 | #' @param empirical_test a boolen value to tell the function is an empirical test should be performed. If TRUE,
245 | #' the exact empirical proportion of the permutated elements that are greater than the true RBO
246 | #' is returned as the p-value (high accuracy usually requires a large n_iter, e.g., 1000). If FALSE, then a standard
247 | #' Z-score test is applied to the RBO based on the mean and standard deviation of all the permuated elements (less accurate
248 | #' but more efficient. A small n_iter is usually enough (e.g., 100 or 200) to get good approximation compared to
249 | #' the true empirical test).
250 | #'
251 | #' @return a data.frame consists of rbo measurement between the inptu gene list and all the GO terms,
252 | #' as well as the P-values based on permutation. Please note that the P-values indicate whether the rank of the input gene
253 | #' list and the GO-term gene set are consistent or not. It does NOT indicate if RBO is significantly different from 0.
254 | #'
255 |
256 | rbo_enrich_test <- function(input_list,
257 | go_term_db,
258 | p,
259 | n_iter = 500,
260 | k=300,
261 | side=c("top", "bottom"),
262 | mid = NULL,
263 | uneven.lengths = TRUE,
264 | empirical_test = FALSE,
265 | seed = 131415926) {
266 | if(length(input_list) < 5){
267 | print("The length of the input list is less than 5, stopping analysis...")
268 | return(NULL)
269 | }
270 |
271 | # 0. each vector in the go_term_db is assumed to be an ordered vector of characters (gene names).
272 | # we need to convert each vector in to a named vector of number (number being the rank of each gene).
273 | go_term_db2 = lapply(X = go_term_db,
274 | FUN = function(x) {
275 | if(side == "bottom") tmp = 1:length(x)
276 | else if (side == "top") tmp = length(x):1
277 | names(tmp) = x
278 | return(tmp)
279 | })
280 |
281 | # create a rank vector from input_list
282 | ori_input_list = input_list
283 | if(side == "bottom"){
284 | input_list = 1:length(ori_input_list)
285 | names(input_list) = ori_input_list
286 | } else if (side == "top") {
287 | input_list = length(ori_input_list):1
288 | names(input_list) = ori_input_list
289 | }
290 |
291 | # 1. calculate the true RBO for the input_list and all the GO terms
292 | rbo_real = sapply(X = go_term_db2,
293 | FUN = rbo,
294 | list2 = input_list,
295 | p = p,
296 | k = k,
297 | side = side,
298 | mid = mid,
299 | uneven.lengths = uneven.lengths)
300 |
301 | # 2. now we need to proceed to the permutation tests
302 | set.seed(seed)
303 |
304 | # Shuffle the input_list for n_iter times (shuffled matrix has n_iter columns)
305 | max_d = ifelse(k >= length(input_list), yes = length(input_list), no = k)
306 | shuffled_matrix <- replicate(n_iter, sample(input_list[1:max_d]))
307 | rownames(shuffled_matrix) = names(input_list[1:max_d])
308 |
309 | # Function to calculate rbo for each go_term against the shuffled matrix
310 | calculate_rbo <- function(go_term) {
311 | apply(shuffled_matrix,
312 | MARGIN = 2,
313 | FUN = rbo,
314 | list2 = go_term,
315 | p = p,
316 | k = k,
317 | side = side,
318 | mid = mid,
319 | uneven.lengths = uneven.lengths)
320 | }
321 |
322 | # Apply the function to each go_term in go_term_db2
323 | list_perm_vector <- lapply(go_term_db2, calculate_rbo)
324 |
325 | # now calculate the P-values based on empirical_test
326 | if(empirical_test == TRUE){
327 | calculate_proportion <- function(element, list_vec) {
328 | mean(list_vec > element)
329 | }
330 | p_values <- mapply(calculate_proportion, rbo_real, list_perm_vector)
331 | } else if (empirical_test == FALSE){
332 | calculate_proportion <- function(element, list_vec) {
333 | pnorm(q = element,
334 | mean = mean(list_vec),
335 | sd = sd(list_vec),
336 | lower.tail = F)
337 | }
338 | p_values <- mapply(calculate_proportion, rbo_real, list_perm_vector)
339 | }
340 |
341 | # merge the results into one data.frame
342 | res_dat = data.frame(GO_term = names(rbo_real),
343 | RBO = rbo_real,
344 | Pval = p_values,
345 | n_GO_term = sapply(X = go_term_db2, FUN = length),
346 | n_intersect = sapply(X = go_term_db, FUN = function(x, y) length(intersect(x, y)), y = names(input_list)))
347 | #
348 | res_dat$Pval[res_dat$n_intersect <= 1 | res_dat$RBO <= 0.01 ] = 1
349 | rownames(res_dat) = NULL
350 | return(res_dat)
351 | }
352 |
353 |
354 | #' get the weight of each depth till 'd' given a weight parameter 'p'
355 | #' @noRd
356 | #' @return a numeric vector of the weights from rank depth 1 to d.
357 | gs_seq = function(d, p){
358 | gs = function(d, p){
359 | (1-p)*p^(d-1)
360 | }
361 | return(gs(1:d, p))
362 | }
363 |
364 |
365 |
366 | #' Standard Fisher's exact test for enrichment analysis
367 | #'
368 | #' This function will perform the strandard Fisher's exact test between the input gene
369 | #' list and a series of gene-ontology gene sets (adopted from DAVID GO analysis).
370 | #'
371 | #' @export
372 | #'
373 | #' @param input_list the input gene list
374 | #' @param background the background gene list (usually the expressed genes where the
375 | #' input gene list is generate from, ).
376 | #' @param go_term_db a list of gene-lists (GO term). It should be a list contain multiple named vector,
377 | #' and each vector should be a vector of multiple marker/signature genes for some biological pathway/process.
378 | #' @param list_gene A Boolen value to indicate if the overlapping genes between the input gene list and
379 | #' the GO-term should be output as well.
380 | #' @param EASE A Boolen value to indicate if the EASE correction should be applied (see
381 | #' https://david.ncifcrf.gov/helps/functional_annotation.html). This is useful to mitigate the
382 | #' small-sample inflation when the input gene list is short (e.g., < 10).
383 | #'
384 | #' @return a data frame contains the enrichment test results. Each row contains the P-value and enrichment odds
385 | #' ratio calculated from a Fisher's exact test for one GO-term in the go_term_db.
386 |
387 | fisher_enrich_test = function(input_list = NULL,
388 | background = NULL,
389 | go_term_db = NULL,
390 | list_gene = F,
391 | EASE = F){
392 | PT = length(background)
393 |
394 | # if go_term_db is a list of lists, Reduce it down to a list of vector (remove all the intermediate layers)
395 | while(class(go_term_db[[1]]) == "list"){
396 | go_term_db = Reduce(c, go_term_db)
397 | }
398 |
399 | if(list_gene == F){
400 | dat = matrix(nrow = length(go_term_db), ncol = 6)
401 | i = 1
402 | for(GO_TERM in names(go_term_db)){
403 | PH = length(intersect(go_term_db[[GO_TERM]], background))
404 | LT = length(input_list)
405 |
406 | # LH_list = intersect(input_list, go_term_db[[GO_TERM]])
407 | LH = length(intersect(input_list, go_term_db[[GO_TERM]]))
408 |
409 | # the Fisher exact test with EASE correction
410 | dat2 = matrix(data = c(LH-1, PH-LH+1, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2)
411 | # print(dat2)
412 | if(LH < 1 | EASE == T){
413 | dat2 = matrix(data = c(LH, PH-LH, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2)
414 | }
415 | #
416 | res = fisher.test(dat2, alternative = "greater")
417 | #
418 | dat[i, 1] = GO_TERM
419 | dat[i, 2] = res$estimate
420 | dat[i, 3] = res$p.value
421 | dat[i, 4] = -log(res$p.value)*res$estimate
422 | dat[i, 5] = LH
423 | dat[i, 6] = PH
424 | #
425 | i = i + 1
426 | }
427 | dat = as.data.frame(dat)
428 | dat = dat[complete.cases(dat), ]
429 | names(dat) = c("GO_term", "OR", "Pval", "combined_score", "num_LH", "num_PH")
430 |
431 | } else {
432 | dat = matrix(nrow = length(go_term_db), ncol = 7)
433 | i = 1
434 | for(GO_TERM in names(go_term_db)){
435 | PH = length(intersect(go_term_db[[GO_TERM]], background))
436 | LT = length(input_list)
437 |
438 | LH_list = intersect(input_list, go_term_db[[GO_TERM]])
439 | LH = length(LH_list)
440 |
441 | # the Fisher exact test with EASE correction
442 | dat2 = matrix(data = c(LH-1, PH-LH+1, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2)
443 | # print(dat2)
444 | if(LH < 1){
445 | dat2 = matrix(data = c(LH, PH-LH, LT-LH, PT-LT-(PH-LH)), byrow = T, ncol = 2)
446 | }
447 | #
448 | res = fisher.test(dat2, alternative = "greater")
449 | #
450 | dat[i, 1] = GO_TERM
451 | dat[i, 2] = res$estimate
452 | dat[i, 3] = res$p.value
453 | dat[i, 4] = -log(res$p.value)*res$estimate
454 | dat[i, 5] = LH
455 | dat[i, 6] = PH
456 | dat[i, 7] = paste0(LH_list, collapse = ";")
457 |
458 | #
459 | i = i + 1
460 | }
461 | dat = as.data.frame(dat)
462 | dat = dat[complete.cases(dat), ]
463 | names(dat) = c("GO_term", "OR", "Pval", "combined_score", "num_LH", "num_PH", "overlap_gene")
464 |
465 | }
466 |
467 | dat$OR = as.numeric(dat$OR)
468 | dat$Pval = as.numeric(dat$Pval)
469 | dat$num_LH = as.integer(dat$num_LH)
470 | dat$num_PH = as.integer(dat$num_PH)
471 | dat$n_GO_term = sapply(X = go_term_db, FUN = length)
472 |
473 | return(dat)
474 | }
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
--------------------------------------------------------------------------------
/R/get_fold_change.R:
--------------------------------------------------------------------------------
1 | #'
2 | NULL
3 |
4 |
5 | #' Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells
6 | #'
7 | #' Function to calculate log-fold-change for pooled CRISPR screen datasets.
8 | #' It is just a simple function to calculate the log-fold-change. Users can customise the min.cells,
9 | #' minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)),
10 | #' minimal percentage of cells expression the genes, and the base of the log.
11 | #'
12 | #' @param gene_exp a vector of the gene expression levels
13 | #' @param idx_P a vector of index for the perturbed cells in the gene_exp
14 | #' @param idx_NT a vector of index for the non-target cells (controls) in the gene_exp
15 | #' @param min.cells the minimal number of cells that expresses the gene; if lower than this value, the
16 | #' fold-change will be returned as NA. Default is 3.
17 | #' @param thresh.min the minimal value of expression; any expression value lower than this will be
18 | #' considered as 0. Default is 0.
19 | #' @param pseudocount.use the small value that will be added to the log-transformation to avoid log(0).
20 | #' For example, if a mean expression value is x, the final log-
21 | #' @param min.pct the minimal proportion of cells in either groups that expresses the gene
22 | #' @param base the base for log()
23 | #' @param norm.method the normalization method for the input gene_exp. Default is 'raw', which means
24 | #' the original count value without normalization. The other supported values are 'log.norm', "scale.data".
25 | #' The mean.fxn() will change accordingly.
26 | #' @return Returns a single value of the log-fold-change of the input gene.
27 | #' @export
28 | #' @concept perturbation_scoring
29 |
30 | get_fc = function(gene_exp = NULL, idx_P = NULL, idx_NT = NULL,
31 | min.cells = 3,
32 | thresh.min = 0,
33 | pseudocount.use = 1,
34 | min.pct = 0.1,
35 | base = 2,
36 | norm.method = 'raw'
37 | ){
38 | # the exp_vec should have been std
39 |
40 | # flag 1: do minimum cell check
41 | if (sum(gene_exp[idx_P] > 0) < min.cells &&
42 | sum(gene_exp[idx_NT] > 0) < min.cells) {
43 | return(NA)
44 | }
45 |
46 | # flag 2: do variance check (not 0)
47 | if (var(gene_exp) == 0) {
48 | return(NA)
49 | }
50 |
51 | # flag 3: min.pct check
52 | # calculate fraction of cell with expression > 0
53 | pct.1 <- round(
54 | x = sum(x = gene_exp[idx_NT] > thresh.min) /
55 | length(x = gene_exp[idx_NT]),
56 | digits = 3
57 | )
58 | pct.2 <- round(
59 | x = sum(x = gene_exp[idx_P] > thresh.min) /
60 | length(x = gene_exp[idx_P]),
61 | digits = 3
62 | )
63 |
64 | if (pct.1 < min.pct & pct.2 < min.pct) {
65 | return(NA)
66 | }
67 |
68 | # set the mean.fxn() function according to norm.method
69 | default.mean.fxn <- function(x) {
70 | return(log(x = mean(x = x) + pseudocount.use, base = base))
71 | }
72 | mean.fxn <- switch(
73 | EXPR = norm.method,
74 | 'log.norm' = function(x) {
75 | return(log(x = mean(x = expm1(x = x)) + pseudocount.use, base = base))
76 | },
77 | 'scale.data' = mean,
78 | default.mean.fxn
79 | )
80 |
81 | # flag 4: minimum fold change check
82 | # log(x = mean(x = x) + pseudocount.use, base = base)
83 | data.1 <- mean.fxn(gene_exp[idx_NT])
84 | data.2 <- mean.fxn(gene_exp[idx_P])
85 | fc <- - data.1 + data.2
86 | return(fc)
87 |
88 | }
89 |
90 |
91 |
92 | # a function to do the filtering for all the genens
93 | get_idx = function(gene_exp = NULL, idx_P = NULL, idx_NT = NULL,
94 | min.cells = 3, # the minimum cell threshold to perform DE
95 | thresh.min = 0, # the minimum expression level
96 | pseudocount.use = 1,
97 | min.pct = 0.1,
98 | logfc.threshold = 0.1,
99 | base = 2,
100 | norm.method = 'raw'
101 | ){
102 |
103 | fc <- get_fc(gene_exp = gene_exp, idx_P = idx_P, idx_NT = idx_NT,
104 | min.cells = min.cells,
105 | thresh.min = thresh.min,
106 | pseudocount.use = pseudocount.use,
107 | min.pct = min.pct,
108 | base = base,
109 | norm.method = norm.method
110 | )
111 |
112 | if (fc < logfc.threshold){
113 | return(FALSE)
114 | }
115 |
116 | return(TRUE)
117 | }
118 |
119 |
120 | #' Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells
121 | #'
122 | #' Function to calculate log-fold-change for pooled CRISPR screen datasets.
123 | #' It is just a simple function to calculate the log-fold-change. Users can customise the min.cells,
124 | #' minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)),
125 | #' minimal percentage of cells expression the genes, and the base of the log.
126 | #'
127 | #' @inheritParams Seurat::FoldChange
128 | #' @importFrom Matrix rowSums
129 | #' @return Returns a single value of the log-fold-change of the input gene.
130 | #' @export
131 | #' @concept perturbation_scoring
132 | FoldChange_new <- function(
133 | object,
134 | cells.1,
135 | cells.2,
136 | mean.fxn,
137 | fc.name,
138 | features = NULL,
139 | ...
140 | ) {
141 | features <- features %||% rownames(x = object)
142 |
143 | # Calculate percent expressed
144 | thresh.min <- 0
145 |
146 | min.cell.1 = Matrix::rowSums(x = object[features, cells.1, drop = FALSE] > thresh.min)
147 | min.cell.2 = Matrix::rowSums(x = object[features, cells.2, drop = FALSE] > thresh.min)
148 |
149 | pct.1 <- round(
150 | x = Matrix::rowSums(x = object[features, cells.1, drop = FALSE] > thresh.min) /
151 | length(x = cells.1),
152 | digits = 3
153 | )
154 | pct.2 <- round(
155 | x = Matrix::rowSums(x = object[features, cells.2, drop = FALSE] > thresh.min) /
156 | length(x = cells.2),
157 | digits = 3
158 | )
159 | # Calculate fold change
160 | data.1 <- mean.fxn(object[features, cells.1, drop = FALSE])
161 | data.2 <- mean.fxn(object[features, cells.2, drop = FALSE])
162 | fc <- (data.1 - data.2)
163 | fc.results <- as.data.frame(x = cbind(fc, pct.1, pct.2, min.cell.1, min.cell.2))
164 | colnames(fc.results) <- c(fc.name, "pct.1", "pct.2", "min.cell.1", "min.cell.2")
165 | return(fc.results)
166 | }
167 |
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/R/glm_gp_disp_only.R:
--------------------------------------------------------------------------------
1 | #'
2 | NULL
3 |
4 |
5 | #' Internal Function to Fit a Gamma-Poisson GLM
6 | #'
7 | #' @import glmGamPoi
8 | #' @inheritParams glmGamPoi::glm_gp
9 | #' @inheritParams glmGamPoi::overdispersion_mle
10 | #' @param Y any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with
11 | #' one column per sample and row per gene.
12 | #'
13 | #' @return a list with four elements
14 | #' * `Beta` the coefficient matrix
15 | #' * `overdispersion` the vector with the estimated overdispersions
16 | #' * `Mu` a matrix with the corresponding means for each gene
17 | #' and sample
18 | #' * `size_factors` a vector with the size factor for each
19 | #' sample
20 | #' * `ridge_penalty` a vector with the ridge penalty
21 | #'
22 | #' @seealso [glm_gp()] and [overdispersion_mle()]
23 | #' @keywords internal
24 | glm_gp_disp_only_impl <- function(Y, model_matrix,
25 | offset = 0,
26 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"),
27 | overdispersion = TRUE,
28 | overdispersion_shrinkage = TRUE,
29 | ridge_penalty = 0,
30 | do_cox_reid_adjustment = TRUE,
31 | subsample = FALSE,
32 | verbose = FALSE){
33 | if(is.vector(Y)){
34 | Y <- matrix(Y, nrow = 1)
35 | }
36 | # Error conditions
37 | stopifnot(is.matrix(Y) || is(Y, "DelayedArray"))
38 | stopifnot(is.matrix(model_matrix) && nrow(model_matrix) == ncol(Y))
39 | glmGamPoi:::validate_Y_matrix(Y)
40 | subsample <- glmGamPoi:::handle_subsample_parameter(Y, subsample)
41 | ridge_penalty <- glmGamPoi:::handle_ridge_penalty_parameter(ridge_penalty, model_matrix, verbose = verbose)
42 |
43 | # Combine offset and size factor
44 | off_and_sf <- glmGamPoi:::combine_size_factors_and_offset(offset, size_factors, Y, verbose = verbose)
45 | offset_matrix <- off_and_sf$offset_matrix
46 | size_factors <- off_and_sf$size_factors
47 |
48 | # Check if there distinct groups in model matrix
49 | # returns NULL if there would be more groups than columns
50 | # only_intercept_model <- ncol(model_matrix) == 1 && all(model_matrix == 1)
51 | groups <- glmGamPoi:::get_groups_for_model_matrix(model_matrix)
52 | if(! is.null(groups) && any(ridge_penalty > 1e-10)){
53 | # Cannot apply ridge penalty in group-wise optimization
54 | groups <- NULL
55 | }
56 |
57 | # If no overdispersion, make rough first estimate
58 | if(isTRUE(overdispersion)){
59 | if(verbose){ message("Make initial dispersion estimate") }
60 | disp_init <- glmGamPoi:::estimate_dispersions_roughly(Y, model_matrix, offset_matrix = offset_matrix)
61 | }else if(isFALSE(overdispersion)){
62 | disp_init <- rep(0, times = nrow(Y))
63 | }else if(is.character(overdispersion) && overdispersion == "global"){
64 | if(verbose){ message("Make initial dispersion estimate") }
65 | disp_init <- glmGamPoi:::estimate_dispersions_roughly(Y, model_matrix, offset_matrix = offset_matrix)
66 | disp_init <- rep(median(disp_init), nrow(Y))
67 | }else{
68 | stopifnot(is.numeric(overdispersion) && (length(overdispersion) == 1 || length(overdispersion) == nrow(Y)))
69 | if(length(overdispersion) == 1){
70 | disp_init <- rep(overdispersion, times = nrow(Y))
71 | }else{
72 | disp_init <- overdispersion
73 | }
74 | }
75 |
76 |
77 | # Estimate the betas
78 | if(! is.null(groups)){
79 | if(verbose){ message("Make initial beta estimate") }
80 | beta_group_init <- glmGamPoi:::estimate_betas_roughly_group_wise(Y, offset_matrix, groups)
81 | if(verbose){ message("Estimate beta") }
82 | beta_res <- glmGamPoi:::estimate_betas_group_wise(Y, offset_matrix = offset_matrix,
83 | dispersions = disp_init, beta_group_init = beta_group_init,
84 | groups = groups, model_matrix = model_matrix)
85 | }else{
86 | # Init beta with reasonable values
87 | if(verbose){ message("Make initial beta estimate") }
88 | beta_init <- glmGamPoi:::estimate_betas_roughly(Y, model_matrix, offset_matrix = offset_matrix, ridge_penalty = ridge_penalty)
89 | if(verbose){ message("Estimate beta") }
90 | beta_res <- glmGamPoi:::estimate_betas_fisher_scoring(Y, model_matrix = model_matrix, offset_matrix = offset_matrix,
91 | dispersions = disp_init, beta_mat_init = beta_init, ridge_penalty = ridge_penalty)
92 | }
93 | Beta <- beta_res$Beta
94 |
95 | # Calculate corresponding predictions
96 | # Mu <- exp(Beta %*% t(model_matrix) + offset_matrix)
97 | Mu <- glmGamPoi:::calculate_mu(Beta, model_matrix, offset_matrix)
98 |
99 | # Make estimate of over-disperion
100 | if(isTRUE(overdispersion) || (is.character(overdispersion) && overdispersion == "global")){
101 | if(verbose){ message("Estimate dispersion") }
102 | if(isTRUE(overdispersion)){
103 | disp_est <- overdispersion_mle(Y, Mu, model_matrix = model_matrix,
104 | do_cox_reid_adjustment = do_cox_reid_adjustment,
105 | subsample = subsample, verbose = verbose)$estimate
106 | }else if(is.character(overdispersion) && overdispersion == "global"){
107 | disp_est <- overdispersion_mle(Y, Mu, model_matrix = model_matrix,
108 | do_cox_reid_adjustment = do_cox_reid_adjustment,
109 | global_estimate = TRUE,
110 | subsample = subsample, verbose = verbose)$estimate
111 | disp_est <- rep(disp_est, times = nrow(Y))
112 | }
113 |
114 | if(isTRUE(overdispersion_shrinkage)){
115 | dispersion_shrinkage <- overdispersion_shrinkage(disp_est, gene_means = DelayedMatrixStats::rowMeans2(Mu),
116 | df = subsample - ncol(model_matrix),
117 | ql_disp_trend = length(disp_est) >= 100,
118 | npoints = max(0.1 * length(disp_est), 100),
119 | verbose = verbose)
120 | disp_latest <- dispersion_shrinkage$dispersion_trend
121 | }else{
122 | dispersion_shrinkage <- NULL
123 | disp_latest <- disp_est
124 | }
125 |
126 | # Estimate the betas again (only necessary if disp_est has changed)
127 | # if(verbose){ message("Estimate beta again") }
128 | # if(! is.null(groups)){
129 | # beta_res <- estimate_betas_group_wise(Y, offset_matrix = offset_matrix,
130 | # dispersions = disp_latest, beta_mat_init = Beta,
131 | # groups = groups, model_matrix = model_matrix)
132 | # }else{
133 | # beta_res <- estimate_betas_fisher_scoring(Y, model_matrix = model_matrix, offset_matrix = offset_matrix,
134 | # dispersions = disp_latest, beta_mat_init = Beta, ridge_penalty = ridge_penalty)
135 | # }
136 | # Beta <- beta_res$Beta
137 | #
138 | # # Calculate corresponding predictions
139 | # Mu <- calculate_mu(Beta, model_matrix, offset_matrix)
140 | }else if(isTRUE(overdispersion_shrinkage) || is.numeric(overdispersion_shrinkage)){
141 | # Given predefined disp_est shrink them
142 | disp_est <- disp_init
143 | dispersion_shrinkage <- overdispersion_shrinkage(disp_est, gene_means = DelayedMatrixStats::rowMeans2(Mu),
144 | df = subsample - ncol(model_matrix),
145 | disp_trend = overdispersion_shrinkage, verbose = verbose)
146 | disp_latest <- dispersion_shrinkage$dispersion_trend
147 | # if(verbose){ message("Estimate beta again") }
148 | # if(! is.null(groups)){
149 | # beta_res <- estimate_betas_group_wise(Y, offset_matrix = offset_matrix,
150 | # dispersions = disp_latest, beta_mat_init = Beta,
151 | # groups = groups, model_matrix = model_matrix)
152 | # }else{
153 | # beta_res <- estimate_betas_fisher_scoring(Y, model_matrix = model_matrix, offset_matrix = offset_matrix,
154 | # dispersions = disp_latest, beta_mat_init = Beta, ridge_penalty = ridge_penalty)
155 | # }
156 | # Beta <- beta_res$Beta
157 | # # Calculate corresponding predictions
158 | # Mu <- calculate_mu(Beta, model_matrix, offset_matrix)
159 | }else{
160 | # Use disp_init, because it is already in vector shape
161 | disp_est <- disp_init
162 | dispersion_shrinkage <- NULL
163 | }
164 |
165 |
166 | # Return everything
167 | list(Beta = Beta,
168 | overdispersions = disp_est,
169 | overdispersion_shrinkage_list = dispersion_shrinkage)
170 | }
171 |
172 |
173 |
174 | #' Internal Function to Fit a Gamma-Poisson GLM
175 | #' @import glmGamPoi
176 | #' @inheritParams glmGamPoi::glm_gp
177 | #' @inheritParams glmGamPoi::overdispersion_mle
178 | #' @import glmGamPoi
179 | #'
180 | #' @param Y any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with
181 | #' one column per sample and row per gene.
182 | #'
183 | #' @return a list with four elements
184 | #' * `Beta` the coefficient matrix
185 | #' * `overdispersion` the vector with the estimated overdispersions
186 | #' * `Mu` a matrix with the corresponding means for each gene
187 | #' and sample
188 | #' * `size_factors` a vector with the size factor for each
189 | #' sample
190 | #' * `ridge_penalty` a vector with the ridge penalty
191 | #'
192 | #' @seealso [glm_gp()] and [overdispersion_mle()]
193 | #' @keywords internal
194 | glm_gp_disp_only <- function(data,
195 | design = ~ 1,
196 | col_data = NULL,
197 | reference_level = NULL,
198 | offset = 0,
199 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"),
200 | overdispersion = TRUE,
201 | overdispersion_shrinkage = TRUE,
202 | ridge_penalty = 0,
203 | do_cox_reid_adjustment = TRUE,
204 | subsample = FALSE,
205 | on_disk = NULL,
206 | use_assay = NULL,
207 | verbose = FALSE){
208 |
209 | # Validate `data`
210 | if(inherits(data, "formula")){
211 | if(length(design) != 2 || design != ~ 1){
212 | stop("If the first argument is already a formula, the second argument must not be set. Please call this function like this:\n",
213 | "'glm_gp(data = mat, design = ~ a + b + c, ...)'", call. = FALSE)
214 | }
215 | extr <- glmGamPoi:::extract_data_from_formula(data, col_data, parent.frame())
216 | data <- extr$data
217 | design <- extr$design
218 | }
219 | if(is.vector(data)){
220 | data <- matrix(data, nrow = 1)
221 | }
222 | data_mat <- glmGamPoi:::handle_data_parameter(data, on_disk = F)
223 |
224 | # Convert the formula to a model_matrix
225 | col_data <- glmGamPoi:::get_col_data(data, col_data)
226 | des <- glmGamPoi:::handle_design_parameter(design, data, col_data, reference_level)
227 |
228 | # Call glm_gp_impl()
229 | res <- glm_gp_disp_only_impl(data_mat,
230 | model_matrix = des$model_matrix,
231 | offset = offset,
232 | size_factors = size_factors,
233 | overdispersion = overdispersion,
234 | overdispersion_shrinkage = overdispersion_shrinkage,
235 | ridge_penalty = ridge_penalty,
236 | do_cox_reid_adjustment = do_cox_reid_adjustment,
237 | subsample = subsample,
238 | verbose = verbose)
239 | # Make sure that the output is nice and beautiful
240 | names(res$overdispersions) <- rownames(data)
241 |
242 | class(res) <- "glmGamPoi"
243 | res
244 | }
245 |
246 |
247 |
248 |
249 |
250 |
--------------------------------------------------------------------------------
/R/perturbation_scoring.R:
--------------------------------------------------------------------------------
1 | #'
2 | NULL
3 |
4 |
5 | #' Mixscale scoring for perturbations
6 | #'
7 | #' Function to calculate perturbation scores for perturbed and non-perturbed gRNA expressing cells.
8 | #' The perturbation score reflects the perturbation strength of each cells (inherited from the RunMixscape()
9 | #' function). It is calculated by using the large-effect DE genes from raw DE tests between the
10 | #' perturbed and non-perturbed gRNA expressing cells.
11 | #'
12 | #' @export
13 | #'
14 | #' @inheritParams Seurat::RunMixscape
15 | #' @import Seurat
16 | #'
17 | #' @param object An object of class Seurat.
18 | #' @param assay Assay to use for mixscape classification.
19 | #' @param slot Assay data slot to use.
20 | #' @param labels metadata column with target gene labels.
21 | #' @param nt.class.name Classification name of non-targeting gRNA cells.
22 | #' @param new.class.name Name of mixscale scores to be stored in
23 | #' metadata.
24 | #' @param min.de.genes Required number of genes that are differentially
25 | #' expressed for method to separate perturbed and non-perturbed cells.
26 | #' @param min.cells Minimum number of cells in target gene class. If fewer than
27 | #' this many cells are assigned to a target gene class during classification,
28 | #' all are assigned NP.
29 | #' @param de.assay Assay to use when performing differential expression analysis.
30 | #' Usually RNA.
31 | #' @param logfc.threshold the log-fold-change threshold to select the large-effect
32 | #' DE genes. Only DE genes with log-fold-change larger than this value will be
33 | #' selected. Default is 0.25.
34 | #' @param verbose Display messages
35 | #' @param split.by metadata column with experimental condition/cell type
36 | #' classification information. This is meant to be used to account for cases a
37 | #' perturbation is condition/cell type -specific.
38 | #' @param fine.mode When this is equal to TRUE, DE genes for each target gene
39 | #' class will be calculated for each gRNA separately and pooled into one DE list
40 | #' for calculating the perturbation score of every cell and their subsequent
41 | #' classification.
42 | #' @param fine.mode.labels metadata column with gRNA ID labels.
43 | #' @param DE.gene specify a list of user-defined large-effect DE genes to calculate the perturbation score.
44 | #' @param max.de.genes the maximum number of top large-effect DE genes to calculate the perturbation score. Default is 100.
45 | #' @param harmonize a boolen value to specify whether a harmonization of the cell-type proportion between the NT cells and
46 | #' the perturbed cells should be performed prior to the DE test. If fine.mode is TRUE, this harmonization step will be
47 | #' performed for each fine.mode gRNA. Default is FALSE.
48 | #' @param min_prop_ntgd a minimal threshold to remove cells if any cell type has a proportion less than this value. It will
49 | #' only be used when harmonize is TRUE. Default is 0.1.
50 | #' @param pval.cutoff specify the DE test p-value cutoff (after Bonferroni correction) to select top large-effect DE genes.
51 | #' Default is 0.05.
52 | #'
53 | #' @return Returns a Seurat object containing the perturbation scores. It is stored in the Tool Data of the object, also
54 | #' the standardized scores are stored in the meta.data (column is specified by new.class.name).
55 | #' @concept perturbation_scoring
56 |
57 | RunMixscale = function (object, assay = "PRTB", slot = "scale.data", labels = "gene",
58 | nt.class.name = "NT",
59 | new.class.name = "mixscale_score",
60 | min.de.genes = 5, min.cells = 5, de.assay = "RNA", logfc.threshold = 0.25,
61 | verbose = FALSE, split.by = NULL, fine.mode = FALSE,
62 | fine.mode.labels = "guide_ID",
63 | DE.gene = NULL,
64 | max.de.genes = 100, harmonize = F,
65 | min_prop_ntgd = 0.1, pval.cutoff = 0.05,
66 | seed = 10282021)
67 | {
68 | message("Calculating Mixscale scores ...")
69 |
70 | assay <- assay %||% DefaultAssay(object = object)
71 | if (!assay %in% names(object@assays)){
72 | stop(paste0("The 'assay' being specified does not exist! Please check. Have you run CalcPerturbSig() yet?"))
73 | }
74 |
75 | if (is.null(x = labels)) {
76 | stop("Please specify target gene class metadata name")
77 | }
78 |
79 | if (min.de.genes <= 1) {
80 | warning("The min.de.genes should be larger than 1!")
81 | }
82 |
83 | prtb_markers <- list()
84 | prtb_markers2 <- list()
85 | object[[new.class.name]] <- object[[labels]]
86 | object[[new.class.name]][, 1] <- as.character(x = object[[new.class.name]][,
87 | 1])
88 | gv.list <- list()
89 | if (is.null(x = split.by)) {
90 | split.by <- splits <- "con1"
91 | } else {
92 | splits <- as.character(x = unique(x = object[[split.by]][,
93 | 1]))
94 | }
95 | cells.s.list <- list()
96 |
97 | #
98 | Idents(object = object) <- "con1"
99 | cells.s <- WhichCells(object = object, idents = "con1")
100 | # cells.s.list[[s]] <- cells.s
101 | genes <- setdiff(x = unique(x = object[[labels]][cells.s,
102 | 1]), y = nt.class.name)
103 | #
104 | for (gene in genes) {
105 | Idents(object = object) <- labels
106 |
107 | if (isTRUE(x = verbose)) {
108 | message("Processing ", gene)
109 | }
110 | orig.guide.cells <- intersect(x = WhichCells(object = object,
111 | idents = gene), y = cells.s)
112 | nt.cells <- intersect(x = WhichCells(object = object,
113 | idents = nt.class.name), y = cells.s)
114 |
115 | #############################################
116 |
117 | if (isTRUE(x = fine.mode)) {
118 | guides <- setdiff(x = unique(x = object[[fine.mode.labels]][orig.guide.cells,
119 | 1]), y = nt.class.name)
120 | all.de.genes <- c()
121 | for (gd in guides) {
122 | gd.cells <- rownames(x = object[[]][orig.guide.cells,
123 | ])[which(x = object[[]][orig.guide.cells,
124 | fine.mode.labels] == gd)]
125 | # we will need to extract the NT cells based on each celltype and do harmonization based on cell comp in PRTB cells
126 | if(harmonize == T & length(split.by) > 1){
127 | # this is a flag to indicate if the harmonization process is okay (selected_Cell >= 50% of total cell)
128 | flag_good_harm = F
129 | # the initial list of split-groups
130 | splits_list = splits
131 |
132 | while(flag_good_harm == F){
133 | # get the cell label splitted by splits
134 | cells.s.list.gd = list()
135 | cells.s.list.ntgd = list()
136 | Idents(object = object) <- split.by
137 |
138 | for (s in splits_list) {
139 | cells.s.list.gd[[s]] <- intersect(gd.cells, WhichCells(object = object, idents = s))
140 | cells.s.list.ntgd[[s]] <- intersect(nt.cells, WhichCells(object = object, idents = s))
141 | }
142 |
143 | # calculate the desired number of nt cells in each splits;
144 | length.gd = sapply(X = cells.s.list.gd, FUN = length)
145 | length.ntgd = sapply(X = cells.s.list.ntgd, FUN = length)
146 |
147 | prop.gd = length.gd/sum(length.gd, na.rm = T)
148 | # prop.ntgd = length.ntgd/sum(length.ntgd, na.rm = T)
149 | sum.desire.length.ntgd = floor(min(length.ntgd/prop.gd, na.rm = T))
150 | if(sum.desire.length.ntgd > sum(length.ntgd, na.rm = T)){
151 | stop("The sum.desire.length.ntgd is greater than the total number of NT cells. Need to check!")
152 | }
153 |
154 | if(sum.desire.length.ntgd >= min_prop_ntgd*sum(length.ntgd, na.rm = T) ){
155 | flag_good_harm = T
156 | } else {
157 | message(paste("Removing cell from ", splits_list[which.min(length.ntgd)], "due to 50% check during harmonization step."))
158 | splits_list = splits_list[-which.min(length.ntgd)]
159 | }
160 | }
161 |
162 | # calculate the final number of NT cells to extract from splits_list
163 | desire.length.ntgd = floor(sum.desire.length.ntgd*prop.gd)
164 |
165 | # start to subsample the nt cells based on the desire length:
166 | sub.cells.s.list.ntgd = list()
167 | for (s in splits_list) {
168 | set.seed(seed = seed)
169 | sub.cells.s.list.ntgd[[s]] <- sample(x = cells.s.list.ntgd[[s]], size = desire.length.ntgd[s])
170 | }
171 |
172 | # collapse the list into a single vectors of sub-sampled NT cells
173 | sub.ntgd.cells = Reduce(c, sub.cells.s.list.ntgd)
174 | rm(cells.s.list.gd, cells.s.list.ntgd, length.gd, length.ntgd, prop.gd, sum.desire.length.ntgd, desire.length.ntgd, sub.cells.s.list.ntgd)
175 | if(verbose){
176 | message("Done with harmonizing the cell composition in NT cells (fine mode).")
177 | }
178 |
179 | Idents(object = object) <- labels
180 | } else {
181 | sub.ntgd.cells = nt.cells
182 | }
183 |
184 | # run DE
185 | if(!is.null(DE.gene) ){
186 | if(!is.null(DE.gene[[gene]]) | length(DE.gene[[gene]]) != 0){
187 | all.de.genes = DE.gene[[gene]]
188 | } else {
189 | all.de.genes = character()
190 | warning(paste("No de.genes are provided for PRTB:", gene, ". Pls check!"))
191 | }
192 | } else {
193 | # run DE
194 | de.genes <- Seurat:::TopDEGenesMixscape(object = object,
195 | ident.1 = gd.cells, ident.2 = sub.ntgd.cells, de.assay = de.assay,
196 | logfc.threshold = logfc.threshold, labels = fine.mode.labels,
197 | verbose = verbose, pval.cutoff = pval.cutoff)
198 | all.de.genes <- c(all.de.genes, de.genes)
199 | }
200 | }
201 | all.de.genes <- unique(all.de.genes)
202 | } else {
203 | # we will need to extract the NT cells based on each celltype and do harmonization based on cell comp in PRTB cells
204 | if(harmonize == T & length(split.by) > 1){
205 | # this is a flag to indicate if the harmonization process is okay (selected_Cell >= 50% of total cell)
206 | flag_good_harm = F
207 | # the initial list of split-groups
208 | splits_list = splits
209 |
210 | while(flag_good_harm == F){
211 | # get the cell label splitted by splits
212 | cells.s.list.gene = list()
213 | cells.s.list.nt = list()
214 | Idents(object = object) <- split.by
215 |
216 | for (s in splits_list) {
217 | cells.s.list.gene[[s]] <- intersect(orig.guide.cells, WhichCells(object = object, idents = s))
218 | cells.s.list.nt[[s]] <- intersect(nt.cells, WhichCells(object = object, idents = s))
219 | }
220 |
221 | # calculate the desired number of nt cells in each splits;
222 | length.gene = sapply(X = cells.s.list.gene, FUN = length)
223 | length.nt = sapply(X = cells.s.list.nt, FUN = length)
224 |
225 | prop.gene = length.gene/sum(length.gene, na.rm = T)
226 | # prop.nt = length.nt/sum(length.nt, na.rm = T)
227 | sum.desire.length.nt = floor(min(length.nt/prop.gene, na.rm = T))
228 | if(sum.desire.length.nt > sum(length.nt, na.rm = T)){
229 | stop("The sum.desire.length.nt is greater than the total number of NT cells. Need to check!")
230 | }
231 | #
232 | if(sum.desire.length.nt >= min_prop_ntgd*sum(length.nt, na.rm = T) ){
233 | flag_good_harm = T
234 | } else {
235 | message(paste("Removing cell from ", splits_list[which.min(length.nt)], "due to 50% check during harmonization step."))
236 | splits_list = splits_list[-which.min(length.nt)]
237 | }
238 | }
239 |
240 | ###
241 | desire.length.nt = floor(sum.desire.length.nt*prop.gene)
242 |
243 | # start to subsample the nt cells based on the desire length:
244 | sub.cells.s.list.nt = list()
245 | for (s in splits_list) {
246 | set.seed(seed = seed)
247 | sub.cells.s.list.nt[[s]] <- sample(x = cells.s.list.nt[[s]], size = desire.length.nt[s])
248 | }
249 |
250 | # collapse the list into a single vectors of sub-sampled NT cells
251 | sub.nt.cells = Reduce(c, sub.cells.s.list.nt)
252 | rm(cells.s.list.gene, cells.s.list.nt, length.gene, length.nt, prop.gene, sum.desire.length.nt, desire.length.nt, sub.cells.s.list.nt)
253 | if(verbose){
254 | message("Done with harmonizing the cell composition in NT cells.")
255 | }
256 |
257 | Idents(object = object) <- labels
258 | } else {
259 | sub.nt.cells = nt.cells
260 | }
261 | # run DE
262 | if(!is.null(DE.gene) ){
263 | if(!is.null(DE.gene[[gene]]) | length(DE.gene[[gene]]) != 0){
264 | all.de.genes = DE.gene[[gene]]
265 | } else {
266 | all.de.genes = character()
267 | message(paste("No de.genes are provided for PRTB:", gene, ". Pls check!"))
268 | }
269 | } else {
270 | all.de.genes <- Seurat:::TopDEGenesMixscape(object = object,
271 | ident.1 = orig.guide.cells, ident.2 = sub.nt.cells,
272 | de.assay = de.assay, logfc.threshold = logfc.threshold,
273 | labels = labels, verbose = verbose, pval.cutoff = pval.cutoff)
274 | }
275 |
276 |
277 | }
278 | # print(gene)
279 | # print(all.de.genes)
280 |
281 | # only keep the top max.de.genes as the de.genes for PRTB score calculation
282 | if(!is.null(max.de.genes) ){
283 | if(length(all.de.genes) <= max.de.genes){
284 | if(verbose){
285 | message(paste("The number of de.genes (", length(all.de.genes), ") is less than max.de.genes (", max.de.genes, ")."))
286 | }
287 | } else {
288 | if(verbose){
289 | message(paste("The number of de.genes (", length(all.de.genes), ") is larger than max.de.genes (", max.de.genes, ").",
290 | "Restricting to top", max.de.genes, "genes..."))
291 | }
292 | all.de.genes = all.de.genes[1:max.de.genes]
293 | }
294 | }
295 |
296 | # use user-defined DE.gene list
297 | # if(!is.null(DE.gene) ){
298 | # all.de.genes = DE.gene
299 | # }
300 |
301 | for (s in splits) {
302 | Idents(object = object) <- split.by
303 | cells.s.list[[s]] <- WhichCells(object = object, idents = s)
304 |
305 | prtb_markers[[s]][[gene]] <- all.de.genes
306 | prtb_markers2[[s]][[gene]] <- all.de.genes
307 | if (length(x = all.de.genes) < min.de.genes) {
308 | prtb_markers[[s]][[gene]] <- character()
309 | }
310 |
311 | }
312 | if(verbose){
313 | message(paste0("Done with extracting top DE genes for ", gene))
314 | }
315 | }
316 |
317 | all_markers <- unique(x = unlist(x = prtb_markers))
318 | missing_genes <- all_markers[!all_markers %in% rownames(x = object[[assay]])]
319 | # print(missing_genes)
320 | object <- Seurat:::GetMissingPerturb(object = object, assay = assay,
321 | features = missing_genes, verbose = verbose)
322 |
323 | if(verbose){
324 | message("Done with getting Missing PRTB")
325 | }
326 |
327 | for (s in splits) {
328 | # print(splits)
329 | cells.s <- cells.s.list[[s]]
330 | genes <- setdiff(x = unique(x = object[[labels]][cells.s,
331 | 1]), y = nt.class.name)
332 | for (gene in genes) {
333 | Idents(object = object) <- labels
334 | post.prob <- 0
335 | orig.guide.cells <- intersect(x = WhichCells(object = object,
336 | idents = gene), y = cells.s)
337 | nt.cells <- intersect(x = WhichCells(object = object,
338 | idents = nt.class.name), y = cells.s)
339 | all.cells <- c(orig.guide.cells, nt.cells)
340 | if (length(x = prtb_markers[[s]][[gene]]) == 0) {
341 | if (verbose) {
342 | message(" Fewer than ", min.de.genes, " DE genes for ",
343 | gene, ". Assigning cells as NP.")
344 | }
345 | object[[new.class.name]][orig.guide.cells, 1] <- paste0(gene, " NP")
346 | }
347 | else {
348 | if (verbose) {
349 | message(" ", gene)
350 | }
351 | de.genes <- prtb_markers[[s]][[gene]]
352 | dat <- GetAssayData(object = object[[assay]],
353 | slot = "data")[de.genes, all.cells, drop = FALSE]
354 | if (slot == "scale.data") {
355 | dat <- ScaleData(object = dat, features = de.genes,
356 | verbose = FALSE)
357 | }
358 |
359 | # the first step to calculate the overall PRTB score
360 | if(verbose){
361 | cat(paste0("Calculating the overall PRTB score...\n"))
362 | }
363 | Idents(object = object) <- new.class.name
364 | guide.cells <- intersect(x = WhichCells(object = object,
365 | idents = gene), y = cells.s)
366 | vec <- matrixStats::rowMeans2(x = dat[, guide.cells, drop = FALSE]) -
367 | matrixStats::rowMeans2(x = dat[, nt.cells, drop = FALSE])
368 |
369 | # save the mat and vec to easily calculate the weights by rowSums
370 | pvec_mat = sweep(t(dat), MARGIN=2, vec, `*`)
371 | vec_mat = vec * vec
372 | names(vec_mat) = colnames(pvec_mat)
373 |
374 | # the weights
375 | pvec = matrixStats::rowSums2(pvec_mat)/sum(vec_mat)
376 | names(pvec) = rownames(pvec_mat)
377 |
378 | # create a list to store the PRTB score
379 | gv <- as.data.frame(x = pvec)
380 | gv[, "gene"] <- nt.class.name
381 | gv[intersect(x = rownames(x = gv), y = guide.cells),
382 | "gene"] <- gene
383 | gv.list[[gene]][[s]] <- gv
384 |
385 | # the LOOv2 weights
386 | # for(omit_gene in de.genes){
387 | # if(verbose){
388 | # cat(paste0("Calculating the LOO PRTB score by using rowSums2 for ", omit_gene, " in subset ", s, "...\n"))
389 | # }
390 | # remain_gene = de.genes[which(de.genes != omit_gene)]
391 | # pvec2 <- rowSums2(pvec_mat[, remain_gene, drop = F])/sum(vec_mat[remain_gene])
392 | # # save the LOO weights
393 | # gv.list[[gene]][[s]][, omit_gene] <- pvec2
394 | # }
395 |
396 | # 2023 July 03: substitute the above loop with the following matrix manipulation for speed.
397 | omit_mat <- outer(de.genes, de.genes, `!=`)
398 |
399 | # Function to calculate pvec2
400 | calc_pvec2 <- function(include_gene) {
401 | pvec2 <- matrixStats::rowSums2(pvec_mat[, include_gene, drop = F])/sum(vec_mat[include_gene])
402 | return(pvec2)
403 | }
404 |
405 | # Apply the function to each column of omit_mat
406 | gv.list[[gene]][[s]][, de.genes] <- apply(omit_mat, 2, calc_pvec2)
407 |
408 |
409 | # the second step to calculate the leave-one-out (LOO) PRTB score
410 | if(verbose){
411 | message(paste0("Done calculating LOO PRTB score for ", length(de.genes), " genes in ", s, "...\n"))
412 | }
413 |
414 | }
415 |
416 | }
417 | if(verbose){
418 | message(paste0("Done with calculating scores for ", s))
419 | }
420 | }
421 | SeuratObject::Tool(object = object) <- gv.list
422 |
423 | # check if gv.list is empty.
424 | if(length(gv.list) == 0){
425 | warning("Failed to calculate Mixscale scores for any group. \nThis is probably due to insufficient response to perturbation.\nYou may consider lowering the logfc.threshold or min.de.genes when running this function.")
426 | }
427 |
428 | # added Jan 16: calculate the standardized scores and append them to the meta-data
429 | # get the list of PRTBs
430 | wt_PRTB_list = sort(names(gv.list))
431 | all_PRTB_list = sort(unique(object[[labels]][,1]))
432 | all_PRTB_list = all_PRTB_list[all_PRTB_list != nt.class.name]
433 | wt_PRTB_list = wt_PRTB_list[wt_PRTB_list %in% all_PRTB_list]
434 |
435 | #
436 | mat_B = data.frame(cell_label = colnames(object),
437 | gene = object[[labels]][,1] )
438 |
439 | #
440 | all_score = data.frame() # to store the scores from each PRTB
441 | for(PRTB in all_PRTB_list){
442 | mat_A = data.frame()
443 | # check if the scores are calculated successful for this PRTB
444 | if(PRTB %in% wt_PRTB_list){
445 | celltype_list = names(gv.list[[PRTB]])
446 | for(celltype in celltype_list){
447 | # print(gv.list[[PRTB]][[celltype]])
448 | tmp = gv.list[[PRTB]][[celltype]][, c("pvec", "gene"), drop = FALSE]
449 |
450 | # get the idx for NT cells and PRTBed cells
451 | idx_NT = which(tmp$gene == nt.class.name)
452 | idx_gene = which(tmp$gene == PRTB)
453 |
454 | # 1. calculate the overall weights
455 | # calculate the mean and sd of the PRTB score for the NT cells
456 | mean_NT = mean(tmp$pvec[idx_NT], na.rm = T)
457 | sd_NT = sd(tmp$pvec[idx_NT], na.rm = T)
458 | # standardize the PRTB scores for the PRTBed cells based on the mean and SD from those of the NT cells
459 | std_weight_gene = (tmp$pvec[idx_gene] - mean_NT)/sd_NT
460 | # convert those negative standardised PRTB score to 0
461 | # std_weight_gene[which(std_weight_gene < 0)] = 0
462 |
463 | # create a new column called "weight" in the tmp dataframe.
464 | tmp$weight = 0
465 | tmp$weight[idx_gene] = std_weight_gene
466 |
467 | tmp$cell_label = row.names(tmp)
468 | tmp = tmp[, c("cell_label", "gene", "pvec", "weight")]
469 |
470 | mat_A = rbind(mat_A, tmp)
471 | rm(tmp)
472 | }
473 | } else {
474 | # celltype_list = names(gv.list[[1]])
475 | #
476 | tmp = mat_B[mat_B$gene %in% c(PRTB, nt.class.name), ]
477 | tmp$weight = 0
478 | tmp[tmp$gene == PRTB, "weight"] = 1
479 | tmp$pvec = tmp$weight
480 | #
481 | tmp = tmp[, c("cell_label", "gene", "pvec", "weight")]
482 | mat_A = tmp
483 | rm(tmp)
484 | }
485 | all_score = rbind(all_score, mat_A)
486 | }
487 |
488 | # some final editing
489 | all_score = all_score[!duplicated(all_score$cell_label), ]
490 | rownames(all_score) = all_score$cell_label
491 | all_score = all_score[, "weight", drop = FALSE]
492 | names(all_score) = new.class.name
493 | #
494 | object[[new.class.name]] = NULL
495 |
496 | # add the standardized scores to the meta-data
497 | object = AddMetaData(object, metadata = all_score)
498 |
499 | return(object)
500 | }
501 |
502 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Mixscale
2 | Mixscale is an R package designed to analyze CRISPR interference (CRISPRi) based Perturb-seq data. It can quantify the heterogeneity of perturbation strength in each cell and improve the statistical power when doing differential expression (DE) analysis. It also provides functions for downstream analyses including decomposition, permutation test, gene set enrichment test, etc. A brief vignette is available at https://satijalab.github.io/Mixscale/.
3 |
4 | ## Dependencies
5 | This package depends on several other R packages:
6 | ```
7 | install.packages("Seurat")
8 | install.packages("PMA")
9 | install.packages("protoclust")
10 | BiocManager::install("glmGamPoi")
11 | ```
12 |
13 | ## Installation
14 | You can easily install the package by the following command:
15 | ```
16 | devtools::install_github("satijalab/Mixscale")
17 | ```
18 |
19 | ## Other resources
20 | * Our preprint is available at https://www.biorxiv.org/content/10.1101/2024.01.29.576933v2.
21 | * If you want to access the data generated in our paper (including the processed scRNA-seq data and the pathway gene signatures), you can download them at https://doi.org/10.5281/zenodo.14518762.
22 | * Raw fastq files are available at the Gene Expression Omnibus (GEO) under the accession code [GSE281048](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE281048).
23 |
24 |
--------------------------------------------------------------------------------
/docs/old/New_Vignette_2024Jan.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Using Mixscale for Perturb-seq data"
3 | author: "Longda Jiang"
4 | date: "2024-01-09"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## Introduction
13 |
14 | In this tutorial, we will describe an R package "Mixscale" for analyzing Perturb-seq data. Mixscale contains functions designed to tackle the following tasks:\
15 | 1. calculate 'Mixscale scores' for cells that receives the same perturbation to quantify the heterogeneity in perturbation strength \
16 | 2. perform a scoring-based weighted differential expression (DE) tests to identify DE genes for each perturbation \
17 | 3. perform different levels of decomposition analysis to identify correlated perturbations and group them into a program \
18 | 4. perform a PCA-based permutation test to extract shared genes for the perturbation programs (program gene signature) \
19 | 5. identify shared and unique signature between two relevant programs \
20 | 6. perform gene set enrichment tests using the program signature for new datasets \
21 | 7. perform module score analyses using the program signature to quantify the program activity in new datasets \
22 | \
23 | The tutorial is divided into two sections. The first section will describe task 1 to 5 using a public Perturb-seq dataset from the Weissman Lab ([Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5)), which can be downloaded from [GSE132080](https://0-www-ncbi-nlm-nih-gov.brum.beds.ac.uk/geo/query/acc.cgi?acc=GSE132080). The second section will describe task 6 and 7 using the pathway gene lists generated from our study (available at [Zenodo](not_inserted_yet)) and an interferon-beta stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042). This ifnb dataset is available via the SeuratData package (see section 2 below).
24 |
25 |
26 | ### load the packages
27 | ```{r load_package, message=FALSE, warning=FALSE}
28 | options(Seurat.object.assay.version = 'v3')
29 |
30 | library(Seurat)
31 | library(ggridges)
32 | library(stringr)
33 | library(Mixscale)
34 | ```
35 |
36 | ## Section 1.
37 | In this section we will focus on how to use Mixscale to analyze Perturb-seq data.
38 |
39 | ### 0. load the demo data
40 | The demo dataset from [Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5) contains CRISPRi Perturb-seq data targeting 25 key genes involved in essential cell biological processes. We will load in the count matrix to create a Seurat object and append the provided meta data to it. \
41 | One special feature of this dataset is that, for each perturbation target gene, there are five different gRNAs designed to target it. One of the gRNA has the perfectly matched sequence for the target region (labelled with "_00"), while the others contain 1~3 nucleotide mismatches so that their perturbation stength is "titrated". We will treat the cells that have the same target gene as the same group in our downstream analyses.
42 |
43 | ```{r load_data, message=FALSE, warning=FALSE, cache=TRUE}
44 | # load the count matrix
45 | ct_mat = ReadMtx(mtx = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_matrix.mtx",
46 | cells = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_barcodes.tsv",
47 | features = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_genes.tsv")
48 | # load the meta_data
49 | meta_data = read.csv("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_cell_identities.csv")
50 | rownames(meta_data) = meta_data$cell_barcode
51 |
52 | # create a seurat object
53 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data)
54 | rm(ct_mat, meta_data)
55 |
56 | # retrieve the guide information for each cell
57 | txt = seurat_obj$guide_identity
58 | txt2 = str_extract(txt, "^[^_]+")
59 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt)
60 | seurat_obj[['gene']] = txt2
61 | seurat_obj[['gRNA_name']] = txt3
62 |
63 | # remove ambiguous cells
64 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain
65 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain
66 |
67 | seurat_obj
68 | ```
69 |
70 | ### 1. Pre-processing and calculating the Mixscale score
71 | We will first run standard pre-processing (normalization, find variable features, etc) for the dataset. Then, we will follow the standard [Mixscape analysis](https://satijalab.org/seurat/articles/mixscape_vignette) to calculate local perturbation signatures that mitigate confounding effects. Briefly speaking, for each cell we will search for its 20 nearest neighbors from the non-targeted (NT) cells, and then remove all technical variation so that perturbation-specific effect can be revealed.
72 |
73 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE}
74 | # standard pre-processing
75 | seurat_obj = NormalizeData(seurat_obj)
76 | seurat_obj = FindVariableFeatures(seurat_obj)
77 | seurat_obj = ScaleData(seurat_obj)
78 | seurat_obj = RunPCA(seurat_obj)
79 |
80 | # calculate Perturbation signatures
81 | seurat_obj <- CalcPerturbSig(
82 | object = seurat_obj,
83 | assay = "RNA",
84 | slot = "data",
85 | gd.class ="gene",
86 | nt.cell.class = "neg",
87 | reduction = "pca",
88 | ndims = 40,
89 | num.neighbors = 20,
90 | new.assay.name = "PRTB")
91 |
92 | ```
93 |
94 | Now we will calculate the Mixscale scores for each cell within each perturbation group.
95 | ```{r scoring, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE}
96 | # Mixscale
97 | seurat_obj = RunMixscale(
98 | object = seurat_obj,
99 | assay = "PRTB",
100 | slot = "scale.data",
101 | labels = "gene",
102 | nt.class.name = "neg",
103 | min.de.genes = 5,
104 | logfc.threshold = 0.2,
105 | de.assay = "RNA",
106 | max.de.genes = 100,
107 | prtb.type = "P", new.class.name = "mixscale_id", fine.mode = F)
108 |
109 | ```
110 |
111 |
112 | ### 2. Visualizations for the scores
113 | We will now use some plotting functions to explore the perturbation scores that we just calculated.
114 |
115 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE}
116 | # a. Check the distribution of the scores for the first 10 perturbations
117 | Mixscale_RidgePlot(object = seurat_obj,
118 | nt.class.name = "neg",
119 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10],
120 | facet_wrap = "gene", facet_scale = "fixed")
121 |
122 | # b. Check if the scores correlate with the expression level of the target gene itself
123 | Mixscale_ScatterPlot(object = seurat_obj, nt.class.name = "neg",
124 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10],
125 | facet_wrap = "gene", facet_scale = "free_y", nbin = 10)
126 | ```
127 |
128 | ### 3. Differential expression (DE) analysis
129 | After calculating the scores, we can use the scores to enhance the statistical power of DE analysis by using them as a "weights" in the regression model. Briefly speaking, instead of coding the NT cells as 0 and the targeted cells as 1, we used the standardized scores to code the targeted cells in the regression, so that cells with stronger perturbation strength will have higher "weights" and vice versa.
130 |
131 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
132 | # run score-based weighted DE test for 12 selected perturbations. It will return a list of data frames (one for each perturbation)
133 | de_res = Run_wtDE(object = seurat_obj, assay = "RNA", slot = "counts",
134 | labels = "gene", nt.class.name = "neg",
135 | logfc.threshold = 0.1)
136 |
137 | # have a quick look at the DE results
138 | head(de_res[[1]])
139 |
140 | ```
141 |
142 | We can now explore the top DE genes for each perturbations using the customized DoHeatmap function, where cells are ordered by Mixscale scores.
143 |
144 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
145 | # heatmap for the top DE genes
146 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "POLR2H",
147 | slct_condition = "con1",
148 | nt.class.name = "neg",
149 | labels = "gene",
150 | slct_features = rownames(de_res[["POLR2H"]][order(de_res[["POLR2H"]]$p_weight), ])[1:20]) + NoLegend()
151 |
152 | # similar heatmap for the top DE genes, but this time the cells are divided based on gRNA identity using slct_ident
153 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "GATA1",
154 | slct_condition = "con1",
155 | nt.class.name = "neg",
156 | labels = "gene",
157 | slct_features = rownames(de_res[["GATA1"]][order(de_res[["GATA1"]]$p_weight), ])[1:20],
158 | slct_ident = "gRNA_name") + NoLegend()
159 | ```
160 |
161 | ### 5. Decomposition analyses to identify correlated perturbations
162 | #### 5.1 Hierarchical clustering
163 | In this section we will perform mainly two types of decomposition analyses for our DE results. The first is a hierarchical clustering analysis based on ([MinMax](https://www.sciencedirect.com/science/article/abs/pii/S0031320314000338)). We will apply it to the DE Z-score matrix of our DE results.
164 |
165 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
166 | # get the Z-score matrix (a list of matrices will be returned, with each matrix correspond to a cell type)
167 | # as for the selection of rows (features), we will use the union set of the top 100 DE genes from each column
168 | DEG_mat_main = get_DE_mat(de_res, p_threshold = 0.05/30000, fc_threshold = 0.2, num_top_DEG = 100)
169 |
170 | # slightly clean up the matrices by removing columns with not enough significant DEGs and replace all the NAs with 0
171 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 5)
172 |
173 | # an empty list to store all the gene sets (a repository of gene sets)
174 | go_db = list()
175 |
176 | # get the cell type (in this example there is only one cell type)
177 | celltype_list = names(DEG_mat)
178 |
179 | # loop through all the cell types to perform Minmax hierarchical clustering
180 | for(i in 1:length(celltype_list)){
181 | CELLTYPE = celltype_list[i]
182 | tmp=DEG_mat[[CELLTYPE]]
183 |
184 | # run hierarchical clustering using Minmax. Other standard hclust methods are also supported.
185 | # dist_thres defines the height (= 1 - dist_thres) that is used to cut the hclust tree.
186 | # a lower value indicates a more stringent threshold to define clusters.
187 | res = DEhclust(mat = tmp, cor_method = "pearson", hclust_method = "minmax", dist_thres = 0.4)
188 |
189 | # get_sig_genes_DEhclust() is a wrapper function for PCApermtest() and get_sig_genes(), which will
190 | # perform a PCA-based permutation test and extract the top shared DE genes across the perturbations
191 | # in the same cluster
192 | sig_genes = get_sig_genes_DEhclust(obj = res, row_filtering_pval = 0.05)
193 |
194 | # store the extracted top genes as the cluster signature into go_db
195 | for(CLUSTER in names(sig_genes)){
196 | if(length(sig_genes[[CLUSTER]]$sig_genes$upDEGs) >= 10){
197 | go_db[[paste0(CELLTYPE, "_", CLUSTER, "_upDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$upDEGs
198 | }
199 | if(length(sig_genes[[CLUSTER]]$sig_genes$downDEGs) >= 10){
200 | go_db[[paste0(CELLTYPE, "_", CLUSTER,"_downDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$downDEGs
201 | }
202 | }
203 |
204 | }
205 |
206 | # check the clustering results
207 | res$cluster_assignment
208 |
209 | # generate a correlation matrix plot based on the clustering results
210 | col3 = rev(brewer.pal(11,"RdBu"))
211 | heatmap.2(cor(tmp),
212 | Rowv = as.dendrogram(res$hclust),
213 | Colv = as.dendrogram(res$hclust),
214 | dendrogram = "none",
215 | col = col3)
216 | ```
217 |
218 | we can also use the following function to generate Z-score heatmap for each perturbation cluster and save them to a user defined directory
219 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE}
220 | DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30,
221 | output_path = "output_path/",
222 | prefix = CELLTYPE)
223 | ```
224 |
225 | #### 5.2 MultiCCA analysis
226 | In our [paper](not_insert_yet), we introduced a novel approach for identifying correlated perturbations both within and between various matrices. This method is applicable when DE Z-scores are organized into a list of Z-score matrices, with each matrix corresponding to a cell type. This is especially useful when multiple cell types/lines are used in a Perturb-seq experiment. The demo dataset only contains one cell type, so we will randomly divide the Z-score matrix above into a list of 3 matrices, and use them to test our MultiCCA method.
227 |
228 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
229 | # first we randomly divide the DEG_mat_main into a list of 3 matrices
230 | set.seed(100)
231 | DEG_mat_main2 = list(type1 = DEG_mat_main[, c(1, sample(2:26)[1:8])],
232 | type2 = DEG_mat_main[, c(1, sample(2:26)[9:16])],
233 | type2 = DEG_mat_main[, c(1, sample(2:26)[17:25])] )
234 |
235 | # clean up the matrices
236 | DEG_mat = prune_DE_mat(DEG_mat_main2, min_sig_DEG = 5, center = T)
237 |
238 | # run MultiCCA
239 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, mean_cor_thres = 0.2, max_k = 3, standardize = F)
240 |
241 | # get_sig_genes_DEmultiCCA is a wrapper function for PCApermtest() and get_sig_genes() for DEmultiCCA object.
242 | sig_genes = get_sig_genes_DEmultiCCA(res, row_filtering_pval = 0.05)
243 |
244 | # store the gene signatures to the go-term repo
245 | for(PROGRAM in names(sig_genes)){
246 | if(length(sig_genes[[PROGRAM]]$sig_genes$upDEGs) >= 10){
247 | go_db[[paste0("MultiCCA_", PROGRAM, "_upDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$upDEGs
248 | }
249 | if(length(sig_genes[[PROGRAM]]$sig_genes$downDEGs) >= 10){
250 | go_db[[paste0("MultiCCA_", PROGRAM, "_downDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$downDEGs
251 | }
252 | }
253 |
254 | ```
255 |
256 | Again, we can use the following function to generate Z-score heatmap for each perturbation program and save them to a user defined directory
257 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE}
258 | DE_heatmap(obj = res, sig_genes = sig_genes,
259 | type = "multiCCA", direction = "both",
260 | top_n = 30, labRow = T,
261 | output_path = "output_path/",
262 | prefix = "MultiCCA")
263 | ```
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
--------------------------------------------------------------------------------
/docs/old/index copy 2.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Merged_Vignette_2024Jan16"
3 | author: "Longda Jiang"
4 | date: "2024-01-16"
5 | output: html_document
6 | ---
7 |
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(echo = TRUE)
11 | ```
12 |
13 | ## Introduction
14 |
15 | In this tutorial, we will describe an R package "Mixscale" for analyzing Perturb-seq data. Mixscale contains functions designed to tackle the following tasks:\
16 | 1. calculate 'Mixscale scores' for cells that receives the same perturbation to quantify the heterogeneity in perturbation strength \
17 | 2. perform a scoring-based weighted differential expression (DE) tests to identify DE genes for each perturbation \
18 | 3. perform different levels of decomposition analysis to identify correlated perturbations and group them into a program \
19 | 4. perform a PCA-based permutation test to extract shared genes for the perturbation programs (program gene signature) \
20 | 5. identify shared and unique signature between two relevant programs \
21 | 6. perform gene set enrichment tests using the program signature for new datasets \
22 | 7. perform module score analyses using the program signature to quantify the program activity in new datasets \
23 | \
24 | The tutorial is divided into two sections. The first section will describe task 1 to 5 using a public Perturb-seq dataset from the Weissman Lab ([Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5)), which can be downloaded from [GSE132080](https://0-www-ncbi-nlm-nih-gov.brum.beds.ac.uk/geo/query/acc.cgi?acc=GSE132080). The second section will describe task 6 and 7 using the pathway gene lists generated from our study (available at [Zenodo](not_inserted_yet)) and an interferon-beta stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042). This ifnb dataset is available via the SeuratData package (see section 2 below).
25 |
26 |
27 | ### load the packages
28 | ```{r load_package, message=FALSE, warning=FALSE}
29 | options(Seurat.object.assay.version = 'v3')
30 |
31 | library(Seurat)
32 | library(ggridges)
33 | library(stringr)
34 | library(Mixscale)
35 | library(ggplot2)
36 | ```
37 |
38 | ## Section A.
39 | In this section we will focus on how to use Mixscale to analyze Perturb-seq data.
40 |
41 | ### 0. Description of the demo data
42 | The demo dataset from [Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5) contains CRISPRi Perturb-seq data targeting 25 key genes involved in essential cell biological processes. We will load in the count matrix to create a Seurat object and append the provided meta data to it. \
43 | One special feature of this dataset is that, for each perturbation target gene, there are five different gRNAs designed to target it. One of the gRNA has the perfectly matched sequence for the target region (labelled with "_00"), while the others contain 1~3 nucleotide mismatches so that their perturbation stength is "titrated". We will treat the cells that have the same target gene as the same group in our downstream analyses.
44 |
45 | ```{r load_data, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
46 | # load in the count matrix downloaded from GSE132080
47 | ct_mat = ReadMtx(mtx = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_matrix.mtx",
48 | cells = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_barcodes.tsv",
49 | features = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_genes.tsv")
50 | # load the meta_data
51 | meta_data = read.csv("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_cell_identities.csv")
52 | rownames(meta_data) = meta_data$cell_barcode
53 |
54 | # create a seurat object
55 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data)
56 | rm(ct_mat, meta_data)
57 |
58 | # retrieve the guide information for each cell
59 | txt = seurat_obj$guide_identity
60 | txt2 = str_extract(txt, "^[^_]+")
61 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt)
62 | seurat_obj[['gene']] = txt2
63 | seurat_obj[['gRNA_name']] = txt3
64 | seurat_obj[['cell_type']] = "K562"
65 | rm(txt, txt2, txt3)
66 |
67 | # remove ambiguous cells
68 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain
69 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain
70 |
71 | ```
72 |
73 |
74 | **Click here to see how to generate the Seurat object**
75 | ```{r load_data2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE}
76 | # load in the count matrix downloaded from GSE132080
77 | ct_mat = ReadMtx(mtx = "GSE132080/GSE132080_10X_matrix.mtx",
78 | cells = "GSE132080/GSE132080_10X_barcodes.tsv",
79 | features = "GSE132080/GSE132080_10X_genes.tsv")
80 | # load the meta_data
81 | meta_data = read.csv("GSE132080/GSE132080_cell_identities.csv")
82 | rownames(meta_data) = meta_data$cell_barcode
83 |
84 | # create a seurat object
85 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data)
86 | rm(ct_mat, meta_data)
87 |
88 | # retrieve the guide information for each cell
89 | txt = seurat_obj$guide_identity
90 | txt2 = str_extract(txt, "^[^_]+")
91 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt)
92 | seurat_obj[['gene']] = txt2
93 | seurat_obj[['gRNA_name']] = txt3
94 | seurat_obj[['cell_type']] = "K562"
95 | rm(txt, txt2, txt3)
96 |
97 | # remove ambiguous cells
98 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain
99 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain
100 |
101 | ```
102 |
103 | \
104 |
105 |
106 | ### 1. Pre-processing and calculating the Mixscale score
107 | We will first run standard pre-processing (normalization, find variable features, etc) for the dataset. Then, we will follow the standard [Mixscape analysis](https://satijalab.org/seurat/articles/mixscape_vignette) to calculate local perturbation signatures that mitigate confounding effects. Briefly speaking, for each cell we will search for its 20 nearest neighbors from the non-targeted (NT) cells, and then remove all technical variation so that perturbation-specific effect can be revealed.
108 |
109 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
110 | # quick check of the data
111 | head(seurat_obj)
112 |
113 | # standard pre-processing
114 | seurat_obj = NormalizeData(seurat_obj)
115 | seurat_obj = FindVariableFeatures(seurat_obj)
116 | seurat_obj = ScaleData(seurat_obj)
117 | seurat_obj = RunPCA(seurat_obj)
118 |
119 | # calculate Perturbation signatures
120 | seurat_obj <- CalcPerturbSig(
121 | object = seurat_obj,
122 | assay = "RNA",
123 | slot = "data",
124 | gd.class ="gene",
125 | nt.cell.class = "neg",
126 | reduction = "pca",
127 | ndims = 40,
128 | num.neighbors = 20,
129 | new.assay.name = "PRTB")
130 |
131 | ```
132 |
133 | Now we will calculate the Mixscale scores for each cell within each perturbation group.
134 | ```{r scoring, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
135 | # Mixscale
136 | seurat_obj = RunMixscale(
137 | object = seurat_obj,
138 | assay = "PRTB",
139 | slot = "scale.data",
140 | labels = "gene",
141 | nt.class.name = "neg",
142 | min.de.genes = 5,
143 | logfc.threshold = 0.2,
144 | de.assay = "RNA",
145 | max.de.genes = 100,
146 | prtb.type = "P", new.class.name = "mixscale_id", fine.mode = F)
147 |
148 | ```
149 |
150 |
151 | ### 2. Visualizations for the scores
152 | We will now use some plotting functions to explore the perturbation scores that we just calculated.
153 |
154 | ```{r ridge_plot, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
155 | # a. Check the distribution of the scores for the first 10 perturbations
156 | Mixscale_RidgePlot(object = seurat_obj,
157 | nt.class.name = "neg",
158 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10],
159 | facet_wrap = "gene", facet_scale = "fixed")
160 |
161 | # b. Check if the scores correlate with the expression level of the target gene itself
162 | Mixscale_ScatterPlot(object = seurat_obj, nt.class.name = "neg",
163 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10],
164 | facet_wrap = "gene", facet_scale = "free_y", nbin = 10)
165 | ```
166 |
167 | ### 3. Differential expression (DE) analysis
168 | After calculating the scores, we can use the scores to enhance the statistical power of DE analysis by using them as a "weights" in the regression model. Briefly speaking, instead of coding the NT cells as 0 and the targeted cells as 1, we used the standardized scores to code the targeted cells in the regression, so that cells with stronger perturbation strength will have higher "weights" and vice versa.
169 |
170 | ```{r DE, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
171 | # run score-based weighted DE test for 12 selected perturbations. It will return a list of data frames (one for each perturbation)
172 | de_res = Run_wtDE(object = seurat_obj, assay = "RNA", slot = "counts",
173 | labels = "gene", nt.class.name = "neg",
174 | logfc.threshold = 0.1)
175 |
176 | # have a quick look at the DE results
177 | head(de_res[[1]])
178 |
179 | ```
180 |
181 | We can now explore the top DE genes for each perturbations using the customized DoHeatmap function, where cells are ordered by Mixscale scores.
182 |
183 | ```{r DE_heatmap, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
184 | # select the top 20 DE genes from on of the perturbation
185 | top_res = de_res[["GATA1"]][order(de_res[["GATA1"]]$p_weight)[1:20], ]
186 | # order the DE genes based on its log-fold-change
187 | top_DEG = rownames(top_res[order(top_res$beta_weight), ])
188 |
189 | # heatmap for the top DE genes. cells ordered by Mixscale scores
190 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "GATA1",
191 | slct_condition = "con1",
192 | nt.class.name = "neg",
193 | labels = "gene",
194 | slct_features = top_DEG,
195 | slct_ident = "gene")
196 |
197 | ```
198 |
199 | We can also explore the DE results for some other perturbations using similar codes as above.
200 | ```{r DE_heatmap_2, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
201 | # select the top 20 DE genes from on of the perturbation
202 | for(PRTB in c("GINS1", "MTOR", "TUBB")){
203 | top_res = de_res[[PRTB]][order(de_res[[PRTB]]$p_weight)[1:20], ]
204 | # order the DE genes based on its log-fold-change
205 | top_DEG = rownames(top_res[order(top_res$beta_weight), ])
206 |
207 | #
208 | p = Mixscale_DoHeatmap(object = seurat_obj, PRTB = PRTB,
209 | slct_condition = "con1",
210 | nt.class.name = "neg",
211 | labels = "gene",
212 | slct_features = top_DEG,
213 | slct_ident = "gene")
214 | print(p)
215 | }
216 |
217 | ```
218 |
219 |
220 |
221 | ## Section B.
222 | In this section we will focus on how to use the pathway signatures from our study to run gene set enrichment test in external datasets.
223 |
224 | ### 0. Introduction
225 | We will use the interferon-beta (IFNB) stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042) (available via [SeuratData](https://github.com/satijalab/seurat-data) package) to demonstrate how to perform gene set enrichment analyses using the pathway gene sets from our study. We aim to show that by using our pathway gene lists, we can correctly infer the pathway activation of IFNB across different cell types in the human PBMCs.
226 |
227 | ```{r load_ifnb, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
228 | library(SeuratData)
229 | # load dataset
230 | ifnb <- LoadData("ifnb")
231 | ```
232 |
233 | ```{r load_ifnb2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE}
234 | # install the ifnb dataset
235 | SeuratData::InstallData("ifnb")
236 | # load dataset
237 | ifnb <- SeuratData::LoadData("ifnb")
238 | ```
239 |
240 | We can then load the pathway gene sets we generated (can be downloaded from [Zenodo](not_yet_insert)). There are two versions of pathway gene lists provided. One is the standard pathway gene list for different pathway programs we compiled, and the other one is the pathway exclusive gene list that filtered out the shared genes shared with other relevant pathways in the experiment.
241 | ```{r load_geneset, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
242 | plist3 = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/P3_signatures_2023Jun19.rds")
243 | plist = Reduce(c, plist3)
244 | plist = plist[c("IFNB_program1_down", #"IFNB_program1_up",
245 | "IFNB_program2_down", #"IFNB_program2_up",
246 | "IFNG_program1_down", #"IFNG_program1_up",
247 | "IFNG_program2_down", #"IFNG_program2_up",
248 | "TNFA_program1_down", #"TNFA_program1_up",
249 | "TNFA_program2_down", #"TNFA_program2_up",
250 | "TGFB1_program1_down", #"TGFB1_program1_up",
251 | "TGFB1_program2_down")] #"TGFB1_program2_up")]
252 |
253 | exclusive_plist = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/Exclusive_signatures_2023Jun20.rds")
254 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG",
255 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")]
256 | ```
257 |
258 | ```{r load_geneset2, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE, eval=FALSE}
259 | plist = readRDS("pathway_genelist.rds")
260 | exclusive_plist = readRDS("Exclusive_pathway_genelist.rds")
261 |
262 | # only extract the exclusive gene lists that are relevant to IFNB pathway
263 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG",
264 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")]
265 | ```
266 |
267 | ### 1. DE tests and Fisher enrichment tests for ifnb dataset
268 | We will first conduct Wilcox DE tests between the control and the IFNB-stimulated cells in each cell types in the ifnb dataset. Then, we will perform Fisher enrichment tests for the DE genes from each of the cell types, testing them against the pathway gene lists we just load. These two steps are merged by a wrapper function.
269 | ```{r ifnb_DE, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
270 | # Normalize the counts
271 | ifnb = NormalizeData(ifnb)
272 |
273 | # A wrapper function to perform both DE and enrichment test
274 | res = Mixscale_DEenrich(object = ifnb,
275 | plist = plist,
276 | labels = "seurat_annotations",
277 | conditions = "stim",
278 | ident.1 = "STIM",
279 | ident.2 = "CTRL",
280 | direction = "up",
281 | logfc.threshold = 0.2,
282 | p.val.cutoff = 0.05,
283 | min.pct = 0.1)
284 |
285 | # check the enrichment results for CD14 Monocytes
286 | head(res$`CD14 Mono`)
287 | ```
288 |
289 |
290 | ### 2. Enrichment tests using pathway exclusive gene lists
291 | Gene lists from related pathways, such as IFNG, IFNB, and TNFA which are all linked to immune responses, frequently share many genes. This overlap makes it challenging to differentiate the activation of these pathways. For example, as the result above shows, DE genes due to IFNB stimulation are enriched in not just the IFNB pathway, but also in IFNG and TNFA pathways. To overcome this challenge, we have introduced a concept of pathway-exclusive gene lists. Essentially, for any two related pathways, we define the exclusive genes of one pathway as those that are absent from the gene list of the other. To refine this further, we employed a more stringent criterion to exclude genes that, while potentially related, are not explicitly listed in the gene list of the other pathway (For a detailed explanation, please refer to our [paper](not_yet_insert)). Performing enrichment tests using the exclusive gene lists enhances our ability to accurately distinguish activations among closely associated pathways.
292 |
293 | ```{r ifnb_excl_enrich, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
294 | # A wrapper function to perform both DE and enrichment test
295 | res_exclusive = Mixscale_DEenrich(object = ifnb,
296 | plist = exclusive_plist,
297 | labels = "seurat_annotations",
298 | conditions = "stim",
299 | ident.1 = "STIM",
300 | ident.2 = "CTRL",
301 | direction = "up",
302 | logfc.threshold = 0.2,
303 | p.val.cutoff = 0.05,
304 | min.pct = 0.1)
305 |
306 | # check the enrichment results for CD14 Monocytes
307 | head(res_exclusive$`CD14 Mono`)
308 | ```
309 |
310 | We can see that the exclusive gene lists for IFNB (removing TNFA) and IFNB (removing IFNG) are still enriched for IFNB-stimulated DE genes. But we do not observe signals from IFNG (removing IFNB) or TNFA (removing IFNB), indicating that the underlying activated pathway during IFNB stimulation is indeed IFNB, while IFNG and TNFA are showing enrichment just because of their substantial overlap with IFNB.
311 |
312 |
313 | ### 3. Visualization
314 | We can now visualize the enrichment results across all the cell types in the ifnb dataset. First we will check the results for the standard enrichment test
315 | ```{r plot_standard, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
316 | DEenrich_DotPlot(res,
317 | direction = "up",
318 | plot_title = "Standard pathway gene lists")
319 |
320 | ```
321 |
322 |
323 | ```{r plot_exclusive, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
324 | DEenrich_DotPlot(res_exclusive,
325 | direction = "up",
326 | plot_title = "Pathway exclusive gene lists",
327 | OR_cutoff = 10)
328 |
329 | ```
330 |
331 | ### 4. Module score analysis
332 | Apart from performing enrishment test, we can also evaluate the pathway activity by calculating the over all expression level of all the genes within a gene list (the so-called module score analysis). We will use package ["UCell"](https://bioconductor.org/packages/release/bioc/html/UCell.html) for module score analysis. Alternatively, we can use the built-in function [AddModuleScore()](https://satijalab.org/seurat/reference/addmodulescore) from Seurat as well.
333 |
334 | ```{r module_score, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
335 | ifnb = UCell::AddModuleScore_UCell(ifnb,
336 | features = plist[c("IFNB_program1_down", "IFNG_program1_down",
337 | "TNFA_program1_down", "TGFB1_program1_down")] )
338 |
339 | # using VlnPlot to visualize the score of each cell
340 | VlnPlot(ifnb,
341 | features = grep("_UCell", names(ifnb@meta.data), value = T),
342 | pt.size = 0,
343 | group.by = "seurat_annotations",
344 | split.by = "stim",
345 | ncol = 2) &
346 | theme(legend.position = "NA",
347 | axis.title = element_text(size = 15),
348 | axis.text = element_text(size = 12),
349 | plot.title = element_text(size = 18)) &
350 | ylim(0.1, 0.4)
351 |
352 | ```
353 |
354 | We can observe very similar results as in our enrichment tests, where all IFNB, IFNG, and TNF pathways show a high activity (and not for TGFB pathway) in the IFNB-stimulated cells compared to the non-stimulated cells. And if we repeat the module score analysis using the pathway exclusive gene lists, we should be able to determine the pathway actually being activated (i.e., IFNB pathway).
355 |
356 | ```{r module_score_excl, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
357 | ifnb = UCell::AddModuleScore_UCell(ifnb,
358 | features = exclusive_plist[c("IFNB_REMOVE_IFNG", "IFNB_REMOVE_TNFA",
359 | "IFNG_REMOVE_IFNB", "TNFA_REMOVE_IFNB")] )
360 |
361 | # using VlnPlot to visualize the score of each cell
362 | VlnPlot(ifnb,
363 | features = grep("_REMOVE_.*UCell", names(ifnb@meta.data), value = T),
364 | pt.size = 0,
365 | group.by = "seurat_annotations",
366 | split.by = "stim",
367 | ncol = 2) &
368 | theme(legend.position = "NA",
369 | axis.title = element_text(size = 15),
370 | axis.text = element_text(size = 12),
371 | plot.title = element_text(size = 18))
372 |
373 | ```
374 |
375 |
376 |
377 |
378 |
379 |
--------------------------------------------------------------------------------
/docs/old/index copy.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Merged_Vignette_2024Jan16"
3 | author: "Longda Jiang"
4 | date: "2024-01-16"
5 | output: html_document
6 | ---
7 |
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(echo = TRUE)
11 | ```
12 |
13 | ## Introduction
14 |
15 | In this tutorial, we will describe an R package "Mixscale" for analyzing Perturb-seq data. Mixscale contains functions designed to tackle the following tasks:\
16 | 1. calculate 'Mixscale scores' for cells that receives the same perturbation to quantify the heterogeneity in perturbation strength \
17 | 2. perform a scoring-based weighted differential expression (DE) tests to identify DE genes for each perturbation \
18 | 3. perform different levels of decomposition analysis to identify correlated perturbations and group them into a program \
19 | 4. perform a PCA-based permutation test to extract shared genes for the perturbation programs (program gene signature) \
20 | 5. identify shared and unique signature between two relevant programs \
21 | 6. perform gene set enrichment tests using the program signature for new datasets \
22 | 7. perform module score analyses using the program signature to quantify the program activity in new datasets \
23 | \
24 | The tutorial is divided into two sections. The first section will describe task 1 to 5 using a public Perturb-seq dataset from the Weissman Lab ([Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5)), which can be downloaded from [GSE132080](https://0-www-ncbi-nlm-nih-gov.brum.beds.ac.uk/geo/query/acc.cgi?acc=GSE132080). The second section will describe task 6 and 7 using the pathway gene lists generated from our study (available at [Zenodo](not_inserted_yet)) and an interferon-beta stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042). This ifnb dataset is available via the SeuratData package (see section 2 below).
25 |
26 |
27 | ### load the packages
28 | ```{r load_package, message=FALSE, warning=FALSE}
29 | options(Seurat.object.assay.version = 'v3')
30 |
31 | library(Seurat)
32 | library(ggridges)
33 | library(stringr)
34 | library(Mixscale)
35 | library(ggplot2)
36 | ```
37 |
38 | ## Section A.
39 | In this section we will focus on how to use Mixscale to analyze Perturb-seq data.
40 |
41 | ### 0. Description of the demo data
42 | The demo dataset from [Jost et al 2020](https://www.nature.com/articles/s41587-019-0387-5) contains CRISPRi Perturb-seq data targeting 25 key genes involved in essential cell biological processes. We will load in the count matrix to create a Seurat object and append the provided meta data to it. \
43 | One special feature of this dataset is that, for each perturbation target gene, there are five different gRNAs designed to target it. One of the gRNA has the perfectly matched sequence for the target region (labelled with "_00"), while the others contain 1~3 nucleotide mismatches so that their perturbation stength is "titrated". We will treat the cells that have the same target gene as the same group in our downstream analyses.
44 |
45 | ```{r load_data, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
46 | # load in the count matrix downloaded from GSE132080
47 | ct_mat = ReadMtx(mtx = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_matrix.mtx",
48 | cells = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_barcodes.tsv",
49 | features = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_10X_genes.tsv")
50 | # load the meta_data
51 | meta_data = read.csv("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/GSE132080/GSE132080_cell_identities.csv")
52 | rownames(meta_data) = meta_data$cell_barcode
53 |
54 | # create a seurat object
55 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data)
56 | rm(ct_mat, meta_data)
57 |
58 | # retrieve the guide information for each cell
59 | txt = seurat_obj$guide_identity
60 | txt2 = str_extract(txt, "^[^_]+")
61 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt)
62 | seurat_obj[['gene']] = txt2
63 | seurat_obj[['gRNA_name']] = txt3
64 | seurat_obj[['cell_type']] = "K562"
65 | rm(txt, txt2, txt3)
66 |
67 | # remove ambiguous cells
68 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain
69 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain
70 |
71 | ```
72 |
73 |
74 | **Click here to see how to generate the Seurat object**
75 | ```{r load_data2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE}
76 | # load in the count matrix downloaded from GSE132080
77 | ct_mat = ReadMtx(mtx = "GSE132080/GSE132080_10X_matrix.mtx",
78 | cells = "GSE132080/GSE132080_10X_barcodes.tsv",
79 | features = "GSE132080/GSE132080_10X_genes.tsv")
80 | # load the meta_data
81 | meta_data = read.csv("GSE132080/GSE132080_cell_identities.csv")
82 | rownames(meta_data) = meta_data$cell_barcode
83 |
84 | # create a seurat object
85 | seurat_obj = CreateSeuratObject(counts = ct_mat, meta.data = meta_data)
86 | rm(ct_mat, meta_data)
87 |
88 | # retrieve the guide information for each cell
89 | txt = seurat_obj$guide_identity
90 | txt2 = str_extract(txt, "^[^_]+")
91 | txt3 = gsub(pattern = "^[^_]+_", replacement = "", txt)
92 | seurat_obj[['gene']] = txt2
93 | seurat_obj[['gRNA_name']] = txt3
94 | seurat_obj[['cell_type']] = "K562"
95 | rm(txt, txt2, txt3)
96 |
97 | # remove ambiguous cells
98 | seurat_obj = subset(seurat_obj, subset = number_of_cells == 1) # 19594 cells remain
99 | seurat_obj = subset(seurat_obj, subset = guide_identity != '*') # 19587 cells remain
100 |
101 | ```
102 |
103 | \
104 |
105 |
106 | ### 1. Pre-processing and calculating the Mixscale score
107 | We will first run standard pre-processing (normalization, find variable features, etc) for the dataset. Then, we will follow the standard [Mixscape analysis](https://satijalab.org/seurat/articles/mixscape_vignette) to calculate local perturbation signatures that mitigate confounding effects. Briefly speaking, for each cell we will search for its 20 nearest neighbors from the non-targeted (NT) cells, and then remove all technical variation so that perturbation-specific effect can be revealed.
108 |
109 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
110 | # quick check of the data
111 | head(seurat_obj)
112 |
113 | # standard pre-processing
114 | seurat_obj = NormalizeData(seurat_obj)
115 | seurat_obj = FindVariableFeatures(seurat_obj)
116 | seurat_obj = ScaleData(seurat_obj)
117 | seurat_obj = RunPCA(seurat_obj)
118 |
119 | # calculate Perturbation signatures
120 | seurat_obj <- CalcPerturbSig(
121 | object = seurat_obj,
122 | assay = "RNA",
123 | slot = "data",
124 | gd.class ="gene",
125 | nt.cell.class = "neg",
126 | reduction = "pca",
127 | ndims = 40,
128 | num.neighbors = 20,
129 | new.assay.name = "PRTB")
130 |
131 | ```
132 |
133 | Now we will calculate the Mixscale scores for each cell within each perturbation group.
134 | ```{r scoring, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
135 | # Mixscale
136 | seurat_obj = RunMixscale(
137 | object = seurat_obj,
138 | assay = "PRTB",
139 | slot = "scale.data",
140 | labels = "gene",
141 | nt.class.name = "neg",
142 | min.de.genes = 5,
143 | logfc.threshold = 0.2,
144 | de.assay = "RNA",
145 | max.de.genes = 100,
146 | prtb.type = "P", new.class.name = "mixscale_id", fine.mode = F)
147 |
148 | ```
149 |
150 |
151 | ### 2. Visualizations for the scores
152 | We will now use some plotting functions to explore the perturbation scores that we just calculated.
153 |
154 | ```{r ridge_plot, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
155 | # a. Check the distribution of the scores for the first 10 perturbations
156 | Mixscale_RidgePlot(object = seurat_obj,
157 | nt.class.name = "neg",
158 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10],
159 | facet_wrap = "gene", facet_scale = "fixed")
160 |
161 | # b. Check if the scores correlate with the expression level of the target gene itself
162 | Mixscale_ScatterPlot(object = seurat_obj, nt.class.name = "neg",
163 | PRTB = unique(seurat_obj$gene)[unique(seurat_obj$gene) != "neg"][1:10],
164 | facet_wrap = "gene", facet_scale = "free_y", nbin = 10)
165 | ```
166 |
167 | ### 3. Differential expression (DE) analysis
168 | After calculating the scores, we can use the scores to enhance the statistical power of DE analysis by using them as a "weights" in the regression model. Briefly speaking, instead of coding the NT cells as 0 and the targeted cells as 1, we used the standardized scores to code the targeted cells in the regression, so that cells with stronger perturbation strength will have higher "weights" and vice versa.
169 |
170 | ```{r DE, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
171 | # run score-based weighted DE test for 12 selected perturbations. It will return a list of data frames (one for each perturbation)
172 | de_res = Run_wtDE(object = seurat_obj, assay = "RNA", slot = "counts",
173 | labels = "gene", nt.class.name = "neg",
174 | logfc.threshold = 0.1)
175 |
176 | # have a quick look at the DE results
177 | head(de_res[[1]])
178 |
179 | ```
180 |
181 | We can now explore the top DE genes for each perturbations using the customized DoHeatmap function, where cells are ordered by Mixscale scores.
182 |
183 | ```{r DE_heatmap, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
184 | # select the top 20 DE genes from on of the perturbation
185 | top_res = de_res[["GATA1"]][order(de_res[["GATA1"]]$p_weight)[1:20], ]
186 | # order the DE genes based on its log-fold-change
187 | top_DEG = rownames(top_res[order(top_res$beta_weight), ])
188 |
189 | # heatmap for the top DE genes. cells ordered by Mixscale scores
190 | Mixscale_DoHeatmap(object = seurat_obj, PRTB = "GATA1",
191 | slct_condition = "con1",
192 | nt.class.name = "neg",
193 | labels = "gene",
194 | slct_features = top_DEG,
195 | slct_ident = "gene")
196 | ```
197 |
198 |
199 | ## Section B.
200 | In this section we will focus on how to use the pathway signatures from our study to run gene set enrichment test in external datasets.
201 |
202 | ### 0. Introduction
203 | We will use the interferon-beta (IFNB) stimulated human PBMCs dataset (ifnb) from [Kang et al 2017](https://www.nature.com/articles/nbt.4042) (available via [SeuratData](https://github.com/satijalab/seurat-data) package) to demonstrate how to perform gene set enrichment analyses using the pathway gene sets from our study. We aim to show that by using our pathway gene lists, we can correctly infer the pathway activation of IFNB across different cell types in the human PBMCs.
204 |
205 | ```{r load_ifnb, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
206 | library(SeuratData)
207 | # load dataset
208 | ifnb <- LoadData("ifnb")
209 | ```
210 |
211 | ```{r load_ifnb2, message=FALSE, warning=FALSE, cache=FALSE, eval=FALSE}
212 | # install the ifnb dataset
213 | SeuratData::InstallData("ifnb")
214 | # load dataset
215 | ifnb <- SeuratData::LoadData("ifnb")
216 | ```
217 |
218 | We can then load the pathway gene sets we generated (can be downloaded from [Zenodo](not_yet_insert)). There are two versions of pathway gene lists provided. One is the standard pathway gene list for different pathway programs we compiled, and the other one is the pathway exclusive gene list that filtered out the shared genes shared with other relevant pathways in the experiment.
219 | ```{r load_geneset, message=FALSE, warning=FALSE, cache=FALSE, echo=FALSE}
220 | plist3 = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/P3_signatures_2023Jun19.rds")
221 | plist = Reduce(c, plist3)
222 | plist = plist[c("IFNB_program1_down", #"IFNB_program1_up",
223 | "IFNB_program2_down", #"IFNB_program2_up",
224 | "IFNG_program1_down", #"IFNG_program1_up",
225 | "IFNG_program2_down", #"IFNG_program2_up",
226 | "TNFA_program1_down", #"TNFA_program1_up",
227 | "TNFA_program2_down", #"TNFA_program2_up",
228 | "TGFB1_program1_down", #"TGFB1_program1_up",
229 | "TGFB1_program2_down")] #"TGFB1_program2_up")]
230 |
231 | exclusive_plist = readRDS("/Users/uqljian5/Desktop/Lab_stuffs_NYGC/Paper_information_gathering/results/gene_set_database/inhouse_database/Exclusive_signatures_2023Jun20.rds")
232 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG",
233 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")]
234 | ```
235 |
236 | ```{r load_geneset2, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE, eval=FALSE}
237 | plist = readRDS("pathway_genelist.rds")
238 | exclusive_plist = readRDS("Exclusive_pathway_genelist.rds")
239 |
240 | # only extract the exclusive gene lists that are relevant to IFNB pathway
241 | exclusive_plist = exclusive_plist[c("IFNG_REMOVE_IFNB", "IFNB_REMOVE_IFNG",
242 | "IFNB_REMOVE_TNFA", "TNFA_REMOVE_IFNB")]
243 | ```
244 |
245 | ### 1. DE tests and Fisher enrichment tests for ifnb dataset
246 | We will first conduct Wilcox DE tests between the control and the IFNB-stimulated cells in each cell types in the ifnb dataset. Then, we will perform Fisher enrichment tests for the DE genes from each of the cell types, testing them against the pathway gene lists we just load. These two steps are merged by a wrapper function.
247 | ```{r ifnb_DE, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
248 | # Normalize the counts
249 | ifnb = NormalizeData(ifnb)
250 |
251 | # A wrapper function to perform both DE and enrichment test
252 | res = Mixscale_DEenrich(object = ifnb,
253 | plist = plist,
254 | labels = "seurat_annotations",
255 | conditions = "stim",
256 | ident.1 = "STIM",
257 | ident.2 = "CTRL",
258 | direction = "up",
259 | logfc.threshold = 0.2,
260 | p.val.cutoff = 0.05,
261 | min.pct = 0.1)
262 |
263 | # check the enrichment results for CD14 Monocytes
264 | head(res$`CD14 Mono`)
265 | ```
266 |
267 |
268 | ### 2. Enrichment tests using pathway exclusive gene lists
269 | Gene lists from related pathways, such as IFNG, IFNB, and TNFA which are all linked to immune responses, frequently share many genes. This overlap makes it challenging to differentiate the activation of these pathways. For example, as the result above shows, DE genes due to IFNB stimulation are enriched in not just the IFNB pathway, but also in IFNG and TNFA pathways. To overcome this challenge, we have introduced a concept of pathway-exclusive gene lists. Essentially, for any two related pathways, we define the exclusive genes of one pathway as those that are absent from the gene list of the other. To refine this further, we employed a more stringent criterion to exclude genes that, while potentially related, are not explicitly listed in the gene list of the other pathway (For a detailed explanation, please refer to our [paper](not_yet_insert)). Performing enrichment tests using the exclusive gene lists enhances our ability to accurately distinguish activations among closely associated pathways.
270 |
271 | ```{r ifnb_excl_enrich, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
272 | # A wrapper function to perform both DE and enrichment test
273 | res_exclusive = Mixscale_DEenrich(object = ifnb,
274 | plist = exclusive_plist,
275 | labels = "seurat_annotations",
276 | conditions = "stim",
277 | ident.1 = "STIM",
278 | ident.2 = "CTRL",
279 | direction = "up",
280 | logfc.threshold = 0.2,
281 | p.val.cutoff = 0.05,
282 | min.pct = 0.1)
283 |
284 | # check the enrichment results for CD14 Monocytes
285 | head(res_exclusive$`CD14 Mono`)
286 | ```
287 |
288 | We can see that the exclusive gene lists for IFNB (removing TNFA) and IFNB (removing IFNG) are still enriched for IFNB-stimulated DE genes. But we do not observe signals from IFNG (removing IFNB) or TNFA (removing IFNB), indicating that the underlying activated pathway during IFNB stimulation is indeed IFNB, while IFNG and TNFA are showing enrichment just because of their substantial overlap with IFNB.
289 |
290 |
291 | ### 3. Visualization
292 | We can now visualize the enrichment results across all the cell types in the ifnb dataset. First we will check the results for the standard enrichment test
293 | ```{r plot_standard, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
294 | DEenrich_DotPlot(res,
295 | direction = "up",
296 | plot_title = "Standard pathway gene lists")
297 |
298 | ```
299 |
300 |
301 | ```{r plot_exclusive, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
302 | DEenrich_DotPlot(res_exclusive,
303 | direction = "up",
304 | plot_title = "Pathway exclusive gene lists",
305 | OR_cutoff = 10)
306 |
307 | ```
308 |
309 | ### 4. Module score analysis
310 | Apart from performing enrishment test, we can also evaluate the pathway activity by calculating the over all expression level of all the genes within a gene list (the so-called module score analysis). We will use package ["UCell"](https://bioconductor.org/packages/release/bioc/html/UCell.html) for module score analysis. Alternatively, we can use the built-in function [AddModuleScore()](https://satijalab.org/seurat/reference/addmodulescore) from Seurat as well.
311 |
312 | ```{r module_score, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
313 | ifnb = UCell::AddModuleScore_UCell(ifnb,
314 | features = plist[c("IFNB_program1_down", "IFNG_program1_down",
315 | "TNFA_program1_down", "TGFB1_program1_down")] )
316 |
317 | # using VlnPlot to visualize the score of each cell
318 | VlnPlot(ifnb,
319 | features = grep("_UCell", names(ifnb@meta.data), value = T),
320 | pt.size = 0,
321 | group.by = "seurat_annotations",
322 | split.by = "stim",
323 | ncol = 2) &
324 | theme(legend.position = "NA",
325 | axis.title = element_text(size = 15),
326 | axis.text = element_text(size = 12),
327 | plot.title = element_text(size = 18)) &
328 | ylim(0.1, 0.4)
329 |
330 | ```
331 |
332 | We can observe very similar results as in our enrichment tests, where all IFNB, IFNG, and TNF pathways show a high activity (and not for TGFB pathway) in the IFNB-stimulated cells compared to the non-stimulated cells. And if we repeat the module score analysis using the pathway exclusive gene lists, we should be able to determine the pathway actually being activated (i.e., IFNB pathway).
333 |
334 | ```{r module_score_excl, message=FALSE, warning=FALSE, cache=FALSE, echo=TRUE}
335 | ifnb = UCell::AddModuleScore_UCell(ifnb,
336 | features = exclusive_plist[c("IFNB_REMOVE_IFNG", "IFNB_REMOVE_TNFA",
337 | "IFNG_REMOVE_IFNB", "TNFA_REMOVE_IFNB")] )
338 |
339 | # using VlnPlot to visualize the score of each cell
340 | VlnPlot(ifnb,
341 | features = grep("_REMOVE_.*UCell", names(ifnb@meta.data), value = T),
342 | pt.size = 0,
343 | group.by = "seurat_annotations",
344 | split.by = "stim",
345 | ncol = 2) &
346 | theme(legend.position = "NA",
347 | axis.title = element_text(size = 15),
348 | axis.text = element_text(size = 12),
349 | plot.title = element_text(size = 18))
350 |
351 | ```
352 |
353 |
354 |
355 |
356 |
357 |
--------------------------------------------------------------------------------
/docs/old/index.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Calculate perturbation scores for Perturb-seq data"
3 | author: "Longda Jiang"
4 | date: "2023-09-12"
5 | output: html_document
6 | ---
7 |
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(warning = FALSE)
11 | ```
12 |
13 | ## Introduction
14 |
15 | This html file describes a robust computational framework, "perturbation scoring", that allows us to accurately and efficiently capture the wide range of cellular responses to different perturbations in the context of single-cell pooled CRSIPR screens. Instead of treating cells that receive the same gRNA equally, this framework uses a scoring strategy (extended from 'mixscape', see https://satijalab.org/seurat/articles/mixscape_vignette) to capture the variability in each cell's response to the perturbation. Specifically, we introduce functions for: \
16 | 1. calculating 'perturbation scores' for cells that receives the same perturbation across multiple cell lines \
17 | 2. performing scoring-based differential expression (DE) tests to identify DE genes for different perturbations \
18 | 3. performing different levels of decomposition analysis to identify correlated perturbations given their shared DE genes \
19 | 4. performing PCA-based permutation tests to identify shared gene signatures for correlation perturbations \
20 | 5. constructing a gene-set repository for each group of correlated perturbations (similar to a gene-ontology database) \
21 | 5. performing enrichment tests for external gene lists using the gene-set repo generated above \
22 | \
23 |
24 | ## load the packages
25 | ```{r load_package, message=FALSE, warning=FALSE}
26 | library(devtools)
27 | library(Seurat)
28 | library(ggridges)
29 | library(PRTBScoring)
30 | ```
31 |
32 | ## 0. load the demo data
33 | The demo dataset contains CRISPRi Perturb-seq data for the IFN-gamma pathway. The perturbation targets include "IFNGR1", "IRF1", "IRF2", "JAK1", "STAT1", etc..
34 |
35 | ```{r load_data, message=FALSE, warning=FALSE, cache=FALSE}
36 | seurat_obj = readRDS(file = "/Users/uqljian5/Desktop/Lab_stuffs_NYGC/raw_seq_processing_2021Dec/mixscape_dat10x_NovaSeq_2021Dec07.rds")
37 | seurat_obj$cell_type = seurat_obj$HTO_classification
38 | seurat_obj$cell_type[seurat_obj$cell_type == "Negative"] = "K562"
39 | DefaultAssay(seurat_obj) = "RNA"
40 | seurat_obj[['PRTB']] = NULL
41 |
42 | # select a subset of perturbations for our demo
43 | # the perturbation identity is stored in the "gene" column in the meta.data
44 | seurat_obj = subset(seurat_obj, subset = gene %in% c("NT", "ZC3H3", "IFNGR1", "IFNGR2",
45 | "IRF1", "IRF2", "IRF5", "JUN", "MAFF",
46 | "PARP12", "RUNX1",
47 | "JAK1", "JAK2",
48 | "STAT1", "STAT2", "STAT3"))
49 | table(seurat_obj$gene, seurat_obj$cell_type)
50 | ```
51 |
52 |
53 | ## 1. Standard processing
54 | To perform standard processing for the dataset.
55 | ```{r standard_process, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
56 | seurat_obj = NormalizeData(seurat_obj)
57 | seurat_obj = FindVariableFeatures(seurat_obj)
58 | seurat_obj = ScaleData(seurat_obj)
59 | seurat_obj = RunPCA(seurat_obj)
60 | seurat_obj = RunUMAP(seurat_obj, dims = 1:40)
61 | DimPlot(seurat_obj, group.by = "cell_type")
62 |
63 | ```
64 |
65 |
66 | ## 2. Calculate perturbation signatures (correcting for confounding)
67 | To use the CalcPerturbSig() function from "Mixscape" to correct for confounding factors in each cell.
68 | ```{r calc_prtb_sig, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
69 | seurat_obj <- CalcPerturbSig(
70 | object = seurat_obj,
71 | assay = "RNA",
72 | slot = "data",
73 | gd.class ="gene",
74 | nt.cell.class = "NT",
75 | reduction = "pca",
76 | ndims = 40,
77 | num.neighbors = 20,
78 | new.assay.name = "PRTB",
79 | split.by = "cell_type")
80 |
81 | ```
82 |
83 |
84 | ## 3. Perturbation scoring
85 | This part will use the scoring strategy to assign a score to each cell. The score represents the perturbation strength
86 | each cell undergoes.
87 |
88 | ```{r, echo=TRUE, cache=FALSE}
89 | seurat_obj = PRTBScoring(
90 | object = seurat_obj,
91 | assay = "PRTB",
92 | slot = "scale.data",
93 | labels = "gene",
94 | nt.class.name = "NT",
95 | min.de.genes = 5,
96 | split.by = "cell_type",
97 | logfc.threshold = 0.2,
98 | de.assay = "RNA",
99 | max.de.genes = 100,
100 | prtb.type = "P",
101 | new.class.name = "mixscape_v1",
102 | fine.mode = F,
103 | harmonize = T,
104 | seed = 1)
105 |
106 | # take a look at the scores
107 | Tool(seurat_obj, slot = "PRTBScoring")[[1]][[1]][1:5, 1:2]
108 |
109 | ```
110 |
111 | ## 3.5 Some visualizations for the scores
112 | We will now use some plotting functions to explore the perturbation scores that we just calculated.
113 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
114 | # a. Check the distribution of the scores
115 | PRTBscore_RidgePlot(object = seurat_obj, split.by = "cell_type",
116 | PRTB = c("IFNGR1", "IRF1", "STAT1"),
117 | facet_wrap = "gene", facet_scale = "fixed",
118 | slct_split.by = c("A549", "HT29", "MCF7"),
119 | facet_nrow = 2)
120 |
121 | # b. Check if the scores correlate with the expression level of the target gene itself
122 | PRTBscore_ScatterPlot(object = seurat_obj, split.by = "cell_type",
123 | PRTB = c("IFNGR2", "JAK1", "IRF1", "STAT1"),
124 | facet_wrap = "gene", facet_scale = "free_y")
125 |
126 | # c. Check the single-cell heatmap (stratified by the expression level of the target genes)
127 | PRTBscore_DoHeatmap(object = seurat_obj, PRTB = "STAT1", slct_condition = "A549",
128 | slct_features = c("IRF1", "STAT2", "B2M", "WARS", "JAK1",
129 | "CCR5", "CXCL9", "CXCL10", "CXCL11", "IDO1"))
130 |
131 | ```
132 |
133 |
134 | ## 4. Perform scoring-based DE test
135 | This step will incorporate the scores we just calculated and use them in the differential expression tests. By using
136 | the scores as a 'weight' for the perturbed cells (instead of universally coding them as 1), we can achieve a higher
137 | statistical power.
138 |
139 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
140 | de_res = scoringDE(object = seurat_obj, assay = "RNA", slot = "counts",
141 | PRTB_list = c("RFX5", "ZC3H3", "IFNGR1", "IFNGR2",
142 | "IRF1", "IRF2", "JUN", "MAFF",
143 | "PARP12", "TRAFD1",
144 | "JAK1",
145 | "STAT1", "SP100"),
146 | labels = "gene")
147 |
148 | # have a quick look at the DE results
149 | head(de_res[[1]][order(de_res[[1]][, 24]), c(23:31)])
150 |
151 | # re-arrange the results into a list of DE Z-score matrices, removing non-significant DE genes.
152 | # the function will summarize the number of significant DE genes in the screen output.
153 | DEG_mat_main = get_DE_mat(de_res, p_threshold = 0.05/30000, fc_threshold = 0.2)
154 |
155 | ```
156 |
157 |
158 | ## 5. Decomposition
159 | In this section we will perform a series of decomposition analyses for our DE results. We will also
160 | perform PCA-based permutation tests to extract the gene signatures for the correlated perturbations.
161 | We will then aggregate all the gene signatures and arrange them into a repository of different
162 | sets of gene signatures.
163 |
164 | ### 5.1 within perturbation decomposition
165 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
166 | # an empty list to store all the gene sets (a repository of gene sets)
167 | go_db = list()
168 |
169 | # 5.1 within-prtb decomposition
170 | # to slightly clean up the matrices before decomposition
171 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 0)
172 |
173 | celltype_list = names(DEG_mat)
174 | PRTB_list = colnames(DEG_mat[[1]])
175 | gene_ID = rownames(DEG_mat[[1]])
176 |
177 | # loop through all the perturbations in our dataset
178 | for(i in 1:length(PRTB_list)){
179 | PRTB = PRTB_list[i]
180 |
181 | tmp=list()
182 | for(CELLTYPE in celltype_list){
183 | if(PRTB %in% colnames(DEG_mat[[CELLTYPE]])){
184 | tmp[[CELLTYPE]] = DEG_mat[[CELLTYPE]][, PRTB]
185 | }
186 | }
187 | tmp = Reduce(cbind, tmp)
188 | colnames(tmp) = celltype_list
189 | rownames(tmp) = gene_ID
190 |
191 | # run Permutation test and extract gene signatures
192 | # before each permutation test, PCApermtest() will further filter the
193 | # sub-matrix by removing any row (gene) without raw DE P-value <= 0.05 in any column (perturbation).
194 | res = PCApermtest(mat = tmp, row_filtering_pval = 0.05, k = 1)
195 | sig_genes = get_sig_genes(perm_obj = res, k = 1, collapse = T)
196 |
197 | ## plot Z-score heatmap for the gene signatures
198 | ## !!! this is currently commented out, but remove the '#' if you decide to run them
199 | ## the figures will be automatically saved to the folder you specify.
200 | # DE_heatmap(obj = res, sig_genes = sig_genes, type = "standard", direction = "both", top_n = 30,
201 | # output_path = "/Users/uqljian5/Desktop/test_multiCCA/level1/",
202 | # prefix = PRTB)
203 |
204 | # store the gene signatures to the go-term repo
205 | if(length(sig_genes$upDEGs) >= 10){
206 | go_db[[paste0(PRTB, "_upDEGs")]] = sig_genes$upDEGs
207 | }
208 | if(length(sig_genes$downDEGs) >= 10){
209 | go_db[[paste0(PRTB, "_downDEGs")]] = sig_genes$downDEGs
210 | }
211 | }
212 |
213 |
214 | ```
215 |
216 |
217 | ### 5.2 within cell type decomposition
218 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
219 | # clean up the matrices
220 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 3)
221 |
222 | celltype_list = names(DEG_mat)
223 | PRTB_list = colnames(DEG_mat[[1]])
224 | gene_ID = rownames(DEG_mat[[1]])
225 |
226 | # loop through all the cell types in our datasets
227 | for(i in 1:length(celltype_list)){
228 | CELLTYPE = celltype_list[i]
229 | tmp=DEG_mat[[CELLTYPE]]
230 |
231 | # run Permutation test and extract gene signatures
232 | res = DEhclust(mat = tmp)
233 |
234 | # get_sig_genes_DEhclust() is a wrapper function for PCApermtest() and get_sig_genes() for DEhclust object.
235 | sig_genes = get_sig_genes_DEhclust(obj = res, row_filtering_pval = 0.05)
236 |
237 | ## plot Z-score heatmap for the gene signatures
238 | ## remove the '#' to generate the figures.
239 | # DE_heatmap(obj = res, sig_genes = sig_genes, type = "hclust", direction = "both", top_n = 30,
240 | # output_path = "/Users/uqljian5/Desktop/test_multiCCA/level2/",
241 | # prefix = CELLTYPE)
242 |
243 | # store the gene signatures to the go-term repo
244 | for(CLUSTER in names(sig_genes)){
245 | if(length(sig_genes[[CLUSTER]]$sig_genes$upDEGs) >= 10){
246 | go_db[[paste0(CELLTYPE, "_", CLUSTER, "_upDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$upDEGs
247 | }
248 | if(length(sig_genes[[CLUSTER]]$sig_genes$downDEGs) >= 10){
249 | go_db[[paste0(CELLTYPE, "_", CLUSTER,"_downDEGs")]] = sig_genes[[CLUSTER]]$sig_genes$downDEGs
250 | }
251 | }
252 |
253 | }
254 |
255 | ```
256 |
257 |
258 | ### 5.3 MultiCCA analysis (decomposition across cell types and perturbations)
259 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
260 | # clean up the matrices
261 | DEG_mat = prune_DE_mat(DEG_mat_main, mask_target = T, min_sig_DEG = 1, center = T)
262 |
263 | celltype_list = names(DEG_mat)
264 | PRTB_list = colnames(DEG_mat[[1]])
265 | gene_ID = rownames(DEG_mat[[1]])
266 |
267 | # run Permutation test and extract gene signatures
268 | res = DEmultiCCA(mat_list = DEG_mat, cor_coef_thres = 0.8, max_k = 3, standardize = T)
269 |
270 | # get_sig_genes_DEmultiCCA is a wrapper function for PCApermtest() and get_sig_genes() for DEmultiCCA object.
271 | sig_genes = get_sig_genes_DEmultiCCA(res, row_filtering_pval = 0.05)
272 |
273 | ## visualization.
274 | ## remove '#' to generate the figures
275 | # DE_heatmap(obj = res, sig_genes = sig_genes,
276 | # type = "multiCCA", direction = "both",
277 | # top_n = 30, labRow = T, output_path = "/Users/uqljian5/Desktop/test_multiCCA/level3/",
278 | # prefix = "IFNG")
279 |
280 | # store the gene signatures to the go-term repo
281 | for(PROGRAM in names(sig_genes)){
282 | if(length(sig_genes[[PROGRAM]]$sig_genes$upDEGs) >= 10){
283 | go_db[[paste0("IFNG_", PROGRAM, "_upDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$upDEGs
284 | }
285 | if(length(sig_genes[[PROGRAM]]$sig_genes$downDEGs) >= 10){
286 | go_db[[paste0("IFNG_", PROGRAM, "_downDEGs")]] = sig_genes[[PROGRAM]]$sig_genes$downDEGs
287 | }
288 | }
289 |
290 | ```
291 |
292 |
293 | ## 6. Enrichment analysis
294 | After generating the repository of the gene sets, we will now use it to perform enrichment analyses. Enrichment tests
295 | aim to identify if a user input gene list shows any significant overlap with an existing gene-ontology (GO) gene set. In our
296 | case, the GO gene sets are generated as above, each represent the gene signatures of a perturbation or a group of correlated
297 | perturbations. \
298 | We implemented two different methods for enrichment analyses:\
299 | 1. a standard enrichment test method based on Fisher's exact test. \
300 | 2. a novel enrichment test method based on rank biased overlap (RBO). This method has the advantage that it not only
301 | shows the overlap between 2 lists, but also takes the consistency of the rank of each gene into consideration. \
302 | \
303 | For demonstration, we will use the DE genes identified for "JAK2" perturbation from the same dataset
304 | (which was not included in our above analyses) as the input gene list, and test it against the gene set
305 | repository we just generated. \
306 |
307 |
308 | ```{r, echo=TRUE, message=FALSE, warning=FALSE, cache=FALSE}
309 |
310 | # a. DE tests for JAK2 in A549 cell line. The DE genes will be used as the input gene list.
311 | seurat_obj$Condition = paste0(seurat_obj$cell_type, "_", seurat_obj$gene)
312 | Idents(seurat_obj) = "Condition"
313 |
314 | new_DE_test = FindMarkers(seurat_obj, ident.1 = "A549_NT", ident.2 = "A549_JAK2",
315 | slot = "data", logfc.threshold = 0)
316 |
317 | # get the background gene list for conventional enrich test
318 | background = rownames(new_DE_test)
319 | # get the significant down-reg genes (the input list )
320 | input_list = rownames(new_DE_test[new_DE_test$p_val_adj <= 0.05 & new_DE_test$avg_log2FC > 0.2, ])
321 |
322 |
323 | # b. Conventional enrichment test (Fisher's exact test)
324 | fisher_enrich_res = fisher_enrich_test(input_list = input_list,
325 | background = background,
326 | go_term_db = go_db)
327 | fisher_enrich_res = fisher_enrich_res[order(fisher_enrich_res$Pval), ]
328 |
329 | head(fisher_enrich_res, 10)
330 |
331 | # c. Rank biased overlap based test
332 | # RBO test does NOT require pre-select DEGs based on P-value or log-fold-change. We can simply input the
333 | # complete list of ordered DE genes as the input gene list. (here it is ordered by P-values)
334 | input_list2 = rownames(new_DE_test[new_DE_test$avg_log2FC > 0, ])
335 |
336 | rbo_enrich_res = rbo_enrich_test(input_list = input_list2,
337 | go_term_db = go_db,
338 | p = 0.98,
339 | k = 100,
340 | side = "bottom")
341 |
342 | rbo_enrich_res = rbo_enrich_res[order(rbo_enrich_res$RBO, decreasing = T), ]
343 | rownames(rbo_enrich_res) = NULL
344 |
345 | head(rbo_enrich_res, 10)
346 |
347 | ```
348 |
349 |
350 |
--------------------------------------------------------------------------------
/man/DE_heatmap.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/visualization.R
3 | \name{DE_heatmap}
4 | \alias{DE_heatmap}
5 | \title{Draw DE Z-score heatmap for gene signatures}
6 | \usage{
7 | DE_heatmap(
8 | obj = NULL,
9 | sig_genes = NULL,
10 | type = c("standard", "hclust", "multiCCA"),
11 | direction = c("both", "down", "up"),
12 | top_n = 30,
13 | zscore_cap = 15,
14 | labRow = T,
15 | output_path = "./",
16 | prefix = "heatmap",
17 | height = 15,
18 | width = 12,
19 | ...
20 | )
21 | }
22 | \arguments{
23 | \item{obj}{the object produced by PCApermtest()}
24 |
25 | \item{sig_genes}{the object produced by get_sig_genes()}
26 |
27 | \item{type}{the type of obj and sig_genes that are being input. The "standard" (default) indicates
28 | the results from a standard within-perturbation analysis. The "hclust" indicates the results from
29 | a hierarchical clustering analysis. The "multiCCA" indicates the results from a multiCCA analysis.}
30 |
31 | \item{direction}{to indicate whether gene signatures of both directions should be plotted, or just
32 | the up-regulated genes ("up") or down-regulated genes ("down") should be plotted.}
33 |
34 | \item{top_n}{a positive integer to indicate how many gene signatures should be plotted. If provided,
35 | the top top_n genes will be selected for plotting.}
36 |
37 | \item{zscore_cap}{the cap for Z-scores in the Z-score matrix. Any Z-score that is larger than this
38 | value will be capped to this value.}
39 |
40 | \item{labRow}{a boolen variable to indicate if the row names should be labelled in the heatmap or not.}
41 |
42 | \item{output_path}{the directory where the heatmap will be saved.}
43 |
44 | \item{prefix}{the prefix for how the file of the heatmap should be named}
45 |
46 | \item{height}{the height (in inch) for the figure}
47 |
48 | \item{width}{the width (in inch) for the figure}
49 | }
50 | \value{
51 | this function returns nothing. It directly output the generated figures to the directory that
52 | a user specifies.
53 | }
54 | \description{
55 | This function will generate a standard heatmap based on the DE Z-score heatmap.
56 | Only the selected significant gene signatures will be plotted in the rows.
57 | }
58 |
--------------------------------------------------------------------------------
/man/DEenrich.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/enrichment_test.R
3 | \name{DEenrich}
4 | \alias{DEenrich}
5 | \title{Wrapper function for DE and enrichment test}
6 | \usage{
7 | DEenrich(
8 | object,
9 | plist = NULL,
10 | ident = NULL,
11 | ident.1 = NULL,
12 | ident.2 = NULL,
13 | split.by = NULL,
14 | slct.ct = NULL,
15 | direction = c("up", "down", "both"),
16 | logfc.threshold = 0.25,
17 | p.val.cutoff = 0.05,
18 | min.pct = 0.1,
19 | assay = NULL,
20 | ...
21 | )
22 | }
23 | \arguments{
24 | \item{object}{a seurat object to perform the DE test and the enrichment test}
25 |
26 | \item{plist}{the pathway gene lists to test the DE genes against}
27 |
28 | \item{ident.1}{Identity class to define markers for; pass an object of class phylo or 'clustertree' to find markers for a node in a cluster tree; passing 'clustertree' requires BuildClusterTree to have been run}
29 |
30 | \item{ident.2}{A second identity class for comparison; if NULL, use all other cells for comparison; if an object of class phylo or 'clustertree' is passed to ident.1, must pass a node to find markers for}
31 |
32 | \item{split.by}{Regroup cells into a different identity class prior to performing differential expression.
33 | Default is NULL (so all cells be used simultaneously).}
34 |
35 | \item{slct.ct}{Subset a particular identity class prior to regrouping. Only relevant if group.by is set.}
36 | }
37 | \value{
38 | a list of data frames containing the gene set enrichment results for each group in "group.by"
39 | }
40 | \description{
41 | This function provides a wrapper of Seurat::FindMarkers() and Mixscale::fisher_enrich_test().
42 | Users can input a Seurat object they want to investigate and a list of gene sets they want to
43 | test against, and the wrapper will perform DE tests + Fisher's enrichment test across all the
44 | available cell types. It will then return a list of data frames, containing gene set enrichment
45 | results for each cell type.
46 | }
47 |
--------------------------------------------------------------------------------
/man/DEenrich_DotPlot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/visualization.R
3 | \name{DEenrich_DotPlot}
4 | \alias{DEenrich_DotPlot}
5 | \title{Dot plot for enrichment results across multiple cell types}
6 | \usage{
7 | DEenrich_DotPlot(
8 | obj,
9 | adjust.methods = "BH",
10 | direction = c("up", "down", "both"),
11 | log10P_cutoff = 10,
12 | OR_cutoff = 20,
13 | slct_labels = NULL,
14 | plot_title = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{obj}{the list of data frames generated by Mixscale_DEenrich()}
19 |
20 | \item{adjust.methods}{the method for multiple testing correction method (see 'p.adjust.methods')}
21 |
22 | \item{direction}{a character to specify what to plot: to plot the enrichment results for
23 | the up-regulated DE genes ("up") or the down-regulated DE genes ("down").}
24 |
25 | \item{log10P_cutoff}{the maximum value of -log10(adjusted P-value) for plotting the size of the dot (any dot with a p-value
26 | larger than this will be set to the same size)}
27 |
28 | \item{OR_cutoff}{the odds ratio (of the Fisher's exact test) cutoff for plotting the color gradient
29 | of the dot (any dot with a OR larger than this will be set to the same color)}
30 |
31 | \item{slct_labels}{the selected labels (cell types) that need to be plotted}
32 |
33 | \item{adjust.split}{TRUR/FALSE to specify if the multiple testing correction should be done for
34 | each group separately (TRUE) or all together (FALSE)}
35 | }
36 | \value{
37 | a ggplot2 object
38 | }
39 | \description{
40 | This function will generate a Dot plot for the enrichment results generated by Mixscale_DEenrich().
41 | It will also perform multiple testing for the P-values for all the cell types taken together
42 | (or within each cell type).
43 | }
44 |
--------------------------------------------------------------------------------
/man/DEhclust.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decomposition.R
3 | \name{DEhclust}
4 | \alias{DEhclust}
5 | \title{Run Hierarchical clustering for a matrix}
6 | \usage{
7 | DEhclust(
8 | mat = NULL,
9 | cor_method = c("pearson", "kendall", "spearman"),
10 | hclust_method = "minmax",
11 | dist_thres = 0.6,
12 | ...
13 | )
14 | }
15 | \arguments{
16 | \item{mat}{the Z-score matrix to perform the permutation test.
17 | Rows are the gene and columns are the conditions/samples.}
18 |
19 | \item{cor_method}{the method to calculate the correlation matrix. see cor()
20 | function for details.}
21 |
22 | \item{hclust_method}{the method to perform the hierarchical clustering. The default method
23 | is MinMax clustering (package 'protoclust' required). Other methods available in the hclust()
24 | function is also allowed.}
25 |
26 | \item{dist_thres}{The distance to cut the hierarchical clustering tree ("tree height"). See cutree()
27 | function for details. Default is 0.6.}
28 | }
29 | \value{
30 | return a list of two object: 1. a list of the cluster assignment of the
31 | columns (only those got successfully assigned to a multi-member cluster will be stored).
32 | 2. a object of the output object from protocut() (if MinMax hclust method is selected) or
33 | from cutree() (if other hclust method is selected)
34 | }
35 | \description{
36 | A wrapper for different hierarchical clustering methods to be applied to the within-cell-type
37 | cross-conditions Z-score matrix (input). Highly similar conditions (columns) will be grouped together
38 | given the DE Z-scores of rows (genes).
39 | }
40 |
--------------------------------------------------------------------------------
/man/DEmultiCCA.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decomposition.R
3 | \name{DEmultiCCA}
4 | \alias{DEmultiCCA}
5 | \title{Run MultiCCA for a list of matrices}
6 | \usage{
7 | DEmultiCCA(
8 | mat_list = NULL,
9 | penalty = FALSE,
10 | standardize = FALSE,
11 | max_k = 5,
12 | cor_number = "all",
13 | mean_cor_thres = 0.2,
14 | flag_cor_num = T,
15 | flag_loose = F,
16 | pval_thres = 0.05,
17 | cor_coef_thres = 0.6,
18 | cor_coef_mean_thres = 0.3,
19 | ...
20 | )
21 | }
22 | \arguments{
23 | \item{mat_list}{the list of >= 2 DE Z-score matrices for the multiCCA analysis. Each matrix
24 | should have the same named rows, but can have different number of columns (samples).}
25 |
26 | \item{penalty}{to indicate if penalty should be applied during the multiCCA process. When set
27 | to "FALSE", no penalty will be applied so that the results are not forced to be "sparse". However,
28 | if a single value or a vector of k values (k should be the same as the number of matrices in the
29 | input list), then L1 penalty will be applied to each matrix to force the output CVs to be sparse.
30 | See MultiCCA() from the PMA package for details.}
31 |
32 | \item{standardize}{a boolen value to indicate whether to standardize each column before running
33 | the MultiCCA. Default is FALSE.}
34 |
35 | \item{max_k}{the maximum number of MultiCCA runs. MultiCCA will be repeated until this number is
36 | reached or the CVs across the matrices have very low correlation coefficients.}
37 |
38 | \item{mean_cor_thres}{During the multiCCA analysis, if any of the input matrix has significantly
39 | low(er) correlation with other matrices, it will impact the MultiCCA process. This is the threshold
40 | to remove such matrices. If (the CV of) any matrix has a mean correlation coefficient <= 0.2, it will
41 | be removed and the MultiCCA will be repeated for this round (k-th). Note that this matrix will be appended
42 | back to the list for the next round (k+1-th) of MultiCCA, and the same filtering will be repeated.
43 | Default is 0.2. Set it to 0 to avoid such filtering. Vector is also accepted and will be sequentially used
44 | for each iteration of MultiCCA.}
45 |
46 | \item{cor_num}{for each column of a matrix, the number of CVs that it needs to be significantly correlated
47 | with to be selected as a member of the program. Default is "all", meaning it needs to be significantly
48 | correlated with all other CVs. Alternatively, users may input a integer (e.g., 2, 3, ...).}
49 | }
50 | \value{
51 | a list of MultiCCA results for each program identified.
52 | }
53 | \description{
54 | A function to perform MultiCCA analysis (main function imported from package "PMA",
55 | see PMID 19377034 for details of the algorithm) that takes in a list of multiple
56 | Z-score matrices to find the canonical variates (CVs) that maximize the cross-matrices
57 | correlation. The MultiCCA process is modified so that it is not completed in one
58 | run when multiple rounds of CVs are desired. Instead, after each MultiCCA run, we will
59 | identify the columns (samples) from each matrix that highly correlate with the CV of that
60 | matrix, and extract + remove them from the matrix. The next MultiCCA is performed to the
61 | list of such "filtered" matrices. This process is repeated until the desired number of
62 | runs is reached (set by users) or the CVs across the matrices have very low correlation
63 | coefficients.
64 | }
65 | \seealso{
66 | [MultiCCA()]
67 | }
68 |
--------------------------------------------------------------------------------
/man/FoldChange_new.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_fold_change.R
3 | \name{FoldChange_new}
4 | \alias{FoldChange_new}
5 | \title{Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells}
6 | \usage{
7 | FoldChange_new(
8 | object,
9 | cells.1,
10 | cells.2,
11 | mean.fxn,
12 | fc.name,
13 | features = NULL,
14 | ...
15 | )
16 | }
17 | \arguments{
18 | \item{object}{A Seurat object}
19 |
20 | \item{cells.1}{Vector of cell names belonging to group 1}
21 |
22 | \item{cells.2}{Vector of cell names belonging to group 2}
23 |
24 | \item{mean.fxn}{Function to use for fold change or average difference calculation}
25 |
26 | \item{fc.name}{Name of the fold change, average difference, or custom function column
27 | in the output data.frame}
28 |
29 | \item{features}{Features to calculate fold change for.
30 | If NULL, use all features}
31 |
32 | \item{...}{Arguments passed to other methods}
33 | }
34 | \value{
35 | Returns a single value of the log-fold-change of the input gene.
36 | }
37 | \description{
38 | Function to calculate log-fold-change for pooled CRISPR screen datasets.
39 | It is just a simple function to calculate the log-fold-change. Users can customise the min.cells,
40 | minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)),
41 | minimal percentage of cells expression the genes, and the base of the log.
42 | }
43 | \concept{perturbation_scoring}
44 |
--------------------------------------------------------------------------------
/man/Mixscale_DoHeatmap.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/visualization.R
3 | \name{Mixscale_DoHeatmap}
4 | \alias{Mixscale_DoHeatmap}
5 | \title{Single-cell heatmap for selected DE genes stratified by target expression}
6 | \usage{
7 | Mixscale_DoHeatmap(
8 | object = NULL,
9 | assay = "RNA",
10 | slot = "data",
11 | labels = "gene",
12 | nt.class.name = "NT",
13 | slct.ident = NULL,
14 | mixscale.score.name = "mixscale_score",
15 | features = NULL,
16 | group.by = NULL,
17 | ct.class = NULL,
18 | slct.ct = NULL,
19 | max.num.cell = 300,
20 | ...
21 | )
22 | }
23 | \arguments{
24 | \item{object}{a seurat object returned by RunMixscale()}
25 |
26 | \item{assay}{the assay name to extract the expression level data from for plotting}
27 |
28 | \item{slot}{the slot name to extract the expression level data from for plotting}
29 |
30 | \item{labels}{the column name in the object's meta.data that contains the target
31 | gene labels}
32 |
33 | \item{nt.class.name}{the classification name of non-targeting gRNA cells}
34 |
35 | \item{slct.ident}{the name of the perturbation target in 'labels' to be plotted.}
36 |
37 | \item{mixscale.score.name}{Name of mixscale scores to be stored in metadata. Default is "mixscale_score".}
38 |
39 | \item{features}{A vector of features to plot.}
40 |
41 | \item{group.by}{A vector of variables to group cells by; pass 'ident' to group by cell identity classes.
42 | Default is the same as 'labels'.}
43 |
44 | \item{ct.class}{the metadata colname that stores the cell type (or other conditions) information.
45 | If this is set and the correct slct.ct is provided, only cells with the corresponding slct.ct will
46 | be plotted. Default is NULL (so cells from all the ct.class will be plotted).}
47 |
48 | \item{slct.ct}{a character of the group of cells in ct.class to be plotted. Default is NULL (so cells from all the ct.class will be plotted).}
49 | }
50 | \value{
51 | a ggplot2 object of the single-cell heatmap.
52 | }
53 | \description{
54 | This function will generate single-cell expression heatmap for selected DE genes
55 | in cells of the same perturbation target (gRNA). This function is basically a
56 | wrapper function of the Seurat::DoHeatmap(), but with easier usage to select the
57 | cells based on given gRNA identity. Cells will be ordered in each stratification based
58 | on their perturbation scores.
59 | }
60 |
--------------------------------------------------------------------------------
/man/Mixscale_RidgePlot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/visualization.R
3 | \name{Mixscale_RidgePlot}
4 | \alias{Mixscale_RidgePlot}
5 | \title{Plot to show the distribution of perturbation scores}
6 | \usage{
7 | Mixscale_RidgePlot(
8 | object = NULL,
9 | labels = "gene",
10 | nt.class.name = "NT",
11 | split.by = NULL,
12 | PRTB = NULL,
13 | slct_split.by = NULL,
14 | facet_wrap = c(NULL, "gene", "split.by"),
15 | facet_scale = c("fixed", "free_y"),
16 | facet_nrow = 1,
17 | ...
18 | )
19 | }
20 | \arguments{
21 | \item{object}{a seurat object returned by RunMixscale()}
22 |
23 | \item{labels}{the column name in the object's meta.data that contains the target
24 | gene labels}
25 |
26 | \item{nt.class.name}{the classification name of non-targeting gRNA cells}
27 |
28 | \item{split.by}{metadata column with experimental condition/cell type classification
29 | information. This is meant to be used to account for cases a perturbation is
30 | condition/cell type -specific.}
31 |
32 | \item{PRTB}{the perturbation target genes to extract for plotting. Multiple values are
33 | allowed.}
34 |
35 | \item{slct_split.by}{if only a subset of the conditions/cell-types in the split.by column need
36 | to be plotted, users can specify them as a character vector here. Default is NULL, meaning all the
37 | conditions/cell-types need to be plotted.}
38 |
39 | \item{facet_wrap}{whether to divide the plot into multiple facets based on either the
40 | perturbation targets ("gene") or conditions/cell types ("split.by"). Default is NULL, meaning
41 | no facet.}
42 |
43 | \item{facet_scale}{whether to use a fixed scale for y-axis across all facets or allow
44 | y axis to vary.}
45 |
46 | \item{facet_nrow}{the number of rows to plot the different panels when facet_wrap is set.}
47 | }
48 | \value{
49 | a ggplot2 object that contains the ridge plot.
50 | }
51 | \description{
52 | This function will generate a density (ridge) plot for the perturbation scores across
53 | different cell types or different perturbation targets.
54 | }
55 |
--------------------------------------------------------------------------------
/man/Mixscale_ScatterPlot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/visualization.R
3 | \name{Mixscale_ScatterPlot}
4 | \alias{Mixscale_ScatterPlot}
5 | \title{Plot to compare the expression level of the perturbation target and the
6 | perturbation scores}
7 | \usage{
8 | Mixscale_ScatterPlot(
9 | object = NULL,
10 | assay = "RNA",
11 | slot = "data",
12 | nt.class.name = "NT",
13 | split.by = NULL,
14 | slct.ident = NULL,
15 | nbin = 10,
16 | facet_wrap = c(NULL, "gene", "split.by"),
17 | facet_scale = "free_y",
18 | ...
19 | )
20 | }
21 | \arguments{
22 | \item{object}{a seurat object returned by RunMixscale()}
23 |
24 | \item{assay}{the assay name to extract the expression level data from for plotting}
25 |
26 | \item{slot}{the slot name to extract the expression level data from for plotting}
27 |
28 | \item{nt.class.name}{the classification name of non-targeting gRNA cells}
29 |
30 | \item{split.by}{metadata column with experimental condition/cell type classification
31 | information. This is meant to be used to account for cases a perturbation is
32 | condition/cell type -specific.}
33 |
34 | \item{slct.ident}{the perturbation target genes to extract for plotting from 'labels'. Multiple values are
35 | allowed. Default is NULL (every class will be plotted).}
36 |
37 | \item{nbin}{the number of bins to divide the perturbation scores into.}
38 |
39 | \item{facet_wrap}{whether to divide the plot into multiple facets based on either the
40 | perturbation targets ("gene") or conditions/cell types ("split.by"). Default is NULL, meaning
41 | no facet.}
42 |
43 | \item{facet_scale}{whether to use a fixed scale for y-axis across all facets or allow
44 | y axis to vary.}
45 | }
46 | \value{
47 | a ggplot2 object that contains the connected scatterplot.
48 | }
49 | \description{
50 | This function will generate a connected scatterplot to compare the mean
51 | expression level of the perturbation target gene within different perturbation
52 | percentile bins. After running the RunMixscale() function, user can specify the
53 | gene name of the perturbation target and the number of bins to divide the scores
54 | into, and this function will sutomatically generate a connected scatterplot.
55 | Multiple perturbation targets and cell types are allowed.
56 | }
57 |
--------------------------------------------------------------------------------
/man/PCApermtest.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decomposition.R
3 | \name{PCApermtest}
4 | \alias{PCApermtest}
5 | \title{Run PCA-based permutation test for a matrix}
6 | \usage{
7 | PCApermtest(
8 | mat = NULL,
9 | k = 1,
10 | var_prop = NULL,
11 | var_prop_total = NULL,
12 | center = T,
13 | scale = T,
14 | row_filtering_pval = 0.05,
15 | num_iter = 200,
16 | seed = 123124125,
17 | ...
18 | )
19 | }
20 | \arguments{
21 | \item{mat}{the Z-score matrix to perform the permutation test.
22 | Rows are the gene and columns are the conditions/samples.}
23 |
24 | \item{k}{the number of top PCs to extract. If set to NULL, the
25 | function will determine the number of PCs to extract based on
26 | the var_prop, which is the cut-off on the proportion of variance
27 | explained for each PC. Only PCs with prop_var greater than this
28 | value will be removed extracted.}
29 |
30 | \item{var_prop}{if k is not set, then use this value as a cutoff for
31 | %var explained to select the PCs. Only the PCs with %var larger than this
32 | value will be selected (the top k PCs).}
33 |
34 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var
35 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the
36 | accumulated sum larger than this value to be selected.}
37 |
38 | \item{center}{a boolen value to indicate whether the column will be centered.}
39 |
40 | \item{scale}{a boolen value to indicate whether the column will be scaled to 1.}
41 |
42 | \item{num_iter}{the number of iteration for the permutation test.}
43 |
44 | \item{seed}{seed for random number generator.}
45 | }
46 | \value{
47 | a list object consists of 3 elements: the original input Z-score matrix,
48 | the p-value matrix produced by the permutation test for each genes (same dimension
49 | as the original input matrix), and the prcomp() object done for the input Z-score
50 | matrix.
51 | }
52 | \description{
53 | This function will load in a DE test Z-score matrix, and perform PCA to it to get the 1st to k-th PCs.
54 | Then it will permuate the matrix and then the same PCA analysis will be performed. By default, this process
55 | will be repeated for 200 times (default), so that we will have a decent number of null values
56 | in the top k PCs. The proportion of extreme values greater or smaller than the actual
57 | value will be used as the P-values.
58 | }
59 |
--------------------------------------------------------------------------------
/man/RunMixscale.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/perturbation_scoring.R
3 | \name{RunMixscale}
4 | \alias{RunMixscale}
5 | \title{Mixscale scoring for perturbations}
6 | \usage{
7 | RunMixscale(
8 | object,
9 | assay = "PRTB",
10 | slot = "scale.data",
11 | labels = "gene",
12 | nt.class.name = "NT",
13 | new.class.name = "mixscale_score",
14 | min.de.genes = 5,
15 | min.cells = 5,
16 | de.assay = "RNA",
17 | logfc.threshold = 0.25,
18 | verbose = FALSE,
19 | split.by = NULL,
20 | fine.mode = FALSE,
21 | fine.mode.labels = "guide_ID",
22 | DE.gene = NULL,
23 | max.de.genes = 100,
24 | harmonize = F,
25 | min_prop_ntgd = 0.1,
26 | pval.cutoff = 0.05,
27 | seed = 10282021
28 | )
29 | }
30 | \arguments{
31 | \item{object}{An object of class Seurat.}
32 |
33 | \item{assay}{Assay to use for mixscape classification.}
34 |
35 | \item{slot}{Assay data slot to use.}
36 |
37 | \item{labels}{metadata column with target gene labels.}
38 |
39 | \item{nt.class.name}{Classification name of non-targeting gRNA cells.}
40 |
41 | \item{new.class.name}{Name of mixscale scores to be stored in
42 | metadata.}
43 |
44 | \item{min.de.genes}{Required number of genes that are differentially
45 | expressed for method to separate perturbed and non-perturbed cells.}
46 |
47 | \item{min.cells}{Minimum number of cells in target gene class. If fewer than
48 | this many cells are assigned to a target gene class during classification,
49 | all are assigned NP.}
50 |
51 | \item{de.assay}{Assay to use when performing differential expression analysis.
52 | Usually RNA.}
53 |
54 | \item{logfc.threshold}{the log-fold-change threshold to select the large-effect
55 | DE genes. Only DE genes with log-fold-change larger than this value will be
56 | selected. Default is 0.25.}
57 |
58 | \item{verbose}{Display messages}
59 |
60 | \item{split.by}{metadata column with experimental condition/cell type
61 | classification information. This is meant to be used to account for cases a
62 | perturbation is condition/cell type -specific.}
63 |
64 | \item{fine.mode}{When this is equal to TRUE, DE genes for each target gene
65 | class will be calculated for each gRNA separately and pooled into one DE list
66 | for calculating the perturbation score of every cell and their subsequent
67 | classification.}
68 |
69 | \item{fine.mode.labels}{metadata column with gRNA ID labels.}
70 |
71 | \item{DE.gene}{specify a list of user-defined large-effect DE genes to calculate the perturbation score.}
72 |
73 | \item{max.de.genes}{the maximum number of top large-effect DE genes to calculate the perturbation score. Default is 100.}
74 |
75 | \item{harmonize}{a boolen value to specify whether a harmonization of the cell-type proportion between the NT cells and
76 | the perturbed cells should be performed prior to the DE test. If fine.mode is TRUE, this harmonization step will be
77 | performed for each fine.mode gRNA. Default is FALSE.}
78 |
79 | \item{min_prop_ntgd}{a minimal threshold to remove cells if any cell type has a proportion less than this value. It will
80 | only be used when harmonize is TRUE. Default is 0.1.}
81 |
82 | \item{pval.cutoff}{specify the DE test p-value cutoff (after Bonferroni correction) to select top large-effect DE genes.
83 | Default is 0.05.}
84 | }
85 | \value{
86 | Returns a Seurat object containing the perturbation scores. It is stored in the Tool Data of the object, also
87 | the standardized scores are stored in the meta.data (column is specified by new.class.name).
88 | }
89 | \description{
90 | Function to calculate perturbation scores for perturbed and non-perturbed gRNA expressing cells.
91 | The perturbation score reflects the perturbation strength of each cells (inherited from the RunMixscape()
92 | function). It is calculated by using the large-effect DE genes from raw DE tests between the
93 | perturbed and non-perturbed gRNA expressing cells.
94 | }
95 | \concept{perturbation_scoring}
96 |
--------------------------------------------------------------------------------
/man/Run_wmvRegDE.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/scoring_de.R
3 | \name{Run_wmvRegDE}
4 | \alias{Run_wmvRegDE}
5 | \title{Scoring-based weighted DE test}
6 | \usage{
7 | Run_wmvRegDE(
8 | object,
9 | assay = "RNA",
10 | slot = "counts",
11 | labels = "gene",
12 | nt.class.name = "NT",
13 | verbose = FALSE,
14 | PRTB_list = NULL,
15 | split.by = NULL,
16 | logfc.threshold = 0,
17 | min.pct = 0.1,
18 | min.cells.group = 10,
19 | total_ct_labels = "nCount_RNA",
20 | pseudocount.use = 1,
21 | base = 2,
22 | full.results = FALSE
23 | )
24 | }
25 | \arguments{
26 | \item{object}{An object of class Seurat.}
27 |
28 | \item{assay}{Assay to use for mixscape classification.}
29 |
30 | \item{slot}{Assay data slot to use.}
31 |
32 | \item{labels}{metadata column with target gene labels.}
33 |
34 | \item{nt.class.name}{Classification name of non-targeting gRNA cells.}
35 |
36 | \item{verbose}{Print a progress bar once expression testing begins}
37 |
38 | \item{PRTB_list}{provide a vector of perturbations that the DE tests are restricted
39 | to. Default is NULL (DE tests will be performed for all available perturbations).}
40 |
41 | \item{split.by}{metadata column with experimental condition/cell type
42 | classification information. This is used to account for cases where
43 | perturbations are done for multiple condition/cell type. Default is NULL (only one
44 | cell type).}
45 |
46 | \item{logfc.threshold}{the log-fold-change threshold to select genes for DE
47 | test. Genes with log2-fold-change larger than this value will be selected for DE test.
48 | Note that if split.by is set and more than 1 split.by group exists, this
49 | logfc.threashold will be applied to each group and if any of them satisfies this criteria, the
50 | gene will be selected. Default is 0 (no filtering based on log2-fold-change).}
51 |
52 | \item{min.pct}{only test genes that are detected in a minimum fraction of min.pct cells in either
53 | of the two populations. Meant to speed up the function by not testing genes that are very
54 | infrequently expressed. Default is 0.1. Same as logfc.threshold, if split.by is set and more than 1 split.by
55 | group exists, thiswill be applied to each group and if any of them satisfies this criteria, the
56 | gene will be selected.}
57 |
58 | \item{min.cells.group}{Minimum number of cells in one of the groups}
59 |
60 | \item{total_ct_labels}{metadata column for the total RNA counts of each cell. The default is
61 | nCount_RNA, which is the default names used by Seurat package.}
62 |
63 | \item{pseudocount.use}{Pseudocount to add to averaged expression values when calculating logFC. 1 by default.}
64 |
65 | \item{base}{The base with respect to which logarithms are computed.}
66 |
67 | \item{full.results}{A boolen value to indicate if the full DE results should be output. Default is
68 | FALSE (only the regression coefficients/P-values of the mixscale scores will be output).}
69 | }
70 | \value{
71 | a list of DE results, one for each perturbation
72 | }
73 | \description{
74 | A function to perform differential expression (DE) tests based on the perturbation scores from the
75 | RunMixscale() function. It is a multivariate negative binomial based model that incorporates both the heterogeneity
76 | of perturbation strength in each cell, as well as their cell type background.
77 | }
78 | \concept{perturbation_scoring}
79 |
--------------------------------------------------------------------------------
/man/fisher_enrich_test.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/enrichment_test.R
3 | \name{fisher_enrich_test}
4 | \alias{fisher_enrich_test}
5 | \title{Standard Fisher's exact test for enrichment analysis}
6 | \usage{
7 | fisher_enrich_test(
8 | input_list = NULL,
9 | background = NULL,
10 | go_term_db = NULL,
11 | list_gene = F,
12 | EASE = F
13 | )
14 | }
15 | \arguments{
16 | \item{input_list}{the input gene list}
17 |
18 | \item{background}{the background gene list (usually the expressed genes where the
19 | input gene list is generate from, ).}
20 |
21 | \item{go_term_db}{a list of gene-lists (GO term). It should be a list contain multiple named vector,
22 | and each vector should be a vector of multiple marker/signature genes for some biological pathway/process.}
23 |
24 | \item{list_gene}{A Boolen value to indicate if the overlapping genes between the input gene list and
25 | the GO-term should be output as well.}
26 |
27 | \item{EASE}{A Boolen value to indicate if the EASE correction should be applied (see
28 | https://david.ncifcrf.gov/helps/functional_annotation.html). This is useful to mitigate the
29 | small-sample inflation when the input gene list is short (e.g., < 10).}
30 | }
31 | \value{
32 | a data frame contains the enrichment test results. Each row contains the P-value and enrichment odds
33 | ratio calculated from a Fisher's exact test for one GO-term in the go_term_db.
34 | }
35 | \description{
36 | This function will perform the strandard Fisher's exact test between the input gene
37 | list and a series of gene-ontology gene sets (adopted from DAVID GO analysis).
38 | }
39 |
--------------------------------------------------------------------------------
/man/get_DE_mat.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/scoring_de.R
3 | \name{get_DE_mat}
4 | \alias{get_DE_mat}
5 | \title{Rearrange the DE results into a list of Z-score matrices}
6 | \usage{
7 | get_DE_mat(
8 | de_res = NULL,
9 | p_threshold = 0.05/30000,
10 | fc_threshold = 0.2,
11 | num_top_DEG = 50
12 | )
13 | }
14 | \arguments{
15 | \item{de_res}{the DE results produced by Run_wmvRegDE(), which is a list of data frames.}
16 |
17 | \item{p_threshold}{the DE P-value threshold to define statistically significant DE genes.}
18 |
19 | \item{fc_threshold}{the log-fold-change threhsold to define statistically significant DE genes.}
20 |
21 | \item{num_top_DEG}{for each perturbation, only the top num_top_DEG DEG within each condition/cell line
22 | will be selected as the rows (features). This helps avoid the feature space is dominated by one or a few perturbations
23 | that have a huge number of DEGs. The default is 50. Set it to NULL to avoid filtering.}
24 | }
25 | \description{
26 | A function to re-arrange the DE results produced by Run_wmvRegDE() into a list of Z-score matrices.
27 | Each matrix represents one cell type (if multiple cell types were included), and contains the
28 | DE test Z-scores for each valid gene being tested (rows) and each perturbation target (columns).
29 | }
30 |
--------------------------------------------------------------------------------
/man/get_fc.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_fold_change.R
3 | \name{get_fc}
4 | \alias{get_fc}
5 | \title{Calculate log-fold-change given a vector of gene expression and the indices of perturbed cells and non-target cells}
6 | \usage{
7 | get_fc(
8 | gene_exp = NULL,
9 | idx_P = NULL,
10 | idx_NT = NULL,
11 | min.cells = 3,
12 | thresh.min = 0,
13 | pseudocount.use = 1,
14 | min.pct = 0.1,
15 | base = 2,
16 | norm.method = "raw"
17 | )
18 | }
19 | \arguments{
20 | \item{gene_exp}{a vector of the gene expression levels}
21 |
22 | \item{idx_P}{a vector of index for the perturbed cells in the gene_exp}
23 |
24 | \item{idx_NT}{a vector of index for the non-target cells (controls) in the gene_exp}
25 |
26 | \item{min.cells}{the minimal number of cells that expresses the gene; if lower than this value, the
27 | fold-change will be returned as NA. Default is 3.}
28 |
29 | \item{thresh.min}{the minimal value of expression; any expression value lower than this will be
30 | considered as 0. Default is 0.}
31 |
32 | \item{pseudocount.use}{the small value that will be added to the log-transformation to avoid log(0).
33 | For example, if a mean expression value is x, the final log-}
34 |
35 | \item{min.pct}{the minimal proportion of cells in either groups that expresses the gene}
36 |
37 | \item{base}{the base for log()}
38 |
39 | \item{norm.method}{the normalization method for the input gene_exp. Default is 'raw', which means
40 | the original count value without normalization. The other supported values are 'log.norm', "scale.data".
41 | The mean.fxn() will change accordingly.}
42 | }
43 | \value{
44 | Returns a single value of the log-fold-change of the input gene.
45 | }
46 | \description{
47 | Function to calculate log-fold-change for pooled CRISPR screen datasets.
48 | It is just a simple function to calculate the log-fold-change. Users can customise the min.cells,
49 | minimal expression threshold, pseudo-count (the small value added to the expression level to avoid log(0)),
50 | minimal percentage of cells expression the genes, and the base of the log.
51 | }
52 | \concept{perturbation_scoring}
53 |
--------------------------------------------------------------------------------
/man/get_sig_genes.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decomposition.R
3 | \name{get_sig_genes}
4 | \alias{get_sig_genes}
5 | \title{Extract significant genes from PCApermtest}
6 | \usage{
7 | get_sig_genes(
8 | perm_obj = NULL,
9 | k = 1,
10 | var_prop = NULL,
11 | var_prop_total = NULL,
12 | perm_pval_thres = 0.05,
13 | ori_pval_thres = 1.666667e-06,
14 | cor_threshold = 0.2,
15 | collapse = T,
16 | ...
17 | )
18 | }
19 | \arguments{
20 | \item{perm_obj}{the list object produces by the PCApermtest() function.}
21 |
22 | \item{k}{the number of top PCs to extract. If set to NULL, the
23 | function will determine the number of PCs to extract based on
24 | the var_prop, which is the cut-off on the proportion of variance
25 | explained for each PC. Only PCs with prop_var greater than this
26 | value will be removed extracted.}
27 |
28 | \item{var_prop}{if k is not set, then use this value as a cutoff for
29 | %var explained to select the PCs. Only the PCs with %var larger than this
30 | value will be selected (the top k PCs).}
31 |
32 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var
33 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the
34 | accumulated sum larger than this value to be selected.}
35 |
36 | \item{perm_pval_thres}{the p-value threshold for the permutation test. Rows
37 | (genes) with permutation p-value lower then permtest_pval_thres or greater than
38 | (1 - permtest_pval_thres) will be selected. Default is 0.05.}
39 |
40 | \item{ori_pval_thres}{the p-value threshold for the original DE test. In the original
41 | Z-score matrix we store all the Z-score from the DE test. When selecting significant
42 | rows (genes), we might wish them to be significant in the original DE test as well.
43 | By setting this value, we force the rows (genes) to be both significant in the
44 | permutation test and the DE test. Default is 0.05/30000 = 1.666667e-06 (which is
45 | gene-wide significant after Bonferroni correction). Set it to 1 to avoid such filtering.}
46 |
47 | \item{cor_threshold}{After the PCApermtest, the actual orientation of the PC
48 | might not be the same as the orientation of its correlated columns in the original
49 | matrix. We need to do correlation test between each PC and all the columns in the
50 | original matrix, and then use this cor_threshold to define and extract the
51 | correlated columns and then use them to determine the actual orientation of the PC.}
52 |
53 | \item{collapse}{a boolen value to indicate when k >= 2, whether the significant
54 | genes from the top k PCs should be return together as one gene list (collapse = T)
55 | or separately for each k (collapse = F). Default is T.}
56 | }
57 | \value{
58 | a list of significant genes (gene IDs) selected from the output of PCApermtest.
59 | The order of genes in each gene list indicates the significance of P-value (high
60 | significance at the top).
61 | }
62 | \description{
63 | The function will load the pval matrix calculated from PCApermtest() and
64 | return the significant rows (usually genes) given the threshold that users
65 | provide.
66 | }
67 |
--------------------------------------------------------------------------------
/man/get_sig_genes_DEhclust.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decomposition.R
3 | \name{get_sig_genes_DEhclust}
4 | \alias{get_sig_genes_DEhclust}
5 | \title{Run PCApermtest and get significant genes from DEhclust}
6 | \usage{
7 | get_sig_genes_DEhclust(
8 | obj = NULL,
9 | k = 1,
10 | var_prop = NULL,
11 | center = T,
12 | scale = T,
13 | num_iter = 200,
14 | row_filtering_pval = 0.05,
15 | var_prop_total = NULL,
16 | perm_pval_thres = 0.05,
17 | ori_pval_thres = 1.666667e-06,
18 | cor_threshold = 0.2,
19 | collapse = T,
20 | ...
21 | )
22 | }
23 | \arguments{
24 | \item{obj}{The results object produced by DEhclust() function.}
25 |
26 | \item{k}{the number of top PCs to extract. If set to NULL, the
27 | function will determine the number of PCs to extract based on
28 | the var_prop, which is the cut-off on the proportion of variance
29 | explained for each PC. Only PCs with prop_var greater than this
30 | value will be removed extracted.}
31 |
32 | \item{var_prop}{if k is not set, then use this value as a cutoff for
33 | %var explained to select the PCs. Only the PCs with %var larger than this
34 | value will be selected (the top k PCs).}
35 |
36 | \item{center}{a boolen value to indicate whether the column will be centered.}
37 |
38 | \item{scale}{a boolen value to indicate whether the column will be scaled to 1.}
39 |
40 | \item{num_iter}{the number of iteration for the permutation test.}
41 |
42 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var
43 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the
44 | accumulated sum larger than this value to be selected.}
45 |
46 | \item{perm_pval_thres}{the p-value threshold for the permutation test. Rows
47 | (genes) with permutation p-value lower then permtest_pval_thres or greater than
48 | (1 - permtest_pval_thres) will be selected. Default is 0.05.}
49 |
50 | \item{ori_pval_thres}{the p-value threshold for the original DE test. In the original
51 | Z-score matrix we store all the Z-score from the DE test. When selecting significant
52 | rows (genes), we might wish them to be significant in the original DE test as well.
53 | By setting this value, we force the rows (genes) to be both significant in the
54 | permutation test and the DE test. Default is 0.05/30000 = 1.666667e-06 (which is
55 | gene-wide significant after Bonferroni correction). Set it to 1 to avoid such filtering.}
56 |
57 | \item{cor_threshold}{After the PCApermtest, the actual orientation of the PC
58 | might not be the same as the orientation of its correlated columns in the original
59 | matrix. We need to do correlation test between each PC and all the columns in the
60 | original matrix, and then use this cor_threshold to define and extract the
61 | correlated columns and then use them to determine the actual orientation of the PC.}
62 |
63 | \item{collapse}{a boolen value to indicate when k >= 2, whether the significant
64 | genes from the top k PCs should be return together as one gene list (collapse = T)
65 | or separately for each k (collapse = F). Default is T.}
66 | }
67 | \value{
68 | return a list of vectors, and each vector contains the signature genes identified for each cluster.
69 | }
70 | \description{
71 | This function will use the output from the DEhclust() and get the necessary elements for PCApermtest():
72 | For each cluster of columns being identified, this function will create a truncated sub-matrix given
73 | the original Z-score matrix. The sub-matrix will only contains the selected columns, and they will be input
74 | to the PCApermtest() and get_sig_genes() to get the gene signatures for this cluster. This process will
75 | be repeated for each cluster.
76 | }
77 |
--------------------------------------------------------------------------------
/man/get_sig_genes_DEmultiCCA.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decomposition.R
3 | \name{get_sig_genes_DEmultiCCA}
4 | \alias{get_sig_genes_DEmultiCCA}
5 | \title{Run PCApermtest and get significant genes from DEmultiCCA}
6 | \usage{
7 | get_sig_genes_DEmultiCCA(
8 | obj = NULL,
9 | k = 1,
10 | var_prop = NULL,
11 | center = T,
12 | scale = T,
13 | num_iter = 200,
14 | row_filtering_pval = 0.05,
15 | var_prop_total = NULL,
16 | perm_pval_thres = 0.05,
17 | ori_pval_thres = 1.666667e-06,
18 | cor_threshold = 0.2,
19 | collapse = T,
20 | ...
21 | )
22 | }
23 | \arguments{
24 | \item{obj}{The results object produced by DEmultiCCA() function.}
25 |
26 | \item{k}{the number of top PCs to extract. If set to NULL, the
27 | function will determine the number of PCs to extract based on
28 | the var_prop, which is the cut-off on the proportion of variance
29 | explained for each PC. Only PCs with prop_var greater than this
30 | value will be removed extracted.}
31 |
32 | \item{var_prop}{if k is not set, then use this value as a cutoff for
33 | %var explained to select the PCs. Only the PCs with %var larger than this
34 | value will be selected (the top k PCs).}
35 |
36 | \item{center}{a boolen value to indicate whether the column will be centered.}
37 |
38 | \item{scale}{a boolen value to indicate whether the column will be scaled to 1.}
39 |
40 | \item{num_iter}{the number of iteration for the permutation test.}
41 |
42 | \item{var_prop_total}{similar to var_prop. The accumulated sum of the %var
43 | of the top i = 1, 2, ... PCs will be calculated, and the top k PCs with the
44 | accumulated sum larger than this value to be selected.}
45 |
46 | \item{perm_pval_thres}{the p-value threshold for the permutation test. Rows
47 | (genes) with permutation p-value lower then permtest_pval_thres or greater than
48 | (1 - permtest_pval_thres) will be selected. Default is 0.05.}
49 |
50 | \item{ori_pval_thres}{the p-value threshold for the original DE test. In the original
51 | Z-score matrix we store all the Z-score from the DE test. When selecting significant
52 | rows (genes), we might wish them to be significant in the original DE test as well.
53 | By setting this value, we force the rows (genes) to be both significant in the
54 | permutation test and the DE test. Default is 0.05/30000 = 1.666667e-06 (which is
55 | gene-wide significant after Bonferroni correction). Set it to 1 to avoid such filtering.}
56 |
57 | \item{cor_threshold}{After the PCApermtest, the actual orientation of the PC
58 | might not be the same as the orientation of its correlated columns in the original
59 | matrix. We need to do correlation test between each PC and all the columns in the
60 | original matrix, and then use this cor_threshold to define and extract the
61 | correlated columns and then use them to determine the actual orientation of the PC.}
62 |
63 | \item{collapse}{a boolen value to indicate when k >= 2, whether the significant
64 | genes from the top k PCs should be return together as one gene list (collapse = T)
65 | or separately for each k (collapse = F). Default is T.}
66 | }
67 | \value{
68 | return a list of vectors, and each vector contains the signature genes identified for each MultiCCA program.
69 | }
70 | \description{
71 | This function will use the output from the DEmultiCCA() and get the neccessary elements for PCApermtest and
72 | get the gene signatures for each perturbation program that DEmultiCCA() identifies. It works in a similar way
73 | as get_sig_genes_DEhclust() .
74 | }
75 |
--------------------------------------------------------------------------------
/man/glm_gp_disp_only.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/glm_gp_disp_only.R
3 | \name{glm_gp_disp_only}
4 | \alias{glm_gp_disp_only}
5 | \title{Internal Function to Fit a Gamma-Poisson GLM}
6 | \usage{
7 | glm_gp_disp_only(
8 | data,
9 | design = ~1,
10 | col_data = NULL,
11 | reference_level = NULL,
12 | offset = 0,
13 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"),
14 | overdispersion = TRUE,
15 | overdispersion_shrinkage = TRUE,
16 | ridge_penalty = 0,
17 | do_cox_reid_adjustment = TRUE,
18 | subsample = FALSE,
19 | on_disk = NULL,
20 | use_assay = NULL,
21 | verbose = FALSE
22 | )
23 | }
24 | \arguments{
25 | \item{data}{any matrix-like object (e.g. \link{matrix}, \link{DelayedArray}, \link{HDF5Matrix}) or
26 | anything that can be cast to a \link{SummarizedExperiment} (e.g. \code{MSnSet}, \code{eSet} etc.) with
27 | one column per sample and row per gene.}
28 |
29 | \item{design}{a specification of the experimental design used to fit the Gamma-Poisson GLM.
30 | It can be a \code{\link[=model.matrix]{model.matrix()}} with one row for each sample and one column for each
31 | coefficient. \cr
32 | Alternatively, \code{design} can be a \code{formula}. The entries in the
33 | formula can refer to global objects, columns in the \code{col_data} parameter, or the \code{colData(data)}
34 | of \code{data} if it is a \code{SummarizedExperiment}. \cr
35 | The third option is that \code{design} is a vector where each element specifies to which
36 | condition a sample belongs. \cr
37 | Default: \code{design = ~ 1}, which means that all samples are treated as if they belong to the
38 | same condition. Note that this is the fasted option.}
39 |
40 | \item{col_data}{a dataframe with one row for each sample in \code{data}. Default: \code{NULL}.}
41 |
42 | \item{reference_level}{a single string that specifies which level is used as reference
43 | when the model matrix is created. The reference level becomes the intercept and all
44 | other coefficients are calculated with respect to the \code{reference_level}.
45 | Default: \code{NULL}.}
46 |
47 | \item{offset}{Constant offset in the model in addition to \code{log(size_factors)}. It can
48 | either be a single number, a vector of length \code{ncol(data)} or a matrix with the
49 | same dimensions as \code{dim(data)}. Note that if data is a \link{DelayedArray} or \link{HDF5Matrix},
50 | \code{offset} must be as well. Default: \code{0}.}
51 |
52 | \item{size_factors}{in large scale experiments, each sample is typically of different size
53 | (for example different sequencing depths). A size factor is an internal mechanism of GLMs to
54 | correct for this effect.\cr
55 | \code{size_factors} is either a numeric vector with positive entries that has the same lengths as columns in the data
56 | that specifies the size factors that are used.
57 | Or it can be a string that species the method that is used to estimate the size factors
58 | (one of \code{c("normed_sum", "deconvolution", "poscounts")}).
59 | Note that \code{"normed_sum"} and \code{"poscounts"} are fairly
60 | simple methods and can lead to suboptimal results. For the best performance, I recommend to use
61 | \code{size_factors = "deconvolution"} which calls \code{scran::calculateSumFactors()}. However, you need
62 | to separately install the \code{scran} package from Bioconductor for this method to work.
63 | Also note that \code{size_factors = 1} and \code{size_factors = FALSE} are equivalent. If only a single gene is given,
64 | no size factor is estimated (ie. \code{size_factors = 1}). Default: \code{"normed_sum"}.}
65 |
66 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model
67 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
68 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
69 | \code{overdispersion} can either be
70 | \itemize{
71 | \item a single boolean that indicates if an overdispersion is estimated for each gene.
72 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values.
73 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes.
74 | }
75 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce
76 | the Gamma-Poisson to the classical Poisson model. Default: \code{TRUE}.}
77 |
78 | \item{overdispersion_shrinkage}{the overdispersion can be difficult to estimate with few replicates. To
79 | improve the overdispersion estimates, we can share information across genes and shrink each individual
80 | overdispersion estimate towards a global overdispersion estimate. Empirical studies show however that
81 | the overdispersion varies based on the mean expression level (lower expression level => higher
82 | dispersion). If \code{overdispersion_shrinkage = TRUE}, a median trend of dispersion and expression level is
83 | fit and used to estimate the variances of a quasi Gamma Poisson model (Lund et al. 2012). Default: \code{TRUE}.}
84 |
85 | \item{ridge_penalty}{to avoid overfitting, we can penalize fits with large coefficient estimates. Instead
86 | of directly minimizing the deviance per gene (\eqn{Sum dev(y_i, X_i b)}), we will minimize
87 | \eqn{Sum dev(y_i, X_i b) + N * Sum (penalty_p * b_p)^2}.\cr
88 | \code{ridge_penalty} can be
89 | \itemize{
90 | \item a scalar in which case all parameters except the intercept are penalized.
91 | \item a vector which has to have the same length as columns in the model matrix
92 | \item a matrix with the same number of columns as columns in the model matrix. This gives
93 | maximum flexibility for expert users and allows for full Tikhonov regularization.
94 | }
95 | Default: \code{ridge_penalty = 0}, which is internally replaced with a small positive number for numerical stability.}
96 |
97 | \item{do_cox_reid_adjustment}{the classical maximum likelihood estimator of the \code{overdisperion} is biased
98 | towards small values. McCarthy \emph{et al.} (2012) showed that it is preferable to optimize the Cox-Reid
99 | adjusted profile likelihood.\cr
100 | \code{do_cox_reid_adjustment} can be either be \code{TRUE} or \code{FALSE} to indicate if the adjustment is
101 | added during the optimization of the \code{overdispersion} parameter. Default: \code{TRUE}.}
102 |
103 | \item{subsample}{the estimation of the overdispersion is the slowest step when fitting
104 | a Gamma-Poisson GLM. For datasets with many samples, the estimation can be considerably sped up
105 | without loosing much precision by fitting the overdispersion only on a random subset of the samples.
106 | Default: \code{FALSE} which means that the data is not subsampled. If set to \code{TRUE}, at most 1,000 samples
107 | are considered. Otherwise the parameter just specifies the number of samples that are considered
108 | for each gene to estimate the overdispersion.}
109 |
110 | \item{on_disk}{a boolean that indicates if the dataset is loaded into memory or if it is kept on disk
111 | to reduce the memory usage. Processing in memory can be significantly faster than on disk.
112 | Default: \code{NULL} which means that the data is only processed in memory if \code{data} is an in-memory
113 | data structure.}
114 |
115 | \item{verbose}{a boolean that indicates if information about the individual steps are printed
116 | while fitting the GLM. Default: \code{FALSE}.}
117 |
118 | \item{Y}{any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with
119 | one column per sample and row per gene.}
120 | }
121 | \value{
122 | a list with four elements
123 | * `Beta` the coefficient matrix
124 | * `overdispersion` the vector with the estimated overdispersions
125 | * `Mu` a matrix with the corresponding means for each gene
126 | and sample
127 | * `size_factors` a vector with the size factor for each
128 | sample
129 | * `ridge_penalty` a vector with the ridge penalty
130 | }
131 | \description{
132 | Internal Function to Fit a Gamma-Poisson GLM
133 | }
134 | \seealso{
135 | [glm_gp()] and [overdispersion_mle()]
136 | }
137 | \keyword{internal}
138 |
--------------------------------------------------------------------------------
/man/glm_gp_disp_only_impl.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/glm_gp_disp_only.R
3 | \name{glm_gp_disp_only_impl}
4 | \alias{glm_gp_disp_only_impl}
5 | \title{Internal Function to Fit a Gamma-Poisson GLM}
6 | \usage{
7 | glm_gp_disp_only_impl(
8 | Y,
9 | model_matrix,
10 | offset = 0,
11 | size_factors = c("normed_sum", "deconvolution", "poscounts", "ratio"),
12 | overdispersion = TRUE,
13 | overdispersion_shrinkage = TRUE,
14 | ridge_penalty = 0,
15 | do_cox_reid_adjustment = TRUE,
16 | subsample = FALSE,
17 | verbose = FALSE
18 | )
19 | }
20 | \arguments{
21 | \item{Y}{any matrix-like object (e.g. `matrix()`, `DelayedArray()`, `HDF5Matrix()`) with
22 | one column per sample and row per gene.}
23 |
24 | \item{model_matrix}{a numeric matrix that specifies the experimental
25 | design. It can be produced using \code{stats::model.matrix()}.
26 | Default: \code{NULL}}
27 |
28 | \item{offset}{Constant offset in the model in addition to \code{log(size_factors)}. It can
29 | either be a single number, a vector of length \code{ncol(data)} or a matrix with the
30 | same dimensions as \code{dim(data)}. Note that if data is a \link{DelayedArray} or \link{HDF5Matrix},
31 | \code{offset} must be as well. Default: \code{0}.}
32 |
33 | \item{size_factors}{in large scale experiments, each sample is typically of different size
34 | (for example different sequencing depths). A size factor is an internal mechanism of GLMs to
35 | correct for this effect.\cr
36 | \code{size_factors} is either a numeric vector with positive entries that has the same lengths as columns in the data
37 | that specifies the size factors that are used.
38 | Or it can be a string that species the method that is used to estimate the size factors
39 | (one of \code{c("normed_sum", "deconvolution", "poscounts")}).
40 | Note that \code{"normed_sum"} and \code{"poscounts"} are fairly
41 | simple methods and can lead to suboptimal results. For the best performance, I recommend to use
42 | \code{size_factors = "deconvolution"} which calls \code{scran::calculateSumFactors()}. However, you need
43 | to separately install the \code{scran} package from Bioconductor for this method to work.
44 | Also note that \code{size_factors = 1} and \code{size_factors = FALSE} are equivalent. If only a single gene is given,
45 | no size factor is estimated (ie. \code{size_factors = 1}). Default: \code{"normed_sum"}.}
46 |
47 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model
48 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
49 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
50 | \code{overdispersion} can either be
51 | \itemize{
52 | \item a single boolean that indicates if an overdispersion is estimated for each gene.
53 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values.
54 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes.
55 | }
56 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce
57 | the Gamma-Poisson to the classical Poisson model. Default: \code{TRUE}.}
58 |
59 | \item{overdispersion_shrinkage}{the overdispersion can be difficult to estimate with few replicates. To
60 | improve the overdispersion estimates, we can share information across genes and shrink each individual
61 | overdispersion estimate towards a global overdispersion estimate. Empirical studies show however that
62 | the overdispersion varies based on the mean expression level (lower expression level => higher
63 | dispersion). If \code{overdispersion_shrinkage = TRUE}, a median trend of dispersion and expression level is
64 | fit and used to estimate the variances of a quasi Gamma Poisson model (Lund et al. 2012). Default: \code{TRUE}.}
65 |
66 | \item{ridge_penalty}{to avoid overfitting, we can penalize fits with large coefficient estimates. Instead
67 | of directly minimizing the deviance per gene (\eqn{Sum dev(y_i, X_i b)}), we will minimize
68 | \eqn{Sum dev(y_i, X_i b) + N * Sum (penalty_p * b_p)^2}.\cr
69 | \code{ridge_penalty} can be
70 | \itemize{
71 | \item a scalar in which case all parameters except the intercept are penalized.
72 | \item a vector which has to have the same length as columns in the model matrix
73 | \item a matrix with the same number of columns as columns in the model matrix. This gives
74 | maximum flexibility for expert users and allows for full Tikhonov regularization.
75 | }
76 | Default: \code{ridge_penalty = 0}, which is internally replaced with a small positive number for numerical stability.}
77 |
78 | \item{do_cox_reid_adjustment}{the classical maximum likelihood estimator of the \code{overdisperion} is biased
79 | towards small values. McCarthy \emph{et al.} (2012) showed that it is preferable to optimize the Cox-Reid
80 | adjusted profile likelihood.\cr
81 | \code{do_cox_reid_adjustment} can be either be \code{TRUE} or \code{FALSE} to indicate if the adjustment is
82 | added during the optimization of the \code{overdispersion} parameter. Default: \code{TRUE}.}
83 |
84 | \item{subsample}{the estimation of the overdispersion is the slowest step when fitting
85 | a Gamma-Poisson GLM. For datasets with many samples, the estimation can be considerably sped up
86 | without loosing much precision by fitting the overdispersion only on a random subset of the samples.
87 | Default: \code{FALSE} which means that the data is not subsampled. If set to \code{TRUE}, at most 1,000 samples
88 | are considered. Otherwise the parameter just specifies the number of samples that are considered
89 | for each gene to estimate the overdispersion.}
90 |
91 | \item{verbose}{a boolean that indicates if information about the individual steps are printed
92 | while fitting the GLM. Default: \code{FALSE}.}
93 | }
94 | \value{
95 | a list with four elements
96 | * `Beta` the coefficient matrix
97 | * `overdispersion` the vector with the estimated overdispersions
98 | * `Mu` a matrix with the corresponding means for each gene
99 | and sample
100 | * `size_factors` a vector with the size factor for each
101 | sample
102 | * `ridge_penalty` a vector with the ridge penalty
103 | }
104 | \description{
105 | Internal Function to Fit a Gamma-Poisson GLM
106 | }
107 | \seealso{
108 | [glm_gp()] and [overdispersion_mle()]
109 | }
110 | \keyword{internal}
111 |
--------------------------------------------------------------------------------
/man/prune_DE_mat.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/scoring_de.R
3 | \name{prune_DE_mat}
4 | \alias{prune_DE_mat}
5 | \title{QC for the list of DE Z-score matrices}
6 | \usage{
7 | prune_DE_mat(
8 | DEG_mat = NULL,
9 | zscore_cap = 37,
10 | mask_target = FALSE,
11 | p_threshold = 0.05/30000,
12 | min_sig_DEG = 0,
13 | center = FALSE,
14 | scale = FALSE,
15 | ...
16 | )
17 | }
18 | \arguments{
19 | \item{DEG_mat}{the list of DE Z-score matrices that produced by get_DE_mat()}
20 |
21 | \item{zscore_cap}{the cap value for the Z-scores. Any absolute(Z-score) larger than this value will be
22 | set to this value to avoid extreme values affecting the downstream analyses. Default is 37
23 | which is the machine precision limit for a Z-score to produce a non-zero P-value (= ~1e-300).}
24 |
25 | \item{mask_target}{a boolen value to indicate if the Z-score of the perturbation target (labelled
26 | by the column names) should be masked as 0. Default is FALSE so no masking will happen.}
27 |
28 | \item{p_threshold}{the P-value threshold to define the 'significant' DE genes. Default is 0.05/30000, which
29 | is approximately the Bonferroni correction threshold for genome-wide DE tests (assuming 30,000 genes in total).}
30 |
31 | \item{min_sig_DEG}{the minimal number of significant DE genes that each column must contain. Any
32 | column with sig DE genes less than this value will be removed from the matrix. Default is 0 so no
33 | column will be removed.}
34 |
35 | \item{center}{a boolen value to indicate whether we should center each column to 0. Default is FALSE.}
36 |
37 | \item{scale}{a boolen value to indicate whether we should scale each column to have variance = 1. Default is FALSE.}
38 | }
39 | \value{
40 | The function will return a list of QCed DE Z-score matrices. This list can directly be the input for
41 | DEmultiCCA().
42 | }
43 | \description{
44 | A function to perform QC/filtering for the DE Z-score matrices that produced by get_DE_mat().
45 | }
46 |
--------------------------------------------------------------------------------
/man/rbo.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/enrichment_test.R
3 | \name{rbo}
4 | \alias{rbo}
5 | \title{Rank biased overlap}
6 | \usage{
7 | rbo(
8 | list1,
9 | list2,
10 | p,
11 | k = floor(max(length(list1), length(list2))/2),
12 | side = c("top", "bottom"),
13 | mid = NULL,
14 | uneven.lengths = TRUE
15 | )
16 | }
17 | \arguments{
18 | \item{list1}{List 1}
19 |
20 | \item{list2}{List 2}
21 |
22 | \item{p}{Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements}
23 |
24 | \item{k}{Evaluation depth for extrapolation}
25 |
26 | \item{side}{Evaluate similarity between the top or the bottom of the ranked lists}
27 |
28 | \item{mid}{Set the mid point to for example only consider positive or negative scores}
29 |
30 | \item{uneven.lengths}{Indicator if lists have uneven lengths}
31 | }
32 | \value{
33 | a scaler value measuring the rank biased overlap (rbo)
34 | }
35 | \description{
36 | A function for a new gene-set enrichment test based on the
37 | RBO (rank biased overlap) calculation with extropolation (Webber et al., 2010).
38 | The core functions of rbo() calculation was modified from the "gespeR" package (original author: Fabian Schmich).
39 | We modified it to accomodate our package and data type. We also developed a permutation scheme for
40 | RBO to allow for p-value calculations.
41 | }
42 | \author{
43 | Fabian Schmich ("gespeR" package)
44 | }
45 |
--------------------------------------------------------------------------------
/man/rbo_enrich_test.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/enrichment_test.R
3 | \name{rbo_enrich_test}
4 | \alias{rbo_enrich_test}
5 | \title{Rank biased overlap (RBO) based enrichment test}
6 | \usage{
7 | rbo_enrich_test(
8 | input_list,
9 | go_term_db,
10 | p,
11 | n_iter = 500,
12 | k = 300,
13 | side = c("top", "bottom"),
14 | mid = NULL,
15 | uneven.lengths = TRUE,
16 | empirical_test = FALSE,
17 | seed = 131415926
18 | )
19 | }
20 | \arguments{
21 | \item{input_list}{input gene list from user (a named vector)}
22 |
23 | \item{go_term_db}{a list object of multiple gene-ontology (GO) terms to run enrichment test against}
24 |
25 | \item{p}{Weighting parameter in [0, 1]. High p implies strong emphasis on top ranked elements}
26 |
27 | \item{n_iter}{the number of iteration to perform the permutation to obtain the P-values of the enrichment test}
28 |
29 | \item{k}{Evaluation depth for extrapolation}
30 |
31 | \item{side}{Evaluate similarity between the top or the bottom of the ranked lists}
32 |
33 | \item{mid}{Set the mid point to for example only consider positive or negative scores}
34 |
35 | \item{uneven.lengths}{Indicator if lists have uneven lengths}
36 |
37 | \item{empirical_test}{a boolen value to tell the function is an empirical test should be performed. If TRUE,
38 | the exact empirical proportion of the permutated elements that are greater than the true RBO
39 | is returned as the p-value (high accuracy usually requires a large n_iter, e.g., 1000). If FALSE, then a standard
40 | Z-score test is applied to the RBO based on the mean and standard deviation of all the permuated elements (less accurate
41 | but more efficient. A small n_iter is usually enough (e.g., 100 or 200) to get good approximation compared to
42 | the true empirical test).}
43 | }
44 | \value{
45 | a data.frame consists of rbo measurement between the inptu gene list and all the GO terms,
46 | as well as the P-values based on permutation. Please note that the P-values indicate whether the rank of the input gene
47 | list and the GO-term gene set are consistent or not. It does NOT indicate if RBO is significantly different from 0.
48 | }
49 | \description{
50 | To perform enrichment test based on rank biased overlap and permutation.
51 | }
52 |
--------------------------------------------------------------------------------