├── .Rbuildignore
├── .Rhistory
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── data_normalized.R
    ├── pathway_score.R
    ├── predict_cell.R
    └── sysdata.rda
├── README.md
├── TCfinder.Rproj
├── data
    ├── KEGG_Gene.rda
    └── TCfinder_Pathway.rda
├── inst
    ├── analysis
    │   ├── GSE673_analysis.R
    │   ├── MLcode.R
    │   ├── bluk_Anti.R
    │   ├── bulk_pathway.R
    │   ├── confusion_matrix.R
    │   ├── figure1.R
    │   ├── gene_analysis.R
    │   ├── model_border_gene.R
    │   ├── model_build.py
    │   ├── model_train.pbs
    │   ├── model_train.py
    │   ├── other_data_pathscore.R
    │   ├── pathway_importance.R
    │   ├── pathway_select.R
    │   ├── result_analysis.R
    │   ├── simulation.R
    │   ├── simulation_gene.R
    │   └── umap.R
    ├── extdata
    │   ├── GOSH_pathway_score.csv
    │   ├── TCfinder.hdf5
    │   └── predict_py.py
    └── image
    │   └── workflow.png
├── man
    ├── data_normalized.Rd
    ├── pathway_score.Rd
    └── predict_cell.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-data_normalized.R
    │   ├── test-pathway_score.R
    │   └── test-predict_cell.R
└── vignettes
    ├── .gitignore
    └── interpretation.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^TCfinder\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | 


--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
  1 | getwd()
  2 | usethis::use_agpl3_license("chenxuwu")
  3 | usethis::use_mit_license("chenxuwu")
  4 | usethis::use_roxygen_md()
  5 | usethis::use_testthat()
  6 | load("./data/GSE673_diff_path213.rds")
  7 | Diff_path <- readRDS("./data/Diff_path.rds")
  8 | pathway_score <- function(normalized_matrix){
  9 | Diff_path <- readRDS("data/Diff_path.rds")
 10 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds")
 11 | score_gene <- KEGG_gene %>% filter(hsa %in% Diff_pathway$hsa)
 12 | gene_id <- rownames(normalized_matrix)
 13 | normalized_matrix <- as.data.frame(t(normalized_matrix))
 14 | colnames(normalized_matrix) <- gene_id
 15 | myFun1 <- function(number){
 16 | sum(number)/length(number)
 17 | }
 18 | all_pathway_score <- NA
 19 | for (i in 1:213) {
 20 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 21 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)])
 22 | path_score <- as.data.frame(apply(data1, 1, myFun1))
 23 | colnames(path_score) <- names(table(score_gene$hsa))[i]
 24 | all_pathway_score <- cbind(all_pathway_score,path_score)
 25 | }
 26 | pathway_score <- all_pathway_score[,-1]
 27 | return(pathway_score)
 28 | }
 29 | file.edit("DESCRIPTION")
 30 | devtools::document()
 31 | devtools::document()
 32 | pwd
 33 | getwd()
 34 | ?pathway_score
 35 | usethis::use_vignette("interpretation")
 36 | devtools::check()
 37 | devtools::check()
 38 | usethis::use_testthat()
 39 | usethis::use_test()
 40 | pwd
 41 | getwd()
 42 | library(testthat)
 43 | library(TCfinder)
 44 | evtools::test()
 45 | devtools::test()
 46 | load_all()
 47 | devtools::load_all()
 48 | devtools::check()
 49 | devtools::check()
 50 | devtools::test()
 51 | devtools::test()
 52 | devtools::load_all()
 53 | devtools::check()
 54 | pathway_score <- function(normalized_matrix){
 55 | Diff_path <- readRDS("data/Diff_path.rds")
 56 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds")
 57 | score_gene <- KEGG_gene %>% filter(hsa %in% Diff_pathway$hsa)
 58 | gene_id <- rownames(normalized_matrix)
 59 | normalized_matrix <- as.data.frame(t(normalized_matrix))
 60 | colnames(normalized_matrix) <- gene_id
 61 | myFun1 <- function(number){
 62 | sum(number)/length(number)
 63 | }
 64 | all_pathway_score <- NA
 65 | for (i in 1:213) {
 66 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 67 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)])
 68 | path_score <- as.data.frame(apply(data1, 1, myFun1))
 69 | colnames(path_score) <- names(table(score_gene$hsa))[i]
 70 | all_pathway_score <- cbind(all_pathway_score,path_score)
 71 | }
 72 | pathway_score <- all_pathway_score[,-1]
 73 | return(pathway_score)
 74 | }
 75 | data_normalized <- function(expr_data){
 76 | gene_id <- rownames(expr_data)
 77 | data1 <- expr_data %>% apply(2,function(x){x/sum(x) * 10000}) %>% as.data.frame()
 78 | data2 <- data1 %>% dplyr::mutate_all(funs(log2(.+1)))
 79 | rownames(data2) <- gene_id
 80 | data2 <- round(data2,3)
 81 | return(data2)
 82 | }
 83 | devtools::check()
 84 | devtools::check()
 85 | View(pathway_score)
 86 | ("data/Diff_path.rds")
 87 | ("data/Diff_path.rds")
 88 | Diff_path <- readRDS("data/Diff_path.rds")
 89 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds")
 90 | pathway_score <- function(normalized_matrix){
 91 | Diff_path <- readRDS("data/Diff_path.rds")
 92 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds")
 93 | score_gene <- KEGG_gene %>% filter(hsa %in% Diff_pathway$hsa)
 94 | gene_id <- rownames(normalized_matrix)
 95 | normalized_matrix <- as.data.frame(t(normalized_matrix))
 96 | colnames(normalized_matrix) <- gene_id
 97 | myFun1 <- function(number){
 98 | sum(number)/length(number)
 99 | }
100 | all_pathway_score <- NA
101 | for (i in 1:213) {
102 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
103 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)])
104 | path_score <- as.data.frame(apply(data1, 1, myFun1))
105 | colnames(path_score) <- names(table(score_gene$hsa))[i]
106 | all_pathway_score <- cbind(all_pathway_score,path_score)
107 | }
108 | pathway_score <- all_pathway_score[,-1]
109 | return(pathway_score)
110 | }
111 | usethis::use_test()
112 | usethis::use_test()
113 | devtools::check()
114 | library(TCfinder)
115 | devtools::check()
116 | devtools::check()
117 | devtools::check()
118 | Diff_path <- read.csv("data/TCfinder_Pathway.csv")
119 | View(Diff_path)
120 | KEGG_gene <- read.csv("data/KEGG_Gene.csv")
121 | View(KEGG_gene)
122 | save(Diff_path,"./data/Diff_path.Rdata")
123 | save(Diff_path,"./data/")
124 | save(Diff_path,"./data/Diff_path.Rda")
125 | save(Diff_path,"./data/Diff_path.rda")
126 | load(file = "./data/KEGG_Gene.rda")
127 | load(file = "./data/KEGG_Gene.rda")
128 | load(file = "./data/TCfinder_Pathway.rda")
129 | load(file = "./data/TCfinder_Pathway.rda")
130 | load(file = "./data/TCfinder_Pathway.rda")
131 | View(TCfinder_Pathway)
132 | remove(list = ls())
133 | load(file = "./data/KEGG_Gene.rda")
134 | load(file = "./data/TCfinder_Pathway.rda")
135 | score_gene <- KEGG_gene %>% filter(hsa %in% TCfinder_pathway$hsa)
136 | library(dplyr)
137 | score_gene <- KEGG_gene %>% filter(hsa %in% TCfinder_pathway$hsa)
138 | score_gene <- KEGG_Gene %>% filter(hsa %in% TCfinder_pathway$hsa)
139 | View(TCfinder_Pathway)
140 | View(TCfinder_Pathway)
141 | score_gene <- KEGG_Gene %>% filter(hsa %in% TCfinder_Pathway$hsa)
142 | utils::data()
143 | utils::data()
144 | devtools::check()
145 | usethis::use_data()
146 | usethis::use_data(KEGG_Gene.rda)
147 | usethis::use_data(TCfinder_Pathway.rda)
148 | KEGG_Gene <- KEGG_Gene
149 | TCfinder_Pathway <- TCfinder_Pathway
150 | usethis::use_data(TCfinder_Pathway)
151 | usethis::use_data(KEGG_Gene)
152 | devtools::check()
153 | devtools::check()
154 | devtools::document()
155 | devtools::document()
156 | devtools::check()
157 | devtools::document()
158 | devtools::check()
159 | devtools::document()
160 | devtools::check()
161 | devtools::check()
162 | devtools::document()
163 | devtools::check()
164 | devtools::check()
165 | devtools::document()
166 | devtools::check()
167 | devtools::check()
168 | remove(list = ls())
169 | KEGG_Gene <- TCfinder::KEGG_Gene.rds
170 | KEGG_Gene <- TCfinder::KEGG_Gene
171 | TCfinder_Pathway <- TCfinder::TCfinder_Pathway
172 | devtools::document()
173 | devtools::check()
174 | library(TCfinder)
175 | devtools::load_all(".")
176 | devtools::document()
177 | data_test <- fread("inst/extdata/tests_score.csv")
178 | library(data.table)
179 | data_test <- fread("inst/extdata/tests_score.csv")
180 | library(reticulate)
181 | predict_cell <- function(path_score){
182 | reticulate::source_python('inst/extdata/predict_py.py')
183 | predict <- predict_py(path_score)
184 | predict_result <- as.data.frame(predict)
185 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
186 | V1 <= 0.5 ~ "tumor"))
187 | colnames(result) <- c("value","cell_type")
188 | return(result)
189 | }
190 | data_test <- fread("inst/extdata/tests_score.csv")
191 | result <- predict_cell(path_score = data_test)
192 | reticulate::py_config()
193 | python
194 | reticulate::repl_python()
195 | reticulate::py_config()
196 | result <- predict_cell(path_score = data_test)
197 | library(reticulate)
198 | reticulate::py_config()
199 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe")
200 | library(reticulate)
201 | reticulate::py_config()
202 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe")
203 | library(reticulate)
204 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe")
205 | reticulate::py_config()
206 | predict_cell <- function(path_score){
207 | reticulate::source_python('inst/extdata/predict_py.py')
208 | predict <- predict_py(path_score)
209 | predict_result <- as.data.frame(predict)
210 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
211 | V1 <= 0.5 ~ "tumor"))
212 | colnames(result) <- c("value","cell_type")
213 | return(result)
214 | }
215 | data_test <- fread("inst/extdata/tests_score.csv")
216 | library(data.table)
217 | data_test <- fread("inst/extdata/tests_score.csv")
218 | predict_cell <- function(path_score){
219 | reticulate::source_python('inst/extdata/predict_py.py')
220 | predict <- predict_py(path_score)
221 | predict_result <- as.data.frame(predict)
222 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
223 | V1 <= 0.5 ~ "tumor"))
224 | colnames(result) <- c("value","cell_type")
225 | return(result)
226 | }
227 | result <- predict_cell(path_score = data_test)
228 | predict_cell <- function(path_score){
229 | reticulate::source_python('inst/extdata/predict_py.py')
230 | predict <- predict_py(path_score)
231 | predict_result <- as.data.frame(predict)
232 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
233 | V1 <= 0.5 ~ "tumor"))
234 | colnames(result) <- c("value","cell_type")
235 | return(result)
236 | }
237 | result <- predict_cell(path_score = data_test)
238 | result <- predict_cell(path_score = data_test)
239 | library(data.table)
240 | data_test <- fread("inst/extdata/tests_score.csv")
241 | library(reticulate)
242 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe")
243 | reticulate::py_config()
244 | predict_cell <- function(path_score){
245 | reticulate::source_python('inst/extdata/predict_py.py')
246 | predict <- predict_py(path_score)
247 | predict_result <- as.data.frame(predict)
248 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
249 | V1 <= 0.5 ~ "tumor"))
250 | colnames(result) <- c("value","cell_type")
251 | return(result)
252 | }
253 | result <- predict_cell(path_score = data_test)
254 | result <- predict_cell(path_score = data_test)
255 | predict_cell <- function(path_score){
256 | reticulate::source_python('inst/extdata/predict_py.py')
257 | predict <- predict_py(path_score)
258 | predict_result <- as.data.frame(predict)
259 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
260 | V1 <= 0.5 ~ "tumor"))
261 | colnames(result) <- c("value","cell_type")
262 | return(result)
263 | }
264 | result <- predict_cell(path_score = data_test)
265 | reticulate::source_python('inst/extdata/predict_py.py')
266 | predict <- predict_py(data_test)
267 | reticulate::repl_python()
268 | from tensorflow.keras.models import load_model
269 | model = load_model("./inst/extdata/TCfinder.hdf5")
270 | data2 = r.data.test
271 | data2 = r.data_test
272 | predict = model.predict(data2)
273 | data_test <- fread("inst/extdata/tests_score.csv")
274 | quit
275 | data2[1:5,1:5]
276 | data_test[1:5,1:5]
277 | class(data_test)
278 | data_test <- fread("inst/extdata/tests_score.csv",data.table = F)
279 | class(data_test)
280 | reticulate::repl_python()
281 | data2 = r.data_test
282 | data2
283 | library(data.table)
284 | data_test <- fread("inst/extdata/tests_score.csv",data.table = F)
285 | library(reticulate)
286 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe")
287 | reticulate::py_config()
288 | reticulate::repl_python()
289 | import pandas as pd
290 | import numpy as np
291 | import pandas as pd
292 | data2 = r.data_test
293 | quit
294 | predict_cell <- function(path_score){
295 | reticulate::source_python('inst/extdata/predict_py.py')
296 | predict <- predict_py(path_score)
297 | predict_result <- as.data.frame(predict)
298 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
299 | V1 <= 0.5 ~ "tumor"))
300 | colnames(result) <- c("value","cell_type")
301 | return(result)
302 | }
303 | result <- predict_cell(path_score = data_test)
304 | library(dplyr)
305 | result <- predict_cell(path_score = data_test)
306 | View(result)
307 | table(result$cell_type)
308 | devtools::load_all()
309 | sethis::use_testthat(3)
310 | usethis::use_testthat(3)
311 | usethis::use_test()
312 | use_test("predict_cell")
313 | usethis::use_test("predict_cell")
314 | devtools::document()
315 | devtools::check()
316 | library(fs)
317 | fs::path_package("extdata",package = "TCfinder")
318 | fs::path_package("predict_py.py",package = "TCfinder")
319 | fs::path_package("predict_py.py",package = "TCfinder")
320 | fs::path_package("extdata",package = "TCfinder")
321 | Path <- fs::path_package("extdata",package = "TCfinder")
322 | paste0(Path,"predict_py.py")
323 | reticulate::repl_python()
324 | Path = r.Path
325 | Path+"/TCfinder.hdf5"
326 | quit
327 | predict_cell <- function(path_score){
328 | Path <- fs::path_package("extdata",package = "TCfinder")
329 | reticulate::source_python(paste0(Path,"predict_py.py"))
330 | predict <- predict_py(path_score,Path)
331 | predict_result <- as.data.frame(predict)
332 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
333 | V1 <= 0.5 ~ "tumor"))
334 | colnames(result) <- c("value","cell_type")
335 | return(result)
336 | }
337 | result1 <- predict_cell(data_test)
338 | Path <- fs::path_package("extdata",package = "TCfinder")
339 | Path
340 | paste0(Path,"predict_py.py")
341 | predict_cell <- function(path_score){
342 | Path <- fs::path_package("extdata",package = "TCfinder")
343 | reticulate::source_python(paste0(Path,"/predict_py.py"))
344 | predict <- predict_py(path_score,Path)
345 | predict_result <- as.data.frame(predict)
346 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
347 | V1 <= 0.5 ~ "tumor"))
348 | colnames(result) <- c("value","cell_type")
349 | return(result)
350 | }
351 | data_test <- fread("inst/extdata/tests_score.csv",data.table = F)
352 | result1 <- predict_cell(data_test)
353 | devtools::check()
354 | devtools::check()
355 | devtools::document()
356 | rm(list = c("predict_cell"))
357 | devtools::load_all()
358 | devtools::document()
359 | roxygen2::roxygenise()
360 | devtools::document()
361 | devtools::check()
362 | library(data.table)
363 | a <- fread("./inst/extdata/GOSH_pathway_score.csv")
364 | a[1:5,1:5]
365 | pathway_score <- function(normalized_matrix){
366 | KEGG_Gene <- TCfinder::KEGG_Gene
367 | TCfinder_Pathway <- TCfinder::TCfinder_Pathway
368 | score_gene <- KEGG_Gene %>% filter(hsa %in% TCfinder_Pathway$hsa)
369 | gene_id <- rownames(normalized_matrix)
370 | barcode <- colnames(normalized_matrix)
371 | normalized_matrix <- as.data.frame(t(normalized_matrix))
372 | colnames(normalized_matrix) <- gene_id
373 | myFun1 <- function(number){
374 | sum(number)/length(number)
375 | }
376 | all_pathway_score <- NA
377 | for (i in 1:213) {
378 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
379 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)])
380 | path_score <- as.data.frame(apply(data1, 1, myFun1))
381 | colnames(path_score) <- names(table(score_gene$hsa))[i]
382 | all_pathway_score <- cbind(all_pathway_score,path_score)
383 | }
384 | pathway_score <- all_pathway_score[,-1]
385 | pathway_score <- pathway_score %>% dplyr::select(TCfinder_Pathway$hsa)
386 | rownames(pathway_score) <- barcode
387 | return(pathway_score)
388 | }
389 | predict_cell <- function(path_score){
390 | barcode <- rownames(path_score)
391 | Path <- fs::path_package("extdata",package = "TCfinder")
392 | reticulate::source_python(paste0(Path,"/predict_py.py"))
393 | predict <- predict_py(path_score,Path)
394 | predict_result <- as.data.frame(predict)
395 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal",
396 | V1 <= 0.5 ~ "tumor"))
397 | colnames(result) <- c("value","cell_type")
398 | result$barcode <- barcode
399 | return(result)
400 | }
401 | library(TCfinder)
402 | library(TCfinder)
403 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | inst/doc
3 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: TCfinder
 2 | Title: Tumor cell identification in single-cell datasets
 3 | Version: 1.2.0.1
 4 | Authors@R: c(
 5 |     person("Chenxu", "Wu", , "wuchx@shanghaitech.edu.cn", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0009-0005-7257-4470")),
 7 |     person("Tao", "Wu", , "wutao2@shanghaitech.edu.cn", role = "aut",
 8 |            comment = c(ORCID = "0000-0002-8999-9628")),
 9 |     person("Xue-Song", "Liu", role = c("aut", "ctb"),
10 |            comment = c(ORCID = "0000-0002-7736-0077"))
11 |   )
12 | Description: Perform normalization and pathway score calculations on single-cell data, and distinguish tumor cells from normal cells in single-cell datasets.
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | Roxygen: list(markdown = TRUE)
16 | RoxygenNote: 7.2.0
17 | Suggests: 
18 |     knitr,
19 |     rmarkdown,
20 |     testthat (>= 3.0.0)
21 | Config/testthat/edition: 3
22 | VignetteBuilder: knitr
23 | Imports:
24 |     dplyr,
25 |     reticulate,
26 |     fs,
27 |     Matrix,
28 |     methods
29 | Depends: 
30 |     R (>= 3.50)
31 | LazyData: true
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2022
2 | COPYRIGHT HOLDER: chenxuwu
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 chenxuwu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(data_normalized)
 4 | export(pathway_score)
 5 | export(predict_cell)
 6 | importFrom(Matrix,Diagonal)
 7 | importFrom(Matrix,Matrix)
 8 | importFrom(Matrix,colSums)
 9 | importFrom(Matrix,t)
10 | importFrom(dplyr,"%>%")
11 | importFrom(dplyr,case_when)
12 | importFrom(dplyr,filter)
13 | importFrom(dplyr,mutate)
14 | importFrom(dplyr,select)
15 | importFrom(methods,is)
16 | importFrom(reticulate,source_python)
17 | 


--------------------------------------------------------------------------------
/R/data_normalized.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title data normalized
 3 | #' @description Normalize single-cell raw counts matrix.
 4 | #' @details Input a data.frame where the rows are the gene names and the columns are the sample names.
 5 | #' @param expr_data A single-cell counts expression matrix.
 6 | #' @param method If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required.
 7 | #' For other single-cell sequencing methods, this parameter does not need to be filled in.
 8 | #' @param genome Reference genome, when method = "smart-seq2",
 9 | #' this parameter needs to be filled in, you can choose hg19 and hg38
10 | #' @return A normalized single-cell expression matrix.
11 | #' @export
12 | #' @importFrom Matrix Matrix Diagonal colSums
13 | #' @importFrom methods is
14 | 
15 | 
16 | data_normalized <- function(expr_data,method = "method",genome = "hg38"){
17 | 
18 | 
19 |   if (!methods::is(expr_data, "CsparseMatrix")) {
20 | 
21 |     expr_data <- Matrix::Matrix(as.matrix(expr_data),sparse = T)
22 | 
23 |   }
24 | 
25 | 
26 |   if (method == "method") {
27 | 
28 | 
29 |     sparse_data1 <- expr_data %*% Matrix::Diagonal(x = 1 / Matrix::colSums(expr_data)) * 10000
30 | 
31 |     #nonzero_indices <- which(sparse_data1 != 0, arr.ind = TRUE)
32 |     #sparse_data1[nonzero_indices] <- round(log2(sparse_data1[nonzero_indices] + 1), 3)
33 |     #sparse_data1 <- round(log2(sparse_data1 + 1), 3)
34 |     sparse_data1 <- log1p(sparse_data1)/log(2)
35 |     return(sparse_data1)
36 | 
37 |   }
38 | 
39 | 
40 |   if (method == "smart-seq2") {
41 | 
42 |     if (genome == "hg19") {
43 |       gene_length <- hg19
44 |     }
45 | 
46 |     if (genome == "hg38") {
47 |       gene_length <- hg38
48 |     }
49 | 
50 | 
51 |     colnames(gene_length) <- c("gene_name","Length")
52 | 
53 | 
54 |     use_gene_length <- gene_length[gene_length$gene_name %in% rownames(expr_data),]
55 |     gene_names <- use_gene_length$gene_name
56 |     selected_rows <- expr_data[gene_names, ]
57 | 
58 | 
59 |     compute_result <- function(x) {
60 |       round((x * 1000 * 1000000) / (use_gene_length[, 2] * sum(x * 1000 / use_gene_length[, 2])), 3)
61 |     }
62 | 
63 | 
64 |     result_matrix <- as.data.frame(apply(selected_rows, 2, compute_result))
65 |     colnames(result_matrix) <- colnames(selected_rows)
66 |     rownames(result_matrix) <- rownames(selected_rows)
67 | 
68 |     sparse_data1 <- Matrix::Matrix(as.matrix(result_matrix),sparse = T)
69 |     #nonzero_indices <- which(sparse_data1 != 0, arr.ind = TRUE)
70 |     #sparse_data1[nonzero_indices] <- round(log2(sparse_data1[nonzero_indices] + 1), 3)
71 |     #sparse_data1 <- round(log2(sparse_data1 + 1), 3)
72 |     sparse_data1 <- log1p(sparse_data1)/log(2)
73 |     return(sparse_data1)
74 |   }
75 | 
76 | 
77 |   if (!all(method %in% c("method", "smart-seq2"))) {
78 |     stop("Method parameter error ")
79 |   }
80 | }
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/pathway_score.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title pathway score
 3 | #' @description Obtain a pathway score matrix for predicting tumor cells.
 4 | #' @details Input a sparse matrix, matrix, or data frame where the rows are the gene names and the columns are the sample names. Matrix that can be generated directly using the data_normalized.R function.
 5 | #' @param expr_data Single-cell expression matrix after normalization of the original counts matrix.
 6 | #' @param normalized If the matrix is not normalized, you need to set normalized = FALSE
 7 | #' @param method This parameter is required when normalized = FALSE. If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required.
 8 | #' For other single-cell sequencing methods, this parameter does not need to be filled in.
 9 | #' @param genome This parameter is required when normalized = FALSE. Reference genome, when method = "smart-seq2",
10 | #' this parameter needs to be filled in, you can choose hg19 and hg38
11 | #' @return A matrix containing 213 pathway scores.
12 | #' @export
13 | #' @importFrom dplyr %>% filter select
14 | #' @importFrom Matrix Matrix t
15 | #' @importFrom methods is
16 | 
17 | pathway_score <- function(expr_data,normalized = TRUE,method = "method",genome = "hg38"){
18 | 
19 | 
20 |   if (!methods::is(expr_data, "CsparseMatrix")) {
21 | 
22 |     expr_data <- Matrix::Matrix(as.matrix(expr_data),sparse = T)
23 | 
24 |   }
25 | 
26 | 
27 |   if (!all(normalized %in% c(TRUE, FALSE))) {
28 |     stop("The normalized parameter is required")
29 |   }
30 | 
31 | 
32 |   if(normalized == FALSE){
33 | 
34 |   normalized_matrix <- TCfinder::data_normalized(expr_data,method = method,genome = genome)
35 | 
36 |   }
37 | 
38 |   if(normalized == TRUE){
39 | 
40 |     normalized_matrix <- expr_data
41 | 
42 |   }
43 | 
44 | 
45 |   KEGG_Gene <- TCfinder::KEGG_Gene
46 |   TCfinder_Pathway <- TCfinder::TCfinder_Pathway
47 | 
48 |   score_gene <- KEGG_Gene %>% dplyr::filter(hsa %in% TCfinder_Pathway$hsa)
49 | 
50 | 
51 |   gene_id <- rownames(normalized_matrix)
52 |   barcode <- colnames(normalized_matrix)
53 |   normalized_matrix <- Matrix::t(normalized_matrix)
54 | 
55 |   colnames(normalized_matrix) <- gene_id
56 | 
57 | 
58 | 
59 |   all_pathway_score <- NA
60 |   for (i in 1:213) {
61 | 
62 |     gene <- score_gene %>% dplyr::filter(hsa == names(table(score_gene$hsa))[i])
63 |     pathay_gene <- colnames(normalized_matrix)[which(colnames(normalized_matrix) %in% gene$gene_id==TRUE)]
64 | 
65 |     selected_data <- normalized_matrix[, pathay_gene]
66 | 
67 |     path_score <- as.data.frame(apply(selected_data, 1, FUN = function(x){sum(x)/length(x)}))
68 | 
69 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
70 |     all_pathway_score <- cbind(all_pathway_score,path_score)
71 | 
72 |   }
73 | 
74 |   pathway_score <- all_pathway_score[,-1]
75 |   pathway_score <- pathway_score %>% dplyr::select(TCfinder_Pathway$hsa)
76 |   rownames(pathway_score) <- barcode
77 |   return(pathway_score)
78 | 
79 | }
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/R/predict_cell.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title Cell types prediction.
 3 | #' @description Classify tumor cells from normal cells.
 4 | #' @details Input the pathway score matrix calculated by the pathway_score function.
 5 | #' @param path_score The pathway score matrix calculated by the pathway_score function.
 6 | #' @return A data.frame containing cell types and predicted values.
 7 | #' @export
 8 | #' @importFrom reticulate source_python
 9 | #' @importFrom dplyr mutate case_when
10 | 
11 | 
12 | 
13 | predict_cell <- function(path_score){
14 | 
15 |   barcode <- rownames(path_score)
16 |   Path <- fs::path_package("extdata",package = "TCfinder")
17 |   reticulate::source_python(paste0(Path,"/predict_py.py"))
18 | 
19 |   predict <- predict_py(path_score,Path)
20 |   predict_result <- as.data.frame(predict)
21 |   result <- predict_result %>% dplyr::mutate(cell_type = dplyr::case_when(V1 > 0.5 ~ "normal",
22 |                                                                           V1 <= 0.5 ~ "tumor"))
23 |   colnames(result) <- c("value","cell_type")
24 |   result$barcode <- barcode
25 |   return(result)
26 | }
27 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/R/sysdata.rda


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TCfinder
  2 | 
  3 | TCfinder is the tool to distinguish tumor cells from normal cells in single-cell data from the perspective of gene pathway expression quantification. A pathway usually contains multiple genes, which makes TCfinder more applicable because it overcomes the single-cell data sparsity problem faced by traditional methods. The successful construction of TCfinder also suggests the applicability of gene pathway expression quantification in the annotation of other cell types in scRNA-seq.
  4 | 
  5 | ## Workflow
  6 | 
  7 | ![Image text](inst/image/workflow.png)
  8 | 
  9 | ## Installation and use of TCfinder package.
 10 | 
 11 | TCfinder, as an R package, can be downloaded and used via Github. TCfinder relies on several R packages, and these dependencies include：
 12 | 
 13 | ***R (>= 3.5.0);***
 14 | 
 15 | ***dplyr (>= 1.1.0);***
 16 | 
 17 | ***reticulate (>= 1.2.6);***
 18 | 
 19 | ***Matrix;***
 20 | 
 21 | ***fs;***
 22 | 
 23 | ### Install
 24 | 
 25 | ```R
 26 | devtools::install_github("XSLiuLab/TCfinder")
 27 | ```
 28 | 
 29 | TCfinder contains three functions, which respectively standardize the raw counts of single cells, score pathways, and predict tumor cells and normal cells. 
 30 | 
 31 | ### Data normalization
 32 | 
 33 | The input data needs to be a sparse matrix or data.frame data whose row name is gene name and column name is sample name.
 34 | 
 35 | If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required, and needed to select genome = "hg19" or "hg38". For other single-cell sequencing methods, this parameter does not need to be filled in.
 36 | 
 37 | ```R
 38 | library(TCfinder)
 39 | result1 <- data_normalized(expr_data = expr_data,method = "method",genome = "hg38")
 40 | ```
 41 | 
 42 | #### Example:
 43 | 
 44 | The row name is gene symbol, and the column name is barcode of the sample.
 45 | 
 46 | |         | AAACCTGCACATCCGG | ...  | AAACGGGGTTGAACTC | AAACGGGGTTGTCGCG |
 47 | | :-----: | :--------------: | ---- | :--------------: | :--------------: |
 48 | | FAM138A |        0         | ...  |        0         |        1         |
 49 | |  OR4F5  |        8         | ...  |        20        |        15        |
 50 | |   ...   |       ...        | ...  |       ...        |       ...        |
 51 | | FAM87B  |        1         | ...  |        0         |        1         |
 52 | 
 53 | ### Pathway score
 54 | 
 55 | The path score is calculated using the built-in 213 pathways according to the formula in workflow. 
 56 | 
 57 | The output of data_normalized() can be directly used as the input of pathway_score(). If the matrix is not normalized, "normalized = FALSE" is needed to set
 58 | 
 59 | ```R
 60 | result2 <- pathway_score(expr_data = result1, normalized = T)
 61 | ```
 62 | 
 63 | #### result2: pathway score
 64 | 
 65 | |                  | hsa00010  | hsa00190  | ...  | hsa00270  |
 66 | | :--------------: | :-------: | :-------: | ---- | :-------: |
 67 | | AAACCTGCACATCCGG | 0.3401667 | 0.9679245 | ...  | 0.2091803 |
 68 | | AAACGGGGTTGAACTC | 0.5657879 | 1.6702925 | ...  | 0.4492787 |
 69 | |       ...        |    ...    |    ...    | ...  |    ...    |
 70 | | AAACGGGGTTGTCGCG | 0.3202879 | 1.4834434 | ...  | 0.4590984 |
 71 | 
 72 | ### Prediction of cell type (tumor cell or normal cell)
 73 | 
 74 | The prediction model is developed based on deep learning in python, so some python environments and module installations need to be configured before running the prediction.
 75 | 
 76 | #### Python environment and module installation
 77 | 
 78 | ```python
 79 | # Create a new environment
 80 | conda create -n new_env python=3.8
 81 | # Activate the new environment
 82 | conda activate new_env
 83 | # Install required modules
 84 | conda install tensorflow==2.3.0
 85 | conda install pandas==1.0.5
 86 | conda install numpy==1.18.5
 87 | # View conda environment information
 88 | conda env list # Copy the address of the new conda environment, which will be used later
 89 | ```
 90 | 
 91 | #### Predict cell
 92 | 
 93 | The prediction process needs to call a python script, so the R package 'reticulate' is required. The input data is the pathway score result obtained by running the pathway_score() function
 94 | 
 95 | ```R
 96 | install.packages("reticulate")
 97 | library(reticulate)
 98 | # Use the use_python() function to specify the version, here we use the python just created and configured above
 99 | reticulate::use_python("XXX/XXX/XXX/anaconda3/envs/new_env/bin/python")
100 | # View specified environment information
101 | reticulate::py_config()
102 | # Predict
103 | predict_result <- predict_cell(path_score = result2)
104 | ```
105 | 
106 | #### predict_result
107 | 
108 | |      |    value     | cell_type |     barcode      |
109 | | :--: | :----------: | :-------: | :--------------: |
110 | |  1   |  0.9996183   |  normal   | AAACCTGCACATCCGG |
111 | |  2   |  0.9989167   |  normal   | AAACGGGGTTGAACTC |
112 | |  3   | 0.0001887589 |   tumor   | AAACGGGGTTGTCGCG |
113 | | ...  |     ...      |    ...    |       ...        |
114 | 
115 | ## Citation
116 | 
117 | Chenxu Wu, Wei Ning, Tao Wu, Jing Chen, Huizi Yao, Ziyu Tao, Xiangyu Zhao, Kaixuan Diao, Jinyu Wang, Weiliang Wang, Xinxing Li, Qianqian Song, Xue-Song Liu. 2024. TCfinder: Robust tumor cell discriminationin scRNA-seq based on gene pathway activity. iMetaOmics 1: e22. https://doi.org/10.1002/imo2.22
118 | 
119 | ## Contributors
120 | 
121 | TCfinder was developed by Chenxu Wu. Please contact Chenxu Wu: wuchx@shanghaitech.edu.cn for any questions or suggestions. Thank you for your use and feedback.
122 | 
123 | ------
124 | 
125 | **Cancer Biology Group @ShanghaiTech**
126 | 
127 | **Research group led by Xue-Song Liu in ShanghaiTech University**
128 | 


--------------------------------------------------------------------------------
/TCfinder.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | LineEndingConversion: Posix
18 | 
19 | BuildType: Package
20 | PackageUseDevtools: Yes
21 | PackageInstallArgs: --no-multiarch --with-keep.source
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/data/KEGG_Gene.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/data/KEGG_Gene.rda


--------------------------------------------------------------------------------
/data/TCfinder_Pathway.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/data/TCfinder_Pathway.rda


--------------------------------------------------------------------------------
/inst/analysis/GSE673_analysis.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | setwd("~/project/mcIdentify/data/")
 4 | remove(list = ls())
 5 | library(data.table)
 6 | library(dplyr)
 7 | 
 8 | GSE673_eff <- fread("./model_built/pathway_score_213/model_result/GSE673_inter_eff.csv")
 9 | 
10 | tumor_type <- GSE673_eff[1:4,]
11 | 
12 | tumor_type1 <- melt(tumor_type)
13 | 
14 | V1 <- c("GSE673_ATC","GSE673_IDC","GSE673_TNBC","GSE673_DCIS")
15 | tumor_type1$V1 <- factor(tumor_type1$V1,levels = c("GSE673_ATC","GSE673_IDC","GSE673_TNBC","GSE673_DCIS"))
16 | library(ggplot2)
17 | library(ggprism)
18 | ggplot(tumor_type1,aes(x=V1,y=value,fill=variable))+
19 |   geom_bar(position="dodge",stat="identity")+
20 |   labs(x="Cancer type",y="")+
21 |   theme_prism()+
22 |   geom_hline(aes(yintercept=0.95),linetype=5,col="black")+
23 |   scale_y_continuous(breaks=c(0,.25,0.5,0.75,0.95,1))+
24 |   scale_x_discrete(breaks=V1, labels=c("ATC","IDC","TNBC","DCIS"))+
25 |   scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"),
26 |                     breaks=c("f1", "accuracy", "recall", "precisoon"),
27 |                     labels=c("F1 score", "Accuracy", "Recall", "Precison"))
28 | 
29 | 
30 | 
31 | 
32 | ###gene number
33 | gene_number <- GSE673_eff[5:10,]
34 | gene_number1 <- melt(gene_number)
35 | V1 <- c("GSE673_500","GSE673_500_1000","GSE673_1000_1500","GSE673_1500_2000","GSE673_2000_2500","GSE673_2500_00")
36 | gene_number1$V1 <- factor(gene_number1$V1,levels = V1)
37 | library(ggplot2)
38 | library(ggprism)
39 | ggplot(gene_number1,aes(x=V1,y=value,fill=variable))+
40 |   geom_bar(position="dodge",stat="identity")+
41 |   labs(x="Gene number",y="")+
42 |   theme_prism()+
43 |   geom_hline(aes(yintercept=0.95),linetype=5,col="black")+
44 |   scale_y_continuous(breaks=c(0,.25,0.5,0.75,0.95,1))+
45 |   scale_x_discrete(breaks=V1, labels=c("<500","500~1000","1000~1500","1500~2000","2000~2500",">2500"))+
46 |   scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"),
47 |                     breaks=c("f1", "accuracy", "recall", "precisoon"),
48 |                     labels=c("F1 score", "Accuracy", "Recall", "Precison"))
49 | 
50 | 
51 | 
52 | # simulation gene
53 | gene_number <- GSE673_eff[11:15,]
54 | gene_number1 <- melt(gene_number)
55 | V1 <- c("simulation_500","simulation_1000","simulation_1500","simulation_2000","simulation_2500")
56 | gene_number1$V1 <- factor(gene_number1$V1,levels = V1)
57 | library(ggplot2)
58 | library(ggprism)
59 | ggplot(gene_number1,aes(x=V1,y=value,fill=variable))+
60 |   geom_bar(position="dodge",stat="identity")+
61 |   labs(x="Simulate gene number",y="")+
62 |   theme_prism()+
63 |   geom_hline(aes(yintercept=0.95),linetype=5,col="black")+
64 |   scale_y_continuous(breaks=c(0,.25,0.5,0.75,0.95,1))+
65 |   scale_x_discrete(breaks=V1, labels=c("500","1000","1500","2000","2500"))+
66 |   scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"),
67 |                     breaks=c("f1", "accuracy", "recall", "precisoon"),
68 |                     labels=c("F1 score", "Accuracy", "Recall", "Precison"))
69 | 
70 | 
71 | 
72 | 
73 | # setwd("~/project/mcIdentify/data/")
74 | # remove(list = ls())
75 | # 
76 | # library(data.table)
77 | # library(dplyr)
78 | # GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
79 | # data1 <- fread("./model_built/pathway_score_213/tumor_type_data/GSE673_tumor_type_data.csv")
80 | # 
81 | # data2 <- data1 %>% mutate(cancer_type = case_when(cell_type == "ATC1." ~ "ATC",
82 | #                                                   cell_type == "ATC2." ~ "ATC",
83 | #                                                   cell_type == "ATC3." ~ "ATC",
84 | #                                                   cell_type == "ATC4." ~ "ATC",
85 | #                                                   cell_type == "ATC5." ~ "ATC",
86 | #                                                   cell_type == "DCIS1" ~ "DCIS",
87 | #                                                   cell_type == "IDC1." ~ "IDC",
88 | #                                                   cell_type == "IDC2." ~ "IDC",
89 | #                                                   cell_type == "TNBC1" ~ "TNBC",
90 | #                                                   cell_type == "TNBC2" ~ "TNBC",
91 | #                                                   cell_type == "TNBC3" ~ "TNBC",
92 | #                                                   cell_type == "TNBC4" ~ "TNBC",
93 | #                                                   cell_type == "TNBC5" ~ "TNBC"))
94 | # 
95 | # data3 <- data2 %>% select(type,cancer_type,GSE673_diff_path213$hsa)
96 | # single_cancer <- data3 %>% filter(cancer_type == "TNBC") %>% select(type,GSE673_diff_path213$hsa)
97 | # fwrite(single_cancer,"./model_built/pathway_score_213/tumor_type_data/GSE673_TNBC.csv")
98 | 
99 | 


--------------------------------------------------------------------------------
/inst/analysis/MLcode.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ### RF
  3 | library(dplyr)
  4 | library(data.table)
  5 | library(randomForest)
  6 | remove(list = ls())
  7 | setwd("~/project/mcIdentify/Revise_1/")
  8 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 
  9 | set.seed(123) 
 10 | split <- sample.split(data$type, SplitRatio = 0.8) 
 11 | train_data <- subset(data, split == TRUE) 
 12 | test_data <- subset(data, split == FALSE)
 13 | 
 14 | X_train <- train_data[, -1]
 15 | y_train <- as.factor(train_data[, 1])
 16 | 
 17 | 
 18 | ctrl <- trainControl(method = "cv", number = 5)
 19 | grid <- expand.grid(mtry = c(2, 4, 6)) 
 20 | rf_model <- train(x = X_train, y = y_train,
 21 |                   method = "rf",
 22 |                   trControl = ctrl,
 23 |                   tuneGrid = grid)
 24 | 
 25 | print(rf_model)
 26 | 
 27 | grid <- expand.grid(mtry = c(6)) 
 28 | modellist <- list()
 29 | for (ntree in c(100,200, 300)) {
 30 |   set.seed(123)
 31 |   fit <- train(x = X_train, y = y_train, method="rf", 
 32 |                metric="Accuracy", tuneGrid=grid, 
 33 |                trControl=ctrl, ntree=ntree)
 34 |   key <- toString(ntree)
 35 |   modellist[[key]] <- fit
 36 | }
 37 | results <- resamples(modellist)
 38 | summary(results)
 39 | 
 40 | model <- randomForest(x = X_train, y = y_train,mtry = 6,ntree = 200)
 41 | print(model)
 42 | 
 43 | x_test_data <- test_data[, -1]
 44 | y_test_data <- as.factor(test_data[, 1])
 45 | test_predictions <- predict(model, newdata = x_test_data)
 46 | 
 47 | confusion_matrix <- confusionMatrix(test_predictions, y_test_data)
 48 | accuracy <- confusion_matrix$overall["Accuracy"]
 49 | precision <- confusion_matrix$byClass["Pos Pred Value"]
 50 | recall <- confusion_matrix$byClass["Sensitivity"]
 51 | f1_score <- confusion_matrix$byClass["F1"]
 52 | 
 53 | print(confusion_matrix)
 54 | print(paste("Accuracy:", accuracy))
 55 | print(paste("Precision:", precision))
 56 | print(paste("Recall:", recall))
 57 | print(paste("F1 Score:", f1_score))
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | ## SVM
 65 | library(e1071)
 66 | library(ggplot2)
 67 | library(caret)
 68 | remove(list = ls())
 69 | setwd("~/project/mcIdentify/Revise_1/")
 70 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 
 71 | 
 72 | set.seed(123)  
 73 | split <- sample.split(data$type, SplitRatio = 0.8)  
 74 | train_data <- subset(data, split == TRUE)  
 75 | test_data <- subset(data, split == FALSE) 
 76 | X_train <- train_data[, -1]
 77 | y_train <- as.factor(train_data[, 1])
 78 | 
 79 | 
 80 | 
 81 | param_grid <- expand.grid(
 82 |   sigma = c(0.1, 1, 10),
 83 |   C = c(0.1, 1, 10))
 84 | ctrl <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
 85 | 
 86 | tuned_model <- train(
 87 |   x = X_train,
 88 |   y = y_train,
 89 |   method = "svmRadial",
 90 |   tuneGrid = param_grid,
 91 |   trControl = ctrl)
 92 | print(tuned_model)
 93 | 
 94 | 
 95 | svm_model <- svm(x = X_train, y = y_train,sigma = 0.1,C = 10)
 96 | train_predictions <- predict(svm_model, newdata = X_train)
 97 | table(y_train, train_predictions)
 98 | 
 99 | X_new_data <- test_data[, -1]
100 | y_new_data <- as.factor(test_data[, 1])
101 | test_predictions <- predict(svm_model, newdata = X_new_data)
102 | 
103 | table(test_predictions,y_new_data)
104 | accuracy <- mean(test_predictions == y_new_data)
105 | precision <- sum(test_predictions == "normal" & y_new_data == "normal") / sum(test_predictions == "normal")
106 | recall <- sum(test_predictions == "normal" & y_new_data == "normal") / sum(y_new_data == "normal")
107 | f1_score <- 2 * precision * recall / (precision + recall)
108 | print(paste("accuracy:", accuracy))
109 | print(paste("precision:", precision))
110 | print(paste("recall:", recall))
111 | print(paste("F1 score:", f1_score))
112 | 
113 | 
114 | 
115 | 
116 | ### xgboost
117 | library(xgboost)
118 | library(Matrix)
119 | remove(list = ls())
120 | setwd("~/project/mcIdentify/Revise_1/")
121 | new_data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F)
122 | new_data1 <- new_data %>% mutate(type = ifelse(type == "normal",0,1))
123 | 
124 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F)  
125 | data <- data %>% mutate(type = ifelse(type == "normal",0,1))
126 | set.seed(123) 
127 | split <- sample.split(data$type, SplitRatio = 0.8) 
128 | train_data <- subset(data, split == TRUE) 
129 | test_data <- subset(data, split == FALSE)
130 | 
131 | X_train <- train_data[, -1]
132 | y_train <- train_data[, 1]
133 | 
134 | ctrl <- trainControl(
135 |   method = "cv", 
136 |   number = 5,    
137 |   verboseIter = FALSE)
138 | 
139 | param_grid <- expand.grid(
140 |   nrounds = c(100, 200), 
141 |   max_depth = c(3, 6), 
142 |   eta = c(0.1), 
143 |   gamma = c(0, 0.1),
144 |   colsample_bytree = c(0.8),
145 |   min_child_weight = c(1, 3),
146 |   subsample = c(0.8))
147 | 
148 | xgb_model <- train(
149 |   x = X_train,
150 |   y = y_train,
151 |   method = "xgbTree",
152 |   trControl = ctrl,
153 |   tuneGrid = param_grid)
154 | print(xgb_model$bestTune)
155 | 
156 | 
157 | dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
158 | params <- list(objective = "binary:logistic", eval_metric = "logloss", eta = 0.1, max_depth = 3)
159 | nrounds <- 100
160 | xgb_model <- xgboost(params = params, data = dtrain, nrounds = nrounds)
161 | 
162 | train_predictions <- predict(xgb_model, newdata = dtrain)
163 | train_predictions <- ifelse(train_predictions > 0.5,1,0)
164 | 
165 | confusion_matrix <- table(train_predictions,y_train)
166 | accuracy <- mean(train_predictions == y_train)
167 | precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
168 | recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
169 | f1_score <- 2 * precision * recall / (precision + recall)
170 | 
171 | print(paste("accuracy:", accuracy))
172 | print(paste("precision:", precision))
173 | print(paste("recall:", recall))
174 | print(paste("F1 score:", f1_score))
175 | 
176 | X_new_data1 <- new_data1[, -1]
177 | y_new_data1 <- as.factor(new_data1[, 1])
178 | dtest <- xgb.DMatrix(data = as.matrix(X_new_data1))
179 | test_predictions <- predict(xgb_model, newdata = dtest)
180 | test_predictions <- ifelse(test_predictions > 0.5,1,0)
181 | 
182 | confusion_matrix <- table(test_predictions,y_new_data1)
183 | accuracy <- mean(test_predictions == y_new_data1)
184 | precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
185 | recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
186 | f1_score <- 2 * precision * recall / (precision + recall)
187 | 
188 | print(paste("accuracy:", accuracy))
189 | print(paste("precision:", precision))
190 | print(paste("recall:", recall))
191 | print(paste("F1 score:", f1_score))
192 | 
193 | 
194 | 
195 | ###LR
196 | library(ggplot2)
197 | library(dplyr)
198 | library(caTools)
199 | library(pROC)
200 | library(caret)
201 | remove(list = ls())
202 | setwd("~/project/mcIdentify/Revise_1/")
203 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 
204 | data <- data %>% mutate(type = ifelse(type=="normal",1,0))
205 | set.seed(123) 
206 | split <- sample.split(data$type, SplitRatio = 0.7) 
207 | train_data <- subset(data, split == TRUE)  
208 | test_data <- subset(data, split == FALSE)  
209 | model <- glm(type ~ ., data = train_data, family = gaussian)
210 | summary(model)
211 | 
212 | 
213 | predictions <- predict(model, newdata = test_data, type = "response")
214 | threshold <- 0.5 
215 | predicted_classes <- ifelse(predictions >= threshold, 1, 0)
216 | 
217 | confusion_matrix <- table(test_data$type, predicted_classes)
218 | accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
219 | precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
220 | recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
221 | f1_score <- 2 * precision * recall / (precision + recall)
222 | 
223 | cat("Accuracy: ", accuracy, "\n")
224 | cat("Precision: ", precision, "\n")
225 | cat("Recall: ", recall, "\n")
226 | cat("F1 Score: ", f1_score, "\n")
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/inst/analysis/bluk_Anti.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | remove(list = ls())
  4 | setwd("~/project/mcIdentify/data/")
  5 | 
  6 | tcga_counts <- readRDS("~/project/common_data/tcga/tcga_clean_counts.rds")
  7 | 
  8 | library(NeoEnrichment)
  9 | library(dplyr)
 10 | library(data.table)
 11 | infor <- as.data.frame(colnames(tcga_counts))
 12 | colnames(infor) <- "barcode"
 13 | infor$type <- get_cancer_type(infor$barcode)
 14 |   
 15 | BRCA_infor <- infor %>% filter(type == "BRCA") 
 16 | BRCA_infor <- BRCA_infor %>% dplyr::mutate(tissue = case_when(grepl("*01$",BRCA_infor$barcode) ~ "tumor",
 17 |                                   grepl("*11$",BRCA_infor$barcode) ~ "normal"))
 18 | 
 19 | tumor_infor <- BRCA_infor %>% filter(tissue == "tumor")
 20 | normal_infor <- BRCA_infor %>% filter(tissue == "normal")
 21 | 
 22 | 
 23 | tumor_sample <- tcga_counts %>% select(tumor_infor$barcode) %>% as.data.frame()
 24 | rownames(tumor_sample) <- rownames(tcga_counts)
 25 | 
 26 | normal_sample <- tcga_counts %>% select(normal_infor$barcode) %>% as.data.frame()
 27 | rownames(normal_sample) <- rownames(tcga_counts)
 28 | 
 29 | 
 30 | ##DESeq2
 31 | DESeq2_DEG <- function(ecDNA_sample,NonecDNA_sample){
 32 |   ##DESeq2
 33 |   library("DESeq2")
 34 |   DEG_data <- as.data.frame(cbind(ecDNA_sample,NonecDNA_sample)) %>% round(.,digits = 0)
 35 |   rownames(DEG_data) <- rownames(tcga_counts)
 36 |   group <- as.factor(c(rep("ecDNA",length(ecDNA_sample)), rep("NonecDNA",length(NonecDNA_sample)))) #建立分组
 37 |   colGroup <- data.frame(row.names = colnames(DEG_data),
 38 |                          group_list = group)
 39 |   dds <- DESeqDataSetFromMatrix(countData = DEG_data,
 40 |                                 colData = colGroup,
 41 |                                 design = ~ group_list)
 42 |   dds <- dds[rowSums(counts(dds)) > 10, ] 
 43 |   dds2 <- DESeq(dds)
 44 |   res <-  results(dds2, contrast=c("group_list","ecDNA","NonecDNA")) 
 45 |   resOrdered <- res[order(res$padj),]
 46 |   resOrdered$gene_id <- rownames(resOrdered)
 47 |   DE_result <- resOrdered %>% as_data_frame(.) %>% na.omit(.)
 48 |   
 49 |   return(DE_result)
 50 | }
 51 | Z_score <- function(data1){
 52 |   for (i in 1:length(rownames(data1))) {
 53 |     data1 <- as.matrix(data1)
 54 |     data1[i,] <- (data1[i,]-mean(data1[i,]))/sd(data1[i,])
 55 |   }
 56 |   return(data1)
 57 | }
 58 | 
 59 | DE_Cluster <- DESeq2_DEG(tumor_sample,normal_sample)
 60 | 
 61 | fwrite(DE_Cluster,"./model_built/pathway_score_213/time_analysis/bluk_brca_DEG.txt")
 62 | 
 63 | 
 64 | ###
 65 | genelist_four <- KEGG_pathway_gene %>% filter(hsa %in% c("hsa00190","hsa04612","hsa04940","hsa05416"))
 66 | genelist_four1 <- genelist_four[,-1]
 67 | ###
 68 | 
 69 | ##GSEA
 70 | library(GSEABase)
 71 | library(clusterProfiler)
 72 | HallmarkGeneSet <- read.gmt("./model_built/pathway_score_213/time_analysis/single_gene_list.gmt") 
 73 | 
 74 | Gsea_DEG <- DE_Cluster %>% 
 75 |   dplyr::mutate(state = (-log10(padj)) * sign(log2FoldChange)) %>% dplyr::arrange(-state) %>%
 76 |   dplyr::filter(padj != 0)
 77 | 
 78 | geneList <- Gsea_DEG$state 
 79 | names(geneList) <- Gsea_DEG$gene_id 
 80 | geneList <- sort(geneList, decreasing = T) 
 81 | GSEA_result <- GSEA(geneList, TERM2GENE = HallmarkGeneSet, pvalueCutoff = 1,eps = 0) 
 82 | 
 83 | library(enrichplot)
 84 | gseaplot2(GSEA_result, GSEA_result@result$Description[1:4], title = "", color = "red", base_size = 12,
 85 |           rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = T,
 86 |           ES_geom = "line")
 87 | 
 88 | 
 89 | 
 90 | ##GSVA
 91 | library(GSVA)
 92 | library(GSEABase)
 93 | HallmarkGeneSet <- getGmt("./model_built/pathway_score_213/time_analysis/single_gene_list.gmt") 
 94 | gsva_result <- gsva(as.matrix(GSE530_sample), HallmarkGeneSet,
 95 |                     min.sz=1, max.sz=1000, verbose=TRUE,kcdf="Poisson",parallel.sz=5L)
 96 | 
 97 | 
 98 | gsva_result <- as.data.frame(t(gsva_result))
 99 | 
100 | gsva_data1 <- cbind(BRCA_infor,gsva_result)
101 | 
102 | 
103 | library(ggpubr)
104 | library(ggprism)
105 | library(ggplot2)
106 | library(cowplot)
107 | 
108 | gsva_data2 <- melt(gsva_data1)
109 | gsva_data3 <- na.omit(gsva_data2)
110 | gsva_data3$tissue <- factor(gsva_data3$tissue,levels = c("tumor","normal"))
111 | 
112 | 
113 | ggplot(data=gsva_data3,aes(x=variable,y=value,fill=factor(tissue)))+
114 |   geom_boxplot()+
115 |   stat_compare_means(aes(label = ..p.signif..))+
116 |   theme_prism()+
117 |   labs(y="GSVA Score",title = "BRCA")+
118 |   theme(axis.title.x = element_blank())
119 | 
120 | 
121 | 
122 | 
123 | ## gene expression
124 | 
125 | tcga_tpm <- readRDS("~/project/common_data/tcga/tpm_clean_data.rds")
126 | library(NeoEnrichment)
127 | library(dplyr)
128 | library(data.table)
129 | infor <- as.data.frame(colnames(tcga_tpm))
130 | colnames(infor) <- "barcode"
131 | infor$type <- get_cancer_type(infor$barcode)
132 | 
133 | BRCA_infor <- infor %>% filter(type == "BRCA") 
134 | BRCA_infor <- BRCA_infor %>% dplyr::mutate(tissue = case_when(grepl("*01$",BRCA_infor$barcode) ~ "tumor",
135 |                                                               grepl("*11$",BRCA_infor$barcode) ~ "normal"))
136 | 
137 | GSE530_sample <- tcga_tpm %>% select(BRCA_infor$barcode) %>% as.data.frame()
138 | rownames(GSE530_sample) <- rownames(tcga_tpm)
139 | 
140 | data1 <- GSE530_sample[as.character(gene2$Var1),]
141 | data1 <- na.omit(data1)
142 | data2 <- as.data.frame(t(data1))
143 | data3 <- cbind(BRCA_infor, data2)
144 | 
145 | data4 <- melt(data3)
146 | 
147 | data4$variable <- factor(data4$variable,levels = c("HLA-A","HLA-B","HLA-C","HLA-E","HLA-F","HLA-G","HLA-DRA",
148 |                                                    "HLA-DRB1","HLA-DRB5","HLA-DQA1","HLA-DQA2",
149 |                                                    "HLA-DQB1","HLA-DOB","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DPA1","HLA-DPB1"))
150 | 
151 | 
152 | library(ggpubr)
153 | library(ggprism)
154 | library(ggplot2)
155 | library(cowplot)
156 | 
157 | data4$tissue <- factor(data4$tissue,levels = c("tumor","normal"))
158 | data4 <- na.omit(data4)
159 | 
160 | ggplot(data=data4,aes(x=variable,y=value,fill=factor(tissue)))+
161 |   geom_boxplot()+
162 |   stat_compare_means(aes(label = ..p.signif..))+
163 |   theme_prism()+
164 |   theme(axis.text.x = element_text(angle = 30,vjust = 1, hjust = 1) )+
165 |   labs(y="Gene expression",title = "BRCA")+
166 |   theme(axis.title.x = element_blank())
167 | 
168 | 
169 | 
170 | fwrite(gene_list,"./model_built/pathway_score_213/time_analysis/genelist.csv")
171 | 


--------------------------------------------------------------------------------
/inst/analysis/bulk_pathway.R:
--------------------------------------------------------------------------------
  1 | 
  2 | setwd("~/project/mcIdentify/data/")
  3 | remove(list = ls())
  4 | 
  5 | library(data.table)
  6 | library(dplyr)
  7 | 
  8 | 
  9 | tcga_counts <- readRDS("~/project/common_data/tcga/tcga_clean_counts.rds")
 10 | 
 11 | data1 <- tcga_counts %>% apply(2,function(x){x/sum(x) * 10000})
 12 | 
 13 | data1 <- as.data.frame(data1)
 14 | data2 <- data1 %>% mutate_all(funs(log2(.+1)))
 15 | data2 <- round(data2,3)
 16 | 
 17 | data3 <- as.data.frame(t(data2))
 18 | colnames(data3) <- rownames(tcga_counts)
 19 | 
 20 | 
 21 | # pathway socre
 22 | 
 23 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
 24 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 25 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
 26 | 
 27 | 
 28 | myFun1 <- function(a){
 29 |   
 30 |   sum(a)/length(a)
 31 |   
 32 | }
 33 | 
 34 | all_pathway_score <- NA
 35 | for (i in 1:213) {
 36 |   
 37 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 38 |   
 39 |   a <- data3 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data3)==TRUE)])
 40 |   
 41 |   path_score <- as.data.frame(apply(a, 1, myFun1))
 42 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
 43 |   
 44 |   all_pathway_score <- cbind(all_pathway_score,path_score)
 45 |   
 46 | }
 47 | 
 48 | pathway_score <- all_pathway_score
 49 | pathway_score <- pathway_score[,-1]
 50 | 
 51 | fwrite(pathway_score,"./model_built/pathway_score_213/bulk_pathway_score/bulk_pathway_score.txt",row.names = T)
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | ###figure
 59 | setwd("~/project/mcIdentify/data/")
 60 | remove(list = ls())
 61 | 
 62 | library(data.table)
 63 | library(dplyr)
 64 | tcga_score <- fread("./model_built/pathway_score_213/bulk_pathway_score/bulk_pathway_score.txt")
 65 | 
 66 | data1 <- tcga_score %>% select(V1,hsa00190,hsa04612,hsa04940,hsa05416,hsa04110)
 67 | colnames(data1)[1] <- "tcga_id"
 68 | 
 69 | library(NeoEnrichment)
 70 | data1$cancer_type <- get_cancer_type(data1$tcga_id)
 71 | 
 72 | data2 <- data1 %>% dplyr::mutate(tissue = case_when(grepl("*01$",tcga_id) ~ "tumor",
 73 |                                      grepl("*11$",tcga_id) ~ "normal")) %>% na.omit()
 74 | 
 75 | 
 76 | 
 77 | normal_data <- data2 %>% filter(tissue == "normal")
 78 | normal_name <- as.data.frame(sort(table(normal_data$cancer_type)))
 79 | normal_name <- normal_name %>% filter(Freq > 20) %>% arrange(-Freq) %>% filter(Var1 != c("PRAD","KICH"))
 80 | rownames(normal_name) <- normal_name$Var1
 81 | 
 82 | data3 <- data2 %>% filter(cancer_type %in% normal_name$Var1)
 83 | 
 84 | data4 <- melt(data3)
 85 | 
 86 | pathway1 <- data4 %>% filter(variable == "hsa05416")
 87 | 
 88 | all_sample <- pathway1
 89 | all_sample$cancer_type <- "Pan-cancer"
 90 | pathway2 <- rbind(pathway1,all_sample)
 91 | pathway2$tissue <- factor(pathway2$tissue,levels = c("tumor","normal"))
 92 | pathway2$cancer_type <- factor(pathway2$cancer_type,levels = c("Pan-cancer",rownames(normal_name)))
 93 | 
 94 | 
 95 | 
 96 | library(ggpubr)
 97 | library(ggprism)
 98 | library(ggplot2)
 99 | library(cowplot)
100 | ggplot(data=pathway2,aes(x=cancer_type,y=value,fill=factor(tissue)))+
101 |   geom_boxplot()+
102 |   stat_compare_means(aes(label = ..p.signif..))+
103 |   #ylim(0.3,1.6)+
104 |   theme_prism()+
105 |   labs(y="Pathway score",title = "hsa05416")+
106 |   theme(axis.title.x = element_blank())+
107 |   theme(axis.text.x = element_text(angle = 15))
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | path_score2 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
115 | infor1 <- fread("./model_built/datasets/GSE148673_anno.txt")
116 | 
117 | data1 <- cbind(infor1,path_score2)
118 | 
119 | data2 <- data1 %>% select(type,cell_type,hsa00190,hsa04612,hsa04940,hsa05416,hsa04110)
120 | 
121 | data2 <- data2 %>% mutate(cancer_type = case_when(cell_type == "ATC1." ~ "ATC",
122 |                                                   cell_type == "ATC2." ~ "ATC",
123 |                                                   cell_type == "ATC3." ~ "ATC",
124 |                                                   cell_type == "ATC4." ~ "ATC",
125 |                                                   cell_type == "ATC5." ~ "ATC",
126 |                                                   cell_type == "DCIS1" ~ "DCIS",
127 |                                                   cell_type == "IDC1." ~ "IDC",
128 |                                                   cell_type == "IDC2." ~ "IDC",
129 |                                                   cell_type == "TNBC1" ~ "TNBC",
130 |                                                   cell_type == "TNBC2" ~ "TNBC",
131 |                                                   cell_type == "TNBC3" ~ "TNBC",
132 |                                                   cell_type == "TNBC4" ~ "TNBC",
133 |                                                   cell_type == "TNBC5" ~ "TNBC"))
134 | data2 <- data2 %>% select(-cell_type)
135 | 
136 | 
137 | data3 <- melt(data2)
138 | 
139 | data4 <- data3 %>% filter(variable == "hsa04110")
140 | 
141 | library(ggpubr)
142 | library(ggprism)
143 | library(ggplot2)
144 | library(cowplot)
145 | ggplot(data=data4,aes(x=cancer_type,y=value,fill=factor(type)))+
146 |   geom_boxplot()+
147 |   stat_compare_means(aes(label = ..p.signif..))+
148 |   #ylim(0.3,1.6)+
149 |   theme_prism()+
150 |   labs(y="Pathway score",title = "hsa04110")+
151 |   theme(axis.title.x = element_blank())+
152 |   theme(axis.text.x = element_text(angle = 15))
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/inst/analysis/confusion_matrix.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | setwd("~/project/mcIdentify/data/")
 4 | remove(list = ls())
 5 | 
 6 | library(data.table)
 7 | library(dplyr)
 8 | data1 <- fread("./model_built/pathway_score_213/model_result/confusion/confusion_GOSH.csv")
 9 | data2 <- as.data.frame(data1[,-1])
10 | rownames(data2) <- data1$V1
11 | 
12 | 
13 | 
14 | data3 <- round(data2 / rowSums(data2),2)
15 | data3$real <- rownames(data3)
16 | a <- melt(data3)
17 | a$real <- factor(a$real, levels = c("normal","malignant"))
18 | a$variable <- factor(a$variable,levels = c("malignant","normal"))
19 | 
20 | library(ggplot2)
21 | ggplot(a, aes(real,variable, fill = value)) +
22 |   geom_tile() +
23 |   geom_text(aes(label = scales::percent(value))) +
24 |   scale_fill_gradient(low = "#F0F0F0", high = "#3575b5") +
25 |   labs(x = "True", y = "Guess", title = "GOSH") +
26 |   theme_prism(border = T)+
27 |   theme(panel.border = element_blank(),
28 |         axis.ticks.y = element_blank(),
29 |         axis.ticks.x = element_blank(),
30 |         legend.position="none")
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/inst/analysis/figure1.R:
--------------------------------------------------------------------------------
  1 | 
  2 | setwd("~/project/mcIdentify/data/")
  3 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
  4 | library(data.table)
  5 | library(dplyr)
  6 | 
  7 | 
  8 | pathdata <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
  9 | path_score1 <- pathdata
 10 | 
 11 | library(ggplot2)
 12 | library(ggpubr)
 13 | library(ggprism)
 14 | p1 <- ggplot(data=path_score1,aes(x=type,y=hsa04514,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
 15 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
 16 |   labs(y="Pathway Score",title = "Cell adhesion molecules",x="")+
 17 |   theme(axis.title.x = element_blank())
 18 | p1
 19 | 
 20 | p2 <- ggplot(data=path_score1,aes(x=type,y=hsa04110,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
 21 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
 22 |   labs(y="Pathway Score",title = "Cell cycle",x="")+
 23 |   theme(axis.title.x = element_blank())
 24 | p2
 25 | 
 26 | prow <- plot_grid(
 27 |   p1 ,
 28 |   p2,
 29 |   align = 'vh',
 30 |   labels = c(),
 31 |   hjust = -1,
 32 |   nrow = 1
 33 | )
 34 | prow
 35 | 
 36 | 
 37 | ##Gene distribution
 38 | 
 39 | 
 40 | data1 <- fread("~/project/mcIdentify/data/model_built/GSE673_gene_distribution.csv")
 41 | 
 42 | 
 43 | library(ggpubr)
 44 | library(ggprism)
 45 | library(ggplot2)
 46 | ggplot(data=data1,aes(x=type,y=Freq))+
 47 |   geom_boxplot(size=1)+
 48 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+
 49 |   theme_prism()+
 50 |   labs(y="Number of gene with expression")+
 51 |   scale_x_discrete(labels=c("Normal Cell","Tumor Cell"))+
 52 |   theme(axis.title.x = element_blank())
 53 | 
 54 | 
 55 | data1$type <- factor(data1$type,levels = c("normal","malignant"))
 56 | ggplot()+
 57 |   geom_density(data= GSE256_sample, alpha=0.8,adjust=1.5,aes(x=Freq,fill=type))+
 58 |   theme_prism()+
 59 |   labs(x="Number of gene with expression",y="Density")+
 60 |   scale_fill_manual(values = c("#F8766D","#00BFC4"))
 61 | 
 62 | 
 63 | ComplexHeatmap::Heatmap()
 64 | 
 65 | 
 66 | a <- as.data.frame(rnorm(20,mean = 2,sd = 1))
 67 | a$b <- c(1:20)
 68 | colnames(a) <- c("v1","v2")
 69 | ggplot(data = a,aes(x = v2,y=v1))+
 70 |   geom_point(size=3,color = "red")+
 71 |   labs(y="",x="")+
 72 |   theme_prism()
 73 | 
 74 | 
 75 | 
 76 | ###heatmap
 77 | remove(list = ls())
 78 | setwd("~/project/mcIdentify/data/")
 79 | library(data.table)
 80 | library(dplyr)
 81 | data1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
 82 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
 83 | 
 84 | tumor <- data1 %>% filter(type=="malignant") %>% select(GSE673_diff_path213$hsa)
 85 | tumor1 <- tumor[1:500,]
 86 | 
 87 | normal <- data1 %>% filter(type=="normal") %>% select(GSE673_diff_path213$hsa)
 88 | normal1 <- normal[1:500,]
 89 | 
 90 | data2 <- rbind(tumor1,normal1)
 91 | library(scales)
 92 | 
 93 | data3 <- scale(data2)
 94 | data4 <- t(data3)
 95 | 
 96 | 
 97 | sample_group <- as.data.frame(c(rep("malignant",500),rep("normal",500)))
 98 | colnames(sample_group) <- "cluster"
 99 | library(ComplexHeatmap)
100 | library(circlize)
101 | col_fun = colorRamp2(c(-2, 0, 2), c("#00FF00", "#3B3B3B", "#EE0000"))
102 | top_anno <- HeatmapAnnotation(Cluster = sample_group$cluster,
103 |                               col = list(Cluster = c("malignant"= "#F8766D","normal"= "#00BFC4"),border = TRUE))
104 | column_split = sample_group$cluster
105 | 
106 | 
107 | library(ggprism)
108 | ComplexHeatmap::Heatmap(data4,cluster_rows = T,cluster_columns = F,name = " ",
109 |                         show_column_names = F,show_row_names = F,show_heatmap_legend = T,
110 |                         col = col_fun,column_split = column_split,row_title = "Pathway")
111 | 
112 | 
113 | 
114 | 
115 | remove(list = ls())
116 | setwd("~/project/mcIdentify/data/")
117 | library(data.table)
118 | library(dplyr)
119 | data1 <- fread("./model_built/pathway_score_213/model_result/GSE673_method_gene.csv")
120 | data1$gene <- rep(1:6,4)
121 | data1$method <- factor(data1$method,levels = c("mcIdentify","ikraus","SCINA","scMRMA"))
122 | ggplot(data = data1, aes(x = gene, y = accuracy,  color = method, shape = method)) + 
123 |   geom_point(size = 3) + 
124 |   geom_smooth(size = 1.8) + 
125 |   labs(x = " ", y = "Accuracy") + 
126 |   ylim(0,1)+
127 |   theme_prism()+
128 |   scale_x_continuous(name = "Gene number", breaks = seq(1, 6, by = 1), 
129 |                      labels = c("<500", "500~1000", "1000~1500", "1500~2000", "2000~2500",">2500"), limits = c(1, 6))
130 | 
131 | 
132 | 
133 | 
134 | library(data.table)
135 | library(dplyr)
136 | data1 <- fread("./model_built/pathway_score_213/model_result/gene_sample_statistic.csv",header = T,data.table = F)
137 | 
138 | data1$type <- factor(data1$type,levels = c("normal","malignant"))
139 | data1$gene <- rep(1:6,2)
140 | library(ggplot2)
141 | 
142 | ggplot(data1, aes(x = gene, weight = value, fill = type))+
143 |   geom_bar(position = "stack")+
144 |   scale_fill_manual(values = c("#00BFC4","#F8766D"))+
145 |   theme_prism()+
146 |   scale_x_continuous(name = "Gene number", breaks = seq(1, 6, by = 1), 
147 |                      labels = c("<500", "500~1000", "1000~1500", "1500~2000", "2000~2500",">2500"))+
148 |   geom_text(aes(label = value1,y=value), 
149 |             position = position_stack(vjust = 0.5), size = 5)
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | col_fun = colorRamp2(c(-2, 0, 2), c("red", "black", "blue"))
160 | a <- matrix(data = rnorm(10000,mean = 0,sd = 1),nrow = 100,ncol = 100)
161 | 
162 | a[c(round(runif(50,min=1,max=100),0)),] = 0 
163 | 
164 | ComplexHeatmap::Heatmap(a,cluster_rows = F,cluster_columns = F,name = " ",
165 |                         show_column_names = F,show_row_names = F,show_heatmap_legend = F,col = col_fun)
166 |                         
167 | 
168 | ?ComplexHeatmap::Heatmap()
169 | 
170 | 
171 | 
172 | remove(list = ls())
173 | setwd("~/project/mcIdentify/data/")
174 | library(data.table)
175 | library(dplyr)
176 | data1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
177 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
178 | 
179 | tumor <- data1 %>% filter(type=="malignant") %>% select(GSE673_diff_path213$hsa[1:50])
180 | tumor1 <- tumor[1:100,]
181 | 
182 | normal <- data1 %>% filter(type=="normal") %>% select(GSE673_diff_path213$hsa[1:50])
183 | normal1 <- normal[1:100,]
184 | 
185 | data2 <- rbind(tumor1,normal1)
186 | library(scales)
187 | 
188 | data3 <- scale(data2)
189 | data4 <- t(data3)
190 | 
191 | 
192 | sample_group <- as.data.frame(c(rep("malignant",100),rep("normal",100)))
193 | colnames(sample_group) <- "cluster"
194 | library(ComplexHeatmap)
195 | library(circlize)
196 | top_anno <- HeatmapAnnotation(Cluster = sample_group$cluster,
197 |                               col = list(Cluster = c("malignant"= "#F8766D","normal"= "#00BFC4"),border = TRUE))
198 | 
199 | 
200 | library(ggprism)
201 | ComplexHeatmap::Heatmap(data4,cluster_rows = F,cluster_columns = F,name = " ",
202 |                         show_column_names = F,show_row_names = F,show_heatmap_legend = F)
203 | 
204 | 
205 | 


--------------------------------------------------------------------------------
/inst/analysis/gene_analysis.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | setwd("~/project/mcIdentify/data/")
  4 | 
  5 | 
  6 | library(data.table)
  7 | library(dplyr)
  8 | 
  9 | path_impor1 <- fread("./model_built/pathway_score_213/pathway_importance/path_impor_GSE673_model15.csv")
 10 | path_score1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
 11 | path_impor1$hsa <- colnames(path_score1)
 12 | 
 13 | path_impor1 <- path_impor1[-1,-1]
 14 | path_impor1$number <- c(1:213)
 15 | 
 16 | plot(path_impor1$V2,ylim = c(0.048,0.11),col = "blue", pch = 19, cex = 1)
 17 | 
 18 | 
 19 | path_impor1 <- path_impor1 %>% dplyr::mutate(fac = case_when(V2 > 0.058 ~ "A", TRUE ~ "B"))
 20 | path_impor1$fac <- as.factor(path_impor1$fac)
 21 | 
 22 | path_impor1 <- left_join(path_impor1,GSE673_diff_path213)
 23 | 
 24 | gene_list <- KEGG_pathway_gene %>% filter(hsa %in% path_impor1[path_impor1$fac == "A",]$hsa)
 25 | 
 26 | 
 27 | fwrite(gene_list,"./model_built/pathway_score_213/pathway_importance/gene_list.csv")
 28 | 
 29 | 
 30 | 
 31 | x <- list("Oxidative phosphorylation" = gene_list[gene_list$hsa=="hsa00190",]$gene_id,
 32 |           
 33 |           "Viral myocarditis" = gene_list[gene_list$hsa=="hsa05416",]$gene_id,
 34 |           
 35 |           "Type I diabetes mellitus" = gene_list[gene_list$hsa=="hsa04940",]$gene_id,
 36 |           
 37 |           "Antigen processing and presentation" = gene_list[gene_list$hsa=="hsa04612",]$gene_id)
 38 | 
 39 | 
 40 | venn.plot <- venn.diagram(
 41 |   x,
 42 |   filename = NULL,
 43 |   lty = 1,
 44 |   lwd = 1,
 45 |   col = "black",  
 46 |   fill = c("#6E9ECE", "#EFDBB9","#E6928F","4E9595"),
 47 |   alpha = 0.60,
 48 |   cat.col = "black",
 49 |   cat.cex = 0.8,
 50 |   cat.fontface = "bold",
 51 |   margin = 0.07,
 52 |   cex = 0.8
 53 | )
 54 | 
 55 | 
 56 | 
 57 | 
 58 | pdf("venn.pdf",width = 12,height = 12,pointsize = 20.5)
 59 | grid.draw(venn.plot)
 60 | dev.off()
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | gene1 <- as.data.frame(sort(table(gene_list$gene_id)))
 67 | gene2 <- gene1[gene1$Freq > 2,]
 68 | 
 69 | 
 70 | data1 <- fread("./model_built/datasets/GSE148673_tpm.txt")
 71 | data2 <- data1 %>% filter(V1 %in% gene2$Var1)
 72 | 
 73 | data3 <- data2[,-1]
 74 | data4 <- as.data.frame(t(data3))
 75 | colnames(data4) <- data2$V1
 76 | data4$barcode <- rownames(data4)
 77 | 
 78 | infor <- fread("./model_built/datasets/GSE148673_anno.txt")
 79 | infor <- infor %>% mutate(type= if_else(cluster.pred == "T","malignant","normal"))
 80 | 
 81 | infor1 <- infor %>% select(barcode,type)
 82 | 
 83 | data5 <- left_join(infor1,data4)
 84 | data5 <- data5 %>% select(-barcode)
 85 | 
 86 | data6 <- melt(data5)
 87 | 
 88 | data6$variable <- factor(data6$variable,levels = c("HLA-A","HLA-B","HLA-C","HLA-E","HLA-F","HLA-G",
 89 |                                                    "HLA-DRA","HLA-DRB1","HLA-DRB5","HLA-DQA1","HLA-DQA2",
 90 |                                                    "HLA-DQB1","HLA-DOB","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DPA1","HLA-DPB1"))
 91 | 
 92 | library(ggpubr)
 93 | library(ggprism)
 94 | library(ggplot2)
 95 | library(cowplot)
 96 | 
 97 | 
 98 | 
 99 | ggplot(data=data6,aes(x=variable,y=value,fill=factor(type)))+
100 |   geom_violin()+
101 |   stat_compare_means(aes(label = ..p.signif..))+
102 |   theme_prism()+
103 |   theme(axis.text.x = element_text(angle = 30,vjust = 1, hjust = 1) )+
104 |   labs(y="Gene expression",title = "")+
105 |   theme(axis.title.x = element_blank())
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/inst/analysis/model_border_gene.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | setwd("~/project/mcIdentify/data/")
  4 | remove(list = ls())
  5 | library(data.table)
  6 | library(dplyr)
  7 | 
  8 | data1 <- fread("./processed_data/GSE151530_tpm.txt")
  9 | expr_data <- as.data.frame(data1[,-1])
 10 | rownames(expr_data) <- data1$V1
 11 | 
 12 | 
 13 | 
 14 | ## pathway score
 15 | setwd("~/project/mcIdentify/data/")
 16 | remove(list = ls())
 17 | library(data.table)
 18 | library(dplyr)
 19 | 
 20 | data <- fread("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_500.txt")
 21 | data <- fread("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_700.txt")
 22 | 
 23 | 
 24 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt")
 25 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 26 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
 27 | 
 28 | 
 29 | expr_data <- expr_data1
 30 | rownames(expr_data) <- expr_data$V1
 31 | 
 32 | expr_matrix <- expr_data[,-1]
 33 | expr_matrix <- as.data.frame(t(expr_matrix))
 34 | colnames(expr_matrix) <- rownames(expr_data)
 35 | 
 36 | data1 <- expr_matrix
 37 | 
 38 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
 39 | 
 40 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
 41 | 
 42 | ##pathway score
 43 | 
 44 | myFun1 <- function(a){
 45 |   
 46 |   sum(a)/length(a)
 47 |   
 48 | }
 49 | 
 50 | all_pathway_score <- NA
 51 | for (i in 1:213) {
 52 |   
 53 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 54 |   
 55 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
 56 |   
 57 |   path_score <- as.data.frame(apply(a, 1, myFun1))
 58 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
 59 |   
 60 |   all_pathway_score <- cbind(all_pathway_score,path_score)
 61 |   
 62 | }
 63 | 
 64 | pathway_score <- all_pathway_score
 65 | pathway_score <- pathway_score[,-1]
 66 | 
 67 | 
 68 | diff_path <- pathway_score
 69 | diff_path$Cell <- rownames(diff_path)
 70 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
 71 | diff_path <- left_join(diff_path,infor_data2)
 72 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
 73 | 
 74 | 
 75 | fwrite(diff_path,"./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_500.csv")
 76 | fwrite(diff_path,"./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_700.csv")
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | for (number in c(900,1100,1300,1500)) {
 83 |   read_filename <- paste0("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_",number,".txt")
 84 |   data <- fread(read_filename)
 85 | 
 86 |   
 87 |   infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt")
 88 |   pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 89 |   expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
 90 |   
 91 |   
 92 |   expr_data <- expr_data1
 93 |   rownames(expr_data) <- expr_data$V1
 94 |   
 95 |   expr_matrix <- expr_data[,-1]
 96 |   expr_matrix <- as.data.frame(t(expr_matrix))
 97 |   colnames(expr_matrix) <- rownames(expr_data)
 98 |   data1 <- expr_matrix
 99 |   
100 |   GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
101 |   score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
102 |   
103 |   ##pathway score
104 |   
105 |   myFun1 <- function(a){
106 |     
107 |     sum(a)/length(a)
108 |     
109 |   }
110 |   
111 |   all_pathway_score <- NA
112 |   for (i in 1:213) {
113 |     
114 |     gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
115 |     
116 |     a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
117 |     
118 |     path_score <- as.data.frame(apply(a, 1, myFun1))
119 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
120 |     
121 |     all_pathway_score <- cbind(all_pathway_score,path_score)
122 |     
123 |   }
124 |   
125 |   pathway_score <- all_pathway_score
126 |   pathway_score <- pathway_score[,-1]
127 |   
128 | 
129 |   diff_path <- pathway_score
130 |   diff_path$Cell <- rownames(diff_path)
131 |   infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
132 |   diff_path <- left_join(diff_path,infor_data2)
133 |   diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
134 |   
135 |   
136 |   write_filename <- paste0("./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_",number,".csv")
137 |   fwrite(diff_path,write_filename)
138 | }
139 | 
140 | 
141 | 
142 | 
143 | ###gene+pathway
144 | setwd("~/project/mcIdentify/data/")
145 | remove(list = ls())
146 | library(data.table)
147 | library(dplyr)
148 | 
149 | all_data <- fread("./processed_data/GSE148673_tpm.txt")
150 | border_data <- as.data.frame(all_data[,-1])
151 | rownames(border_data) <- all_data$V1
152 | 
153 | 
154 | for (number in c(500,1000,1500,2000,2500)) {
155 | 
156 | ##border gene select
157 | low_number <- NA
158 | testdata <- border_data
159 | for (i in 1:ncol(border_data)) {
160 |   gene_number <- length(which(testdata[,i] > 0))
161 |   judge <- gene_number - number
162 |   
163 |   if (judge >= 0) {
164 |     random_number <- sample(1:gene_number, judge, replace = FALSE)
165 |     testdata[which(testdata[,i] > 0)[random_number],i] <- 0
166 |   }else{
167 |     low_number <- append(low_number,i)
168 |   }
169 |   
170 | }
171 | 
172 | if (number == 500) {
173 |   testdata1 <- testdata
174 | }else{
175 |   testdata1 <- testdata[,-low_number[-1]] 
176 | }
177 | 
178 | 
179 | write_filename_border <- paste0("./model_built/pathway_score_213/border_data_gene/GSE148673_tpm_",number,".txt")
180 | 
181 | fwrite(testdata1,write_filename_border,row.names = T)
182 | 
183 | 
184 | 
185 | ####pathway score
186 | read_filename <- paste0("./model_built/pathway_score_213/border_data_gene/GSE148673_tpm_",number,".txt")
187 | data <- fread(read_filename)
188 | 
189 | 
190 | infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt")
191 | infor_data1 <- infor_data1 %>% mutate(type = case_when(cluster.pred == "T"~"malignant",
192 |                                                        cluster.pred == "N"~"normal"))
193 | 
194 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
195 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
196 | 
197 | 
198 | expr_data <- expr_data1
199 | rownames(expr_data) <- expr_data$V1
200 | 
201 | expr_matrix <- expr_data[,-1]
202 | expr_matrix <- as.data.frame(t(expr_matrix))
203 | colnames(expr_matrix) <- rownames(expr_data)
204 | data1 <- expr_matrix
205 | 
206 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
207 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
208 | 
209 | ##pathway score
210 | 
211 | myFun1 <- function(a){
212 |   
213 |   sum(a)/length(a)
214 |   
215 | }
216 | 
217 | all_pathway_score <- NA
218 | for (i in 1:213) {
219 |   
220 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
221 |   
222 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
223 |   
224 |   path_score <- as.data.frame(apply(a, 1, myFun1))
225 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
226 |   
227 |   all_pathway_score <- cbind(all_pathway_score,path_score)
228 |   
229 | }
230 | 
231 | pathway_score <- all_pathway_score
232 | pathway_score <- pathway_score[,-1]
233 | 
234 | 
235 | diff_path <- pathway_score
236 | diff_path$Cell <- rownames(diff_path)
237 | # infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
238 | infor_data2 <- infor_data1
239 | diff_path <- left_join(diff_path,infor_data2)
240 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
241 | 
242 | 
243 | write_filename <- paste0("./model_built/pathway_score_213/border_data_gene/pathway_score/GSE673/GSE673_",number,".csv")
244 | fwrite(diff_path,write_filename)
245 | }
246 | 
247 | 
248 | 
249 | 
250 | 
251 | ##GSE530
252 | a <- NA
253 | for (i in 1:ncol(border_data)) {
254 |   
255 |   a <- append(a,table(border_data[,i]>0))
256 |   
257 | }
258 | 
259 | a <- as.data.frame(a[-1])
260 | b <- as.data.frame(a[seq(2,nrow(a),2),])
261 | 
262 | colnames(b) <- "gene_number"
263 | 
264 | border_500_1000 <- border_data[,which(b$gene_number>=500 & b$gene_number <1000)]
265 | border_1000_1500 <- border_data[,which(b$gene_number>=1000 & b$gene_number <1500)]
266 | border_1500_2000 <- border_data[,which(b$gene_number>=1500 & b$gene_number <2000)]
267 | border_2000_2500 <- border_data[,which(b$gene_number>=2000 & b$gene_number <2500)]
268 | border_2500_00 <- border_data[,which(b$gene_number>=2500)]
269 | 
270 | 
271 | fwrite(border_500_1000,"./model_built/pathway_score_213/border_interval/border_500_1000.txt",row.names = T)
272 | fwrite(border_1000_1500,"./model_built/pathway_score_213/border_interval/border_1000_1500.txt",row.names = T)
273 | fwrite(border_1500_2000,"./model_built/pathway_score_213/border_interval/border_1500_2000.txt",row.names = T)
274 | fwrite(border_2000_2500,"./model_built/pathway_score_213/border_interval/border_2000_2500.txt",row.names = T)
275 | fwrite(border_2500_00,"./model_built/pathway_score_213/border_interval/border_2500_00.txt",row.names = T)
276 | 
277 | 
278 | 
279 | file_name <- list.files("./model_built/pathway_score_213/border_interval/")[1:5]
280 | for (number in file_name) {
281 |   
282 |   read_filename <- paste0("./model_built/pathway_score_213/border_interval/",number)
283 |   data <- fread(read_filename)
284 |   
285 |   
286 |   infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt")
287 |   pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
288 |   expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
289 |   
290 |   
291 |   expr_data <- expr_data1
292 |   rownames(expr_data) <- expr_data$V1
293 |   
294 |   expr_matrix <- expr_data[,-1]
295 |   expr_matrix <- as.data.frame(t(expr_matrix))
296 |   colnames(expr_matrix) <- rownames(expr_data)
297 |   data1 <- expr_matrix
298 |   
299 |   GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
300 |   score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
301 |   
302 |   ##pathway score
303 |   
304 |   myFun1 <- function(a){
305 |     
306 |     sum(a)/length(a)
307 |     
308 |   }
309 |   
310 |   all_pathway_score <- NA
311 |   for (i in 1:213) {
312 |     
313 |     gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
314 |     
315 |     a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
316 |     
317 |     path_score <- as.data.frame(apply(a, 1, myFun1))
318 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
319 |     
320 |     all_pathway_score <- cbind(all_pathway_score,path_score)
321 |     
322 |   }
323 |   
324 |   pathway_score <- all_pathway_score
325 |   pathway_score <- pathway_score[,-1]
326 |   
327 | 
328 |   diff_path <- pathway_score
329 |   diff_path$Cell <- rownames(diff_path)
330 |   infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
331 |   diff_path <- left_join(diff_path,infor_data2)
332 |   diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
333 |   
334 |   
335 |   write_filename <- paste0("./model_built/pathway_score_213/border_interval/pathway_score_interval/",number,".csv")
336 |   fwrite(diff_path,write_filename)
337 | }
338 | 
339 | 
340 | 
341 | 
342 | 
343 | setwd("~/project/mcIdentify/data/")
344 | remove(list = ls())
345 | library(data.table)
346 | library(dplyr)
347 | 
348 | 
349 | all_data <- fread("./model_built/pathway_score_213/border_interval/border_2500_00.txt")
350 | border_data <- as.data.frame(all_data[,-1])
351 | rownames(border_data) <- all_data$V1
352 | 
353 | 
354 | 
355 | for (number in c(500,1000,1500,2000,2500)) {
356 |   
357 |   ##border gene select
358 |   low_number <- NA
359 |   testdata <- border_data
360 |   for (i in 1:ncol(border_data)) {
361 |     gene_number <- length(which(testdata[,i] > 0))
362 |     judge <- gene_number - number
363 |     
364 |     if (judge >= 0) {
365 |       random_number <- sample(1:gene_number, judge, replace = FALSE)
366 |       testdata[which(testdata[,i] > 0)[random_number],i] <- 0
367 |     }
368 |     
369 |   }
370 |   testdata1 <- testdata
371 |   
372 |   write_filename_border <- paste0("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_",number,".txt")
373 |   
374 |   fwrite(testdata1,write_filename_border,row.names = T)
375 |   
376 |   
377 |   
378 |   ####pathway score
379 |   read_filename <- paste0("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_",number,".txt")
380 |   data <- fread(read_filename)
381 |   
382 |   
383 |   infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt")
384 |   pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
385 |   expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
386 |   
387 |   
388 |   expr_data <- expr_data1
389 |   rownames(expr_data) <- expr_data$V1
390 |   
391 |   expr_matrix <- expr_data[,-1]
392 |   expr_matrix <- as.data.frame(t(expr_matrix))
393 |   colnames(expr_matrix) <- rownames(expr_data)
394 |   data1 <- expr_matrix
395 |   
396 |   GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
397 |   score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
398 |   
399 |   ##pathway score
400 |   
401 |   myFun1 <- function(a){
402 |     
403 |     sum(a)/length(a)
404 |     
405 |   }
406 |   
407 |   all_pathway_score <- NA
408 |   for (i in 1:213) {
409 |     
410 |     gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
411 |     
412 |     a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
413 |     
414 |     path_score <- as.data.frame(apply(a, 1, myFun1))
415 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
416 |     
417 |     all_pathway_score <- cbind(all_pathway_score,path_score)
418 |     
419 |   }
420 |   
421 |   pathway_score <- all_pathway_score
422 |   pathway_score <- pathway_score[,-1]
423 |   
424 | 
425 |   diff_path <- pathway_score
426 |   diff_path$Cell <- rownames(diff_path)
427 |   infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
428 |   diff_path <- left_join(diff_path,infor_data2)
429 |   diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
430 |   
431 |   
432 |   write_filename <- paste0("./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_",number,".csv")
433 |   fwrite(diff_path,write_filename)
434 | }
435 |  
436 | 
437 | 
438 | 
439 | #### other sample GOSH
440 | 
441 | setwd("~/project/mcIdentify/data/")
442 | remove(list = ls())
443 | library(data.table)
444 | library(dplyr)
445 | 
446 | all_data <- fread("./model_built/datasets/GOSH_tpm.txt")
447 | border_data <- as.data.frame(all_data[,-1])
448 | rownames(border_data) <- all_data$V1
449 | 
450 | 
451 | a <- NA
452 | for (i in 1:ncol(border_data)) {
453 |   
454 |   a <- append(a,table(border_data[,i]>0))
455 |   
456 | }
457 | 
458 | a <- as.data.frame(a[-1])
459 | b <- as.data.frame(a[seq(2,nrow(a),2),])
460 | 
461 | colnames(b) <- "gene_number"
462 | 
463 | border_500 <- border_data[,which(b$gene_number < 500)]
464 | border_500_1000 <- border_data[,which(b$gene_number>=500 & b$gene_number <1000)]
465 | border_1000_1500 <- border_data[,which(b$gene_number>=1000 & b$gene_number <1500)]
466 | border_1500_2000 <- border_data[,which(b$gene_number>=1500 & b$gene_number <2000)]
467 | border_2000_2500 <- border_data[,which(b$gene_number>=2000 & b$gene_number <2500)]
468 | border_2500_00 <- border_data[,which(b$gene_number>=2500)]
469 | 
470 | 
471 | fwrite(border_500,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_500.txt",row.names = T)
472 | fwrite(border_500_1000,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_500_1000.txt",row.names = T)
473 | fwrite(border_1000_1500,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_1000_1500.txt",row.names = T)
474 | fwrite(border_1500_2000,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_1500_2000.txt",row.names = T)
475 | fwrite(border_2000_2500,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_2000_2500.txt",row.names = T)
476 | fwrite(border_2500_00,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_2500_00.txt",row.names = T)
477 | 
478 | 
479 | file_name <- list.files("./model_built/pathway_score_213/border_interval/GOSH/")
480 | for (number in file_name) {
481 |   
482 |   read_filename <- paste0("./model_built/pathway_score_213/border_interval/GOSH/",number)
483 |   data <- fread(read_filename)
484 |   
485 |   
486 |   infor_data1 <- fread("./model_built/datasets/GOSH_anno.txt")
487 |   pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
488 |   expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
489 |   
490 |   
491 |   expr_data <- expr_data1
492 |   rownames(expr_data) <- expr_data$V1
493 |   
494 |   expr_matrix <- expr_data[,-1]
495 |   expr_matrix <- as.data.frame(t(expr_matrix))
496 |   colnames(expr_matrix) <- rownames(expr_data)
497 |   data1 <- expr_matrix
498 |   
499 |   GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
500 |   score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
501 |   
502 |   ##pathway score
503 |   
504 |   myFun1 <- function(a){
505 |     
506 |     sum(a)/length(a)
507 |     
508 |   }
509 |   
510 |   all_pathway_score <- NA
511 |   for (i in 1:213) {
512 |     
513 |     gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
514 |     
515 |     a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
516 |     
517 |     path_score <- as.data.frame(apply(a, 1, myFun1))
518 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
519 |     
520 |     all_pathway_score <- cbind(all_pathway_score,path_score)
521 |     
522 |   }
523 |   
524 |   pathway_score <- all_pathway_score
525 |   pathway_score <- pathway_score[,-1]
526 |   
527 | 
528 |   diff_path <- pathway_score
529 |   diff_path$V1 <- rownames(diff_path)
530 |   infor_data2 <- infor_data1 %>% filter(type != "unclassified")
531 |   diff_path <- left_join(diff_path,infor_data2)
532 |   diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
533 |   
534 |   
535 |   write_filename <- paste0("./model_built/pathway_score_213/border_interval/pathway_score_interval/",number,".csv")
536 |   fwrite(diff_path,write_filename)
537 | }
538 | 
539 | 
540 | 
541 | 
542 | 
543 | 
544 | 
545 | 
546 | #### other sample GOSH
547 | 
548 | setwd("~/project/mcIdentify/data/")
549 | remove(list = ls())
550 | library(data.table)
551 | library(dplyr)
552 | 
553 | all_data <- fread("./model_built/datasets/GSE148673_tpm.txt")
554 | border_data <- as.data.frame(all_data[,-1])
555 | rownames(border_data) <- all_data$V1
556 | 
557 | 
558 | a <- NA
559 | for (i in 1:ncol(border_data)) {
560 |   
561 |   a <- append(a,table(border_data[,i]>0))
562 |   
563 | }
564 | 
565 | a <- as.data.frame(a[-1])
566 | b <- as.data.frame(a[seq(2,nrow(a),2),])
567 | 
568 | colnames(b) <- "gene_number"
569 | border_500 <- border_data[,which(b$gene_number < 500)]
570 | border_500_1000 <- border_data[,which(b$gene_number>=500 & b$gene_number <1000)]
571 | border_1000_1500 <- border_data[,which(b$gene_number>=1000 & b$gene_number <1500)]
572 | border_1500_2000 <- border_data[,which(b$gene_number>=1500 & b$gene_number <2000)]
573 | border_2000_2500 <- border_data[,which(b$gene_number>=2000 & b$gene_number <2500)]
574 | border_2500_00 <- border_data[,which(b$gene_number>=2500)]
575 | 
576 | 
577 | fwrite(border_500,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_500.txt",row.names = T)
578 | fwrite(border_500_1000,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_500_1000.txt",row.names = T)
579 | fwrite(border_1000_1500,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_1000_1500.txt",row.names = T)
580 | fwrite(border_1500_2000,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_1500_2000.txt",row.names = T)
581 | fwrite(border_2000_2500,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_2000_2500.txt",row.names = T)
582 | fwrite(border_2500_00,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_2500_00.txt",row.names = T)
583 | 
584 | 
585 | file_name <- list.files("./model_built/pathway_score_213/border_interval/GSE673/")
586 | for (number in file_name) {
587 |   
588 |   read_filename <- paste0("./model_built/pathway_score_213/border_interval/GSE673/",number)
589 |   data <- fread(read_filename)
590 |   
591 |   
592 |   infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt")
593 |   pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
594 |   expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
595 |   
596 |   
597 |   expr_data <- expr_data1
598 |   rownames(expr_data) <- expr_data$V1
599 |   
600 |   expr_matrix <- expr_data[,-1]
601 |   expr_matrix <- as.data.frame(t(expr_matrix))
602 |   colnames(expr_matrix) <- rownames(expr_data)
603 |   data1 <- expr_matrix
604 |   
605 |   GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
606 |   score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
607 |   
608 |   ##pathway score
609 |   
610 |   myFun1 <- function(a){
611 |     
612 |     sum(a)/length(a)
613 |     
614 |   }
615 |   
616 |   all_pathway_score <- NA
617 |   for (i in 1:213) {
618 |     
619 |     gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
620 |     
621 |     a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
622 |     
623 |     path_score <- as.data.frame(apply(a, 1, myFun1))
624 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
625 |     
626 |     all_pathway_score <- cbind(all_pathway_score,path_score)
627 |     
628 |   }
629 |   
630 |   pathway_score <- all_pathway_score
631 |   pathway_score <- pathway_score[,-1]
632 | 
633 |   diff_path <- pathway_score
634 |   diff_path$barcode <- rownames(diff_path)
635 |   infor_data2 <- infor_data1 %>% filter(type != "unclassified")
636 |   diff_path <- left_join(diff_path,infor_data2)
637 |   diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
638 |   
639 |   
640 |   write_filename <- paste0("./model_built/pathway_score_213/border_interval/pathway_score_interval/",number,".csv")
641 |   fwrite(diff_path,write_filename)
642 | }
643 | 
644 | 
645 | 
646 | 
647 | 


--------------------------------------------------------------------------------
/inst/analysis/model_build.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from tensorflow import keras as K
 3 | import tensorflow as tf
 4 | from tensorflow.keras import regularizers
 5 | import pandas as pd
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.preprocessing import LabelBinarizer
 8 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
 9 | from sklearn.model_selection import cross_val_score
10 | from sklearn.model_selection import KFold
11 | import numpy as np
12 | from sklearn.model_selection import StratifiedShuffleSplit
13 | import matplotlib.pyplot as plt
14 | from tensorflow.keras.backend import clear_session
15 | from sklearn.model_selection import PredefinedSplit
16 | import math
17 | from tensorflow.keras.models import Sequential
18 | from tensorflow.keras.layers import Dense
19 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
20 | from sklearn.preprocessing import MinMaxScaler
21 | from sklearn.model_selection import train_test_split
22 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
23 | 
24 | data = pd.read_csv("./model_data213/GSE673_pathway_score.csv")
25 | 
26 | 
27 | print(data["type"].value_counts())
28 | data.type = data.type.astype(str).map({'malignant': 0, 'normal': 1})
29 | x = data.drop('type', axis=1)
30 | y = data['type']
31 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=10)
32 | feature = list(x.columns)
33 | 
34 | 
35 | # build model
36 | clear_session()
37 | model = K.models.Sequential()
38 | model.add(K.layers.Dense(units=300, input_dim=213, activation='sigmoid'))
39 | model.add(K.layers.Dropout(0.3))
40 | model.add(K.layers.Dense(units=200, activation='sigmoid'))
41 | model.add(K.layers.Dropout(0.2))
42 | model.add(K.layers.Dense(units=100, activation='sigmoid'))
43 | model.add(K.layers.Dropout(0.1))
44 | model.add(K.layers.Dense(units=10, activation='sigmoid'))
45 | model.add(K.layers.Dense(units=1, activation='sigmoid'))
46 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
47 | 
48 | b_size = 50
49 | max_epochs = 100
50 | h = model.fit(x_train, y_train, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1) 
51 | eval = model.evaluate(x_train, y_train, verbose=0, batch_size=b_size)
52 | print("Evaluation on train data: loss = %0.6f accuracy = %0.2f%% \n" % (eval[0], eval[1] * 100))
53 | 
54 | eval = model.evaluate(x_test, y_test, verbose=0, batch_size=b_size)
55 | print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" % (eval[0], eval[1] * 100))
56 | 
57 | 


--------------------------------------------------------------------------------
/inst/analysis/model_train.pbs:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N tpm_train
 3 | #PBS -k oe
 4 | #PBS -l walltime=1000:00:00,nodes=1:ppn=1
 5 | #PBS -q pub_gpu
 6 | 
 7 | 
 8 |   if [ -f "/public/home/liuxs/anaconda3/etc/profile.d/conda.sh" ]; then
 9 |       . "/public/home/liuxs/anaconda3/etc/profile.d/conda.sh"
10 |   else
11 |         export PATH="/public/home/liuxs/anaconda3/bin:$PATH"
12 |   fi
13 | 
14 | 
15 | # conda activate  /public/slst/home/wuchx/anaconda3/envs/python3
16 | cd /public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model
17 | 
18 | 
19 | python /public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/train_para.py
20 | 


--------------------------------------------------------------------------------
/inst/analysis/model_train.py:
--------------------------------------------------------------------------------
  1 | #########################################################  加载模块
  2 | import sys
  3 | from tensorflow import keras as K
  4 | import tensorflow as tf
  5 | from tensorflow.keras import regularizers
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.preprocessing import LabelBinarizer
  9 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
 10 | from sklearn.model_selection import cross_val_score
 11 | from sklearn.model_selection import KFold
 12 | import numpy as np
 13 | from sklearn.model_selection import StratifiedShuffleSplit
 14 | import matplotlib.pyplot as plt
 15 | from tensorflow.keras.backend import clear_session
 16 | from sklearn.model_selection import PredefinedSplit
 17 | import math
 18 | import pandas as pd
 19 | from tensorflow.keras.models import Sequential
 20 | from tensorflow.keras.layers import Dense
 21 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
 22 | from sklearn.preprocessing import MinMaxScaler
 23 | from sklearn.model_selection import train_test_split
 24 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 25 | ########################################################################### Build partition function
 26 | 
 27 | 
 28 | def FindLayerNodesLinear(n_layers, first_layer_nodes, last_layer_nodes):
 29 |     layers = []
 30 |     nodes_increment = (last_layer_nodes - first_layer_nodes)/ (n_layers-1)
 31 |     nodes = first_layer_nodes
 32 |     for i in range(1, n_layers+1):
 33 |         layers.append(math.ceil(nodes))
 34 |         nodes = nodes + nodes_increment
 35 |     return layers
 36 | 
 37 | def FinddropoutLinear(n_layers, dropout):
 38 |     layers = []
 39 |     nodes_increment = round(dropout/(n_layers-1),2)
 40 |     nodes = dropout
 41 |     for i in range(1, n_layers+1):
 42 |         layers.append(nodes)
 43 |         nodes = round(nodes - nodes_increment,2)
 44 |         if(nodes <= 0):
 45 |             nodes = 0
 46 |     
 47 |     return layers
 48 | 
 49 | 
 50 | def createmodel(n_layers, first_layer_nodes, last_layer_nodes, activation_func, loss_func,dropout):
 51 |     model = Sequential()
 52 |     n_nodes = FindLayerNodesLinear(n_layers, first_layer_nodes, last_layer_nodes)
 53 |     n_dropout = FinddropoutLinear(n_layers,dropout)
 54 |     for i in range(1, n_layers):
 55 |         if i==1:
 56 |             model.add(Dense(first_layer_nodes, input_dim=train_x.shape[1], activation=activation_func))
 57 |             model.add(K.layers.Dropout(rate=n_dropout[0]))
 58 |         else:
 59 |             model.add(Dense(n_nodes[i-1], activation=activation_func))
 60 |             model.add(K.layers.Dropout(rate=n_dropout[i-1]))
 61 |     model.add(Dense(train_y.shape[1], activation='softmax'))
 62 |     model.compile(optimizer='adam', loss=loss_func, metrics = ["accuracy"]) #note: metrics could also be 'mse'
 63 |     
 64 |     return model
 65 | 
 66 | 
 67 | train_x = pd.read_csv("/public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/train_x.csv")
 68 | train_y = pd.read_csv("/public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/train_x.csv")
 69 | train_x1, value_x, train_y1, value_y = train_test_split(train_x, train_y,train_size=0.8, test_size=0.2, random_state=1)
 70 | 
 71 | 
 72 | ############################################################################ build function
 73 | train_val_features = np.concatenate((train_x,value_x),axis=0)
 74 | train_val_labels = np.concatenate((train_y,value_y),axis=0)
 75 | test_fold = np.zeros(train_val_features.shape[0]) 
 76 | test_fold[:train_x1.shape[0]] = -1  
 77 | ps = PredefinedSplit(test_fold=test_fold)
 78 | #####################################################################Set parameter range
 79 | model =  KerasClassifier(build_fn=createmodel, verbose = False)  
 80 | 
 81 | activation_funcs = ['sigmoid', 'relu'] 
 82 | #activation_funcs = ['relu'] 
 83 | loss_funcs = ['binary_crossentropy']
 84 | 
 85 | param_grid = dict(n_layers=[3,4,5,6], first_layer_nodes = [200,250,300,350,400,450,500], last_layer_nodes = [10,20,30], dropout=[0.1,0.3,0.5], activation_func = activation_funcs, loss_func = loss_funcs, batch_size = [100,80,50], epochs = [50,100])
 86 | 
 87 | grid = GridSearchCV(estimator = model, param_grid = param_grid,cv=ps,n_jobs=1)
 88 | #grid = RandomizedSearchCV (estimator = model, param_grid = param_grid,cv=3,n_jobs=5)
 89 | ################################################################ trainning
 90 | grid.fit(train_val_features, train_val_labels)
 91 | ############################################################### Output the highest accuracy and corresponding parameters
 92 | print(grid.best_score_)
 93 | print(grid.best_params_)
 94 | f = open("/public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/model_result.txt") 
 95 | f.write(str(grid.best_score_))
 96 | f.write("\n")
 97 | f.write(str(grid.best_params_)) 
 98 | f.write("\n") 
 99 | f.close()
100 | 
101 | 


--------------------------------------------------------------------------------
/inst/analysis/other_data_pathscore.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##GSE151530
  3 | remove(list = ls())
  4 | setwd("~/project/mcIdentify/data/")
  5 | library(data.table)
  6 | library(dplyr)
  7 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt")
  8 | expr_data1 <- fread("./model_built/datasets/GSE151530_tpm2.txt")
  9 | 
 10 | expr_data <- expr_data1
 11 | rownames(expr_data) <- expr_data$V1
 12 | 
 13 | expr_matrix <- expr_data[,-1]
 14 | expr_matrix <- as.data.frame(t(expr_matrix))
 15 | colnames(expr_matrix) <- rownames(expr_data)
 16 | 
 17 | data1 <- expr_matrix
 18 | 
 19 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds")
 20 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 21 | 
 22 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
 23 | 
 24 | ##pathway score
 25 | 
 26 | myFun1 <- function(a){
 27 |   
 28 |   sum(a)/length(a)
 29 |   
 30 | }
 31 | 
 32 | all_pathway_score <- NA
 33 | for (i in 1:213) {
 34 |   
 35 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 36 |   
 37 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
 38 |   
 39 |   path_score <- as.data.frame(apply(a, 1, myFun1))
 40 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
 41 |   
 42 |   all_pathway_score <- cbind(all_pathway_score,path_score)
 43 |   
 44 | }
 45 | 
 46 | pathway_score <- all_pathway_score
 47 | pathway_score <- pathway_score[,-1]
 48 | 
 49 | ##test
 50 | diff_path <- pathway_score
 51 | diff_path$Cell <- rownames(diff_path)
 52 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
 53 | diff_path <- left_join(infor_data2,diff_path)
 54 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa)
 55 | 
 56 | fwrite(diff_path,"./model_built/pathway_score/GSE530_pathway_score.csv")
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | ##GSE146771
 67 | remove(list = ls())
 68 | setwd("~/project/mcIdentify/data/")
 69 | library(data.table)
 70 | library(dplyr)
 71 | infor_data1 <- fread("./model_built/datasets/GSE146771_anno.txt")
 72 | expr_data1 <- fread("./model_built/datasets/GSE146771_tpm.txt")
 73 | 
 74 | expr_data <- expr_data1
 75 | rownames(expr_data) <- expr_data$V1
 76 | 
 77 | expr_matrix <- expr_data[,-1]
 78 | expr_matrix <- as.data.frame(t(expr_matrix))
 79 | colnames(expr_matrix) <- rownames(expr_data)
 80 | 
 81 | data1 <- expr_matrix
 82 | 
 83 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds")
 84 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 85 | 
 86 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
 87 | ##pathway score
 88 | myFun1 <- function(a){
 89 |   
 90 |   sum(a)/length(a)
 91 |   
 92 | }
 93 | 
 94 | all_pathway_score <- NA
 95 | for (i in 1:213) {
 96 |   
 97 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 98 |   
 99 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
100 |   
101 |   path_score <- as.data.frame(apply(a, 1, myFun1))
102 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
103 |   
104 |   all_pathway_score <- cbind(all_pathway_score,path_score)
105 |   
106 | }
107 | 
108 | pathway_score <- all_pathway_score
109 | pathway_score <- pathway_score[,-1]
110 | 
111 | 
112 | ##test
113 | diff_path <- pathway_score
114 | diff_path$CellName <- rownames(diff_path)
115 | diff_path <- left_join(infor_data1,diff_path)
116 | diff_path <- diff_path %>% filter(type == "malignant" | type == "normal") %>%
117 |   select(type,GSE256_diff_path3$hsa)
118 | 
119 | fwrite(diff_path,"./model_built/pathway_score/GSE771_pathway_score.csv")
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | remove(list = ls())
129 | ##GOSH
130 | setwd("~/project/mcIdentify/data/")
131 | library(data.table)
132 | library(dplyr)
133 | infor_data1 <- fread("./model_built/datasets/GOSH_anno.txt")
134 | expr_data1 <- fread("./model_built/datasets/GOSH_tpm.txt")
135 | 
136 | ##PMC
137 | infor_data1 <- fread("./model_built/datasets/PMC_anno.txt")
138 | expr_data1 <- fread("./model_built/datasets/PMC_tpm.txt")
139 | 
140 | 
141 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds")
142 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
143 | 
144 | 
145 | expr_data <- expr_data1
146 | rownames(expr_data) <- expr_data1$V1
147 | expr_matrix <- expr_data[,-1]
148 | expr_matrix <- as.data.frame(t(expr_matrix))
149 | colnames(expr_matrix) <- rownames(expr_data) 
150 | 
151 | data1 <- expr_matrix
152 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
153 | 
154 | 
155 | ##pathway score
156 | 
157 | myFun1 <- function(a){
158 |   
159 |   sum(a)/length(a)
160 |   
161 | }
162 | 
163 | all_pathway_score <- NA
164 | for (i in 1:213) {
165 |   
166 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
167 |   
168 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
169 |   
170 |   path_score <- as.data.frame(apply(a, 1, myFun1))
171 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
172 |   
173 |   all_pathway_score <- cbind(all_pathway_score,path_score)
174 |   
175 | }
176 | 
177 | pathway_score <- all_pathway_score
178 | pathway_score <- pathway_score[,-1]
179 | 
180 | ##test
181 | diff_path <- pathway_score
182 | diff_path$V1 <- rownames(diff_path)
183 | infor_data2 <- infor_data1 %>% filter(type != "unclassified")
184 | diff_path <- left_join(infor_data2,diff_path)
185 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa)
186 | fwrite(diff_path,"./model_built/pathway_score/GOSH_pathway_score.csv")
187 | 
188 | fwrite(diff_path,"./model_built/pathway_score/PMC_pathway_score.csv")
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | ## GSE131309 Seq
196 | remove(list = ls())
197 | setwd("~/project/mcIdentify/data/")
198 | library(data.table)
199 | library(dplyr)
200 | infor_data1 <- fread("./model_built/datasets/GSE131309_Seq_anno.txt")
201 | expr_data1 <- fread("./model_built/datasets/GSE131309_Seq_tpm.txt")
202 | 
203 | 
204 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds")
205 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
206 | 
207 | expr_data <- expr_data1
208 | rownames(expr_data) <- expr_data1$V1
209 | 
210 | expr_matrix <- expr_data[,-1]
211 | expr_matrix <- as.data.frame(t(expr_matrix))
212 | colnames(expr_matrix) <- rownames(expr_data) 
213 | data1 <- expr_matrix
214 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa)
215 | 
216 | ##pathway score
217 | myFun1 <- function(a){
218 |   
219 |   sum(a)/length(a)
220 |   
221 | }
222 | 
223 | all_pathway_score <- NA
224 | for (i in 1:213) {
225 |   
226 |   gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
227 |   
228 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
229 |   
230 |   path_score <- as.data.frame(apply(a, 1, myFun1))
231 |   colnames(path_score) <- names(table(score_gene$hsa))[i]
232 |   
233 |   all_pathway_score <- cbind(all_pathway_score,path_score)
234 |   
235 | }
236 | 
237 | pathway_score <- all_pathway_score
238 | pathway_score <- pathway_score[,-1]
239 | 
240 | ##test
241 | diff_path <- pathway_score
242 | diff_path$barcode <- rownames(diff_path)
243 | infor_data2 <- infor_data1 %>% filter(type != "unclassified")
244 | diff_path <- left_join(infor_data2,diff_path)
245 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa)
246 | 
247 | fwrite(diff_path,"./model_built/pathway_score/GSE309_Seq_pathway_score.csv")
248 | fwrite(diff_path,"./model_built/pathway_score/GSE309_10X_pathway_score.csv")
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/inst/analysis/pathway_importance.R:
--------------------------------------------------------------------------------
  1 | 
  2 | setwd("~/project/mcIdentify/data/")
  3 | 
  4 | library(data.table)
  5 | library(dplyr)
  6 | 
  7 | 
  8 | 
  9 | path_impor1 <- fread("./model_built/pathway_score_213/pathway_importance/path_impor_GSE673_model15.csv")
 10 | path_score1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
 11 | path_impor1$hsa <- colnames(path_score1)
 12 | 
 13 | path_impor1 <- path_impor1[-1,-1]
 14 | path_impor1$number <- c(1:213)
 15 | 
 16 | # top10 <- path_impor1 %>% arrange(-V2) %>% filter(V2 > 0.059)
 17 | 
 18 | plot(path_impor1$V2,ylim = c(0.945,0.985),col = "blue", pch = 19, cex = 1)
 19 | 
 20 | 
 21 | path_impor1 <- path_impor1 %>% dplyr::mutate(fac = case_when(V2 < 0.979 ~ "A", TRUE ~ "B"))
 22 | path_impor1$fac <- as.factor(path_impor1$fac)
 23 | 
 24 | path_impor1 <- left_join(path_impor1,GSE673_diff_path213)
 25 | 
 26 | 
 27 | library(ggplot2)
 28 | library(ggpubr)
 29 | library(ggprism)
 30 | library(ggrepel)
 31 | ggplot(path_impor1, aes(x=number, y=V2, color=factor(fac))) + 
 32 |   geom_point(size = 3,)+
 33 |   theme_prism(border = T)+
 34 |   labs(y="Accuracy of the model", x  = "Pathway")+
 35 |   ylim(0.955,0.984)+
 36 |   xlim(0,214)+
 37 |   scale_color_manual(values = c("#DC0000FF",'#0072B5FF'))+
 38 |   theme(legend.position = 'none')+
 39 |   geom_text_repel(
 40 |     data = subset(path_impor1, path_impor1$V2 < 0.979),
 41 |     aes(label = pathway_id),
 42 |     size = 5,
 43 |     box.padding = unit(1, "lines"),
 44 |     point.padding = unit(1, "lines"), segment.color = "black", show.legend = FALSE )+
 45 |   geom_hline(aes(yintercept=0.98),linetype=5,col="black")
 46 |   
 47 |   
 48 | 
 49 | 
 50 | library(data.table)
 51 | library(dplyr)
 52 | 
 53 | path_impor2 <- fread("./model_built/pathway_score_213/pathway_importance/loss_result_GSE673_model15.csv")
 54 | path_score2 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
 55 | path_impor2$hsa <- colnames(path_score2)
 56 | 
 57 | path_impor2 <- path_impor2[-1,-1]
 58 | path_impor2$number <- c(1:213)
 59 | 
 60 | path_impor2 <- left_join(GSE673_diff_path213,path_impor2)
 61 | 
 62 | draw_data <- path_impor2 %>% filter(hsa %in% top10$hsa) %>% select(-pvalue) %>% select(-number)
 63 | 
 64 | 
 65 | data1 <- melt(draw_data)
 66 | data1$hsa <- factor(data1$hsa,levels = top10$hsa)
 67 | 
 68 | 
 69 | library(ggpubr)
 70 | library(ggprism)
 71 | library(ggplot2)
 72 | library(cowplot)
 73 | ggplot(data=data1,aes(x=hsa,y=value,fill = pathway_id))+
 74 |   geom_boxplot(size=1, draw_quantiles = c(0.5))+
 75 |   theme_prism(border = T)+theme(legend.position = 'none')+
 76 |   labs(y="Loss of the model",title = " ")+
 77 |   theme(axis.title.x = element_blank())+
 78 |   # geom_hline(aes(yintercept=0.9),linetype=5,col="red")+
 79 |   # geom_hline(aes(yintercept=0.8),linetype=5,col="red")+
 80 |   # scale_y_continuous(breaks=c(0,0.5,0.6,0.7,0.8,0.9,1))+
 81 |   scale_fill_manual(values = c("#F27970", "#BB9727","#54B345","#32B897",
 82 |                                "#05B9E2", "#8983BF","#C76DA2","#F27970",
 83 |                                "#BB9727","#54B345"))
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | library(ggpubr)
 90 | library(ggprism)
 91 | library(ggplot2)
 92 | library(cowplot)
 93 | ggplot(data=path_score1,aes(x=type,y=hsa05416,fill=factor(type)))+
 94 |   geom_boxplot(size=1,)+
 95 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+
 96 |   theme_prism()+theme(legend.position = 'none')+
 97 |   labs(y="Pathway Score",title = "Viral myocarditis")+
 98 |   theme(axis.title.x = element_blank())
 99 | 
100 | 
101 | 
102 | 
103 | p1 <- ggplot(data=path_score1,aes(x=type,y=hsa00190,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
104 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
105 |   labs(y="Pathway Score",title = "Oxidative phosphorylation")+
106 |   theme(axis.title.x = element_blank())
107 | 
108 | p2 <- ggplot(data=path_score1,aes(x=type,y=hsa04612,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
109 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
110 |   labs(y="Pathway Score",title = "Antigen processing and presentation")+
111 |   theme(axis.title.x = element_blank())
112 | 
113 | p3 <- ggplot(data=path_score1,aes(x=type,y=hsa04940,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
114 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
115 |   labs(y="Pathway Score",title = "Type I diabetes mellitus")+
116 |   theme(axis.title.x = element_blank())
117 | 
118 | p4 <- ggplot(data=path_score1,aes(x=type,y=hsa05416,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
119 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
120 |   labs(y="Pathway Score",title = "Viral myocarditis")+
121 |   theme(axis.title.x = element_blank())
122 | 
123 | 
124 | 
125 | ggdraw() +     
126 |   draw_plot(p3, 0, 0, 0.5, 0.5) +  
127 |   draw_plot(p4, 0.5, 0, 0.5, 0.5) +  
128 |   draw_plot(p1, 0, 0.5, 0.5, 0.5) +
129 |   draw_plot(p2, 0.5, 0.5, 0.5, 0.5)
130 | 
131 | 
132 | 
133 | library(ggpubr)
134 | library(ggprism)
135 | library(ggplot2)
136 | ggplot(data=path_score1,aes(x=type,y=hsa04940))+
137 |   geom_boxplot(size=1,)+
138 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+
139 |   theme_prism()+
140 |   labs(y="Pathway Score")+
141 |   theme(axis.title.x = element_blank())
142 | 
143 | 
144 | 
145 | 
146 | ######pathway importance 
147 | path_impor <- fread("./model_built/pathway_score_213/pathway_importance/path_impor_GSE530_model15.csv")
148 | path_score <- fread("./model_built/pathway_score_213/GSE530_pathway_score213.csv")
149 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
150 | path_impor$hsa <- colnames(path_score)
151 | 
152 | path_impor <- path_impor[-1,-1]
153 | path_impor$number <- c(1:213)
154 | 
155 | # plot(path_impor$V2,ylim = c(0.105,0.175),col = "blue", pch = 19, cex = 1)
156 | 
157 | path_impor <- path_impor %>% dplyr::mutate(fac = case_when(V2 > 0.11 ~ "A", TRUE ~ "B"))
158 | path_impor$fac <- as.factor(path_impor$fac)
159 | 
160 | path_impor <- left_join(path_impor,GSE673_diff_path213)
161 | 
162 | library(ggplot2)
163 | library(ggpubr)
164 | library(ggprism)
165 | library(ggrepel)
166 | ggplot(path_impor, aes(x=number, y=V2, color=factor(fac))) + 
167 |   geom_point(size = 3,)+
168 |   theme_prism(border = T)+
169 |   labs(y="Loss of the model", x  = "Pathway")+
170 |   ylim(0.089,0.136)+
171 |   scale_color_manual(values = c('red','blue'))+
172 |   theme(legend.position = 'none')+
173 |   geom_text_repel(
174 |   data = subset(path_impor, path_impor$V2 > 0.11),
175 |   aes(label = pathway_id),
176 |   size = 4,
177 |   box.padding = unit(1.2, "lines"),
178 |   point.padding = unit(1, "lines"), segment.color = "black", show.legend = FALSE )
179 | 
180 | 
181 | 
182 | 
183 | 
184 | library(ggpubr)
185 | library(ggprism)
186 | library(ggplot2)
187 | library(cowplot)
188 | ggplot(data=path_score,aes(x=type,y=hsa04380,fill=factor(type)))+
189 |   geom_boxplot(size=1,)+
190 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+
191 |   theme_prism()+theme(legend.position = 'none')+
192 |   labs(y="Pathway Score",title = "Osteoclast differentiation")+
193 |   theme(axis.title.x = element_blank())
194 | 
195 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
196 | 
197 | 
198 | p1 <- ggplot(data=path_score,aes(x=type,y=hsa04380,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
199 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
200 |   labs(y="Pathway Score",title = "Osteoclast differentiation")+
201 |   theme(axis.title.x = element_blank())
202 | 
203 | p2 <- ggplot(data=path_score,aes(x=type,y= hsa04940,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
204 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
205 |   labs(y="Pathway Score",title = "Type I diabetes mellitus")+
206 |   theme(axis.title.x = element_blank())
207 | 
208 | p3 <- ggplot(data=path_score,aes(x=type,y=hsa04650,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
209 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
210 |   labs(y="Pathway Score",title = "Natural killer cell mediated cytotoxicity")+
211 |   theme(axis.title.x = element_blank())
212 | 
213 | p4 <- ggplot(data=path_score,aes(x=type,y=hsa04978,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
214 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
215 |   labs(y="Pathway Score",title = "Mineral absorption")+
216 |   theme(axis.title.x = element_blank())
217 | 
218 | p5 <- ggplot(data=path_score,aes(x=type,y=hsa05322,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
219 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
220 |   labs(y="Pathway Score",title = "Systemic lupus erythematosus")+
221 |   theme(axis.title.x = element_blank())
222 | 
223 | p6 <- ggplot(data=path_score,aes(x=type,y=hsa05208,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+
224 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+
225 |   labs(y="Pathway Score",title = "Chemical carcinogenesis - reactive oxygen species")+
226 |   theme(axis.title.x = element_blank())
227 | 
228 | 
229 | ggdraw() +     
230 |   draw_plot(p3, 0, 0, 0.33, 0.5) +  
231 |   draw_plot(p2, 0.33, 0, 0.33, 0.5) +  
232 |   draw_plot(p1, 0.66, 0, 0.33, 0.5) +
233 |   draw_plot(p6, 0, 0.5, 0.33, 0.5) +
234 |   draw_plot(p4, 0.33, 0.5, 0.33, 0.5) +
235 |   draw_plot(p5, 0.66, 0.5, 0.33, 0.5)
236 | 
237 | 
238 | 
239 | 
240 | gene_ <- KEGG_pathway_gene %>% filter(hsa %in% path_impor1[path_impor1$V2>0.058,]$hsa)
241 | sort(table(gene_$gene_id))
242 | 
243 | 
244 | 
245 | gene_list <- KEGG_pathway_gene %>% filter(hsa %in% path_impor[path_impor$V2 > 0.135,]$hsa)
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | ###heatmap pathy
253 | pathway_data <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv")
254 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
255 | 
256 | heatpathay <- GSE673_diff_path213 %>% filter(hsa %in% c("hsa00190","hsa04612","hsa04940","hsa05416"))
257 | 
258 | heatmap_data <- pathway_data %>% select(type,heatpathay$hsa) %>% arrange(type)
259 | heatmap_data1 <-  heatmap_data %>% select(-type)
260 | 
261 | heatmap_data2 <- scale(heatmap_data1)
262 | heatmap_data2 <- t(heatmap_data2)
263 | 
264 | heatmap_data3 <- heatmap_data2[,c(1:2000,33001:35000)]
265 | 
266 | 
267 | 
268 | tumor_sample <- heatmap_data %>% filter(type =="malignant") %>% arrange(-hsa00190)
269 | tumor_sample1 <- tumor_sample[1:2000,]
270 | 
271 | normal_sample <- heatmap_data %>% filter(type =="normal") %>% arrange(-hsa04612)
272 | normal_sample1 <- normal_sample[1:2000,]
273 | 
274 | data1 <- rbind(tumor_sample1,normal_sample1)
275 | data2 <- data1[,-1]
276 | data3 <- scale(data2)
277 | # data3 <- data2
278 | data4 <- t(data3)
279 | 
280 | library(ComplexHeatmap)
281 | sample_group <- as.data.frame(c(rep("malignant",2000),rep("normal",2000)))
282 | colnames(sample_group) <- "cluster"
283 | library(ComplexHeatmap)
284 | library(circlize)
285 | col_fun = colorRamp2(c(-2.5, 0, 2.5), c("#00FF00", "#3B3B3B", "#EE0000"))
286 | top_anno <- HeatmapAnnotation(Cluster = sample_group$cluster,
287 |                               col = list(Cluster = c("malignant"= "#F8766D","normal"= "#00BFC4"),border = TRUE))
288 | column_split = sample_group$cluster
289 | 
290 | 
291 | ComplexHeatmap::Heatmap(data4,cluster_rows = F,cluster_columns = F,name = " ",
292 |                         show_column_names = F,show_row_names = T,show_heatmap_legend = F,
293 |                         col = col_fun,column_split = column_split,row_title = "Pathway")
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | a <- fread("./model_built/pathway_score_213/model_result/cell_statistics.csv")
302 | b <- a[5:8,]
303 | 
304 | data1 <- melt(b)
305 | data1$sample <- factor(data1$sample,levels = c("ATC","TNBC","IDC","DCIS"))
306 | ggplot(data=data1) +
307 |   geom_bar(aes(x=sample, y=value, fill=variable), 
308 |            stat="identity")+
309 |   scale_fill_manual(values = c("#00BFC4","#F8766D"))+
310 |   theme_prism()+
311 |   labs(y="Number of cells")+
312 |   theme(axis.title.x = element_blank())
313 | 
314 | 
315 | 
316 | 
317 | 


--------------------------------------------------------------------------------
/inst/analysis/pathway_select.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | remove(list = ls())
  6 | setwd("~/project/mcIdentify/data/")
  7 | # model
  8 | ##GSE148673
  9 | 
 10 | library(data.table)
 11 | library(dplyr)
 12 | infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt")
 13 | expr_data1 <- fread("./model_built/datasets/GSE148673_tpm.txt")
 14 | 
 15 | 
 16 | expr_data <- expr_data1
 17 | rownames(expr_data) <- expr_data1$V1
 18 | 
 19 | expr_matrix <- expr_data[,-1]
 20 | expr_matrix <- as.data.frame(t(expr_matrix))
 21 | colnames(expr_matrix) <- rownames(expr_data)
 22 | 
 23 | 
 24 | data1 <- expr_matrix
 25 | 
 26 | ##pathway score
 27 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 28 | 
 29 | infor_data1 <- infor_data1 %>% mutate_at(.vars = "cluster.pred",.funs = funs(ifelse(.=="T","malignant","normal")))
 30 | 
 31 | 
 32 | 
 33 | myFun1 <- function(a){
 34 |   
 35 |   sum(a)/length(a)
 36 |   
 37 | }
 38 | 
 39 | 
 40 | all_pathway_score <- NA
 41 | for (i in 1:335) {
 42 |   
 43 |   gene <- pathway_gene %>% filter(hsa == names(table(pathway_gene$hsa))[i])
 44 |   
 45 |   a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
 46 |   
 47 |   path_score <- as.data.frame(apply(a, 1, myFun1))
 48 |   colnames(path_score) <- names(table(pathway_gene$hsa))[i]
 49 |   
 50 |   all_pathway_score <- cbind(all_pathway_score,path_score)
 51 |   
 52 | }
 53 | 
 54 | 
 55 | pathway_score <- all_pathway_score
 56 | pathway_score <- pathway_score[,-1]
 57 | 
 58 | 
 59 | 
 60 | ##test
 61 | 
 62 | diff_path <- pathway_score
 63 | diff_path$barcode <- rownames(diff_path)
 64 | diff_path <- left_join(infor_data1,diff_path)
 65 | diff_path <- diff_path %>% filter(cluster.pred == "malignant" | cluster.pred == "normal") %>%
 66 |   select(cluster.pred,names(table(pathway_gene$hsa)))
 67 | 
 68 | colnames(diff_path)[1] <- "type"
 69 | 
 70 | saveRDS(diff_path,"./model_built/GSE673_all_pathway_score.rds")
 71 | 
 72 | 
 73 | ###boxplot
 74 | library(ggpubr)
 75 | library(ggprism)
 76 | library(ggplot2)
 77 | plot1 <- ggplot(data=diff_path,aes(x=PredictionRefined,y=hsa00010))+
 78 |   geom_boxplot(size=1)+
 79 |   stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+
 80 |   theme_prism()+
 81 |   labs(y="Pathway Score")+
 82 |   theme(axis.title.x = element_blank())
 83 | 
 84 | plot1
 85 | 
 86 | 
 87 | ##testing
 88 | tumor_sample <- diff_path %>% filter(type=="malignant")
 89 | normal_sample <- diff_path %>% filter(type=="normal")
 90 | 
 91 | tumor_infor <- infor_data1 %>% filter(cluster.pred=="malignant")
 92 | 
 93 | 
 94 | diff_path2 <- diff_path %>% select(-type)
 95 | 
 96 | test_pathway <- NA
 97 | 
 98 | for (name in colnames(diff_path2)) {
 99 |   
100 |   if (sum(select(diff_path2,name) == 0) < nrow(diff_path2)*0.01) {
101 |     
102 |     a <- wilcox.test(as.matrix(select(tumor_sample,name)),as.matrix(select(normal_sample,name)),paired = F, correct = F)
103 |     b <- as.data.frame(a$p.value)
104 |     rownames(b) <- name
105 |     test_pathway <- rbind(test_pathway,b)
106 |     
107 |   }
108 |   
109 | }
110 | 
111 | test_value <- test_pathway
112 | ##select pathway
113 | colnames(test_value) <- "pvalue"
114 | test_value2 <- test_value %>% filter(pvalue< 0.05)
115 | test_value2$hsa <- rownames(test_value2)
116 | test_value2 <- test_value2 %>% arrange(pvalue)
117 | 
118 | 
119 | 
120 | ##pathway
121 | path_test <- test_value2
122 | pathway <- pathway_gene %>% filter(!duplicated(hsa)) %>% select(hsa,pathway_id)
123 | 
124 | path_test <- left_join(path_test,pathway) %>% filter(pvalue == 0)
125 | saveRDS(path_test,"./model_built/GSE673_diff_path.rds")
126 | 
127 | 
128 | testdata <- diff_path %>% select(type,path_test$hsa)
129 | fwrite(testdata,"./model_built/GSE673_pathway_score.csv")
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/inst/analysis/result_analysis.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | remove(list = ls())
  4 | setwd("~/project/mcIdentify/data/")
  5 | 
  6 | library(data.table)
  7 | library(dplyr)
  8 | 
  9 | ##method
 10 | model_result <- fread("./model_built/pathway_score_213/model_result/method.csv")
 11 | 
 12 | data1 <- melt(model_result)
 13 | 
 14 | f1 <- data1 %>% filter(variable == "f1")
 15 | f1$method <- factor(f1$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA"))
 16 | 
 17 | library(ggplot2)
 18 | library(ggpubr)
 19 | library(ggprism)
 20 | 
 21 | p1 <- ggplot(f1,aes(x=method,y=value))+ 
 22 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
 23 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
 24 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+
 25 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
 26 |   scale_color_manual(values=c("black"))+ 
 27 |   ylim(0,1.01)+
 28 |   theme_prism(border = T)+
 29 |   labs(y="F1 score", x  = "",title = "F1 score of different methods")+
 30 |   theme(legend.text = element_text(size = 13,family = "sans"))
 31 | p1  
 32 | 
 33 | 
 34 | 
 35 | 
 36 | accuracy <- data1 %>% filter(variable == "accuracy")
 37 | accuracy$method <- factor(accuracy$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA"))
 38 | p2 <- ggplot(accuracy,aes(x=method,y=value))+ 
 39 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
 40 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
 41 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
 42 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
 43 |   scale_color_manual(values=c("black"))+
 44 |   ylim(0,1.01)+
 45 |   theme_prism(border = T)+
 46 |   labs(y="Accuracy", x  = "",title = "Accuracy of different methods")+
 47 |   theme(legend.text = element_text(size = 13,family = "sans"))
 48 | p2
 49 | 
 50 | 
 51 | 
 52 | 
 53 | recall <- data1 %>% filter(variable == "recall")
 54 | recall$method <- factor(recall$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA"))
 55 | p3 <- ggplot(recall,aes(x=method,y=value))+ 
 56 |   stat_boxplot(geom = "errorbar",width=0.15)+
 57 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
 58 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
 59 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
 60 |   scale_color_manual(values=c("black"))+ 
 61 |   ylim(0,1.01)+
 62 |   theme_prism(border = T)+
 63 |   labs(y="Recall", x  = "",title = "Recall of different methods")+
 64 |   theme(legend.text = element_text(size = 13,family = "sans"))
 65 | p3
 66 | 
 67 | 
 68 | 
 69 | precisoon <- data1 %>% filter(variable == "precisoon")
 70 | precisoon$method <- factor(precisoon$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA"))
 71 | p4 <- ggplot(precisoon,aes(x=method,y=value))+ 
 72 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
 73 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+
 74 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
 75 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 
 76 |   scale_color_manual(values=c("black"))+
 77 |   ylim(0,1.01)+
 78 |   theme_prism(border = T)+
 79 |   labs(y="Precison", x  = "",title = "Precison of different methods")+
 80 |   theme(legend.text = element_text(size = 13,family = "sans"))
 81 | p4
 82 | 
 83 | 
 84 | library(cowplot)
 85 | 
 86 | 
 87 | prow <- plot_grid(
 88 |   p1 + theme(legend.position="none"),
 89 |   p2 + theme(legend.position="none"),
 90 |   p3 + theme(legend.position="none"),
 91 |   p4 + theme(legend.position="none"),
 92 |   align = 'vh',
 93 |   labels = c("A", "B", "C","D"),
 94 |   hjust = -1,
 95 |   nrow = 2
 96 | )
 97 | prow
 98 | legend <- get_legend(
 99 |   p1 + theme(legend.box.margin = margin(0, 0, 0, 12))
100 | )
101 | 
102 | plot_grid(prow, legend, rel_widths = c(3, .4))
103 | 
104 | 
105 | 
106 | 
107 | ##model framwork
108 | model_result <- fread("./model_built/pathway_score_213/model_result/model.csv")
109 | 
110 | data1 <- melt(model_result)
111 | colnames(data1)[2] <- "method"
112 | 
113 | f1 <- data1 %>% filter(variable == "f1")
114 | f1$method <- factor(f1$method,levels = c("DNN","FR","LR","SVM","XGBOOST"))
115 | 
116 | library(ggplot2)
117 | library(ggpubr)
118 | library(ggprism)
119 | 
120 | p1 <- ggplot(f1,aes(x=method,y=value))+ 
121 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
122 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
123 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
124 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
125 |   scale_color_manual(values=c("black"))+ 
126 |   ylim(0,1.01)+
127 |   theme_prism(border = T)+
128 |   labs(y="F1 score", x  = "",title = "F1 score of different model framowrks")+
129 |   theme(legend.text = element_text(size = 13,family = "sans"))
130 | p1  
131 | 
132 | 
133 | 
134 | 
135 | accuracy <- data1 %>% filter(variable == "accuracy")
136 | accuracy$method <- factor(accuracy$method,levels = c("DNN","FR","LR","SVM","XGBOOST"))
137 | p2 <- ggplot(accuracy,aes(x=method,y=value))+ 
138 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
139 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
140 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
141 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
142 |   scale_color_manual(values=c("black"))+ 
143 |   ylim(0,1.01)+
144 |   theme_prism(border = T)+
145 |   labs(y="Accuracy", x  = "",title = "Accuracy of different model framowrks")+
146 |   theme(legend.text = element_text(size = 13,family = "sans"))
147 | p2
148 | 
149 | 
150 | 
151 | 
152 | recall <- data1 %>% filter(variable == "recall")
153 | recall$method <- factor(recall$method,levels = c("DNN","FR","LR","SVM","XGBOOST"))
154 | p3 <- ggplot(recall,aes(x=method,y=value))+ 
155 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
156 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
157 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
158 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
159 |   scale_color_manual(values=c("black"))+ 
160 |   ylim(0,1.01)+
161 |   theme_prism(border = T)+
162 |   labs(y="Recall", x  = "",title = "Recall of different model framowrks")+
163 |   theme(legend.text = element_text(size = 13,family = "sans"))
164 | p3
165 | 
166 | 
167 | 
168 | precisoon <- data1 %>% filter(variable == "precisoon")
169 | precisoon$method <- factor(precisoon$method,levels = c("DNN","FR","LR","SVM","XGBOOST"))
170 | p4 <- ggplot(precisoon,aes(x=method,y=value))+
171 |   stat_boxplot(geom = "errorbar",width=0.15)+ 
172 |   geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 
173 |   geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 
174 |   scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+  
175 |   scale_color_manual(values=c("black"))+
176 |   ylim(0,1.01)+
177 |   theme_prism(border = T)+
178 |   labs(y="Precison", x  = "",title = "Precison of different model framowrks")+
179 |   theme(legend.text = element_text(size = 13,family = "sans"))
180 | p4
181 | 
182 | 
183 | library(cowplot)
184 | 
185 | 
186 | prow <- plot_grid(
187 |   p1 + theme(legend.position="none"),
188 |   p2 + theme(legend.position="none"),
189 |   p3 + theme(legend.position="none"),
190 |   p4 + theme(legend.position="none"),
191 |   align = 'vh',
192 |   labels = c("A", "B", "C","D"),
193 |   hjust = -1,
194 |   nrow = 2
195 | )
196 | prow
197 | legend <- get_legend(
198 |   p1 + theme(legend.box.margin = margin(0, 0, 0, 12))
199 | )
200 | 
201 | plot_grid(prow, legend, rel_widths = c(3, .4))
202 | 
203 | 


--------------------------------------------------------------------------------
/inst/analysis/simulation.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ## pathway_simulation
  3 | setwd("~/project/mcIdentify/data/")
  4 | remove(list = ls())
  5 | library(data.table)
  6 | library(dplyr)
  7 | data <- fread("./model_built/pathway_score_213/model_result/mcIdentify_simulation_pathway/10_pathway.txt")
  8 | colnames(data) <- c("simulation","measure","5%")
  9 | for (number in c(20,40,60,80,100,120,140,160,180)) {
 10 |   filename <- paste0("./model_built/pathway_score_213/model_result/mcIdentify_simulation_pathway/",number,"_pathway.txt")
 11 |   data1 <- fread(filename)
 12 |   cname <- paste0(number/2,"%")
 13 |   data[,cname] <- data1$V3
 14 |   
 15 | }
 16 | 
 17 | 
 18 | data[,"0%"] <- rep(c(0.98,0.98,0.99,0.98),100)
 19 | data2 <- melt(data)
 20 | data2$variable <- factor(data2$variable,levels = c("0%","5%","10%","20%","30%","40%",
 21 |                                                    "50%","60%","70%","80%","90%"))
 22 | 
 23 | draw_data <- data2 %>% filter(measure == "precision:")
 24 | 
 25 | library(ggpubr)
 26 | library(ggprism)
 27 | library(ggplot2)
 28 | library(cowplot)
 29 | ggplot(data=draw_data,aes(x=variable,y=value,fill = variable))+
 30 |   geom_boxplot(size=1, draw_quantiles = c(0.5))+
 31 |   theme_prism(border = T)+theme(legend.position = 'none')+
 32 |   labs(y="Precision",title = " ")+
 33 |   theme(axis.title.x = element_blank())+
 34 |   geom_hline(aes(yintercept=0.9),linetype=5,col="red")+
 35 |   geom_hline(aes(yintercept=0.8),linetype=5,col="red")+
 36 |   scale_y_continuous(breaks=c(0,0.5,0.6,0.7,0.8,0.9,1))+
 37 |   scale_fill_manual(values = c("#F27970", "#BB9727","#54B345","#32B897",
 38 |                                "#05B9E2", "#8983BF","#C76DA2","#F27970",
 39 |                                "#BB9727","#54B345","#32B897"))+
 40 |   ylim(0.5,1)
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | ## simulation gene 
 49 | setwd("~/project/mcIdentify/data/")
 50 | remove(list = ls())
 51 | library(data.table)
 52 | library(dplyr)
 53 | mcIdentify <- fread("./model_built/pathway_score_213/model_result/simulation_gene/mcIdentify.txt")
 54 | SCINA <- fread("./model_built/pathway_score_213/model_result/simulation_gene/SCINA.txt")
 55 | scMRMA <- fread("./model_built/pathway_score_213/model_result/simulation_gene/scMRMA.txt")
 56 | ikarus <- fread("./model_built/pathway_score_213/model_result/simulation_gene/ikarus_pred.txt")
 57 | 
 58 | 
 59 | colnames(SCINA) <- c("ID","measure","SCINA")
 60 | colnames(mcIdentify) <- c("ID","measure","mcIdentify")
 61 | colnames(ikarus) <- c("ID","measure","ikarus")
 62 | colnames(scMRMA) <- c("ID","measure","scMRMA")
 63 | 
 64 | 
 65 | data <- left_join(mcIdentify,ikarus) %>% left_join(.,SCINA) %>% left_join(.,scMRMA) %>% arrange(ID)
 66 | 
 67 | data$infor <- c(rep("1000gene",40),rep("1500gene",40),rep("2000gene",40),rep("2500gene",40),rep("500gene",40))
 68 | 
 69 | data1 <- melt(data)
 70 | 
 71 | data1$variable <- factor(data1$variable,levels = c("mcIdentify","scMRMA","SCINA","ikarus"))
 72 | data1$infor <- factor(data1$infor,levels = c("500gene","1000gene","1500gene","2000gene","2500gene"))
 73 | 
 74 | draw_data <- data1 %>% filter(measure == "F1:") #  & variable == "mcIdentify"
 75 | draw_data <- data1 %>% filter(measure == "recall:") #  & variable == "mcIdentify"
 76 | 
 77 | 
 78 | a <- draw_data %>% group_by(variable, infor) %>% summarise(mean(value))
 79 | colnames(a) <- c("method","gene","value")
 80 | V1 <- c("500gene","1000gene","1500gene","2000gene","2500gene")
 81 | library(ggpubr)
 82 | library(ggprism)
 83 | library(ggplot2)
 84 | library(cowplot)
 85 | ggplot(a,aes(x=gene,y=value,fill=method))+
 86 |   geom_bar(position="dodge",stat="identity")+
 87 |   labs(x="Number of random genes",y="Recall")+
 88 |   theme_prism(border = F)+
 89 |   geom_hline(aes(yintercept=0.9),linetype=5,col="red")+
 90 |   scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,0.9))+
 91 |   scale_x_discrete(breaks=V1, labels=c("500","1000","1500","2000","2500"))+
 92 |   scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"),
 93 |                     breaks=c("mcIdentify","scMRMA","SCINA","ikarus"),
 94 |                     labels=c("mcIdentify","scMRMA","SCINA","ikarus"))
 95 | 
 96 | 
 97 | 
 98 | 
 99 | ## 4pathway simulation
100 | setwd("~/project/mcIdentify/data/")
101 | remove(list = ls())
102 | library(data.table)
103 | library(dplyr)
104 | 
105 | data1 <- fread("./model_built/pathway_score_213/model_result/simulation_pathway2/36_without4_pathway.txt")
106 | data1$pathway <- "withoutpathway"
107 | 
108 | data2 <- fread("./model_built/pathway_score_213/model_result/simulation_pathway2/40_with4_pathway.txt")
109 | data2$pathway <- "withpathway"
110 | 
111 | data3 <- rbind(data2,data1)
112 | data3$pathway <- factor(data3$pathway,levels = c("withpathway","withoutpathway"))
113 | 
114 | draw_data <- data3 %>% filter(V2 == "F1:")
115 | 
116 | my_lists <- list(c("withpathway","withoutpathway"))
117 | 
118 | ggplot(data=draw_data,aes(x=pathway,y=V3,fill = pathway))+
119 |   geom_boxplot(size=1, draw_quantiles = c(0.5))+
120 |   theme_prism(border = F)+theme(legend.position = 'none')+
121 |   labs(y="F1 score",title = " ")+
122 |   stat_compare_means(method = "wilcox.test")+
123 |   theme(axis.title.x = element_blank())+
124 |   scale_y_continuous(breaks=c(0,0.5,0.6,0.7,0.8,0.9,1))+
125 |   scale_fill_manual(values = c("#91D1C2FF", "#FDAF91FF"))+
126 |   ylim(0.5,1)
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | ##simulation gene 2
134 | library(data.table)
135 | library(dplyr)
136 | data1 <- fread("./model_built/pathway_score_213/model_result/simulation_gene.csv")
137 | data1$infor <- rep(c("100gene","200gene","300gene","400gene","500gene"),4)
138 | 
139 | data2 <- melt(data1)
140 | 
141 | 
142 | data2$method <- factor(data2$method,levels = c("TCfinder","scMRMA","SCINA","ikraus"))
143 | data2$infor <- factor(data2$infor,levels = c("100gene","200gene","300gene","400gene","500gene"))
144 | 
145 | a <- data2 %>% dplyr::filter(variable == "f1")
146 | 
147 | V1 <- c("100gene","200gene","300gene","400gene","500gene")
148 | library(ggpubr)
149 | library(ggprism)
150 | library(ggplot2)
151 | library(cowplot)
152 | ggplot(a,aes(x=infor,y=value,fill=method))+
153 |   geom_bar(position="dodge",stat="identity")+
154 |   labs(x="Number of randomly inactivate genes",y="F1 Score")+
155 |   theme_prism(border = F)+
156 |   geom_hline(aes(yintercept=0.95),linetype=5,col="red")+
157 |   scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,0.95,1))+
158 |   scale_x_discrete(breaks=V1, labels=c("100","200","300","400","500"))+
159 |   scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"),
160 |                     breaks=c("TCfinder","scMRMA","SCINA","ikraus"),
161 |                     labels=c("TCfinder","scMRMA","SCINA","ikarus"))
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/inst/analysis/simulation_gene.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | setwd("~/project/mcIdentify/data/")
  4 | remove(list = ls())
  5 | library(data.table)
  6 | library(dplyr)
  7 | 
  8 | all_data <- fread("./processed_data/GSE148673_tpm.txt")
  9 | border_data <- as.data.frame(all_data[,-1])
 10 | rownames(border_data) <- all_data$V1
 11 | 
 12 | 
 13 | 
 14 | 
 15 | a <- apply(border_data, 2, function(x){which(x > 0)})
 16 | 
 17 | for (number in c(20)) {
 18 |   
 19 |   ##border gene select
 20 |   low_number <- NA
 21 |   testdata <- border_data
 22 |   for (i in 1:35727) {
 23 |     random_number <- sample(a[[i]], number, replace = FALSE)
 24 |     testdata[random_number,i] <- 0
 25 | 
 26 |   }
 27 |   
 28 | 
 29 | 
 30 | 
 31 |   write_filename_border <- paste0("./model_built/pathway_score_213/simulation_gene/GSE148673_tpm_",number,".txt")
 32 |   
 33 |   fwrite(testdata,write_filename_border,row.names = T)
 34 |   
 35 |   
 36 |   
 37 |   ####pathway score
 38 |   read_filename <- paste0("./model_built/pathway_score_213/simulation_gene/GSE148673_tpm_",number,".txt")
 39 |   data <- fread(read_filename)
 40 |   
 41 |   
 42 |   infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt")
 43 |   infor_data1 <- infor_data1 %>% mutate(type = case_when(cluster.pred == "T"~"malignant",
 44 |                                                          cluster.pred == "N"~"normal"))
 45 |   
 46 |   pathway_gene <- readRDS("./KEGG_pathway_gene.rds")
 47 |   expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id)))
 48 |   
 49 |   
 50 |   expr_data <- expr_data1
 51 |   rownames(expr_data) <- expr_data$V1
 52 |   
 53 |   expr_matrix <- expr_data[,-1]
 54 |   expr_matrix <- as.data.frame(t(expr_matrix))
 55 |   colnames(expr_matrix) <- rownames(expr_data)
 56 |   data1 <- expr_matrix
 57 |   
 58 |   GSE673_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds")
 59 |   score_gene <- pathway_gene %>% filter(hsa %in% GSE673_diff_path3$hsa)
 60 |   
 61 |   ##pathway score
 62 |   
 63 |   myFun1 <- function(a){
 64 |     
 65 |     sum(a)/length(a)
 66 |     
 67 |   }
 68 |   
 69 |   all_pathway_score <- NA
 70 |   for (i in 1:213) {
 71 |     
 72 |     gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i])
 73 |     
 74 |     a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)])
 75 |     
 76 |     path_score <- as.data.frame(apply(a, 1, myFun1))
 77 |     colnames(path_score) <- names(table(score_gene$hsa))[i]
 78 |     
 79 |     all_pathway_score <- cbind(all_pathway_score,path_score)
 80 |     
 81 |   }
 82 |   
 83 |   pathway_score <- all_pathway_score
 84 |   pathway_score <- pathway_score[,-1]
 85 | 
 86 |   diff_path <- pathway_score
 87 |   diff_path$barcode <- rownames(diff_path)
 88 |   # infor_data2 <- infor_data1 %>% filter(Type != "unclassified")
 89 |   infor_data2 <- infor_data1
 90 |   diff_path <- left_join(diff_path,infor_data2)
 91 |   diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit()
 92 |   
 93 |   
 94 |   write_filename <- paste0("./model_built/pathway_score_213/simulation_gene/pathway_score/GSE673_",number,".csv")
 95 |   fwrite(diff_path,write_filename)
 96 | }
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/inst/analysis/umap.R:
--------------------------------------------------------------------------------
  1 | 
  2 | setwd("~/project/mcIdentify/data/model_built/pathway_score_213/predict_result/")
  3 | remove(list = ls())
  4 | library(umap)
  5 | library(ggprism)
  6 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
  7 | 
  8 | data <- fread("./GSE309_predict.csv", data.table=F)
  9 | colnames(data)[2] <- "predict"
 10 | data1 <- data %>% mutate(true = case_when(type == 0 ~ "malignant",type == 1 ~ "normal")) %>%
 11 |   mutate(predict = case_when(predict == 0 ~ "malignant",predict == 1 ~ "normal")) %>% select(true,predict,GSE673_diff_path213$hsa)
 12 | 
 13 | umap1 <- umap::umap(data1[,3:215])
 14 | umap2 <- umap1$layout
 15 | df1<-data.frame(umap2,data1$true)
 16 | df1$data1.true<-as.factor(df1$data1.true)
 17 | 
 18 | p1<-ggplot(data = df1,aes(x=X1,y=X2,color=data1.true))+
 19 |   geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+
 20 |   guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+
 21 |   scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 True")+
 22 |   theme(axis.text = element_blank(),axis.ticks=element_blank())
 23 | p1
 24 | 
 25 | 
 26 | df2<-data.frame(umap2,data1$predict)
 27 | df2$data1.predict<-as.factor(df2$data1.predict)
 28 | 
 29 | p2<-ggplot(data = df2,aes(x=X1,y=X2,color=data1.predict))+
 30 |   geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+
 31 |   guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+
 32 |   scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 Predict")+
 33 |   theme(axis.text = element_blank(),axis.ticks=element_blank())
 34 | p2
 35 | 
 36 | 
 37 | library(cowplot)
 38 | prow <- plot_grid(
 39 |   p1 + theme(legend.position="none"),
 40 |   p2 + theme(legend.position="none"),
 41 |   align = 'vh',
 42 |   labels = c(),
 43 |   hjust = -1,
 44 |   nrow = 1
 45 | )
 46 | prow
 47 | legend <- get_legend(
 48 |   p1 + theme(legend.box.margin = margin(0, 5, 0, 5),legend.text = element_text(size = 15,family = "sans"))
 49 | )
 50 | 
 51 | plot_grid(prow, legend, rel_widths = c(4, .7))
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | setwd("~/project/mcIdentify/data/model_built/pathway_score_213/predict_result/")
 60 | remove(list = ls())
 61 | library(umap)
 62 | library(ggprism)
 63 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds")
 64 | 
 65 | data <- fread("./GOSH_predict.csv", data.table=F)
 66 | colnames(data)[2] <- "predict"
 67 | data1 <- data %>% mutate(true = case_when(type == 0 ~ "malignant",type == 1 ~ "normal")) %>%
 68 |   mutate(predict = case_when(predict == 0 ~ "malignant",predict == 1 ~ "normal")) %>% select(true,predict,GSE673_diff_path213$hsa)
 69 | 
 70 | umap1 <- umap::umap(data1[,3:215])
 71 | umap2 <- umap1$layout
 72 | 
 73 | predict_data <- fread("~/project/mcIdentify/data/model_built/pathway_score_213/framwork_umap/XGBoost_309_predict.csv")
 74 | predict_data <- predict_data[-1,]
 75 | colnames(predict_data) <- "predict"
 76 | 
 77 | df2<-data.frame(umap2,predict_data$predict)
 78 | 
 79 | df2$predict_data.predict<-as.factor(df2$predict_data.predict)
 80 | 
 81 | 
 82 | library(ggplot2)
 83 | p3<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+
 84 |   geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+
 85 |   guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+
 86 |   scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 LR Predict")+
 87 |   theme(axis.text = element_blank(),axis.ticks=element_blank())
 88 | p3
 89 | 
 90 | 
 91 | p4<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+
 92 |   geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+
 93 |   guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+
 94 |   scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 RF Predict")+
 95 |   theme(axis.text = element_blank(),axis.ticks=element_blank())
 96 | p4
 97 | 
 98 | 
 99 | p5<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+
100 |   geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+
101 |   guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+
102 |   scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 SVM Predict")+
103 |   theme(axis.text = element_blank(),axis.ticks=element_blank())
104 | p5
105 | 
106 | 
107 | p6<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+
108 |   geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+
109 |   guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+
110 |   scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 XGBoost Predict")+
111 |   theme(axis.text = element_blank(),axis.ticks=element_blank())
112 | p6
113 | 
114 | 
115 | 
116 | library(cowplot)
117 | prow <- plot_grid(
118 |   p3 + theme(legend.position="none"),
119 |   p4 + theme(legend.position="none"),
120 |   p5 + theme(legend.position="none"),
121 |   p6 + theme(legend.position="none"),
122 |   align = 'vh',
123 |   labels = c(),
124 |   hjust = -1,
125 |   nrow = 2
126 | )
127 | prow
128 | legend <- get_legend(
129 |   p3 + theme(legend.box.margin = margin(0, 5, 0, 5),legend.text = element_text(size = 15,family = "sans"))
130 | )
131 | 
132 | plot_grid(prow, legend, rel_widths = c(4, .7))
133 | 
134 | 


--------------------------------------------------------------------------------
/inst/extdata/TCfinder.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/inst/extdata/TCfinder.hdf5


--------------------------------------------------------------------------------
/inst/extdata/predict_py.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from tensorflow.keras.models import load_model
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | def predict_py(path_score,Path):
 7 |   data2 = path_score
 8 |   model = load_model(Path+"/TCfinder.hdf5")
 9 |   predict = model.predict(data2)
10 |   return predict
11 | 


--------------------------------------------------------------------------------
/inst/image/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/inst/image/workflow.png


--------------------------------------------------------------------------------
/man/data_normalized.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data_normalized.R
 3 | \name{data_normalized}
 4 | \alias{data_normalized}
 5 | \title{data normalized}
 6 | \usage{
 7 | data_normalized(expr_data, method = "method", genome = "hg38")
 8 | }
 9 | \arguments{
10 | \item{expr_data}{A single-cell counts expression matrix.}
11 | 
12 | \item{method}{If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required.
13 | For other single-cell sequencing methods, this parameter does not need to be filled in.}
14 | 
15 | \item{genome}{Reference genome, when method = "smart-seq2",
16 | this parameter needs to be filled in, you can choose hg19 and hg38}
17 | }
18 | \value{
19 | A normalized single-cell expression matrix.
20 | }
21 | \description{
22 | Normalize single-cell raw counts matrix.
23 | }
24 | \details{
25 | Input a data.frame where the rows are the gene names and the columns are the sample names.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/pathway_score.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pathway_score.R
 3 | \name{pathway_score}
 4 | \alias{pathway_score}
 5 | \title{pathway score}
 6 | \usage{
 7 | pathway_score(expr_data, normalized = TRUE, method = "method", genome = "hg38")
 8 | }
 9 | \arguments{
10 | \item{expr_data}{Single-cell expression matrix after normalization of the original counts matrix.}
11 | 
12 | \item{normalized}{If the matrix is not normalized, you need to set normalized = FALSE}
13 | 
14 | \item{method}{This parameter is required when normalized = FALSE. If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required.
15 | For other single-cell sequencing methods, this parameter does not need to be filled in.}
16 | 
17 | \item{genome}{This parameter is required when normalized = FALSE. Reference genome, when method = "smart-seq2",
18 | this parameter needs to be filled in, you can choose hg19 and hg38}
19 | }
20 | \value{
21 | A matrix containing 213 pathway scores.
22 | }
23 | \description{
24 | Obtain a pathway score matrix for predicting tumor cells.
25 | }
26 | \details{
27 | Input a sparse matrix, matrix, or data frame where the rows are the gene names and the columns are the sample names. Matrix that can be generated directly using the data_normalized.R function.
28 | }
29 | 


--------------------------------------------------------------------------------
/man/predict_cell.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict_cell.R
 3 | \name{predict_cell}
 4 | \alias{predict_cell}
 5 | \title{Cell types prediction.}
 6 | \usage{
 7 | predict_cell(path_score)
 8 | }
 9 | \arguments{
10 | \item{path_score}{The pathway score matrix calculated by the pathway_score function.}
11 | }
12 | \value{
13 | A data.frame containing cell types and predicted values.
14 | }
15 | \description{
16 | Classify tumor cells from normal cells.
17 | }
18 | \details{
19 | Input the pathway score matrix calculated by the pathway_score function.
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(TCfinder)
11 | 
12 | test_check("TCfinder")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-data_normalized.R:
--------------------------------------------------------------------------------
1 | test_that("multiplication works", {
2 |   expect_equal(2 * 2, 4)
3 | })
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pathway_score.R:
--------------------------------------------------------------------------------
1 | test_that("multiplication works", {
2 |   expect_equal(2 * 2, 4)
3 | })
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test-predict_cell.R:
--------------------------------------------------------------------------------
1 | test_that("multiplication works", {
2 |   expect_equal(2 * 2, 4)
3 | })
4 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/interpretation.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "interpretation"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{interpretation}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 |   collapse = TRUE,
13 |   comment = "#>"
14 | )
15 | ```
16 | 
17 | ```{r setup}
18 | library(TCfinder)
19 | ```
20 | 


--------------------------------------------------------------------------------