├── .Rbuildignore ├── .Rhistory ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── data_normalized.R ├── pathway_score.R ├── predict_cell.R └── sysdata.rda ├── README.md ├── TCfinder.Rproj ├── data ├── KEGG_Gene.rda └── TCfinder_Pathway.rda ├── inst ├── analysis │ ├── GSE673_analysis.R │ ├── MLcode.R │ ├── bluk_Anti.R │ ├── bulk_pathway.R │ ├── confusion_matrix.R │ ├── figure1.R │ ├── gene_analysis.R │ ├── model_border_gene.R │ ├── model_build.py │ ├── model_train.pbs │ ├── model_train.py │ ├── other_data_pathscore.R │ ├── pathway_importance.R │ ├── pathway_select.R │ ├── result_analysis.R │ ├── simulation.R │ ├── simulation_gene.R │ └── umap.R ├── extdata │ ├── GOSH_pathway_score.csv │ ├── TCfinder.hdf5 │ └── predict_py.py └── image │ └── workflow.png ├── man ├── data_normalized.Rd ├── pathway_score.Rd └── predict_cell.Rd ├── tests ├── testthat.R └── testthat │ ├── test-data_normalized.R │ ├── test-pathway_score.R │ └── test-predict_cell.R └── vignettes ├── .gitignore └── interpretation.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^TCfinder\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | getwd() 2 | usethis::use_agpl3_license("chenxuwu") 3 | usethis::use_mit_license("chenxuwu") 4 | usethis::use_roxygen_md() 5 | usethis::use_testthat() 6 | load("./data/GSE673_diff_path213.rds") 7 | Diff_path <- readRDS("./data/Diff_path.rds") 8 | pathway_score <- function(normalized_matrix){ 9 | Diff_path <- readRDS("data/Diff_path.rds") 10 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds") 11 | score_gene <- KEGG_gene %>% filter(hsa %in% Diff_pathway$hsa) 12 | gene_id <- rownames(normalized_matrix) 13 | normalized_matrix <- as.data.frame(t(normalized_matrix)) 14 | colnames(normalized_matrix) <- gene_id 15 | myFun1 <- function(number){ 16 | sum(number)/length(number) 17 | } 18 | all_pathway_score <- NA 19 | for (i in 1:213) { 20 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 21 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)]) 22 | path_score <- as.data.frame(apply(data1, 1, myFun1)) 23 | colnames(path_score) <- names(table(score_gene$hsa))[i] 24 | all_pathway_score <- cbind(all_pathway_score,path_score) 25 | } 26 | pathway_score <- all_pathway_score[,-1] 27 | return(pathway_score) 28 | } 29 | file.edit("DESCRIPTION") 30 | devtools::document() 31 | devtools::document() 32 | pwd 33 | getwd() 34 | ?pathway_score 35 | usethis::use_vignette("interpretation") 36 | devtools::check() 37 | devtools::check() 38 | usethis::use_testthat() 39 | usethis::use_test() 40 | pwd 41 | getwd() 42 | library(testthat) 43 | library(TCfinder) 44 | evtools::test() 45 | devtools::test() 46 | load_all() 47 | devtools::load_all() 48 | devtools::check() 49 | devtools::check() 50 | devtools::test() 51 | devtools::test() 52 | devtools::load_all() 53 | devtools::check() 54 | pathway_score <- function(normalized_matrix){ 55 | Diff_path <- readRDS("data/Diff_path.rds") 56 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds") 57 | score_gene <- KEGG_gene %>% filter(hsa %in% Diff_pathway$hsa) 58 | gene_id <- rownames(normalized_matrix) 59 | normalized_matrix <- as.data.frame(t(normalized_matrix)) 60 | colnames(normalized_matrix) <- gene_id 61 | myFun1 <- function(number){ 62 | sum(number)/length(number) 63 | } 64 | all_pathway_score <- NA 65 | for (i in 1:213) { 66 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 67 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)]) 68 | path_score <- as.data.frame(apply(data1, 1, myFun1)) 69 | colnames(path_score) <- names(table(score_gene$hsa))[i] 70 | all_pathway_score <- cbind(all_pathway_score,path_score) 71 | } 72 | pathway_score <- all_pathway_score[,-1] 73 | return(pathway_score) 74 | } 75 | data_normalized <- function(expr_data){ 76 | gene_id <- rownames(expr_data) 77 | data1 <- expr_data %>% apply(2,function(x){x/sum(x) * 10000}) %>% as.data.frame() 78 | data2 <- data1 %>% dplyr::mutate_all(funs(log2(.+1))) 79 | rownames(data2) <- gene_id 80 | data2 <- round(data2,3) 81 | return(data2) 82 | } 83 | devtools::check() 84 | devtools::check() 85 | View(pathway_score) 86 | ("data/Diff_path.rds") 87 | ("data/Diff_path.rds") 88 | Diff_path <- readRDS("data/Diff_path.rds") 89 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds") 90 | pathway_score <- function(normalized_matrix){ 91 | Diff_path <- readRDS("data/Diff_path.rds") 92 | KEGG_gene <- readRDS("data/KEGG_pathway_gene.rds") 93 | score_gene <- KEGG_gene %>% filter(hsa %in% Diff_pathway$hsa) 94 | gene_id <- rownames(normalized_matrix) 95 | normalized_matrix <- as.data.frame(t(normalized_matrix)) 96 | colnames(normalized_matrix) <- gene_id 97 | myFun1 <- function(number){ 98 | sum(number)/length(number) 99 | } 100 | all_pathway_score <- NA 101 | for (i in 1:213) { 102 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 103 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)]) 104 | path_score <- as.data.frame(apply(data1, 1, myFun1)) 105 | colnames(path_score) <- names(table(score_gene$hsa))[i] 106 | all_pathway_score <- cbind(all_pathway_score,path_score) 107 | } 108 | pathway_score <- all_pathway_score[,-1] 109 | return(pathway_score) 110 | } 111 | usethis::use_test() 112 | usethis::use_test() 113 | devtools::check() 114 | library(TCfinder) 115 | devtools::check() 116 | devtools::check() 117 | devtools::check() 118 | Diff_path <- read.csv("data/TCfinder_Pathway.csv") 119 | View(Diff_path) 120 | KEGG_gene <- read.csv("data/KEGG_Gene.csv") 121 | View(KEGG_gene) 122 | save(Diff_path,"./data/Diff_path.Rdata") 123 | save(Diff_path,"./data/") 124 | save(Diff_path,"./data/Diff_path.Rda") 125 | save(Diff_path,"./data/Diff_path.rda") 126 | load(file = "./data/KEGG_Gene.rda") 127 | load(file = "./data/KEGG_Gene.rda") 128 | load(file = "./data/TCfinder_Pathway.rda") 129 | load(file = "./data/TCfinder_Pathway.rda") 130 | load(file = "./data/TCfinder_Pathway.rda") 131 | View(TCfinder_Pathway) 132 | remove(list = ls()) 133 | load(file = "./data/KEGG_Gene.rda") 134 | load(file = "./data/TCfinder_Pathway.rda") 135 | score_gene <- KEGG_gene %>% filter(hsa %in% TCfinder_pathway$hsa) 136 | library(dplyr) 137 | score_gene <- KEGG_gene %>% filter(hsa %in% TCfinder_pathway$hsa) 138 | score_gene <- KEGG_Gene %>% filter(hsa %in% TCfinder_pathway$hsa) 139 | View(TCfinder_Pathway) 140 | View(TCfinder_Pathway) 141 | score_gene <- KEGG_Gene %>% filter(hsa %in% TCfinder_Pathway$hsa) 142 | utils::data() 143 | utils::data() 144 | devtools::check() 145 | usethis::use_data() 146 | usethis::use_data(KEGG_Gene.rda) 147 | usethis::use_data(TCfinder_Pathway.rda) 148 | KEGG_Gene <- KEGG_Gene 149 | TCfinder_Pathway <- TCfinder_Pathway 150 | usethis::use_data(TCfinder_Pathway) 151 | usethis::use_data(KEGG_Gene) 152 | devtools::check() 153 | devtools::check() 154 | devtools::document() 155 | devtools::document() 156 | devtools::check() 157 | devtools::document() 158 | devtools::check() 159 | devtools::document() 160 | devtools::check() 161 | devtools::check() 162 | devtools::document() 163 | devtools::check() 164 | devtools::check() 165 | devtools::document() 166 | devtools::check() 167 | devtools::check() 168 | remove(list = ls()) 169 | KEGG_Gene <- TCfinder::KEGG_Gene.rds 170 | KEGG_Gene <- TCfinder::KEGG_Gene 171 | TCfinder_Pathway <- TCfinder::TCfinder_Pathway 172 | devtools::document() 173 | devtools::check() 174 | library(TCfinder) 175 | devtools::load_all(".") 176 | devtools::document() 177 | data_test <- fread("inst/extdata/tests_score.csv") 178 | library(data.table) 179 | data_test <- fread("inst/extdata/tests_score.csv") 180 | library(reticulate) 181 | predict_cell <- function(path_score){ 182 | reticulate::source_python('inst/extdata/predict_py.py') 183 | predict <- predict_py(path_score) 184 | predict_result <- as.data.frame(predict) 185 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 186 | V1 <= 0.5 ~ "tumor")) 187 | colnames(result) <- c("value","cell_type") 188 | return(result) 189 | } 190 | data_test <- fread("inst/extdata/tests_score.csv") 191 | result <- predict_cell(path_score = data_test) 192 | reticulate::py_config() 193 | python 194 | reticulate::repl_python() 195 | reticulate::py_config() 196 | result <- predict_cell(path_score = data_test) 197 | library(reticulate) 198 | reticulate::py_config() 199 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe") 200 | library(reticulate) 201 | reticulate::py_config() 202 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe") 203 | library(reticulate) 204 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe") 205 | reticulate::py_config() 206 | predict_cell <- function(path_score){ 207 | reticulate::source_python('inst/extdata/predict_py.py') 208 | predict <- predict_py(path_score) 209 | predict_result <- as.data.frame(predict) 210 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 211 | V1 <= 0.5 ~ "tumor")) 212 | colnames(result) <- c("value","cell_type") 213 | return(result) 214 | } 215 | data_test <- fread("inst/extdata/tests_score.csv") 216 | library(data.table) 217 | data_test <- fread("inst/extdata/tests_score.csv") 218 | predict_cell <- function(path_score){ 219 | reticulate::source_python('inst/extdata/predict_py.py') 220 | predict <- predict_py(path_score) 221 | predict_result <- as.data.frame(predict) 222 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 223 | V1 <= 0.5 ~ "tumor")) 224 | colnames(result) <- c("value","cell_type") 225 | return(result) 226 | } 227 | result <- predict_cell(path_score = data_test) 228 | predict_cell <- function(path_score){ 229 | reticulate::source_python('inst/extdata/predict_py.py') 230 | predict <- predict_py(path_score) 231 | predict_result <- as.data.frame(predict) 232 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 233 | V1 <= 0.5 ~ "tumor")) 234 | colnames(result) <- c("value","cell_type") 235 | return(result) 236 | } 237 | result <- predict_cell(path_score = data_test) 238 | result <- predict_cell(path_score = data_test) 239 | library(data.table) 240 | data_test <- fread("inst/extdata/tests_score.csv") 241 | library(reticulate) 242 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe") 243 | reticulate::py_config() 244 | predict_cell <- function(path_score){ 245 | reticulate::source_python('inst/extdata/predict_py.py') 246 | predict <- predict_py(path_score) 247 | predict_result <- as.data.frame(predict) 248 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 249 | V1 <= 0.5 ~ "tumor")) 250 | colnames(result) <- c("value","cell_type") 251 | return(result) 252 | } 253 | result <- predict_cell(path_score = data_test) 254 | result <- predict_cell(path_score = data_test) 255 | predict_cell <- function(path_score){ 256 | reticulate::source_python('inst/extdata/predict_py.py') 257 | predict <- predict_py(path_score) 258 | predict_result <- as.data.frame(predict) 259 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 260 | V1 <= 0.5 ~ "tumor")) 261 | colnames(result) <- c("value","cell_type") 262 | return(result) 263 | } 264 | result <- predict_cell(path_score = data_test) 265 | reticulate::source_python('inst/extdata/predict_py.py') 266 | predict <- predict_py(data_test) 267 | reticulate::repl_python() 268 | from tensorflow.keras.models import load_model 269 | model = load_model("./inst/extdata/TCfinder.hdf5") 270 | data2 = r.data.test 271 | data2 = r.data_test 272 | predict = model.predict(data2) 273 | data_test <- fread("inst/extdata/tests_score.csv") 274 | quit 275 | data2[1:5,1:5] 276 | data_test[1:5,1:5] 277 | class(data_test) 278 | data_test <- fread("inst/extdata/tests_score.csv",data.table = F) 279 | class(data_test) 280 | reticulate::repl_python() 281 | data2 = r.data_test 282 | data2 283 | library(data.table) 284 | data_test <- fread("inst/extdata/tests_score.csv",data.table = F) 285 | library(reticulate) 286 | use_python("D:/Users/wuchx/anaconda3/envs/tensorflow/python.exe") 287 | reticulate::py_config() 288 | reticulate::repl_python() 289 | import pandas as pd 290 | import numpy as np 291 | import pandas as pd 292 | data2 = r.data_test 293 | quit 294 | predict_cell <- function(path_score){ 295 | reticulate::source_python('inst/extdata/predict_py.py') 296 | predict <- predict_py(path_score) 297 | predict_result <- as.data.frame(predict) 298 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 299 | V1 <= 0.5 ~ "tumor")) 300 | colnames(result) <- c("value","cell_type") 301 | return(result) 302 | } 303 | result <- predict_cell(path_score = data_test) 304 | library(dplyr) 305 | result <- predict_cell(path_score = data_test) 306 | View(result) 307 | table(result$cell_type) 308 | devtools::load_all() 309 | sethis::use_testthat(3) 310 | usethis::use_testthat(3) 311 | usethis::use_test() 312 | use_test("predict_cell") 313 | usethis::use_test("predict_cell") 314 | devtools::document() 315 | devtools::check() 316 | library(fs) 317 | fs::path_package("extdata",package = "TCfinder") 318 | fs::path_package("predict_py.py",package = "TCfinder") 319 | fs::path_package("predict_py.py",package = "TCfinder") 320 | fs::path_package("extdata",package = "TCfinder") 321 | Path <- fs::path_package("extdata",package = "TCfinder") 322 | paste0(Path,"predict_py.py") 323 | reticulate::repl_python() 324 | Path = r.Path 325 | Path+"/TCfinder.hdf5" 326 | quit 327 | predict_cell <- function(path_score){ 328 | Path <- fs::path_package("extdata",package = "TCfinder") 329 | reticulate::source_python(paste0(Path,"predict_py.py")) 330 | predict <- predict_py(path_score,Path) 331 | predict_result <- as.data.frame(predict) 332 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 333 | V1 <= 0.5 ~ "tumor")) 334 | colnames(result) <- c("value","cell_type") 335 | return(result) 336 | } 337 | result1 <- predict_cell(data_test) 338 | Path <- fs::path_package("extdata",package = "TCfinder") 339 | Path 340 | paste0(Path,"predict_py.py") 341 | predict_cell <- function(path_score){ 342 | Path <- fs::path_package("extdata",package = "TCfinder") 343 | reticulate::source_python(paste0(Path,"/predict_py.py")) 344 | predict <- predict_py(path_score,Path) 345 | predict_result <- as.data.frame(predict) 346 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 347 | V1 <= 0.5 ~ "tumor")) 348 | colnames(result) <- c("value","cell_type") 349 | return(result) 350 | } 351 | data_test <- fread("inst/extdata/tests_score.csv",data.table = F) 352 | result1 <- predict_cell(data_test) 353 | devtools::check() 354 | devtools::check() 355 | devtools::document() 356 | rm(list = c("predict_cell")) 357 | devtools::load_all() 358 | devtools::document() 359 | roxygen2::roxygenise() 360 | devtools::document() 361 | devtools::check() 362 | library(data.table) 363 | a <- fread("./inst/extdata/GOSH_pathway_score.csv") 364 | a[1:5,1:5] 365 | pathway_score <- function(normalized_matrix){ 366 | KEGG_Gene <- TCfinder::KEGG_Gene 367 | TCfinder_Pathway <- TCfinder::TCfinder_Pathway 368 | score_gene <- KEGG_Gene %>% filter(hsa %in% TCfinder_Pathway$hsa) 369 | gene_id <- rownames(normalized_matrix) 370 | barcode <- colnames(normalized_matrix) 371 | normalized_matrix <- as.data.frame(t(normalized_matrix)) 372 | colnames(normalized_matrix) <- gene_id 373 | myFun1 <- function(number){ 374 | sum(number)/length(number) 375 | } 376 | all_pathway_score <- NA 377 | for (i in 1:213) { 378 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 379 | data1 <- normalized_matrix %>% select(gene$gene_id[which(gene$gene_id %in% colnames(normalized_matrix)==TRUE)]) 380 | path_score <- as.data.frame(apply(data1, 1, myFun1)) 381 | colnames(path_score) <- names(table(score_gene$hsa))[i] 382 | all_pathway_score <- cbind(all_pathway_score,path_score) 383 | } 384 | pathway_score <- all_pathway_score[,-1] 385 | pathway_score <- pathway_score %>% dplyr::select(TCfinder_Pathway$hsa) 386 | rownames(pathway_score) <- barcode 387 | return(pathway_score) 388 | } 389 | predict_cell <- function(path_score){ 390 | barcode <- rownames(path_score) 391 | Path <- fs::path_package("extdata",package = "TCfinder") 392 | reticulate::source_python(paste0(Path,"/predict_py.py")) 393 | predict <- predict_py(path_score,Path) 394 | predict_result <- as.data.frame(predict) 395 | result <- predict_result %>% mutate(cell_type = case_when(V1 > 0.5 ~ "normal", 396 | V1 <= 0.5 ~ "tumor")) 397 | colnames(result) <- c("value","cell_type") 398 | result$barcode <- barcode 399 | return(result) 400 | } 401 | library(TCfinder) 402 | library(TCfinder) 403 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | inst/doc 3 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: TCfinder 2 | Title: Tumor cell identification in single-cell datasets 3 | Version: 1.2.0.1 4 | Authors@R: c( 5 | person("Chenxu", "Wu", , "wuchx@shanghaitech.edu.cn", role = c("aut", "cre"), 6 | comment = c(ORCID = "0009-0005-7257-4470")), 7 | person("Tao", "Wu", , "wutao2@shanghaitech.edu.cn", role = "aut", 8 | comment = c(ORCID = "0000-0002-8999-9628")), 9 | person("Xue-Song", "Liu", role = c("aut", "ctb"), 10 | comment = c(ORCID = "0000-0002-7736-0077")) 11 | ) 12 | Description: Perform normalization and pathway score calculations on single-cell data, and distinguish tumor cells from normal cells in single-cell datasets. 13 | License: MIT + file LICENSE 14 | Encoding: UTF-8 15 | Roxygen: list(markdown = TRUE) 16 | RoxygenNote: 7.2.0 17 | Suggests: 18 | knitr, 19 | rmarkdown, 20 | testthat (>= 3.0.0) 21 | Config/testthat/edition: 3 22 | VignetteBuilder: knitr 23 | Imports: 24 | dplyr, 25 | reticulate, 26 | fs, 27 | Matrix, 28 | methods 29 | Depends: 30 | R (>= 3.50) 31 | LazyData: true 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2022 2 | COPYRIGHT HOLDER: chenxuwu 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 chenxuwu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(data_normalized) 4 | export(pathway_score) 5 | export(predict_cell) 6 | importFrom(Matrix,Diagonal) 7 | importFrom(Matrix,Matrix) 8 | importFrom(Matrix,colSums) 9 | importFrom(Matrix,t) 10 | importFrom(dplyr,"%>%") 11 | importFrom(dplyr,case_when) 12 | importFrom(dplyr,filter) 13 | importFrom(dplyr,mutate) 14 | importFrom(dplyr,select) 15 | importFrom(methods,is) 16 | importFrom(reticulate,source_python) 17 | -------------------------------------------------------------------------------- /R/data_normalized.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title data normalized 3 | #' @description Normalize single-cell raw counts matrix. 4 | #' @details Input a data.frame where the rows are the gene names and the columns are the sample names. 5 | #' @param expr_data A single-cell counts expression matrix. 6 | #' @param method If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required. 7 | #' For other single-cell sequencing methods, this parameter does not need to be filled in. 8 | #' @param genome Reference genome, when method = "smart-seq2", 9 | #' this parameter needs to be filled in, you can choose hg19 and hg38 10 | #' @return A normalized single-cell expression matrix. 11 | #' @export 12 | #' @importFrom Matrix Matrix Diagonal colSums 13 | #' @importFrom methods is 14 | 15 | 16 | data_normalized <- function(expr_data,method = "method",genome = "hg38"){ 17 | 18 | 19 | if (!methods::is(expr_data, "CsparseMatrix")) { 20 | 21 | expr_data <- Matrix::Matrix(as.matrix(expr_data),sparse = T) 22 | 23 | } 24 | 25 | 26 | if (method == "method") { 27 | 28 | 29 | sparse_data1 <- expr_data %*% Matrix::Diagonal(x = 1 / Matrix::colSums(expr_data)) * 10000 30 | 31 | #nonzero_indices <- which(sparse_data1 != 0, arr.ind = TRUE) 32 | #sparse_data1[nonzero_indices] <- round(log2(sparse_data1[nonzero_indices] + 1), 3) 33 | #sparse_data1 <- round(log2(sparse_data1 + 1), 3) 34 | sparse_data1 <- log1p(sparse_data1)/log(2) 35 | return(sparse_data1) 36 | 37 | } 38 | 39 | 40 | if (method == "smart-seq2") { 41 | 42 | if (genome == "hg19") { 43 | gene_length <- hg19 44 | } 45 | 46 | if (genome == "hg38") { 47 | gene_length <- hg38 48 | } 49 | 50 | 51 | colnames(gene_length) <- c("gene_name","Length") 52 | 53 | 54 | use_gene_length <- gene_length[gene_length$gene_name %in% rownames(expr_data),] 55 | gene_names <- use_gene_length$gene_name 56 | selected_rows <- expr_data[gene_names, ] 57 | 58 | 59 | compute_result <- function(x) { 60 | round((x * 1000 * 1000000) / (use_gene_length[, 2] * sum(x * 1000 / use_gene_length[, 2])), 3) 61 | } 62 | 63 | 64 | result_matrix <- as.data.frame(apply(selected_rows, 2, compute_result)) 65 | colnames(result_matrix) <- colnames(selected_rows) 66 | rownames(result_matrix) <- rownames(selected_rows) 67 | 68 | sparse_data1 <- Matrix::Matrix(as.matrix(result_matrix),sparse = T) 69 | #nonzero_indices <- which(sparse_data1 != 0, arr.ind = TRUE) 70 | #sparse_data1[nonzero_indices] <- round(log2(sparse_data1[nonzero_indices] + 1), 3) 71 | #sparse_data1 <- round(log2(sparse_data1 + 1), 3) 72 | sparse_data1 <- log1p(sparse_data1)/log(2) 73 | return(sparse_data1) 74 | } 75 | 76 | 77 | if (!all(method %in% c("method", "smart-seq2"))) { 78 | stop("Method parameter error ") 79 | } 80 | } 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/pathway_score.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title pathway score 3 | #' @description Obtain a pathway score matrix for predicting tumor cells. 4 | #' @details Input a sparse matrix, matrix, or data frame where the rows are the gene names and the columns are the sample names. Matrix that can be generated directly using the data_normalized.R function. 5 | #' @param expr_data Single-cell expression matrix after normalization of the original counts matrix. 6 | #' @param normalized If the matrix is not normalized, you need to set normalized = FALSE 7 | #' @param method This parameter is required when normalized = FALSE. If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required. 8 | #' For other single-cell sequencing methods, this parameter does not need to be filled in. 9 | #' @param genome This parameter is required when normalized = FALSE. Reference genome, when method = "smart-seq2", 10 | #' this parameter needs to be filled in, you can choose hg19 and hg38 11 | #' @return A matrix containing 213 pathway scores. 12 | #' @export 13 | #' @importFrom dplyr %>% filter select 14 | #' @importFrom Matrix Matrix t 15 | #' @importFrom methods is 16 | 17 | pathway_score <- function(expr_data,normalized = TRUE,method = "method",genome = "hg38"){ 18 | 19 | 20 | if (!methods::is(expr_data, "CsparseMatrix")) { 21 | 22 | expr_data <- Matrix::Matrix(as.matrix(expr_data),sparse = T) 23 | 24 | } 25 | 26 | 27 | if (!all(normalized %in% c(TRUE, FALSE))) { 28 | stop("The normalized parameter is required") 29 | } 30 | 31 | 32 | if(normalized == FALSE){ 33 | 34 | normalized_matrix <- TCfinder::data_normalized(expr_data,method = method,genome = genome) 35 | 36 | } 37 | 38 | if(normalized == TRUE){ 39 | 40 | normalized_matrix <- expr_data 41 | 42 | } 43 | 44 | 45 | KEGG_Gene <- TCfinder::KEGG_Gene 46 | TCfinder_Pathway <- TCfinder::TCfinder_Pathway 47 | 48 | score_gene <- KEGG_Gene %>% dplyr::filter(hsa %in% TCfinder_Pathway$hsa) 49 | 50 | 51 | gene_id <- rownames(normalized_matrix) 52 | barcode <- colnames(normalized_matrix) 53 | normalized_matrix <- Matrix::t(normalized_matrix) 54 | 55 | colnames(normalized_matrix) <- gene_id 56 | 57 | 58 | 59 | all_pathway_score <- NA 60 | for (i in 1:213) { 61 | 62 | gene <- score_gene %>% dplyr::filter(hsa == names(table(score_gene$hsa))[i]) 63 | pathay_gene <- colnames(normalized_matrix)[which(colnames(normalized_matrix) %in% gene$gene_id==TRUE)] 64 | 65 | selected_data <- normalized_matrix[, pathay_gene] 66 | 67 | path_score <- as.data.frame(apply(selected_data, 1, FUN = function(x){sum(x)/length(x)})) 68 | 69 | colnames(path_score) <- names(table(score_gene$hsa))[i] 70 | all_pathway_score <- cbind(all_pathway_score,path_score) 71 | 72 | } 73 | 74 | pathway_score <- all_pathway_score[,-1] 75 | pathway_score <- pathway_score %>% dplyr::select(TCfinder_Pathway$hsa) 76 | rownames(pathway_score) <- barcode 77 | return(pathway_score) 78 | 79 | } 80 | 81 | 82 | -------------------------------------------------------------------------------- /R/predict_cell.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Cell types prediction. 3 | #' @description Classify tumor cells from normal cells. 4 | #' @details Input the pathway score matrix calculated by the pathway_score function. 5 | #' @param path_score The pathway score matrix calculated by the pathway_score function. 6 | #' @return A data.frame containing cell types and predicted values. 7 | #' @export 8 | #' @importFrom reticulate source_python 9 | #' @importFrom dplyr mutate case_when 10 | 11 | 12 | 13 | predict_cell <- function(path_score){ 14 | 15 | barcode <- rownames(path_score) 16 | Path <- fs::path_package("extdata",package = "TCfinder") 17 | reticulate::source_python(paste0(Path,"/predict_py.py")) 18 | 19 | predict <- predict_py(path_score,Path) 20 | predict_result <- as.data.frame(predict) 21 | result <- predict_result %>% dplyr::mutate(cell_type = dplyr::case_when(V1 > 0.5 ~ "normal", 22 | V1 <= 0.5 ~ "tumor")) 23 | colnames(result) <- c("value","cell_type") 24 | result$barcode <- barcode 25 | return(result) 26 | } 27 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/R/sysdata.rda -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TCfinder 2 | 3 | TCfinder is the tool to distinguish tumor cells from normal cells in single-cell data from the perspective of gene pathway expression quantification. A pathway usually contains multiple genes, which makes TCfinder more applicable because it overcomes the single-cell data sparsity problem faced by traditional methods. The successful construction of TCfinder also suggests the applicability of gene pathway expression quantification in the annotation of other cell types in scRNA-seq. 4 | 5 | ## Workflow 6 | 7 | ![Image text](inst/image/workflow.png) 8 | 9 | ## Installation and use of TCfinder package. 10 | 11 | TCfinder, as an R package, can be downloaded and used via Github. TCfinder relies on several R packages, and these dependencies include: 12 | 13 | ***R (>= 3.5.0);*** 14 | 15 | ***dplyr (>= 1.1.0);*** 16 | 17 | ***reticulate (>= 1.2.6);*** 18 | 19 | ***Matrix;*** 20 | 21 | ***fs;*** 22 | 23 | ### Install 24 | 25 | ```R 26 | devtools::install_github("XSLiuLab/TCfinder") 27 | ``` 28 | 29 | TCfinder contains three functions, which respectively standardize the raw counts of single cells, score pathways, and predict tumor cells and normal cells. 30 | 31 | ### Data normalization 32 | 33 | The input data needs to be a sparse matrix or data.frame data whose row name is gene name and column name is sample name. 34 | 35 | If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required, and needed to select genome = "hg19" or "hg38". For other single-cell sequencing methods, this parameter does not need to be filled in. 36 | 37 | ```R 38 | library(TCfinder) 39 | result1 <- data_normalized(expr_data = expr_data,method = "method",genome = "hg38") 40 | ``` 41 | 42 | #### Example: 43 | 44 | The row name is gene symbol, and the column name is barcode of the sample. 45 | 46 | | | AAACCTGCACATCCGG | ... | AAACGGGGTTGAACTC | AAACGGGGTTGTCGCG | 47 | | :-----: | :--------------: | ---- | :--------------: | :--------------: | 48 | | FAM138A | 0 | ... | 0 | 1 | 49 | | OR4F5 | 8 | ... | 20 | 15 | 50 | | ... | ... | ... | ... | ... | 51 | | FAM87B | 1 | ... | 0 | 1 | 52 | 53 | ### Pathway score 54 | 55 | The path score is calculated using the built-in 213 pathways according to the formula in workflow. 56 | 57 | The output of data_normalized() can be directly used as the input of pathway_score(). If the matrix is not normalized, "normalized = FALSE" is needed to set 58 | 59 | ```R 60 | result2 <- pathway_score(expr_data = result1, normalized = T) 61 | ``` 62 | 63 | #### result2: pathway score 64 | 65 | | | hsa00010 | hsa00190 | ... | hsa00270 | 66 | | :--------------: | :-------: | :-------: | ---- | :-------: | 67 | | AAACCTGCACATCCGG | 0.3401667 | 0.9679245 | ... | 0.2091803 | 68 | | AAACGGGGTTGAACTC | 0.5657879 | 1.6702925 | ... | 0.4492787 | 69 | | ... | ... | ... | ... | ... | 70 | | AAACGGGGTTGTCGCG | 0.3202879 | 1.4834434 | ... | 0.4590984 | 71 | 72 | ### Prediction of cell type (tumor cell or normal cell) 73 | 74 | The prediction model is developed based on deep learning in python, so some python environments and module installations need to be configured before running the prediction. 75 | 76 | #### Python environment and module installation 77 | 78 | ```python 79 | # Create a new environment 80 | conda create -n new_env python=3.8 81 | # Activate the new environment 82 | conda activate new_env 83 | # Install required modules 84 | conda install tensorflow==2.3.0 85 | conda install pandas==1.0.5 86 | conda install numpy==1.18.5 87 | # View conda environment information 88 | conda env list # Copy the address of the new conda environment, which will be used later 89 | ``` 90 | 91 | #### Predict cell 92 | 93 | The prediction process needs to call a python script, so the R package 'reticulate' is required. The input data is the pathway score result obtained by running the pathway_score() function 94 | 95 | ```R 96 | install.packages("reticulate") 97 | library(reticulate) 98 | # Use the use_python() function to specify the version, here we use the python just created and configured above 99 | reticulate::use_python("XXX/XXX/XXX/anaconda3/envs/new_env/bin/python") 100 | # View specified environment information 101 | reticulate::py_config() 102 | # Predict 103 | predict_result <- predict_cell(path_score = result2) 104 | ``` 105 | 106 | #### predict_result 107 | 108 | | | value | cell_type | barcode | 109 | | :--: | :----------: | :-------: | :--------------: | 110 | | 1 | 0.9996183 | normal | AAACCTGCACATCCGG | 111 | | 2 | 0.9989167 | normal | AAACGGGGTTGAACTC | 112 | | 3 | 0.0001887589 | tumor | AAACGGGGTTGTCGCG | 113 | | ... | ... | ... | ... | 114 | 115 | ## Citation 116 | 117 | Chenxu Wu, Wei Ning, Tao Wu, Jing Chen, Huizi Yao, Ziyu Tao, Xiangyu Zhao, Kaixuan Diao, Jinyu Wang, Weiliang Wang, Xinxing Li, Qianqian Song, Xue-Song Liu. 2024. TCfinder: Robust tumor cell discriminationin scRNA-seq based on gene pathway activity. iMetaOmics 1: e22. https://doi.org/10.1002/imo2.22 118 | 119 | ## Contributors 120 | 121 | TCfinder was developed by Chenxu Wu. Please contact Chenxu Wu: wuchx@shanghaitech.edu.cn for any questions or suggestions. Thank you for your use and feedback. 122 | 123 | ------ 124 | 125 | **Cancer Biology Group @ShanghaiTech** 126 | 127 | **Research group led by Xue-Song Liu in ShanghaiTech University** 128 | -------------------------------------------------------------------------------- /TCfinder.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /data/KEGG_Gene.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/data/KEGG_Gene.rda -------------------------------------------------------------------------------- /data/TCfinder_Pathway.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/data/TCfinder_Pathway.rda -------------------------------------------------------------------------------- /inst/analysis/GSE673_analysis.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | setwd("~/project/mcIdentify/data/") 4 | remove(list = ls()) 5 | library(data.table) 6 | library(dplyr) 7 | 8 | GSE673_eff <- fread("./model_built/pathway_score_213/model_result/GSE673_inter_eff.csv") 9 | 10 | tumor_type <- GSE673_eff[1:4,] 11 | 12 | tumor_type1 <- melt(tumor_type) 13 | 14 | V1 <- c("GSE673_ATC","GSE673_IDC","GSE673_TNBC","GSE673_DCIS") 15 | tumor_type1$V1 <- factor(tumor_type1$V1,levels = c("GSE673_ATC","GSE673_IDC","GSE673_TNBC","GSE673_DCIS")) 16 | library(ggplot2) 17 | library(ggprism) 18 | ggplot(tumor_type1,aes(x=V1,y=value,fill=variable))+ 19 | geom_bar(position="dodge",stat="identity")+ 20 | labs(x="Cancer type",y="")+ 21 | theme_prism()+ 22 | geom_hline(aes(yintercept=0.95),linetype=5,col="black")+ 23 | scale_y_continuous(breaks=c(0,.25,0.5,0.75,0.95,1))+ 24 | scale_x_discrete(breaks=V1, labels=c("ATC","IDC","TNBC","DCIS"))+ 25 | scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"), 26 | breaks=c("f1", "accuracy", "recall", "precisoon"), 27 | labels=c("F1 score", "Accuracy", "Recall", "Precison")) 28 | 29 | 30 | 31 | 32 | ###gene number 33 | gene_number <- GSE673_eff[5:10,] 34 | gene_number1 <- melt(gene_number) 35 | V1 <- c("GSE673_500","GSE673_500_1000","GSE673_1000_1500","GSE673_1500_2000","GSE673_2000_2500","GSE673_2500_00") 36 | gene_number1$V1 <- factor(gene_number1$V1,levels = V1) 37 | library(ggplot2) 38 | library(ggprism) 39 | ggplot(gene_number1,aes(x=V1,y=value,fill=variable))+ 40 | geom_bar(position="dodge",stat="identity")+ 41 | labs(x="Gene number",y="")+ 42 | theme_prism()+ 43 | geom_hline(aes(yintercept=0.95),linetype=5,col="black")+ 44 | scale_y_continuous(breaks=c(0,.25,0.5,0.75,0.95,1))+ 45 | scale_x_discrete(breaks=V1, labels=c("<500","500~1000","1000~1500","1500~2000","2000~2500",">2500"))+ 46 | scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"), 47 | breaks=c("f1", "accuracy", "recall", "precisoon"), 48 | labels=c("F1 score", "Accuracy", "Recall", "Precison")) 49 | 50 | 51 | 52 | # simulation gene 53 | gene_number <- GSE673_eff[11:15,] 54 | gene_number1 <- melt(gene_number) 55 | V1 <- c("simulation_500","simulation_1000","simulation_1500","simulation_2000","simulation_2500") 56 | gene_number1$V1 <- factor(gene_number1$V1,levels = V1) 57 | library(ggplot2) 58 | library(ggprism) 59 | ggplot(gene_number1,aes(x=V1,y=value,fill=variable))+ 60 | geom_bar(position="dodge",stat="identity")+ 61 | labs(x="Simulate gene number",y="")+ 62 | theme_prism()+ 63 | geom_hline(aes(yintercept=0.95),linetype=5,col="black")+ 64 | scale_y_continuous(breaks=c(0,.25,0.5,0.75,0.95,1))+ 65 | scale_x_discrete(breaks=V1, labels=c("500","1000","1500","2000","2500"))+ 66 | scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"), 67 | breaks=c("f1", "accuracy", "recall", "precisoon"), 68 | labels=c("F1 score", "Accuracy", "Recall", "Precison")) 69 | 70 | 71 | 72 | 73 | # setwd("~/project/mcIdentify/data/") 74 | # remove(list = ls()) 75 | # 76 | # library(data.table) 77 | # library(dplyr) 78 | # GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 79 | # data1 <- fread("./model_built/pathway_score_213/tumor_type_data/GSE673_tumor_type_data.csv") 80 | # 81 | # data2 <- data1 %>% mutate(cancer_type = case_when(cell_type == "ATC1." ~ "ATC", 82 | # cell_type == "ATC2." ~ "ATC", 83 | # cell_type == "ATC3." ~ "ATC", 84 | # cell_type == "ATC4." ~ "ATC", 85 | # cell_type == "ATC5." ~ "ATC", 86 | # cell_type == "DCIS1" ~ "DCIS", 87 | # cell_type == "IDC1." ~ "IDC", 88 | # cell_type == "IDC2." ~ "IDC", 89 | # cell_type == "TNBC1" ~ "TNBC", 90 | # cell_type == "TNBC2" ~ "TNBC", 91 | # cell_type == "TNBC3" ~ "TNBC", 92 | # cell_type == "TNBC4" ~ "TNBC", 93 | # cell_type == "TNBC5" ~ "TNBC")) 94 | # 95 | # data3 <- data2 %>% select(type,cancer_type,GSE673_diff_path213$hsa) 96 | # single_cancer <- data3 %>% filter(cancer_type == "TNBC") %>% select(type,GSE673_diff_path213$hsa) 97 | # fwrite(single_cancer,"./model_built/pathway_score_213/tumor_type_data/GSE673_TNBC.csv") 98 | 99 | -------------------------------------------------------------------------------- /inst/analysis/MLcode.R: -------------------------------------------------------------------------------- 1 | 2 | ### RF 3 | library(dplyr) 4 | library(data.table) 5 | library(randomForest) 6 | remove(list = ls()) 7 | setwd("~/project/mcIdentify/Revise_1/") 8 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 9 | set.seed(123) 10 | split <- sample.split(data$type, SplitRatio = 0.8) 11 | train_data <- subset(data, split == TRUE) 12 | test_data <- subset(data, split == FALSE) 13 | 14 | X_train <- train_data[, -1] 15 | y_train <- as.factor(train_data[, 1]) 16 | 17 | 18 | ctrl <- trainControl(method = "cv", number = 5) 19 | grid <- expand.grid(mtry = c(2, 4, 6)) 20 | rf_model <- train(x = X_train, y = y_train, 21 | method = "rf", 22 | trControl = ctrl, 23 | tuneGrid = grid) 24 | 25 | print(rf_model) 26 | 27 | grid <- expand.grid(mtry = c(6)) 28 | modellist <- list() 29 | for (ntree in c(100,200, 300)) { 30 | set.seed(123) 31 | fit <- train(x = X_train, y = y_train, method="rf", 32 | metric="Accuracy", tuneGrid=grid, 33 | trControl=ctrl, ntree=ntree) 34 | key <- toString(ntree) 35 | modellist[[key]] <- fit 36 | } 37 | results <- resamples(modellist) 38 | summary(results) 39 | 40 | model <- randomForest(x = X_train, y = y_train,mtry = 6,ntree = 200) 41 | print(model) 42 | 43 | x_test_data <- test_data[, -1] 44 | y_test_data <- as.factor(test_data[, 1]) 45 | test_predictions <- predict(model, newdata = x_test_data) 46 | 47 | confusion_matrix <- confusionMatrix(test_predictions, y_test_data) 48 | accuracy <- confusion_matrix$overall["Accuracy"] 49 | precision <- confusion_matrix$byClass["Pos Pred Value"] 50 | recall <- confusion_matrix$byClass["Sensitivity"] 51 | f1_score <- confusion_matrix$byClass["F1"] 52 | 53 | print(confusion_matrix) 54 | print(paste("Accuracy:", accuracy)) 55 | print(paste("Precision:", precision)) 56 | print(paste("Recall:", recall)) 57 | print(paste("F1 Score:", f1_score)) 58 | 59 | 60 | 61 | 62 | 63 | 64 | ## SVM 65 | library(e1071) 66 | library(ggplot2) 67 | library(caret) 68 | remove(list = ls()) 69 | setwd("~/project/mcIdentify/Revise_1/") 70 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 71 | 72 | set.seed(123) 73 | split <- sample.split(data$type, SplitRatio = 0.8) 74 | train_data <- subset(data, split == TRUE) 75 | test_data <- subset(data, split == FALSE) 76 | X_train <- train_data[, -1] 77 | y_train <- as.factor(train_data[, 1]) 78 | 79 | 80 | 81 | param_grid <- expand.grid( 82 | sigma = c(0.1, 1, 10), 83 | C = c(0.1, 1, 10)) 84 | ctrl <- trainControl(method = "cv", number = 5, verboseIter = FALSE) 85 | 86 | tuned_model <- train( 87 | x = X_train, 88 | y = y_train, 89 | method = "svmRadial", 90 | tuneGrid = param_grid, 91 | trControl = ctrl) 92 | print(tuned_model) 93 | 94 | 95 | svm_model <- svm(x = X_train, y = y_train,sigma = 0.1,C = 10) 96 | train_predictions <- predict(svm_model, newdata = X_train) 97 | table(y_train, train_predictions) 98 | 99 | X_new_data <- test_data[, -1] 100 | y_new_data <- as.factor(test_data[, 1]) 101 | test_predictions <- predict(svm_model, newdata = X_new_data) 102 | 103 | table(test_predictions,y_new_data) 104 | accuracy <- mean(test_predictions == y_new_data) 105 | precision <- sum(test_predictions == "normal" & y_new_data == "normal") / sum(test_predictions == "normal") 106 | recall <- sum(test_predictions == "normal" & y_new_data == "normal") / sum(y_new_data == "normal") 107 | f1_score <- 2 * precision * recall / (precision + recall) 108 | print(paste("accuracy:", accuracy)) 109 | print(paste("precision:", precision)) 110 | print(paste("recall:", recall)) 111 | print(paste("F1 score:", f1_score)) 112 | 113 | 114 | 115 | 116 | ### xgboost 117 | library(xgboost) 118 | library(Matrix) 119 | remove(list = ls()) 120 | setwd("~/project/mcIdentify/Revise_1/") 121 | new_data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 122 | new_data1 <- new_data %>% mutate(type = ifelse(type == "normal",0,1)) 123 | 124 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 125 | data <- data %>% mutate(type = ifelse(type == "normal",0,1)) 126 | set.seed(123) 127 | split <- sample.split(data$type, SplitRatio = 0.8) 128 | train_data <- subset(data, split == TRUE) 129 | test_data <- subset(data, split == FALSE) 130 | 131 | X_train <- train_data[, -1] 132 | y_train <- train_data[, 1] 133 | 134 | ctrl <- trainControl( 135 | method = "cv", 136 | number = 5, 137 | verboseIter = FALSE) 138 | 139 | param_grid <- expand.grid( 140 | nrounds = c(100, 200), 141 | max_depth = c(3, 6), 142 | eta = c(0.1), 143 | gamma = c(0, 0.1), 144 | colsample_bytree = c(0.8), 145 | min_child_weight = c(1, 3), 146 | subsample = c(0.8)) 147 | 148 | xgb_model <- train( 149 | x = X_train, 150 | y = y_train, 151 | method = "xgbTree", 152 | trControl = ctrl, 153 | tuneGrid = param_grid) 154 | print(xgb_model$bestTune) 155 | 156 | 157 | dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train) 158 | params <- list(objective = "binary:logistic", eval_metric = "logloss", eta = 0.1, max_depth = 3) 159 | nrounds <- 100 160 | xgb_model <- xgboost(params = params, data = dtrain, nrounds = nrounds) 161 | 162 | train_predictions <- predict(xgb_model, newdata = dtrain) 163 | train_predictions <- ifelse(train_predictions > 0.5,1,0) 164 | 165 | confusion_matrix <- table(train_predictions,y_train) 166 | accuracy <- mean(train_predictions == y_train) 167 | precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2]) 168 | recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ]) 169 | f1_score <- 2 * precision * recall / (precision + recall) 170 | 171 | print(paste("accuracy:", accuracy)) 172 | print(paste("precision:", precision)) 173 | print(paste("recall:", recall)) 174 | print(paste("F1 score:", f1_score)) 175 | 176 | X_new_data1 <- new_data1[, -1] 177 | y_new_data1 <- as.factor(new_data1[, 1]) 178 | dtest <- xgb.DMatrix(data = as.matrix(X_new_data1)) 179 | test_predictions <- predict(xgb_model, newdata = dtest) 180 | test_predictions <- ifelse(test_predictions > 0.5,1,0) 181 | 182 | confusion_matrix <- table(test_predictions,y_new_data1) 183 | accuracy <- mean(test_predictions == y_new_data1) 184 | precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2]) 185 | recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ]) 186 | f1_score <- 2 * precision * recall / (precision + recall) 187 | 188 | print(paste("accuracy:", accuracy)) 189 | print(paste("precision:", precision)) 190 | print(paste("recall:", recall)) 191 | print(paste("F1 score:", f1_score)) 192 | 193 | 194 | 195 | ###LR 196 | library(ggplot2) 197 | library(dplyr) 198 | library(caTools) 199 | library(pROC) 200 | library(caret) 201 | remove(list = ls()) 202 | setwd("~/project/mcIdentify/Revise_1/") 203 | data <- fread("./new_data/GSE131928_new_data_score.csv",data.table = F) 204 | data <- data %>% mutate(type = ifelse(type=="normal",1,0)) 205 | set.seed(123) 206 | split <- sample.split(data$type, SplitRatio = 0.7) 207 | train_data <- subset(data, split == TRUE) 208 | test_data <- subset(data, split == FALSE) 209 | model <- glm(type ~ ., data = train_data, family = gaussian) 210 | summary(model) 211 | 212 | 213 | predictions <- predict(model, newdata = test_data, type = "response") 214 | threshold <- 0.5 215 | predicted_classes <- ifelse(predictions >= threshold, 1, 0) 216 | 217 | confusion_matrix <- table(test_data$type, predicted_classes) 218 | accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix) 219 | precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2]) 220 | recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ]) 221 | f1_score <- 2 * precision * recall / (precision + recall) 222 | 223 | cat("Accuracy: ", accuracy, "\n") 224 | cat("Precision: ", precision, "\n") 225 | cat("Recall: ", recall, "\n") 226 | cat("F1 Score: ", f1_score, "\n") 227 | 228 | 229 | -------------------------------------------------------------------------------- /inst/analysis/bluk_Anti.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | remove(list = ls()) 4 | setwd("~/project/mcIdentify/data/") 5 | 6 | tcga_counts <- readRDS("~/project/common_data/tcga/tcga_clean_counts.rds") 7 | 8 | library(NeoEnrichment) 9 | library(dplyr) 10 | library(data.table) 11 | infor <- as.data.frame(colnames(tcga_counts)) 12 | colnames(infor) <- "barcode" 13 | infor$type <- get_cancer_type(infor$barcode) 14 | 15 | BRCA_infor <- infor %>% filter(type == "BRCA") 16 | BRCA_infor <- BRCA_infor %>% dplyr::mutate(tissue = case_when(grepl("*01$",BRCA_infor$barcode) ~ "tumor", 17 | grepl("*11$",BRCA_infor$barcode) ~ "normal")) 18 | 19 | tumor_infor <- BRCA_infor %>% filter(tissue == "tumor") 20 | normal_infor <- BRCA_infor %>% filter(tissue == "normal") 21 | 22 | 23 | tumor_sample <- tcga_counts %>% select(tumor_infor$barcode) %>% as.data.frame() 24 | rownames(tumor_sample) <- rownames(tcga_counts) 25 | 26 | normal_sample <- tcga_counts %>% select(normal_infor$barcode) %>% as.data.frame() 27 | rownames(normal_sample) <- rownames(tcga_counts) 28 | 29 | 30 | ##DESeq2 31 | DESeq2_DEG <- function(ecDNA_sample,NonecDNA_sample){ 32 | ##DESeq2 33 | library("DESeq2") 34 | DEG_data <- as.data.frame(cbind(ecDNA_sample,NonecDNA_sample)) %>% round(.,digits = 0) 35 | rownames(DEG_data) <- rownames(tcga_counts) 36 | group <- as.factor(c(rep("ecDNA",length(ecDNA_sample)), rep("NonecDNA",length(NonecDNA_sample)))) #建立分组 37 | colGroup <- data.frame(row.names = colnames(DEG_data), 38 | group_list = group) 39 | dds <- DESeqDataSetFromMatrix(countData = DEG_data, 40 | colData = colGroup, 41 | design = ~ group_list) 42 | dds <- dds[rowSums(counts(dds)) > 10, ] 43 | dds2 <- DESeq(dds) 44 | res <- results(dds2, contrast=c("group_list","ecDNA","NonecDNA")) 45 | resOrdered <- res[order(res$padj),] 46 | resOrdered$gene_id <- rownames(resOrdered) 47 | DE_result <- resOrdered %>% as_data_frame(.) %>% na.omit(.) 48 | 49 | return(DE_result) 50 | } 51 | Z_score <- function(data1){ 52 | for (i in 1:length(rownames(data1))) { 53 | data1 <- as.matrix(data1) 54 | data1[i,] <- (data1[i,]-mean(data1[i,]))/sd(data1[i,]) 55 | } 56 | return(data1) 57 | } 58 | 59 | DE_Cluster <- DESeq2_DEG(tumor_sample,normal_sample) 60 | 61 | fwrite(DE_Cluster,"./model_built/pathway_score_213/time_analysis/bluk_brca_DEG.txt") 62 | 63 | 64 | ### 65 | genelist_four <- KEGG_pathway_gene %>% filter(hsa %in% c("hsa00190","hsa04612","hsa04940","hsa05416")) 66 | genelist_four1 <- genelist_four[,-1] 67 | ### 68 | 69 | ##GSEA 70 | library(GSEABase) 71 | library(clusterProfiler) 72 | HallmarkGeneSet <- read.gmt("./model_built/pathway_score_213/time_analysis/single_gene_list.gmt") 73 | 74 | Gsea_DEG <- DE_Cluster %>% 75 | dplyr::mutate(state = (-log10(padj)) * sign(log2FoldChange)) %>% dplyr::arrange(-state) %>% 76 | dplyr::filter(padj != 0) 77 | 78 | geneList <- Gsea_DEG$state 79 | names(geneList) <- Gsea_DEG$gene_id 80 | geneList <- sort(geneList, decreasing = T) 81 | GSEA_result <- GSEA(geneList, TERM2GENE = HallmarkGeneSet, pvalueCutoff = 1,eps = 0) 82 | 83 | library(enrichplot) 84 | gseaplot2(GSEA_result, GSEA_result@result$Description[1:4], title = "", color = "red", base_size = 12, 85 | rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = T, 86 | ES_geom = "line") 87 | 88 | 89 | 90 | ##GSVA 91 | library(GSVA) 92 | library(GSEABase) 93 | HallmarkGeneSet <- getGmt("./model_built/pathway_score_213/time_analysis/single_gene_list.gmt") 94 | gsva_result <- gsva(as.matrix(GSE530_sample), HallmarkGeneSet, 95 | min.sz=1, max.sz=1000, verbose=TRUE,kcdf="Poisson",parallel.sz=5L) 96 | 97 | 98 | gsva_result <- as.data.frame(t(gsva_result)) 99 | 100 | gsva_data1 <- cbind(BRCA_infor,gsva_result) 101 | 102 | 103 | library(ggpubr) 104 | library(ggprism) 105 | library(ggplot2) 106 | library(cowplot) 107 | 108 | gsva_data2 <- melt(gsva_data1) 109 | gsva_data3 <- na.omit(gsva_data2) 110 | gsva_data3$tissue <- factor(gsva_data3$tissue,levels = c("tumor","normal")) 111 | 112 | 113 | ggplot(data=gsva_data3,aes(x=variable,y=value,fill=factor(tissue)))+ 114 | geom_boxplot()+ 115 | stat_compare_means(aes(label = ..p.signif..))+ 116 | theme_prism()+ 117 | labs(y="GSVA Score",title = "BRCA")+ 118 | theme(axis.title.x = element_blank()) 119 | 120 | 121 | 122 | 123 | ## gene expression 124 | 125 | tcga_tpm <- readRDS("~/project/common_data/tcga/tpm_clean_data.rds") 126 | library(NeoEnrichment) 127 | library(dplyr) 128 | library(data.table) 129 | infor <- as.data.frame(colnames(tcga_tpm)) 130 | colnames(infor) <- "barcode" 131 | infor$type <- get_cancer_type(infor$barcode) 132 | 133 | BRCA_infor <- infor %>% filter(type == "BRCA") 134 | BRCA_infor <- BRCA_infor %>% dplyr::mutate(tissue = case_when(grepl("*01$",BRCA_infor$barcode) ~ "tumor", 135 | grepl("*11$",BRCA_infor$barcode) ~ "normal")) 136 | 137 | GSE530_sample <- tcga_tpm %>% select(BRCA_infor$barcode) %>% as.data.frame() 138 | rownames(GSE530_sample) <- rownames(tcga_tpm) 139 | 140 | data1 <- GSE530_sample[as.character(gene2$Var1),] 141 | data1 <- na.omit(data1) 142 | data2 <- as.data.frame(t(data1)) 143 | data3 <- cbind(BRCA_infor, data2) 144 | 145 | data4 <- melt(data3) 146 | 147 | data4$variable <- factor(data4$variable,levels = c("HLA-A","HLA-B","HLA-C","HLA-E","HLA-F","HLA-G","HLA-DRA", 148 | "HLA-DRB1","HLA-DRB5","HLA-DQA1","HLA-DQA2", 149 | "HLA-DQB1","HLA-DOB","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DPA1","HLA-DPB1")) 150 | 151 | 152 | library(ggpubr) 153 | library(ggprism) 154 | library(ggplot2) 155 | library(cowplot) 156 | 157 | data4$tissue <- factor(data4$tissue,levels = c("tumor","normal")) 158 | data4 <- na.omit(data4) 159 | 160 | ggplot(data=data4,aes(x=variable,y=value,fill=factor(tissue)))+ 161 | geom_boxplot()+ 162 | stat_compare_means(aes(label = ..p.signif..))+ 163 | theme_prism()+ 164 | theme(axis.text.x = element_text(angle = 30,vjust = 1, hjust = 1) )+ 165 | labs(y="Gene expression",title = "BRCA")+ 166 | theme(axis.title.x = element_blank()) 167 | 168 | 169 | 170 | fwrite(gene_list,"./model_built/pathway_score_213/time_analysis/genelist.csv") 171 | -------------------------------------------------------------------------------- /inst/analysis/bulk_pathway.R: -------------------------------------------------------------------------------- 1 | 2 | setwd("~/project/mcIdentify/data/") 3 | remove(list = ls()) 4 | 5 | library(data.table) 6 | library(dplyr) 7 | 8 | 9 | tcga_counts <- readRDS("~/project/common_data/tcga/tcga_clean_counts.rds") 10 | 11 | data1 <- tcga_counts %>% apply(2,function(x){x/sum(x) * 10000}) 12 | 13 | data1 <- as.data.frame(data1) 14 | data2 <- data1 %>% mutate_all(funs(log2(.+1))) 15 | data2 <- round(data2,3) 16 | 17 | data3 <- as.data.frame(t(data2)) 18 | colnames(data3) <- rownames(tcga_counts) 19 | 20 | 21 | # pathway socre 22 | 23 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 24 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 25 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 26 | 27 | 28 | myFun1 <- function(a){ 29 | 30 | sum(a)/length(a) 31 | 32 | } 33 | 34 | all_pathway_score <- NA 35 | for (i in 1:213) { 36 | 37 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 38 | 39 | a <- data3 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data3)==TRUE)]) 40 | 41 | path_score <- as.data.frame(apply(a, 1, myFun1)) 42 | colnames(path_score) <- names(table(score_gene$hsa))[i] 43 | 44 | all_pathway_score <- cbind(all_pathway_score,path_score) 45 | 46 | } 47 | 48 | pathway_score <- all_pathway_score 49 | pathway_score <- pathway_score[,-1] 50 | 51 | fwrite(pathway_score,"./model_built/pathway_score_213/bulk_pathway_score/bulk_pathway_score.txt",row.names = T) 52 | 53 | 54 | 55 | 56 | 57 | 58 | ###figure 59 | setwd("~/project/mcIdentify/data/") 60 | remove(list = ls()) 61 | 62 | library(data.table) 63 | library(dplyr) 64 | tcga_score <- fread("./model_built/pathway_score_213/bulk_pathway_score/bulk_pathway_score.txt") 65 | 66 | data1 <- tcga_score %>% select(V1,hsa00190,hsa04612,hsa04940,hsa05416,hsa04110) 67 | colnames(data1)[1] <- "tcga_id" 68 | 69 | library(NeoEnrichment) 70 | data1$cancer_type <- get_cancer_type(data1$tcga_id) 71 | 72 | data2 <- data1 %>% dplyr::mutate(tissue = case_when(grepl("*01$",tcga_id) ~ "tumor", 73 | grepl("*11$",tcga_id) ~ "normal")) %>% na.omit() 74 | 75 | 76 | 77 | normal_data <- data2 %>% filter(tissue == "normal") 78 | normal_name <- as.data.frame(sort(table(normal_data$cancer_type))) 79 | normal_name <- normal_name %>% filter(Freq > 20) %>% arrange(-Freq) %>% filter(Var1 != c("PRAD","KICH")) 80 | rownames(normal_name) <- normal_name$Var1 81 | 82 | data3 <- data2 %>% filter(cancer_type %in% normal_name$Var1) 83 | 84 | data4 <- melt(data3) 85 | 86 | pathway1 <- data4 %>% filter(variable == "hsa05416") 87 | 88 | all_sample <- pathway1 89 | all_sample$cancer_type <- "Pan-cancer" 90 | pathway2 <- rbind(pathway1,all_sample) 91 | pathway2$tissue <- factor(pathway2$tissue,levels = c("tumor","normal")) 92 | pathway2$cancer_type <- factor(pathway2$cancer_type,levels = c("Pan-cancer",rownames(normal_name))) 93 | 94 | 95 | 96 | library(ggpubr) 97 | library(ggprism) 98 | library(ggplot2) 99 | library(cowplot) 100 | ggplot(data=pathway2,aes(x=cancer_type,y=value,fill=factor(tissue)))+ 101 | geom_boxplot()+ 102 | stat_compare_means(aes(label = ..p.signif..))+ 103 | #ylim(0.3,1.6)+ 104 | theme_prism()+ 105 | labs(y="Pathway score",title = "hsa05416")+ 106 | theme(axis.title.x = element_blank())+ 107 | theme(axis.text.x = element_text(angle = 15)) 108 | 109 | 110 | 111 | 112 | 113 | 114 | path_score2 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 115 | infor1 <- fread("./model_built/datasets/GSE148673_anno.txt") 116 | 117 | data1 <- cbind(infor1,path_score2) 118 | 119 | data2 <- data1 %>% select(type,cell_type,hsa00190,hsa04612,hsa04940,hsa05416,hsa04110) 120 | 121 | data2 <- data2 %>% mutate(cancer_type = case_when(cell_type == "ATC1." ~ "ATC", 122 | cell_type == "ATC2." ~ "ATC", 123 | cell_type == "ATC3." ~ "ATC", 124 | cell_type == "ATC4." ~ "ATC", 125 | cell_type == "ATC5." ~ "ATC", 126 | cell_type == "DCIS1" ~ "DCIS", 127 | cell_type == "IDC1." ~ "IDC", 128 | cell_type == "IDC2." ~ "IDC", 129 | cell_type == "TNBC1" ~ "TNBC", 130 | cell_type == "TNBC2" ~ "TNBC", 131 | cell_type == "TNBC3" ~ "TNBC", 132 | cell_type == "TNBC4" ~ "TNBC", 133 | cell_type == "TNBC5" ~ "TNBC")) 134 | data2 <- data2 %>% select(-cell_type) 135 | 136 | 137 | data3 <- melt(data2) 138 | 139 | data4 <- data3 %>% filter(variable == "hsa04110") 140 | 141 | library(ggpubr) 142 | library(ggprism) 143 | library(ggplot2) 144 | library(cowplot) 145 | ggplot(data=data4,aes(x=cancer_type,y=value,fill=factor(type)))+ 146 | geom_boxplot()+ 147 | stat_compare_means(aes(label = ..p.signif..))+ 148 | #ylim(0.3,1.6)+ 149 | theme_prism()+ 150 | labs(y="Pathway score",title = "hsa04110")+ 151 | theme(axis.title.x = element_blank())+ 152 | theme(axis.text.x = element_text(angle = 15)) 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /inst/analysis/confusion_matrix.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | setwd("~/project/mcIdentify/data/") 4 | remove(list = ls()) 5 | 6 | library(data.table) 7 | library(dplyr) 8 | data1 <- fread("./model_built/pathway_score_213/model_result/confusion/confusion_GOSH.csv") 9 | data2 <- as.data.frame(data1[,-1]) 10 | rownames(data2) <- data1$V1 11 | 12 | 13 | 14 | data3 <- round(data2 / rowSums(data2),2) 15 | data3$real <- rownames(data3) 16 | a <- melt(data3) 17 | a$real <- factor(a$real, levels = c("normal","malignant")) 18 | a$variable <- factor(a$variable,levels = c("malignant","normal")) 19 | 20 | library(ggplot2) 21 | ggplot(a, aes(real,variable, fill = value)) + 22 | geom_tile() + 23 | geom_text(aes(label = scales::percent(value))) + 24 | scale_fill_gradient(low = "#F0F0F0", high = "#3575b5") + 25 | labs(x = "True", y = "Guess", title = "GOSH") + 26 | theme_prism(border = T)+ 27 | theme(panel.border = element_blank(), 28 | axis.ticks.y = element_blank(), 29 | axis.ticks.x = element_blank(), 30 | legend.position="none") 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /inst/analysis/figure1.R: -------------------------------------------------------------------------------- 1 | 2 | setwd("~/project/mcIdentify/data/") 3 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 4 | library(data.table) 5 | library(dplyr) 6 | 7 | 8 | pathdata <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 9 | path_score1 <- pathdata 10 | 11 | library(ggplot2) 12 | library(ggpubr) 13 | library(ggprism) 14 | p1 <- ggplot(data=path_score1,aes(x=type,y=hsa04514,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 15 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 16 | labs(y="Pathway Score",title = "Cell adhesion molecules",x="")+ 17 | theme(axis.title.x = element_blank()) 18 | p1 19 | 20 | p2 <- ggplot(data=path_score1,aes(x=type,y=hsa04110,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 21 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 22 | labs(y="Pathway Score",title = "Cell cycle",x="")+ 23 | theme(axis.title.x = element_blank()) 24 | p2 25 | 26 | prow <- plot_grid( 27 | p1 , 28 | p2, 29 | align = 'vh', 30 | labels = c(), 31 | hjust = -1, 32 | nrow = 1 33 | ) 34 | prow 35 | 36 | 37 | ##Gene distribution 38 | 39 | 40 | data1 <- fread("~/project/mcIdentify/data/model_built/GSE673_gene_distribution.csv") 41 | 42 | 43 | library(ggpubr) 44 | library(ggprism) 45 | library(ggplot2) 46 | ggplot(data=data1,aes(x=type,y=Freq))+ 47 | geom_boxplot(size=1)+ 48 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+ 49 | theme_prism()+ 50 | labs(y="Number of gene with expression")+ 51 | scale_x_discrete(labels=c("Normal Cell","Tumor Cell"))+ 52 | theme(axis.title.x = element_blank()) 53 | 54 | 55 | data1$type <- factor(data1$type,levels = c("normal","malignant")) 56 | ggplot()+ 57 | geom_density(data= GSE256_sample, alpha=0.8,adjust=1.5,aes(x=Freq,fill=type))+ 58 | theme_prism()+ 59 | labs(x="Number of gene with expression",y="Density")+ 60 | scale_fill_manual(values = c("#F8766D","#00BFC4")) 61 | 62 | 63 | ComplexHeatmap::Heatmap() 64 | 65 | 66 | a <- as.data.frame(rnorm(20,mean = 2,sd = 1)) 67 | a$b <- c(1:20) 68 | colnames(a) <- c("v1","v2") 69 | ggplot(data = a,aes(x = v2,y=v1))+ 70 | geom_point(size=3,color = "red")+ 71 | labs(y="",x="")+ 72 | theme_prism() 73 | 74 | 75 | 76 | ###heatmap 77 | remove(list = ls()) 78 | setwd("~/project/mcIdentify/data/") 79 | library(data.table) 80 | library(dplyr) 81 | data1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 82 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 83 | 84 | tumor <- data1 %>% filter(type=="malignant") %>% select(GSE673_diff_path213$hsa) 85 | tumor1 <- tumor[1:500,] 86 | 87 | normal <- data1 %>% filter(type=="normal") %>% select(GSE673_diff_path213$hsa) 88 | normal1 <- normal[1:500,] 89 | 90 | data2 <- rbind(tumor1,normal1) 91 | library(scales) 92 | 93 | data3 <- scale(data2) 94 | data4 <- t(data3) 95 | 96 | 97 | sample_group <- as.data.frame(c(rep("malignant",500),rep("normal",500))) 98 | colnames(sample_group) <- "cluster" 99 | library(ComplexHeatmap) 100 | library(circlize) 101 | col_fun = colorRamp2(c(-2, 0, 2), c("#00FF00", "#3B3B3B", "#EE0000")) 102 | top_anno <- HeatmapAnnotation(Cluster = sample_group$cluster, 103 | col = list(Cluster = c("malignant"= "#F8766D","normal"= "#00BFC4"),border = TRUE)) 104 | column_split = sample_group$cluster 105 | 106 | 107 | library(ggprism) 108 | ComplexHeatmap::Heatmap(data4,cluster_rows = T,cluster_columns = F,name = " ", 109 | show_column_names = F,show_row_names = F,show_heatmap_legend = T, 110 | col = col_fun,column_split = column_split,row_title = "Pathway") 111 | 112 | 113 | 114 | 115 | remove(list = ls()) 116 | setwd("~/project/mcIdentify/data/") 117 | library(data.table) 118 | library(dplyr) 119 | data1 <- fread("./model_built/pathway_score_213/model_result/GSE673_method_gene.csv") 120 | data1$gene <- rep(1:6,4) 121 | data1$method <- factor(data1$method,levels = c("mcIdentify","ikraus","SCINA","scMRMA")) 122 | ggplot(data = data1, aes(x = gene, y = accuracy, color = method, shape = method)) + 123 | geom_point(size = 3) + 124 | geom_smooth(size = 1.8) + 125 | labs(x = " ", y = "Accuracy") + 126 | ylim(0,1)+ 127 | theme_prism()+ 128 | scale_x_continuous(name = "Gene number", breaks = seq(1, 6, by = 1), 129 | labels = c("<500", "500~1000", "1000~1500", "1500~2000", "2000~2500",">2500"), limits = c(1, 6)) 130 | 131 | 132 | 133 | 134 | library(data.table) 135 | library(dplyr) 136 | data1 <- fread("./model_built/pathway_score_213/model_result/gene_sample_statistic.csv",header = T,data.table = F) 137 | 138 | data1$type <- factor(data1$type,levels = c("normal","malignant")) 139 | data1$gene <- rep(1:6,2) 140 | library(ggplot2) 141 | 142 | ggplot(data1, aes(x = gene, weight = value, fill = type))+ 143 | geom_bar(position = "stack")+ 144 | scale_fill_manual(values = c("#00BFC4","#F8766D"))+ 145 | theme_prism()+ 146 | scale_x_continuous(name = "Gene number", breaks = seq(1, 6, by = 1), 147 | labels = c("<500", "500~1000", "1000~1500", "1500~2000", "2000~2500",">2500"))+ 148 | geom_text(aes(label = value1,y=value), 149 | position = position_stack(vjust = 0.5), size = 5) 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | col_fun = colorRamp2(c(-2, 0, 2), c("red", "black", "blue")) 160 | a <- matrix(data = rnorm(10000,mean = 0,sd = 1),nrow = 100,ncol = 100) 161 | 162 | a[c(round(runif(50,min=1,max=100),0)),] = 0 163 | 164 | ComplexHeatmap::Heatmap(a,cluster_rows = F,cluster_columns = F,name = " ", 165 | show_column_names = F,show_row_names = F,show_heatmap_legend = F,col = col_fun) 166 | 167 | 168 | ?ComplexHeatmap::Heatmap() 169 | 170 | 171 | 172 | remove(list = ls()) 173 | setwd("~/project/mcIdentify/data/") 174 | library(data.table) 175 | library(dplyr) 176 | data1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 177 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 178 | 179 | tumor <- data1 %>% filter(type=="malignant") %>% select(GSE673_diff_path213$hsa[1:50]) 180 | tumor1 <- tumor[1:100,] 181 | 182 | normal <- data1 %>% filter(type=="normal") %>% select(GSE673_diff_path213$hsa[1:50]) 183 | normal1 <- normal[1:100,] 184 | 185 | data2 <- rbind(tumor1,normal1) 186 | library(scales) 187 | 188 | data3 <- scale(data2) 189 | data4 <- t(data3) 190 | 191 | 192 | sample_group <- as.data.frame(c(rep("malignant",100),rep("normal",100))) 193 | colnames(sample_group) <- "cluster" 194 | library(ComplexHeatmap) 195 | library(circlize) 196 | top_anno <- HeatmapAnnotation(Cluster = sample_group$cluster, 197 | col = list(Cluster = c("malignant"= "#F8766D","normal"= "#00BFC4"),border = TRUE)) 198 | 199 | 200 | library(ggprism) 201 | ComplexHeatmap::Heatmap(data4,cluster_rows = F,cluster_columns = F,name = " ", 202 | show_column_names = F,show_row_names = F,show_heatmap_legend = F) 203 | 204 | 205 | -------------------------------------------------------------------------------- /inst/analysis/gene_analysis.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | setwd("~/project/mcIdentify/data/") 4 | 5 | 6 | library(data.table) 7 | library(dplyr) 8 | 9 | path_impor1 <- fread("./model_built/pathway_score_213/pathway_importance/path_impor_GSE673_model15.csv") 10 | path_score1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 11 | path_impor1$hsa <- colnames(path_score1) 12 | 13 | path_impor1 <- path_impor1[-1,-1] 14 | path_impor1$number <- c(1:213) 15 | 16 | plot(path_impor1$V2,ylim = c(0.048,0.11),col = "blue", pch = 19, cex = 1) 17 | 18 | 19 | path_impor1 <- path_impor1 %>% dplyr::mutate(fac = case_when(V2 > 0.058 ~ "A", TRUE ~ "B")) 20 | path_impor1$fac <- as.factor(path_impor1$fac) 21 | 22 | path_impor1 <- left_join(path_impor1,GSE673_diff_path213) 23 | 24 | gene_list <- KEGG_pathway_gene %>% filter(hsa %in% path_impor1[path_impor1$fac == "A",]$hsa) 25 | 26 | 27 | fwrite(gene_list,"./model_built/pathway_score_213/pathway_importance/gene_list.csv") 28 | 29 | 30 | 31 | x <- list("Oxidative phosphorylation" = gene_list[gene_list$hsa=="hsa00190",]$gene_id, 32 | 33 | "Viral myocarditis" = gene_list[gene_list$hsa=="hsa05416",]$gene_id, 34 | 35 | "Type I diabetes mellitus" = gene_list[gene_list$hsa=="hsa04940",]$gene_id, 36 | 37 | "Antigen processing and presentation" = gene_list[gene_list$hsa=="hsa04612",]$gene_id) 38 | 39 | 40 | venn.plot <- venn.diagram( 41 | x, 42 | filename = NULL, 43 | lty = 1, 44 | lwd = 1, 45 | col = "black", 46 | fill = c("#6E9ECE", "#EFDBB9","#E6928F","4E9595"), 47 | alpha = 0.60, 48 | cat.col = "black", 49 | cat.cex = 0.8, 50 | cat.fontface = "bold", 51 | margin = 0.07, 52 | cex = 0.8 53 | ) 54 | 55 | 56 | 57 | 58 | pdf("venn.pdf",width = 12,height = 12,pointsize = 20.5) 59 | grid.draw(venn.plot) 60 | dev.off() 61 | 62 | 63 | 64 | 65 | 66 | gene1 <- as.data.frame(sort(table(gene_list$gene_id))) 67 | gene2 <- gene1[gene1$Freq > 2,] 68 | 69 | 70 | data1 <- fread("./model_built/datasets/GSE148673_tpm.txt") 71 | data2 <- data1 %>% filter(V1 %in% gene2$Var1) 72 | 73 | data3 <- data2[,-1] 74 | data4 <- as.data.frame(t(data3)) 75 | colnames(data4) <- data2$V1 76 | data4$barcode <- rownames(data4) 77 | 78 | infor <- fread("./model_built/datasets/GSE148673_anno.txt") 79 | infor <- infor %>% mutate(type= if_else(cluster.pred == "T","malignant","normal")) 80 | 81 | infor1 <- infor %>% select(barcode,type) 82 | 83 | data5 <- left_join(infor1,data4) 84 | data5 <- data5 %>% select(-barcode) 85 | 86 | data6 <- melt(data5) 87 | 88 | data6$variable <- factor(data6$variable,levels = c("HLA-A","HLA-B","HLA-C","HLA-E","HLA-F","HLA-G", 89 | "HLA-DRA","HLA-DRB1","HLA-DRB5","HLA-DQA1","HLA-DQA2", 90 | "HLA-DQB1","HLA-DOB","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DPA1","HLA-DPB1")) 91 | 92 | library(ggpubr) 93 | library(ggprism) 94 | library(ggplot2) 95 | library(cowplot) 96 | 97 | 98 | 99 | ggplot(data=data6,aes(x=variable,y=value,fill=factor(type)))+ 100 | geom_violin()+ 101 | stat_compare_means(aes(label = ..p.signif..))+ 102 | theme_prism()+ 103 | theme(axis.text.x = element_text(angle = 30,vjust = 1, hjust = 1) )+ 104 | labs(y="Gene expression",title = "")+ 105 | theme(axis.title.x = element_blank()) 106 | 107 | 108 | -------------------------------------------------------------------------------- /inst/analysis/model_border_gene.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | setwd("~/project/mcIdentify/data/") 4 | remove(list = ls()) 5 | library(data.table) 6 | library(dplyr) 7 | 8 | data1 <- fread("./processed_data/GSE151530_tpm.txt") 9 | expr_data <- as.data.frame(data1[,-1]) 10 | rownames(expr_data) <- data1$V1 11 | 12 | 13 | 14 | ## pathway score 15 | setwd("~/project/mcIdentify/data/") 16 | remove(list = ls()) 17 | library(data.table) 18 | library(dplyr) 19 | 20 | data <- fread("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_500.txt") 21 | data <- fread("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_700.txt") 22 | 23 | 24 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt") 25 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 26 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 27 | 28 | 29 | expr_data <- expr_data1 30 | rownames(expr_data) <- expr_data$V1 31 | 32 | expr_matrix <- expr_data[,-1] 33 | expr_matrix <- as.data.frame(t(expr_matrix)) 34 | colnames(expr_matrix) <- rownames(expr_data) 35 | 36 | data1 <- expr_matrix 37 | 38 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 39 | 40 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 41 | 42 | ##pathway score 43 | 44 | myFun1 <- function(a){ 45 | 46 | sum(a)/length(a) 47 | 48 | } 49 | 50 | all_pathway_score <- NA 51 | for (i in 1:213) { 52 | 53 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 54 | 55 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 56 | 57 | path_score <- as.data.frame(apply(a, 1, myFun1)) 58 | colnames(path_score) <- names(table(score_gene$hsa))[i] 59 | 60 | all_pathway_score <- cbind(all_pathway_score,path_score) 61 | 62 | } 63 | 64 | pathway_score <- all_pathway_score 65 | pathway_score <- pathway_score[,-1] 66 | 67 | 68 | diff_path <- pathway_score 69 | diff_path$Cell <- rownames(diff_path) 70 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 71 | diff_path <- left_join(diff_path,infor_data2) 72 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 73 | 74 | 75 | fwrite(diff_path,"./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_500.csv") 76 | fwrite(diff_path,"./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_700.csv") 77 | 78 | 79 | 80 | 81 | 82 | for (number in c(900,1100,1300,1500)) { 83 | read_filename <- paste0("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_",number,".txt") 84 | data <- fread(read_filename) 85 | 86 | 87 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt") 88 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 89 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 90 | 91 | 92 | expr_data <- expr_data1 93 | rownames(expr_data) <- expr_data$V1 94 | 95 | expr_matrix <- expr_data[,-1] 96 | expr_matrix <- as.data.frame(t(expr_matrix)) 97 | colnames(expr_matrix) <- rownames(expr_data) 98 | data1 <- expr_matrix 99 | 100 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 101 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 102 | 103 | ##pathway score 104 | 105 | myFun1 <- function(a){ 106 | 107 | sum(a)/length(a) 108 | 109 | } 110 | 111 | all_pathway_score <- NA 112 | for (i in 1:213) { 113 | 114 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 115 | 116 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 117 | 118 | path_score <- as.data.frame(apply(a, 1, myFun1)) 119 | colnames(path_score) <- names(table(score_gene$hsa))[i] 120 | 121 | all_pathway_score <- cbind(all_pathway_score,path_score) 122 | 123 | } 124 | 125 | pathway_score <- all_pathway_score 126 | pathway_score <- pathway_score[,-1] 127 | 128 | 129 | diff_path <- pathway_score 130 | diff_path$Cell <- rownames(diff_path) 131 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 132 | diff_path <- left_join(diff_path,infor_data2) 133 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 134 | 135 | 136 | write_filename <- paste0("./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_",number,".csv") 137 | fwrite(diff_path,write_filename) 138 | } 139 | 140 | 141 | 142 | 143 | ###gene+pathway 144 | setwd("~/project/mcIdentify/data/") 145 | remove(list = ls()) 146 | library(data.table) 147 | library(dplyr) 148 | 149 | all_data <- fread("./processed_data/GSE148673_tpm.txt") 150 | border_data <- as.data.frame(all_data[,-1]) 151 | rownames(border_data) <- all_data$V1 152 | 153 | 154 | for (number in c(500,1000,1500,2000,2500)) { 155 | 156 | ##border gene select 157 | low_number <- NA 158 | testdata <- border_data 159 | for (i in 1:ncol(border_data)) { 160 | gene_number <- length(which(testdata[,i] > 0)) 161 | judge <- gene_number - number 162 | 163 | if (judge >= 0) { 164 | random_number <- sample(1:gene_number, judge, replace = FALSE) 165 | testdata[which(testdata[,i] > 0)[random_number],i] <- 0 166 | }else{ 167 | low_number <- append(low_number,i) 168 | } 169 | 170 | } 171 | 172 | if (number == 500) { 173 | testdata1 <- testdata 174 | }else{ 175 | testdata1 <- testdata[,-low_number[-1]] 176 | } 177 | 178 | 179 | write_filename_border <- paste0("./model_built/pathway_score_213/border_data_gene/GSE148673_tpm_",number,".txt") 180 | 181 | fwrite(testdata1,write_filename_border,row.names = T) 182 | 183 | 184 | 185 | ####pathway score 186 | read_filename <- paste0("./model_built/pathway_score_213/border_data_gene/GSE148673_tpm_",number,".txt") 187 | data <- fread(read_filename) 188 | 189 | 190 | infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt") 191 | infor_data1 <- infor_data1 %>% mutate(type = case_when(cluster.pred == "T"~"malignant", 192 | cluster.pred == "N"~"normal")) 193 | 194 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 195 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 196 | 197 | 198 | expr_data <- expr_data1 199 | rownames(expr_data) <- expr_data$V1 200 | 201 | expr_matrix <- expr_data[,-1] 202 | expr_matrix <- as.data.frame(t(expr_matrix)) 203 | colnames(expr_matrix) <- rownames(expr_data) 204 | data1 <- expr_matrix 205 | 206 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 207 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 208 | 209 | ##pathway score 210 | 211 | myFun1 <- function(a){ 212 | 213 | sum(a)/length(a) 214 | 215 | } 216 | 217 | all_pathway_score <- NA 218 | for (i in 1:213) { 219 | 220 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 221 | 222 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 223 | 224 | path_score <- as.data.frame(apply(a, 1, myFun1)) 225 | colnames(path_score) <- names(table(score_gene$hsa))[i] 226 | 227 | all_pathway_score <- cbind(all_pathway_score,path_score) 228 | 229 | } 230 | 231 | pathway_score <- all_pathway_score 232 | pathway_score <- pathway_score[,-1] 233 | 234 | 235 | diff_path <- pathway_score 236 | diff_path$Cell <- rownames(diff_path) 237 | # infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 238 | infor_data2 <- infor_data1 239 | diff_path <- left_join(diff_path,infor_data2) 240 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 241 | 242 | 243 | write_filename <- paste0("./model_built/pathway_score_213/border_data_gene/pathway_score/GSE673/GSE673_",number,".csv") 244 | fwrite(diff_path,write_filename) 245 | } 246 | 247 | 248 | 249 | 250 | 251 | ##GSE530 252 | a <- NA 253 | for (i in 1:ncol(border_data)) { 254 | 255 | a <- append(a,table(border_data[,i]>0)) 256 | 257 | } 258 | 259 | a <- as.data.frame(a[-1]) 260 | b <- as.data.frame(a[seq(2,nrow(a),2),]) 261 | 262 | colnames(b) <- "gene_number" 263 | 264 | border_500_1000 <- border_data[,which(b$gene_number>=500 & b$gene_number <1000)] 265 | border_1000_1500 <- border_data[,which(b$gene_number>=1000 & b$gene_number <1500)] 266 | border_1500_2000 <- border_data[,which(b$gene_number>=1500 & b$gene_number <2000)] 267 | border_2000_2500 <- border_data[,which(b$gene_number>=2000 & b$gene_number <2500)] 268 | border_2500_00 <- border_data[,which(b$gene_number>=2500)] 269 | 270 | 271 | fwrite(border_500_1000,"./model_built/pathway_score_213/border_interval/border_500_1000.txt",row.names = T) 272 | fwrite(border_1000_1500,"./model_built/pathway_score_213/border_interval/border_1000_1500.txt",row.names = T) 273 | fwrite(border_1500_2000,"./model_built/pathway_score_213/border_interval/border_1500_2000.txt",row.names = T) 274 | fwrite(border_2000_2500,"./model_built/pathway_score_213/border_interval/border_2000_2500.txt",row.names = T) 275 | fwrite(border_2500_00,"./model_built/pathway_score_213/border_interval/border_2500_00.txt",row.names = T) 276 | 277 | 278 | 279 | file_name <- list.files("./model_built/pathway_score_213/border_interval/")[1:5] 280 | for (number in file_name) { 281 | 282 | read_filename <- paste0("./model_built/pathway_score_213/border_interval/",number) 283 | data <- fread(read_filename) 284 | 285 | 286 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt") 287 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 288 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 289 | 290 | 291 | expr_data <- expr_data1 292 | rownames(expr_data) <- expr_data$V1 293 | 294 | expr_matrix <- expr_data[,-1] 295 | expr_matrix <- as.data.frame(t(expr_matrix)) 296 | colnames(expr_matrix) <- rownames(expr_data) 297 | data1 <- expr_matrix 298 | 299 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 300 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 301 | 302 | ##pathway score 303 | 304 | myFun1 <- function(a){ 305 | 306 | sum(a)/length(a) 307 | 308 | } 309 | 310 | all_pathway_score <- NA 311 | for (i in 1:213) { 312 | 313 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 314 | 315 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 316 | 317 | path_score <- as.data.frame(apply(a, 1, myFun1)) 318 | colnames(path_score) <- names(table(score_gene$hsa))[i] 319 | 320 | all_pathway_score <- cbind(all_pathway_score,path_score) 321 | 322 | } 323 | 324 | pathway_score <- all_pathway_score 325 | pathway_score <- pathway_score[,-1] 326 | 327 | 328 | diff_path <- pathway_score 329 | diff_path$Cell <- rownames(diff_path) 330 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 331 | diff_path <- left_join(diff_path,infor_data2) 332 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 333 | 334 | 335 | write_filename <- paste0("./model_built/pathway_score_213/border_interval/pathway_score_interval/",number,".csv") 336 | fwrite(diff_path,write_filename) 337 | } 338 | 339 | 340 | 341 | 342 | 343 | setwd("~/project/mcIdentify/data/") 344 | remove(list = ls()) 345 | library(data.table) 346 | library(dplyr) 347 | 348 | 349 | all_data <- fread("./model_built/pathway_score_213/border_interval/border_2500_00.txt") 350 | border_data <- as.data.frame(all_data[,-1]) 351 | rownames(border_data) <- all_data$V1 352 | 353 | 354 | 355 | for (number in c(500,1000,1500,2000,2500)) { 356 | 357 | ##border gene select 358 | low_number <- NA 359 | testdata <- border_data 360 | for (i in 1:ncol(border_data)) { 361 | gene_number <- length(which(testdata[,i] > 0)) 362 | judge <- gene_number - number 363 | 364 | if (judge >= 0) { 365 | random_number <- sample(1:gene_number, judge, replace = FALSE) 366 | testdata[which(testdata[,i] > 0)[random_number],i] <- 0 367 | } 368 | 369 | } 370 | testdata1 <- testdata 371 | 372 | write_filename_border <- paste0("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_",number,".txt") 373 | 374 | fwrite(testdata1,write_filename_border,row.names = T) 375 | 376 | 377 | 378 | ####pathway score 379 | read_filename <- paste0("./model_built/pathway_score_213/border_data_gene/GSE151530_tpm_",number,".txt") 380 | data <- fread(read_filename) 381 | 382 | 383 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt") 384 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 385 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 386 | 387 | 388 | expr_data <- expr_data1 389 | rownames(expr_data) <- expr_data$V1 390 | 391 | expr_matrix <- expr_data[,-1] 392 | expr_matrix <- as.data.frame(t(expr_matrix)) 393 | colnames(expr_matrix) <- rownames(expr_data) 394 | data1 <- expr_matrix 395 | 396 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 397 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 398 | 399 | ##pathway score 400 | 401 | myFun1 <- function(a){ 402 | 403 | sum(a)/length(a) 404 | 405 | } 406 | 407 | all_pathway_score <- NA 408 | for (i in 1:213) { 409 | 410 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 411 | 412 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 413 | 414 | path_score <- as.data.frame(apply(a, 1, myFun1)) 415 | colnames(path_score) <- names(table(score_gene$hsa))[i] 416 | 417 | all_pathway_score <- cbind(all_pathway_score,path_score) 418 | 419 | } 420 | 421 | pathway_score <- all_pathway_score 422 | pathway_score <- pathway_score[,-1] 423 | 424 | 425 | diff_path <- pathway_score 426 | diff_path$Cell <- rownames(diff_path) 427 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 428 | diff_path <- left_join(diff_path,infor_data2) 429 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 430 | 431 | 432 | write_filename <- paste0("./model_built/pathway_score_213/border_data_gene/pathway_score/GSE530_",number,".csv") 433 | fwrite(diff_path,write_filename) 434 | } 435 | 436 | 437 | 438 | 439 | #### other sample GOSH 440 | 441 | setwd("~/project/mcIdentify/data/") 442 | remove(list = ls()) 443 | library(data.table) 444 | library(dplyr) 445 | 446 | all_data <- fread("./model_built/datasets/GOSH_tpm.txt") 447 | border_data <- as.data.frame(all_data[,-1]) 448 | rownames(border_data) <- all_data$V1 449 | 450 | 451 | a <- NA 452 | for (i in 1:ncol(border_data)) { 453 | 454 | a <- append(a,table(border_data[,i]>0)) 455 | 456 | } 457 | 458 | a <- as.data.frame(a[-1]) 459 | b <- as.data.frame(a[seq(2,nrow(a),2),]) 460 | 461 | colnames(b) <- "gene_number" 462 | 463 | border_500 <- border_data[,which(b$gene_number < 500)] 464 | border_500_1000 <- border_data[,which(b$gene_number>=500 & b$gene_number <1000)] 465 | border_1000_1500 <- border_data[,which(b$gene_number>=1000 & b$gene_number <1500)] 466 | border_1500_2000 <- border_data[,which(b$gene_number>=1500 & b$gene_number <2000)] 467 | border_2000_2500 <- border_data[,which(b$gene_number>=2000 & b$gene_number <2500)] 468 | border_2500_00 <- border_data[,which(b$gene_number>=2500)] 469 | 470 | 471 | fwrite(border_500,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_500.txt",row.names = T) 472 | fwrite(border_500_1000,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_500_1000.txt",row.names = T) 473 | fwrite(border_1000_1500,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_1000_1500.txt",row.names = T) 474 | fwrite(border_1500_2000,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_1500_2000.txt",row.names = T) 475 | fwrite(border_2000_2500,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_2000_2500.txt",row.names = T) 476 | fwrite(border_2500_00,"./model_built/pathway_score_213/border_interval/GOSH/G0SH_border_2500_00.txt",row.names = T) 477 | 478 | 479 | file_name <- list.files("./model_built/pathway_score_213/border_interval/GOSH/") 480 | for (number in file_name) { 481 | 482 | read_filename <- paste0("./model_built/pathway_score_213/border_interval/GOSH/",number) 483 | data <- fread(read_filename) 484 | 485 | 486 | infor_data1 <- fread("./model_built/datasets/GOSH_anno.txt") 487 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 488 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 489 | 490 | 491 | expr_data <- expr_data1 492 | rownames(expr_data) <- expr_data$V1 493 | 494 | expr_matrix <- expr_data[,-1] 495 | expr_matrix <- as.data.frame(t(expr_matrix)) 496 | colnames(expr_matrix) <- rownames(expr_data) 497 | data1 <- expr_matrix 498 | 499 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 500 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 501 | 502 | ##pathway score 503 | 504 | myFun1 <- function(a){ 505 | 506 | sum(a)/length(a) 507 | 508 | } 509 | 510 | all_pathway_score <- NA 511 | for (i in 1:213) { 512 | 513 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 514 | 515 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 516 | 517 | path_score <- as.data.frame(apply(a, 1, myFun1)) 518 | colnames(path_score) <- names(table(score_gene$hsa))[i] 519 | 520 | all_pathway_score <- cbind(all_pathway_score,path_score) 521 | 522 | } 523 | 524 | pathway_score <- all_pathway_score 525 | pathway_score <- pathway_score[,-1] 526 | 527 | 528 | diff_path <- pathway_score 529 | diff_path$V1 <- rownames(diff_path) 530 | infor_data2 <- infor_data1 %>% filter(type != "unclassified") 531 | diff_path <- left_join(diff_path,infor_data2) 532 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 533 | 534 | 535 | write_filename <- paste0("./model_built/pathway_score_213/border_interval/pathway_score_interval/",number,".csv") 536 | fwrite(diff_path,write_filename) 537 | } 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | #### other sample GOSH 547 | 548 | setwd("~/project/mcIdentify/data/") 549 | remove(list = ls()) 550 | library(data.table) 551 | library(dplyr) 552 | 553 | all_data <- fread("./model_built/datasets/GSE148673_tpm.txt") 554 | border_data <- as.data.frame(all_data[,-1]) 555 | rownames(border_data) <- all_data$V1 556 | 557 | 558 | a <- NA 559 | for (i in 1:ncol(border_data)) { 560 | 561 | a <- append(a,table(border_data[,i]>0)) 562 | 563 | } 564 | 565 | a <- as.data.frame(a[-1]) 566 | b <- as.data.frame(a[seq(2,nrow(a),2),]) 567 | 568 | colnames(b) <- "gene_number" 569 | border_500 <- border_data[,which(b$gene_number < 500)] 570 | border_500_1000 <- border_data[,which(b$gene_number>=500 & b$gene_number <1000)] 571 | border_1000_1500 <- border_data[,which(b$gene_number>=1000 & b$gene_number <1500)] 572 | border_1500_2000 <- border_data[,which(b$gene_number>=1500 & b$gene_number <2000)] 573 | border_2000_2500 <- border_data[,which(b$gene_number>=2000 & b$gene_number <2500)] 574 | border_2500_00 <- border_data[,which(b$gene_number>=2500)] 575 | 576 | 577 | fwrite(border_500,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_500.txt",row.names = T) 578 | fwrite(border_500_1000,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_500_1000.txt",row.names = T) 579 | fwrite(border_1000_1500,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_1000_1500.txt",row.names = T) 580 | fwrite(border_1500_2000,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_1500_2000.txt",row.names = T) 581 | fwrite(border_2000_2500,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_2000_2500.txt",row.names = T) 582 | fwrite(border_2500_00,"./model_built/pathway_score_213/border_interval/GSE673/GSE673_border_2500_00.txt",row.names = T) 583 | 584 | 585 | file_name <- list.files("./model_built/pathway_score_213/border_interval/GSE673/") 586 | for (number in file_name) { 587 | 588 | read_filename <- paste0("./model_built/pathway_score_213/border_interval/GSE673/",number) 589 | data <- fread(read_filename) 590 | 591 | 592 | infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt") 593 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 594 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 595 | 596 | 597 | expr_data <- expr_data1 598 | rownames(expr_data) <- expr_data$V1 599 | 600 | expr_matrix <- expr_data[,-1] 601 | expr_matrix <- as.data.frame(t(expr_matrix)) 602 | colnames(expr_matrix) <- rownames(expr_data) 603 | data1 <- expr_matrix 604 | 605 | GSE256_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 606 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 607 | 608 | ##pathway score 609 | 610 | myFun1 <- function(a){ 611 | 612 | sum(a)/length(a) 613 | 614 | } 615 | 616 | all_pathway_score <- NA 617 | for (i in 1:213) { 618 | 619 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 620 | 621 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 622 | 623 | path_score <- as.data.frame(apply(a, 1, myFun1)) 624 | colnames(path_score) <- names(table(score_gene$hsa))[i] 625 | 626 | all_pathway_score <- cbind(all_pathway_score,path_score) 627 | 628 | } 629 | 630 | pathway_score <- all_pathway_score 631 | pathway_score <- pathway_score[,-1] 632 | 633 | diff_path <- pathway_score 634 | diff_path$barcode <- rownames(diff_path) 635 | infor_data2 <- infor_data1 %>% filter(type != "unclassified") 636 | diff_path <- left_join(diff_path,infor_data2) 637 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 638 | 639 | 640 | write_filename <- paste0("./model_built/pathway_score_213/border_interval/pathway_score_interval/",number,".csv") 641 | fwrite(diff_path,write_filename) 642 | } 643 | 644 | 645 | 646 | 647 | -------------------------------------------------------------------------------- /inst/analysis/model_build.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tensorflow import keras as K 3 | import tensorflow as tf 4 | from tensorflow.keras import regularizers 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import LabelBinarizer 8 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier 9 | from sklearn.model_selection import cross_val_score 10 | from sklearn.model_selection import KFold 11 | import numpy as np 12 | from sklearn.model_selection import StratifiedShuffleSplit 13 | import matplotlib.pyplot as plt 14 | from tensorflow.keras.backend import clear_session 15 | from sklearn.model_selection import PredefinedSplit 16 | import math 17 | from tensorflow.keras.models import Sequential 18 | from tensorflow.keras.layers import Dense 19 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier 20 | from sklearn.preprocessing import MinMaxScaler 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 23 | 24 | data = pd.read_csv("./model_data213/GSE673_pathway_score.csv") 25 | 26 | 27 | print(data["type"].value_counts()) 28 | data.type = data.type.astype(str).map({'malignant': 0, 'normal': 1}) 29 | x = data.drop('type', axis=1) 30 | y = data['type'] 31 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=10) 32 | feature = list(x.columns) 33 | 34 | 35 | # build model 36 | clear_session() 37 | model = K.models.Sequential() 38 | model.add(K.layers.Dense(units=300, input_dim=213, activation='sigmoid')) 39 | model.add(K.layers.Dropout(0.3)) 40 | model.add(K.layers.Dense(units=200, activation='sigmoid')) 41 | model.add(K.layers.Dropout(0.2)) 42 | model.add(K.layers.Dense(units=100, activation='sigmoid')) 43 | model.add(K.layers.Dropout(0.1)) 44 | model.add(K.layers.Dense(units=10, activation='sigmoid')) 45 | model.add(K.layers.Dense(units=1, activation='sigmoid')) 46 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 47 | 48 | b_size = 50 49 | max_epochs = 100 50 | h = model.fit(x_train, y_train, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1) 51 | eval = model.evaluate(x_train, y_train, verbose=0, batch_size=b_size) 52 | print("Evaluation on train data: loss = %0.6f accuracy = %0.2f%% \n" % (eval[0], eval[1] * 100)) 53 | 54 | eval = model.evaluate(x_test, y_test, verbose=0, batch_size=b_size) 55 | print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" % (eval[0], eval[1] * 100)) 56 | 57 | -------------------------------------------------------------------------------- /inst/analysis/model_train.pbs: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N tpm_train 3 | #PBS -k oe 4 | #PBS -l walltime=1000:00:00,nodes=1:ppn=1 5 | #PBS -q pub_gpu 6 | 7 | 8 | if [ -f "/public/home/liuxs/anaconda3/etc/profile.d/conda.sh" ]; then 9 | . "/public/home/liuxs/anaconda3/etc/profile.d/conda.sh" 10 | else 11 | export PATH="/public/home/liuxs/anaconda3/bin:$PATH" 12 | fi 13 | 14 | 15 | # conda activate /public/slst/home/wuchx/anaconda3/envs/python3 16 | cd /public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model 17 | 18 | 19 | python /public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/train_para.py 20 | -------------------------------------------------------------------------------- /inst/analysis/model_train.py: -------------------------------------------------------------------------------- 1 | ######################################################### 加载模块 2 | import sys 3 | from tensorflow import keras as K 4 | import tensorflow as tf 5 | from tensorflow.keras import regularizers 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import LabelBinarizer 9 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier 10 | from sklearn.model_selection import cross_val_score 11 | from sklearn.model_selection import KFold 12 | import numpy as np 13 | from sklearn.model_selection import StratifiedShuffleSplit 14 | import matplotlib.pyplot as plt 15 | from tensorflow.keras.backend import clear_session 16 | from sklearn.model_selection import PredefinedSplit 17 | import math 18 | import pandas as pd 19 | from tensorflow.keras.models import Sequential 20 | from tensorflow.keras.layers import Dense 21 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier 22 | from sklearn.preprocessing import MinMaxScaler 23 | from sklearn.model_selection import train_test_split 24 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 25 | ########################################################################### Build partition function 26 | 27 | 28 | def FindLayerNodesLinear(n_layers, first_layer_nodes, last_layer_nodes): 29 | layers = [] 30 | nodes_increment = (last_layer_nodes - first_layer_nodes)/ (n_layers-1) 31 | nodes = first_layer_nodes 32 | for i in range(1, n_layers+1): 33 | layers.append(math.ceil(nodes)) 34 | nodes = nodes + nodes_increment 35 | return layers 36 | 37 | def FinddropoutLinear(n_layers, dropout): 38 | layers = [] 39 | nodes_increment = round(dropout/(n_layers-1),2) 40 | nodes = dropout 41 | for i in range(1, n_layers+1): 42 | layers.append(nodes) 43 | nodes = round(nodes - nodes_increment,2) 44 | if(nodes <= 0): 45 | nodes = 0 46 | 47 | return layers 48 | 49 | 50 | def createmodel(n_layers, first_layer_nodes, last_layer_nodes, activation_func, loss_func,dropout): 51 | model = Sequential() 52 | n_nodes = FindLayerNodesLinear(n_layers, first_layer_nodes, last_layer_nodes) 53 | n_dropout = FinddropoutLinear(n_layers,dropout) 54 | for i in range(1, n_layers): 55 | if i==1: 56 | model.add(Dense(first_layer_nodes, input_dim=train_x.shape[1], activation=activation_func)) 57 | model.add(K.layers.Dropout(rate=n_dropout[0])) 58 | else: 59 | model.add(Dense(n_nodes[i-1], activation=activation_func)) 60 | model.add(K.layers.Dropout(rate=n_dropout[i-1])) 61 | model.add(Dense(train_y.shape[1], activation='softmax')) 62 | model.compile(optimizer='adam', loss=loss_func, metrics = ["accuracy"]) #note: metrics could also be 'mse' 63 | 64 | return model 65 | 66 | 67 | train_x = pd.read_csv("/public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/train_x.csv") 68 | train_y = pd.read_csv("/public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/train_x.csv") 69 | train_x1, value_x, train_y1, value_y = train_test_split(train_x, train_y,train_size=0.8, test_size=0.2, random_state=1) 70 | 71 | 72 | ############################################################################ build function 73 | train_val_features = np.concatenate((train_x,value_x),axis=0) 74 | train_val_labels = np.concatenate((train_y,value_y),axis=0) 75 | test_fold = np.zeros(train_val_features.shape[0]) 76 | test_fold[:train_x1.shape[0]] = -1 77 | ps = PredefinedSplit(test_fold=test_fold) 78 | #####################################################################Set parameter range 79 | model = KerasClassifier(build_fn=createmodel, verbose = False) 80 | 81 | activation_funcs = ['sigmoid', 'relu'] 82 | #activation_funcs = ['relu'] 83 | loss_funcs = ['binary_crossentropy'] 84 | 85 | param_grid = dict(n_layers=[3,4,5,6], first_layer_nodes = [200,250,300,350,400,450,500], last_layer_nodes = [10,20,30], dropout=[0.1,0.3,0.5], activation_func = activation_funcs, loss_func = loss_funcs, batch_size = [100,80,50], epochs = [50,100]) 86 | 87 | grid = GridSearchCV(estimator = model, param_grid = param_grid,cv=ps,n_jobs=1) 88 | #grid = RandomizedSearchCV (estimator = model, param_grid = param_grid,cv=3,n_jobs=5) 89 | ################################################################ trainning 90 | grid.fit(train_val_features, train_val_labels) 91 | ############################################################### Output the highest accuracy and corresponding parameters 92 | print(grid.best_score_) 93 | print(grid.best_params_) 94 | f = open("/public/slst/home/wuchx/project/mcIdentify/mcIdentify/code/train673_model/model_result.txt") 95 | f.write(str(grid.best_score_)) 96 | f.write("\n") 97 | f.write(str(grid.best_params_)) 98 | f.write("\n") 99 | f.close() 100 | 101 | -------------------------------------------------------------------------------- /inst/analysis/other_data_pathscore.R: -------------------------------------------------------------------------------- 1 | 2 | ##GSE151530 3 | remove(list = ls()) 4 | setwd("~/project/mcIdentify/data/") 5 | library(data.table) 6 | library(dplyr) 7 | infor_data1 <- fread("./model_built/datasets/GSE151530_anno.txt") 8 | expr_data1 <- fread("./model_built/datasets/GSE151530_tpm2.txt") 9 | 10 | expr_data <- expr_data1 11 | rownames(expr_data) <- expr_data$V1 12 | 13 | expr_matrix <- expr_data[,-1] 14 | expr_matrix <- as.data.frame(t(expr_matrix)) 15 | colnames(expr_matrix) <- rownames(expr_data) 16 | 17 | data1 <- expr_matrix 18 | 19 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds") 20 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 21 | 22 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 23 | 24 | ##pathway score 25 | 26 | myFun1 <- function(a){ 27 | 28 | sum(a)/length(a) 29 | 30 | } 31 | 32 | all_pathway_score <- NA 33 | for (i in 1:213) { 34 | 35 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 36 | 37 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 38 | 39 | path_score <- as.data.frame(apply(a, 1, myFun1)) 40 | colnames(path_score) <- names(table(score_gene$hsa))[i] 41 | 42 | all_pathway_score <- cbind(all_pathway_score,path_score) 43 | 44 | } 45 | 46 | pathway_score <- all_pathway_score 47 | pathway_score <- pathway_score[,-1] 48 | 49 | ##test 50 | diff_path <- pathway_score 51 | diff_path$Cell <- rownames(diff_path) 52 | infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 53 | diff_path <- left_join(infor_data2,diff_path) 54 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) 55 | 56 | fwrite(diff_path,"./model_built/pathway_score/GSE530_pathway_score.csv") 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | ##GSE146771 67 | remove(list = ls()) 68 | setwd("~/project/mcIdentify/data/") 69 | library(data.table) 70 | library(dplyr) 71 | infor_data1 <- fread("./model_built/datasets/GSE146771_anno.txt") 72 | expr_data1 <- fread("./model_built/datasets/GSE146771_tpm.txt") 73 | 74 | expr_data <- expr_data1 75 | rownames(expr_data) <- expr_data$V1 76 | 77 | expr_matrix <- expr_data[,-1] 78 | expr_matrix <- as.data.frame(t(expr_matrix)) 79 | colnames(expr_matrix) <- rownames(expr_data) 80 | 81 | data1 <- expr_matrix 82 | 83 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds") 84 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 85 | 86 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 87 | ##pathway score 88 | myFun1 <- function(a){ 89 | 90 | sum(a)/length(a) 91 | 92 | } 93 | 94 | all_pathway_score <- NA 95 | for (i in 1:213) { 96 | 97 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 98 | 99 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 100 | 101 | path_score <- as.data.frame(apply(a, 1, myFun1)) 102 | colnames(path_score) <- names(table(score_gene$hsa))[i] 103 | 104 | all_pathway_score <- cbind(all_pathway_score,path_score) 105 | 106 | } 107 | 108 | pathway_score <- all_pathway_score 109 | pathway_score <- pathway_score[,-1] 110 | 111 | 112 | ##test 113 | diff_path <- pathway_score 114 | diff_path$CellName <- rownames(diff_path) 115 | diff_path <- left_join(infor_data1,diff_path) 116 | diff_path <- diff_path %>% filter(type == "malignant" | type == "normal") %>% 117 | select(type,GSE256_diff_path3$hsa) 118 | 119 | fwrite(diff_path,"./model_built/pathway_score/GSE771_pathway_score.csv") 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | remove(list = ls()) 129 | ##GOSH 130 | setwd("~/project/mcIdentify/data/") 131 | library(data.table) 132 | library(dplyr) 133 | infor_data1 <- fread("./model_built/datasets/GOSH_anno.txt") 134 | expr_data1 <- fread("./model_built/datasets/GOSH_tpm.txt") 135 | 136 | ##PMC 137 | infor_data1 <- fread("./model_built/datasets/PMC_anno.txt") 138 | expr_data1 <- fread("./model_built/datasets/PMC_tpm.txt") 139 | 140 | 141 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds") 142 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 143 | 144 | 145 | expr_data <- expr_data1 146 | rownames(expr_data) <- expr_data1$V1 147 | expr_matrix <- expr_data[,-1] 148 | expr_matrix <- as.data.frame(t(expr_matrix)) 149 | colnames(expr_matrix) <- rownames(expr_data) 150 | 151 | data1 <- expr_matrix 152 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 153 | 154 | 155 | ##pathway score 156 | 157 | myFun1 <- function(a){ 158 | 159 | sum(a)/length(a) 160 | 161 | } 162 | 163 | all_pathway_score <- NA 164 | for (i in 1:213) { 165 | 166 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 167 | 168 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 169 | 170 | path_score <- as.data.frame(apply(a, 1, myFun1)) 171 | colnames(path_score) <- names(table(score_gene$hsa))[i] 172 | 173 | all_pathway_score <- cbind(all_pathway_score,path_score) 174 | 175 | } 176 | 177 | pathway_score <- all_pathway_score 178 | pathway_score <- pathway_score[,-1] 179 | 180 | ##test 181 | diff_path <- pathway_score 182 | diff_path$V1 <- rownames(diff_path) 183 | infor_data2 <- infor_data1 %>% filter(type != "unclassified") 184 | diff_path <- left_join(infor_data2,diff_path) 185 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) 186 | fwrite(diff_path,"./model_built/pathway_score/GOSH_pathway_score.csv") 187 | 188 | fwrite(diff_path,"./model_built/pathway_score/PMC_pathway_score.csv") 189 | 190 | 191 | 192 | 193 | 194 | 195 | ## GSE131309 Seq 196 | remove(list = ls()) 197 | setwd("~/project/mcIdentify/data/") 198 | library(data.table) 199 | library(dplyr) 200 | infor_data1 <- fread("./model_built/datasets/GSE131309_Seq_anno.txt") 201 | expr_data1 <- fread("./model_built/datasets/GSE131309_Seq_tpm.txt") 202 | 203 | 204 | GSE256_diff_path3 <- readRDS("./model_built/GSE673_diff_path.rds") 205 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 206 | 207 | expr_data <- expr_data1 208 | rownames(expr_data) <- expr_data1$V1 209 | 210 | expr_matrix <- expr_data[,-1] 211 | expr_matrix <- as.data.frame(t(expr_matrix)) 212 | colnames(expr_matrix) <- rownames(expr_data) 213 | data1 <- expr_matrix 214 | score_gene <- pathway_gene %>% filter(hsa %in% GSE256_diff_path3$hsa) 215 | 216 | ##pathway score 217 | myFun1 <- function(a){ 218 | 219 | sum(a)/length(a) 220 | 221 | } 222 | 223 | all_pathway_score <- NA 224 | for (i in 1:213) { 225 | 226 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 227 | 228 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 229 | 230 | path_score <- as.data.frame(apply(a, 1, myFun1)) 231 | colnames(path_score) <- names(table(score_gene$hsa))[i] 232 | 233 | all_pathway_score <- cbind(all_pathway_score,path_score) 234 | 235 | } 236 | 237 | pathway_score <- all_pathway_score 238 | pathway_score <- pathway_score[,-1] 239 | 240 | ##test 241 | diff_path <- pathway_score 242 | diff_path$barcode <- rownames(diff_path) 243 | infor_data2 <- infor_data1 %>% filter(type != "unclassified") 244 | diff_path <- left_join(infor_data2,diff_path) 245 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) 246 | 247 | fwrite(diff_path,"./model_built/pathway_score/GSE309_Seq_pathway_score.csv") 248 | fwrite(diff_path,"./model_built/pathway_score/GSE309_10X_pathway_score.csv") 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /inst/analysis/pathway_importance.R: -------------------------------------------------------------------------------- 1 | 2 | setwd("~/project/mcIdentify/data/") 3 | 4 | library(data.table) 5 | library(dplyr) 6 | 7 | 8 | 9 | path_impor1 <- fread("./model_built/pathway_score_213/pathway_importance/path_impor_GSE673_model15.csv") 10 | path_score1 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 11 | path_impor1$hsa <- colnames(path_score1) 12 | 13 | path_impor1 <- path_impor1[-1,-1] 14 | path_impor1$number <- c(1:213) 15 | 16 | # top10 <- path_impor1 %>% arrange(-V2) %>% filter(V2 > 0.059) 17 | 18 | plot(path_impor1$V2,ylim = c(0.945,0.985),col = "blue", pch = 19, cex = 1) 19 | 20 | 21 | path_impor1 <- path_impor1 %>% dplyr::mutate(fac = case_when(V2 < 0.979 ~ "A", TRUE ~ "B")) 22 | path_impor1$fac <- as.factor(path_impor1$fac) 23 | 24 | path_impor1 <- left_join(path_impor1,GSE673_diff_path213) 25 | 26 | 27 | library(ggplot2) 28 | library(ggpubr) 29 | library(ggprism) 30 | library(ggrepel) 31 | ggplot(path_impor1, aes(x=number, y=V2, color=factor(fac))) + 32 | geom_point(size = 3,)+ 33 | theme_prism(border = T)+ 34 | labs(y="Accuracy of the model", x = "Pathway")+ 35 | ylim(0.955,0.984)+ 36 | xlim(0,214)+ 37 | scale_color_manual(values = c("#DC0000FF",'#0072B5FF'))+ 38 | theme(legend.position = 'none')+ 39 | geom_text_repel( 40 | data = subset(path_impor1, path_impor1$V2 < 0.979), 41 | aes(label = pathway_id), 42 | size = 5, 43 | box.padding = unit(1, "lines"), 44 | point.padding = unit(1, "lines"), segment.color = "black", show.legend = FALSE )+ 45 | geom_hline(aes(yintercept=0.98),linetype=5,col="black") 46 | 47 | 48 | 49 | 50 | library(data.table) 51 | library(dplyr) 52 | 53 | path_impor2 <- fread("./model_built/pathway_score_213/pathway_importance/loss_result_GSE673_model15.csv") 54 | path_score2 <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 55 | path_impor2$hsa <- colnames(path_score2) 56 | 57 | path_impor2 <- path_impor2[-1,-1] 58 | path_impor2$number <- c(1:213) 59 | 60 | path_impor2 <- left_join(GSE673_diff_path213,path_impor2) 61 | 62 | draw_data <- path_impor2 %>% filter(hsa %in% top10$hsa) %>% select(-pvalue) %>% select(-number) 63 | 64 | 65 | data1 <- melt(draw_data) 66 | data1$hsa <- factor(data1$hsa,levels = top10$hsa) 67 | 68 | 69 | library(ggpubr) 70 | library(ggprism) 71 | library(ggplot2) 72 | library(cowplot) 73 | ggplot(data=data1,aes(x=hsa,y=value,fill = pathway_id))+ 74 | geom_boxplot(size=1, draw_quantiles = c(0.5))+ 75 | theme_prism(border = T)+theme(legend.position = 'none')+ 76 | labs(y="Loss of the model",title = " ")+ 77 | theme(axis.title.x = element_blank())+ 78 | # geom_hline(aes(yintercept=0.9),linetype=5,col="red")+ 79 | # geom_hline(aes(yintercept=0.8),linetype=5,col="red")+ 80 | # scale_y_continuous(breaks=c(0,0.5,0.6,0.7,0.8,0.9,1))+ 81 | scale_fill_manual(values = c("#F27970", "#BB9727","#54B345","#32B897", 82 | "#05B9E2", "#8983BF","#C76DA2","#F27970", 83 | "#BB9727","#54B345")) 84 | 85 | 86 | 87 | 88 | 89 | library(ggpubr) 90 | library(ggprism) 91 | library(ggplot2) 92 | library(cowplot) 93 | ggplot(data=path_score1,aes(x=type,y=hsa05416,fill=factor(type)))+ 94 | geom_boxplot(size=1,)+ 95 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+ 96 | theme_prism()+theme(legend.position = 'none')+ 97 | labs(y="Pathway Score",title = "Viral myocarditis")+ 98 | theme(axis.title.x = element_blank()) 99 | 100 | 101 | 102 | 103 | p1 <- ggplot(data=path_score1,aes(x=type,y=hsa00190,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 104 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 105 | labs(y="Pathway Score",title = "Oxidative phosphorylation")+ 106 | theme(axis.title.x = element_blank()) 107 | 108 | p2 <- ggplot(data=path_score1,aes(x=type,y=hsa04612,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 109 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 110 | labs(y="Pathway Score",title = "Antigen processing and presentation")+ 111 | theme(axis.title.x = element_blank()) 112 | 113 | p3 <- ggplot(data=path_score1,aes(x=type,y=hsa04940,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 114 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 115 | labs(y="Pathway Score",title = "Type I diabetes mellitus")+ 116 | theme(axis.title.x = element_blank()) 117 | 118 | p4 <- ggplot(data=path_score1,aes(x=type,y=hsa05416,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 119 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 120 | labs(y="Pathway Score",title = "Viral myocarditis")+ 121 | theme(axis.title.x = element_blank()) 122 | 123 | 124 | 125 | ggdraw() + 126 | draw_plot(p3, 0, 0, 0.5, 0.5) + 127 | draw_plot(p4, 0.5, 0, 0.5, 0.5) + 128 | draw_plot(p1, 0, 0.5, 0.5, 0.5) + 129 | draw_plot(p2, 0.5, 0.5, 0.5, 0.5) 130 | 131 | 132 | 133 | library(ggpubr) 134 | library(ggprism) 135 | library(ggplot2) 136 | ggplot(data=path_score1,aes(x=type,y=hsa04940))+ 137 | geom_boxplot(size=1,)+ 138 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+ 139 | theme_prism()+ 140 | labs(y="Pathway Score")+ 141 | theme(axis.title.x = element_blank()) 142 | 143 | 144 | 145 | 146 | ######pathway importance 147 | path_impor <- fread("./model_built/pathway_score_213/pathway_importance/path_impor_GSE530_model15.csv") 148 | path_score <- fread("./model_built/pathway_score_213/GSE530_pathway_score213.csv") 149 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 150 | path_impor$hsa <- colnames(path_score) 151 | 152 | path_impor <- path_impor[-1,-1] 153 | path_impor$number <- c(1:213) 154 | 155 | # plot(path_impor$V2,ylim = c(0.105,0.175),col = "blue", pch = 19, cex = 1) 156 | 157 | path_impor <- path_impor %>% dplyr::mutate(fac = case_when(V2 > 0.11 ~ "A", TRUE ~ "B")) 158 | path_impor$fac <- as.factor(path_impor$fac) 159 | 160 | path_impor <- left_join(path_impor,GSE673_diff_path213) 161 | 162 | library(ggplot2) 163 | library(ggpubr) 164 | library(ggprism) 165 | library(ggrepel) 166 | ggplot(path_impor, aes(x=number, y=V2, color=factor(fac))) + 167 | geom_point(size = 3,)+ 168 | theme_prism(border = T)+ 169 | labs(y="Loss of the model", x = "Pathway")+ 170 | ylim(0.089,0.136)+ 171 | scale_color_manual(values = c('red','blue'))+ 172 | theme(legend.position = 'none')+ 173 | geom_text_repel( 174 | data = subset(path_impor, path_impor$V2 > 0.11), 175 | aes(label = pathway_id), 176 | size = 4, 177 | box.padding = unit(1.2, "lines"), 178 | point.padding = unit(1, "lines"), segment.color = "black", show.legend = FALSE ) 179 | 180 | 181 | 182 | 183 | 184 | library(ggpubr) 185 | library(ggprism) 186 | library(ggplot2) 187 | library(cowplot) 188 | ggplot(data=path_score,aes(x=type,y=hsa04380,fill=factor(type)))+ 189 | geom_boxplot(size=1,)+ 190 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+ 191 | theme_prism()+theme(legend.position = 'none')+ 192 | labs(y="Pathway Score",title = "Osteoclast differentiation")+ 193 | theme(axis.title.x = element_blank()) 194 | 195 | 196 | 197 | 198 | p1 <- ggplot(data=path_score,aes(x=type,y=hsa04380,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 199 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 200 | labs(y="Pathway Score",title = "Osteoclast differentiation")+ 201 | theme(axis.title.x = element_blank()) 202 | 203 | p2 <- ggplot(data=path_score,aes(x=type,y= hsa04940,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 204 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 205 | labs(y="Pathway Score",title = "Type I diabetes mellitus")+ 206 | theme(axis.title.x = element_blank()) 207 | 208 | p3 <- ggplot(data=path_score,aes(x=type,y=hsa04650,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 209 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 210 | labs(y="Pathway Score",title = "Natural killer cell mediated cytotoxicity")+ 211 | theme(axis.title.x = element_blank()) 212 | 213 | p4 <- ggplot(data=path_score,aes(x=type,y=hsa04978,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 214 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 215 | labs(y="Pathway Score",title = "Mineral absorption")+ 216 | theme(axis.title.x = element_blank()) 217 | 218 | p5 <- ggplot(data=path_score,aes(x=type,y=hsa05322,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 219 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 220 | labs(y="Pathway Score",title = "Systemic lupus erythematosus")+ 221 | theme(axis.title.x = element_blank()) 222 | 223 | p6 <- ggplot(data=path_score,aes(x=type,y=hsa05208,fill=factor(type)))+geom_boxplot(size=1,)+theme_prism()+ 224 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+theme(legend.position = 'none')+ 225 | labs(y="Pathway Score",title = "Chemical carcinogenesis - reactive oxygen species")+ 226 | theme(axis.title.x = element_blank()) 227 | 228 | 229 | ggdraw() + 230 | draw_plot(p3, 0, 0, 0.33, 0.5) + 231 | draw_plot(p2, 0.33, 0, 0.33, 0.5) + 232 | draw_plot(p1, 0.66, 0, 0.33, 0.5) + 233 | draw_plot(p6, 0, 0.5, 0.33, 0.5) + 234 | draw_plot(p4, 0.33, 0.5, 0.33, 0.5) + 235 | draw_plot(p5, 0.66, 0.5, 0.33, 0.5) 236 | 237 | 238 | 239 | 240 | gene_ <- KEGG_pathway_gene %>% filter(hsa %in% path_impor1[path_impor1$V2>0.058,]$hsa) 241 | sort(table(gene_$gene_id)) 242 | 243 | 244 | 245 | gene_list <- KEGG_pathway_gene %>% filter(hsa %in% path_impor[path_impor$V2 > 0.135,]$hsa) 246 | 247 | 248 | 249 | 250 | 251 | 252 | ###heatmap pathy 253 | pathway_data <- fread("./model_built/pathway_score_213/GSE673_pathway_score213.csv") 254 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 255 | 256 | heatpathay <- GSE673_diff_path213 %>% filter(hsa %in% c("hsa00190","hsa04612","hsa04940","hsa05416")) 257 | 258 | heatmap_data <- pathway_data %>% select(type,heatpathay$hsa) %>% arrange(type) 259 | heatmap_data1 <- heatmap_data %>% select(-type) 260 | 261 | heatmap_data2 <- scale(heatmap_data1) 262 | heatmap_data2 <- t(heatmap_data2) 263 | 264 | heatmap_data3 <- heatmap_data2[,c(1:2000,33001:35000)] 265 | 266 | 267 | 268 | tumor_sample <- heatmap_data %>% filter(type =="malignant") %>% arrange(-hsa00190) 269 | tumor_sample1 <- tumor_sample[1:2000,] 270 | 271 | normal_sample <- heatmap_data %>% filter(type =="normal") %>% arrange(-hsa04612) 272 | normal_sample1 <- normal_sample[1:2000,] 273 | 274 | data1 <- rbind(tumor_sample1,normal_sample1) 275 | data2 <- data1[,-1] 276 | data3 <- scale(data2) 277 | # data3 <- data2 278 | data4 <- t(data3) 279 | 280 | library(ComplexHeatmap) 281 | sample_group <- as.data.frame(c(rep("malignant",2000),rep("normal",2000))) 282 | colnames(sample_group) <- "cluster" 283 | library(ComplexHeatmap) 284 | library(circlize) 285 | col_fun = colorRamp2(c(-2.5, 0, 2.5), c("#00FF00", "#3B3B3B", "#EE0000")) 286 | top_anno <- HeatmapAnnotation(Cluster = sample_group$cluster, 287 | col = list(Cluster = c("malignant"= "#F8766D","normal"= "#00BFC4"),border = TRUE)) 288 | column_split = sample_group$cluster 289 | 290 | 291 | ComplexHeatmap::Heatmap(data4,cluster_rows = F,cluster_columns = F,name = " ", 292 | show_column_names = F,show_row_names = T,show_heatmap_legend = F, 293 | col = col_fun,column_split = column_split,row_title = "Pathway") 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | a <- fread("./model_built/pathway_score_213/model_result/cell_statistics.csv") 302 | b <- a[5:8,] 303 | 304 | data1 <- melt(b) 305 | data1$sample <- factor(data1$sample,levels = c("ATC","TNBC","IDC","DCIS")) 306 | ggplot(data=data1) + 307 | geom_bar(aes(x=sample, y=value, fill=variable), 308 | stat="identity")+ 309 | scale_fill_manual(values = c("#00BFC4","#F8766D"))+ 310 | theme_prism()+ 311 | labs(y="Number of cells")+ 312 | theme(axis.title.x = element_blank()) 313 | 314 | 315 | 316 | 317 | -------------------------------------------------------------------------------- /inst/analysis/pathway_select.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | remove(list = ls()) 6 | setwd("~/project/mcIdentify/data/") 7 | # model 8 | ##GSE148673 9 | 10 | library(data.table) 11 | library(dplyr) 12 | infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt") 13 | expr_data1 <- fread("./model_built/datasets/GSE148673_tpm.txt") 14 | 15 | 16 | expr_data <- expr_data1 17 | rownames(expr_data) <- expr_data1$V1 18 | 19 | expr_matrix <- expr_data[,-1] 20 | expr_matrix <- as.data.frame(t(expr_matrix)) 21 | colnames(expr_matrix) <- rownames(expr_data) 22 | 23 | 24 | data1 <- expr_matrix 25 | 26 | ##pathway score 27 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 28 | 29 | infor_data1 <- infor_data1 %>% mutate_at(.vars = "cluster.pred",.funs = funs(ifelse(.=="T","malignant","normal"))) 30 | 31 | 32 | 33 | myFun1 <- function(a){ 34 | 35 | sum(a)/length(a) 36 | 37 | } 38 | 39 | 40 | all_pathway_score <- NA 41 | for (i in 1:335) { 42 | 43 | gene <- pathway_gene %>% filter(hsa == names(table(pathway_gene$hsa))[i]) 44 | 45 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 46 | 47 | path_score <- as.data.frame(apply(a, 1, myFun1)) 48 | colnames(path_score) <- names(table(pathway_gene$hsa))[i] 49 | 50 | all_pathway_score <- cbind(all_pathway_score,path_score) 51 | 52 | } 53 | 54 | 55 | pathway_score <- all_pathway_score 56 | pathway_score <- pathway_score[,-1] 57 | 58 | 59 | 60 | ##test 61 | 62 | diff_path <- pathway_score 63 | diff_path$barcode <- rownames(diff_path) 64 | diff_path <- left_join(infor_data1,diff_path) 65 | diff_path <- diff_path %>% filter(cluster.pred == "malignant" | cluster.pred == "normal") %>% 66 | select(cluster.pred,names(table(pathway_gene$hsa))) 67 | 68 | colnames(diff_path)[1] <- "type" 69 | 70 | saveRDS(diff_path,"./model_built/GSE673_all_pathway_score.rds") 71 | 72 | 73 | ###boxplot 74 | library(ggpubr) 75 | library(ggprism) 76 | library(ggplot2) 77 | plot1 <- ggplot(data=diff_path,aes(x=PredictionRefined,y=hsa00010))+ 78 | geom_boxplot(size=1)+ 79 | stat_compare_means(label.x=1.2,size=5,method = "wilcox.test")+ 80 | theme_prism()+ 81 | labs(y="Pathway Score")+ 82 | theme(axis.title.x = element_blank()) 83 | 84 | plot1 85 | 86 | 87 | ##testing 88 | tumor_sample <- diff_path %>% filter(type=="malignant") 89 | normal_sample <- diff_path %>% filter(type=="normal") 90 | 91 | tumor_infor <- infor_data1 %>% filter(cluster.pred=="malignant") 92 | 93 | 94 | diff_path2 <- diff_path %>% select(-type) 95 | 96 | test_pathway <- NA 97 | 98 | for (name in colnames(diff_path2)) { 99 | 100 | if (sum(select(diff_path2,name) == 0) < nrow(diff_path2)*0.01) { 101 | 102 | a <- wilcox.test(as.matrix(select(tumor_sample,name)),as.matrix(select(normal_sample,name)),paired = F, correct = F) 103 | b <- as.data.frame(a$p.value) 104 | rownames(b) <- name 105 | test_pathway <- rbind(test_pathway,b) 106 | 107 | } 108 | 109 | } 110 | 111 | test_value <- test_pathway 112 | ##select pathway 113 | colnames(test_value) <- "pvalue" 114 | test_value2 <- test_value %>% filter(pvalue< 0.05) 115 | test_value2$hsa <- rownames(test_value2) 116 | test_value2 <- test_value2 %>% arrange(pvalue) 117 | 118 | 119 | 120 | ##pathway 121 | path_test <- test_value2 122 | pathway <- pathway_gene %>% filter(!duplicated(hsa)) %>% select(hsa,pathway_id) 123 | 124 | path_test <- left_join(path_test,pathway) %>% filter(pvalue == 0) 125 | saveRDS(path_test,"./model_built/GSE673_diff_path.rds") 126 | 127 | 128 | testdata <- diff_path %>% select(type,path_test$hsa) 129 | fwrite(testdata,"./model_built/GSE673_pathway_score.csv") 130 | 131 | 132 | -------------------------------------------------------------------------------- /inst/analysis/result_analysis.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | remove(list = ls()) 4 | setwd("~/project/mcIdentify/data/") 5 | 6 | library(data.table) 7 | library(dplyr) 8 | 9 | ##method 10 | model_result <- fread("./model_built/pathway_score_213/model_result/method.csv") 11 | 12 | data1 <- melt(model_result) 13 | 14 | f1 <- data1 %>% filter(variable == "f1") 15 | f1$method <- factor(f1$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA")) 16 | 17 | library(ggplot2) 18 | library(ggpubr) 19 | library(ggprism) 20 | 21 | p1 <- ggplot(f1,aes(x=method,y=value))+ 22 | stat_boxplot(geom = "errorbar",width=0.15)+ 23 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 24 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 25 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 26 | scale_color_manual(values=c("black"))+ 27 | ylim(0,1.01)+ 28 | theme_prism(border = T)+ 29 | labs(y="F1 score", x = "",title = "F1 score of different methods")+ 30 | theme(legend.text = element_text(size = 13,family = "sans")) 31 | p1 32 | 33 | 34 | 35 | 36 | accuracy <- data1 %>% filter(variable == "accuracy") 37 | accuracy$method <- factor(accuracy$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA")) 38 | p2 <- ggplot(accuracy,aes(x=method,y=value))+ 39 | stat_boxplot(geom = "errorbar",width=0.15)+ 40 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 41 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 42 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 43 | scale_color_manual(values=c("black"))+ 44 | ylim(0,1.01)+ 45 | theme_prism(border = T)+ 46 | labs(y="Accuracy", x = "",title = "Accuracy of different methods")+ 47 | theme(legend.text = element_text(size = 13,family = "sans")) 48 | p2 49 | 50 | 51 | 52 | 53 | recall <- data1 %>% filter(variable == "recall") 54 | recall$method <- factor(recall$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA")) 55 | p3 <- ggplot(recall,aes(x=method,y=value))+ 56 | stat_boxplot(geom = "errorbar",width=0.15)+ 57 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 58 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 59 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 60 | scale_color_manual(values=c("black"))+ 61 | ylim(0,1.01)+ 62 | theme_prism(border = T)+ 63 | labs(y="Recall", x = "",title = "Recall of different methods")+ 64 | theme(legend.text = element_text(size = 13,family = "sans")) 65 | p3 66 | 67 | 68 | 69 | precisoon <- data1 %>% filter(variable == "precisoon") 70 | precisoon$method <- factor(precisoon$method,levels = c("mcIdentify","ikarus","SCINA","scMRMA")) 71 | p4 <- ggplot(precisoon,aes(x=method,y=value))+ 72 | stat_boxplot(geom = "errorbar",width=0.15)+ 73 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 74 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 75 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 76 | scale_color_manual(values=c("black"))+ 77 | ylim(0,1.01)+ 78 | theme_prism(border = T)+ 79 | labs(y="Precison", x = "",title = "Precison of different methods")+ 80 | theme(legend.text = element_text(size = 13,family = "sans")) 81 | p4 82 | 83 | 84 | library(cowplot) 85 | 86 | 87 | prow <- plot_grid( 88 | p1 + theme(legend.position="none"), 89 | p2 + theme(legend.position="none"), 90 | p3 + theme(legend.position="none"), 91 | p4 + theme(legend.position="none"), 92 | align = 'vh', 93 | labels = c("A", "B", "C","D"), 94 | hjust = -1, 95 | nrow = 2 96 | ) 97 | prow 98 | legend <- get_legend( 99 | p1 + theme(legend.box.margin = margin(0, 0, 0, 12)) 100 | ) 101 | 102 | plot_grid(prow, legend, rel_widths = c(3, .4)) 103 | 104 | 105 | 106 | 107 | ##model framwork 108 | model_result <- fread("./model_built/pathway_score_213/model_result/model.csv") 109 | 110 | data1 <- melt(model_result) 111 | colnames(data1)[2] <- "method" 112 | 113 | f1 <- data1 %>% filter(variable == "f1") 114 | f1$method <- factor(f1$method,levels = c("DNN","FR","LR","SVM","XGBOOST")) 115 | 116 | library(ggplot2) 117 | library(ggpubr) 118 | library(ggprism) 119 | 120 | p1 <- ggplot(f1,aes(x=method,y=value))+ 121 | stat_boxplot(geom = "errorbar",width=0.15)+ 122 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 123 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 124 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 125 | scale_color_manual(values=c("black"))+ 126 | ylim(0,1.01)+ 127 | theme_prism(border = T)+ 128 | labs(y="F1 score", x = "",title = "F1 score of different model framowrks")+ 129 | theme(legend.text = element_text(size = 13,family = "sans")) 130 | p1 131 | 132 | 133 | 134 | 135 | accuracy <- data1 %>% filter(variable == "accuracy") 136 | accuracy$method <- factor(accuracy$method,levels = c("DNN","FR","LR","SVM","XGBOOST")) 137 | p2 <- ggplot(accuracy,aes(x=method,y=value))+ 138 | stat_boxplot(geom = "errorbar",width=0.15)+ 139 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 140 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 141 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 142 | scale_color_manual(values=c("black"))+ 143 | ylim(0,1.01)+ 144 | theme_prism(border = T)+ 145 | labs(y="Accuracy", x = "",title = "Accuracy of different model framowrks")+ 146 | theme(legend.text = element_text(size = 13,family = "sans")) 147 | p2 148 | 149 | 150 | 151 | 152 | recall <- data1 %>% filter(variable == "recall") 153 | recall$method <- factor(recall$method,levels = c("DNN","FR","LR","SVM","XGBOOST")) 154 | p3 <- ggplot(recall,aes(x=method,y=value))+ 155 | stat_boxplot(geom = "errorbar",width=0.15)+ 156 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 157 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 158 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 159 | scale_color_manual(values=c("black"))+ 160 | ylim(0,1.01)+ 161 | theme_prism(border = T)+ 162 | labs(y="Recall", x = "",title = "Recall of different model framowrks")+ 163 | theme(legend.text = element_text(size = 13,family = "sans")) 164 | p3 165 | 166 | 167 | 168 | precisoon <- data1 %>% filter(variable == "precisoon") 169 | precisoon$method <- factor(precisoon$method,levels = c("DNN","FR","LR","SVM","XGBOOST")) 170 | p4 <- ggplot(precisoon,aes(x=method,y=value))+ 171 | stat_boxplot(geom = "errorbar",width=0.15)+ 172 | geom_boxplot(size=0.5,fill="#E8E8E8",outlier.fill="white",outlier.color="white")+ 173 | geom_jitter(aes(fill=datasets),width =0.05,shape = 21,size=3)+ 174 | scale_fill_manual(values = c("#E69F00", "#0072B2","#F0E442","red","blue"))+ 175 | scale_color_manual(values=c("black"))+ 176 | ylim(0,1.01)+ 177 | theme_prism(border = T)+ 178 | labs(y="Precison", x = "",title = "Precison of different model framowrks")+ 179 | theme(legend.text = element_text(size = 13,family = "sans")) 180 | p4 181 | 182 | 183 | library(cowplot) 184 | 185 | 186 | prow <- plot_grid( 187 | p1 + theme(legend.position="none"), 188 | p2 + theme(legend.position="none"), 189 | p3 + theme(legend.position="none"), 190 | p4 + theme(legend.position="none"), 191 | align = 'vh', 192 | labels = c("A", "B", "C","D"), 193 | hjust = -1, 194 | nrow = 2 195 | ) 196 | prow 197 | legend <- get_legend( 198 | p1 + theme(legend.box.margin = margin(0, 0, 0, 12)) 199 | ) 200 | 201 | plot_grid(prow, legend, rel_widths = c(3, .4)) 202 | 203 | -------------------------------------------------------------------------------- /inst/analysis/simulation.R: -------------------------------------------------------------------------------- 1 | 2 | ## pathway_simulation 3 | setwd("~/project/mcIdentify/data/") 4 | remove(list = ls()) 5 | library(data.table) 6 | library(dplyr) 7 | data <- fread("./model_built/pathway_score_213/model_result/mcIdentify_simulation_pathway/10_pathway.txt") 8 | colnames(data) <- c("simulation","measure","5%") 9 | for (number in c(20,40,60,80,100,120,140,160,180)) { 10 | filename <- paste0("./model_built/pathway_score_213/model_result/mcIdentify_simulation_pathway/",number,"_pathway.txt") 11 | data1 <- fread(filename) 12 | cname <- paste0(number/2,"%") 13 | data[,cname] <- data1$V3 14 | 15 | } 16 | 17 | 18 | data[,"0%"] <- rep(c(0.98,0.98,0.99,0.98),100) 19 | data2 <- melt(data) 20 | data2$variable <- factor(data2$variable,levels = c("0%","5%","10%","20%","30%","40%", 21 | "50%","60%","70%","80%","90%")) 22 | 23 | draw_data <- data2 %>% filter(measure == "precision:") 24 | 25 | library(ggpubr) 26 | library(ggprism) 27 | library(ggplot2) 28 | library(cowplot) 29 | ggplot(data=draw_data,aes(x=variable,y=value,fill = variable))+ 30 | geom_boxplot(size=1, draw_quantiles = c(0.5))+ 31 | theme_prism(border = T)+theme(legend.position = 'none')+ 32 | labs(y="Precision",title = " ")+ 33 | theme(axis.title.x = element_blank())+ 34 | geom_hline(aes(yintercept=0.9),linetype=5,col="red")+ 35 | geom_hline(aes(yintercept=0.8),linetype=5,col="red")+ 36 | scale_y_continuous(breaks=c(0,0.5,0.6,0.7,0.8,0.9,1))+ 37 | scale_fill_manual(values = c("#F27970", "#BB9727","#54B345","#32B897", 38 | "#05B9E2", "#8983BF","#C76DA2","#F27970", 39 | "#BB9727","#54B345","#32B897"))+ 40 | ylim(0.5,1) 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | ## simulation gene 49 | setwd("~/project/mcIdentify/data/") 50 | remove(list = ls()) 51 | library(data.table) 52 | library(dplyr) 53 | mcIdentify <- fread("./model_built/pathway_score_213/model_result/simulation_gene/mcIdentify.txt") 54 | SCINA <- fread("./model_built/pathway_score_213/model_result/simulation_gene/SCINA.txt") 55 | scMRMA <- fread("./model_built/pathway_score_213/model_result/simulation_gene/scMRMA.txt") 56 | ikarus <- fread("./model_built/pathway_score_213/model_result/simulation_gene/ikarus_pred.txt") 57 | 58 | 59 | colnames(SCINA) <- c("ID","measure","SCINA") 60 | colnames(mcIdentify) <- c("ID","measure","mcIdentify") 61 | colnames(ikarus) <- c("ID","measure","ikarus") 62 | colnames(scMRMA) <- c("ID","measure","scMRMA") 63 | 64 | 65 | data <- left_join(mcIdentify,ikarus) %>% left_join(.,SCINA) %>% left_join(.,scMRMA) %>% arrange(ID) 66 | 67 | data$infor <- c(rep("1000gene",40),rep("1500gene",40),rep("2000gene",40),rep("2500gene",40),rep("500gene",40)) 68 | 69 | data1 <- melt(data) 70 | 71 | data1$variable <- factor(data1$variable,levels = c("mcIdentify","scMRMA","SCINA","ikarus")) 72 | data1$infor <- factor(data1$infor,levels = c("500gene","1000gene","1500gene","2000gene","2500gene")) 73 | 74 | draw_data <- data1 %>% filter(measure == "F1:") # & variable == "mcIdentify" 75 | draw_data <- data1 %>% filter(measure == "recall:") # & variable == "mcIdentify" 76 | 77 | 78 | a <- draw_data %>% group_by(variable, infor) %>% summarise(mean(value)) 79 | colnames(a) <- c("method","gene","value") 80 | V1 <- c("500gene","1000gene","1500gene","2000gene","2500gene") 81 | library(ggpubr) 82 | library(ggprism) 83 | library(ggplot2) 84 | library(cowplot) 85 | ggplot(a,aes(x=gene,y=value,fill=method))+ 86 | geom_bar(position="dodge",stat="identity")+ 87 | labs(x="Number of random genes",y="Recall")+ 88 | theme_prism(border = F)+ 89 | geom_hline(aes(yintercept=0.9),linetype=5,col="red")+ 90 | scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,0.9))+ 91 | scale_x_discrete(breaks=V1, labels=c("500","1000","1500","2000","2500"))+ 92 | scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"), 93 | breaks=c("mcIdentify","scMRMA","SCINA","ikarus"), 94 | labels=c("mcIdentify","scMRMA","SCINA","ikarus")) 95 | 96 | 97 | 98 | 99 | ## 4pathway simulation 100 | setwd("~/project/mcIdentify/data/") 101 | remove(list = ls()) 102 | library(data.table) 103 | library(dplyr) 104 | 105 | data1 <- fread("./model_built/pathway_score_213/model_result/simulation_pathway2/36_without4_pathway.txt") 106 | data1$pathway <- "withoutpathway" 107 | 108 | data2 <- fread("./model_built/pathway_score_213/model_result/simulation_pathway2/40_with4_pathway.txt") 109 | data2$pathway <- "withpathway" 110 | 111 | data3 <- rbind(data2,data1) 112 | data3$pathway <- factor(data3$pathway,levels = c("withpathway","withoutpathway")) 113 | 114 | draw_data <- data3 %>% filter(V2 == "F1:") 115 | 116 | my_lists <- list(c("withpathway","withoutpathway")) 117 | 118 | ggplot(data=draw_data,aes(x=pathway,y=V3,fill = pathway))+ 119 | geom_boxplot(size=1, draw_quantiles = c(0.5))+ 120 | theme_prism(border = F)+theme(legend.position = 'none')+ 121 | labs(y="F1 score",title = " ")+ 122 | stat_compare_means(method = "wilcox.test")+ 123 | theme(axis.title.x = element_blank())+ 124 | scale_y_continuous(breaks=c(0,0.5,0.6,0.7,0.8,0.9,1))+ 125 | scale_fill_manual(values = c("#91D1C2FF", "#FDAF91FF"))+ 126 | ylim(0.5,1) 127 | 128 | 129 | 130 | 131 | 132 | 133 | ##simulation gene 2 134 | library(data.table) 135 | library(dplyr) 136 | data1 <- fread("./model_built/pathway_score_213/model_result/simulation_gene.csv") 137 | data1$infor <- rep(c("100gene","200gene","300gene","400gene","500gene"),4) 138 | 139 | data2 <- melt(data1) 140 | 141 | 142 | data2$method <- factor(data2$method,levels = c("TCfinder","scMRMA","SCINA","ikraus")) 143 | data2$infor <- factor(data2$infor,levels = c("100gene","200gene","300gene","400gene","500gene")) 144 | 145 | a <- data2 %>% dplyr::filter(variable == "f1") 146 | 147 | V1 <- c("100gene","200gene","300gene","400gene","500gene") 148 | library(ggpubr) 149 | library(ggprism) 150 | library(ggplot2) 151 | library(cowplot) 152 | ggplot(a,aes(x=infor,y=value,fill=method))+ 153 | geom_bar(position="dodge",stat="identity")+ 154 | labs(x="Number of randomly inactivate genes",y="F1 Score")+ 155 | theme_prism(border = F)+ 156 | geom_hline(aes(yintercept=0.95),linetype=5,col="red")+ 157 | scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,0.95,1))+ 158 | scale_x_discrete(breaks=V1, labels=c("100","200","300","400","500"))+ 159 | scale_fill_manual(values = c("#6E9ECE", "#CCCCCC","#E6928F","#8FBC8F"), 160 | breaks=c("TCfinder","scMRMA","SCINA","ikraus"), 161 | labels=c("TCfinder","scMRMA","SCINA","ikarus")) 162 | 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /inst/analysis/simulation_gene.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | setwd("~/project/mcIdentify/data/") 4 | remove(list = ls()) 5 | library(data.table) 6 | library(dplyr) 7 | 8 | all_data <- fread("./processed_data/GSE148673_tpm.txt") 9 | border_data <- as.data.frame(all_data[,-1]) 10 | rownames(border_data) <- all_data$V1 11 | 12 | 13 | 14 | 15 | a <- apply(border_data, 2, function(x){which(x > 0)}) 16 | 17 | for (number in c(20)) { 18 | 19 | ##border gene select 20 | low_number <- NA 21 | testdata <- border_data 22 | for (i in 1:35727) { 23 | random_number <- sample(a[[i]], number, replace = FALSE) 24 | testdata[random_number,i] <- 0 25 | 26 | } 27 | 28 | 29 | 30 | 31 | write_filename_border <- paste0("./model_built/pathway_score_213/simulation_gene/GSE148673_tpm_",number,".txt") 32 | 33 | fwrite(testdata,write_filename_border,row.names = T) 34 | 35 | 36 | 37 | ####pathway score 38 | read_filename <- paste0("./model_built/pathway_score_213/simulation_gene/GSE148673_tpm_",number,".txt") 39 | data <- fread(read_filename) 40 | 41 | 42 | infor_data1 <- fread("./model_built/datasets/GSE148673_anno.txt") 43 | infor_data1 <- infor_data1 %>% mutate(type = case_when(cluster.pred == "T"~"malignant", 44 | cluster.pred == "N"~"normal")) 45 | 46 | pathway_gene <- readRDS("./KEGG_pathway_gene.rds") 47 | expr_data1 <- data %>% filter(V1 %in% names(table(pathway_gene$gene_id))) 48 | 49 | 50 | expr_data <- expr_data1 51 | rownames(expr_data) <- expr_data$V1 52 | 53 | expr_matrix <- expr_data[,-1] 54 | expr_matrix <- as.data.frame(t(expr_matrix)) 55 | colnames(expr_matrix) <- rownames(expr_data) 56 | data1 <- expr_matrix 57 | 58 | GSE673_diff_path3 <- readRDS("./model_built/pathway_score_213/GSE673_diff_path213.rds") 59 | score_gene <- pathway_gene %>% filter(hsa %in% GSE673_diff_path3$hsa) 60 | 61 | ##pathway score 62 | 63 | myFun1 <- function(a){ 64 | 65 | sum(a)/length(a) 66 | 67 | } 68 | 69 | all_pathway_score <- NA 70 | for (i in 1:213) { 71 | 72 | gene <- score_gene %>% filter(hsa == names(table(score_gene$hsa))[i]) 73 | 74 | a <- data1 %>% select(gene$gene_id[which(gene$gene_id %in% colnames(data1)==TRUE)]) 75 | 76 | path_score <- as.data.frame(apply(a, 1, myFun1)) 77 | colnames(path_score) <- names(table(score_gene$hsa))[i] 78 | 79 | all_pathway_score <- cbind(all_pathway_score,path_score) 80 | 81 | } 82 | 83 | pathway_score <- all_pathway_score 84 | pathway_score <- pathway_score[,-1] 85 | 86 | diff_path <- pathway_score 87 | diff_path$barcode <- rownames(diff_path) 88 | # infor_data2 <- infor_data1 %>% filter(Type != "unclassified") 89 | infor_data2 <- infor_data1 90 | diff_path <- left_join(diff_path,infor_data2) 91 | diff_path <- diff_path %>% select(type,GSE256_diff_path3$hsa) %>% na.omit() 92 | 93 | 94 | write_filename <- paste0("./model_built/pathway_score_213/simulation_gene/pathway_score/GSE673_",number,".csv") 95 | fwrite(diff_path,write_filename) 96 | } 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /inst/analysis/umap.R: -------------------------------------------------------------------------------- 1 | 2 | setwd("~/project/mcIdentify/data/model_built/pathway_score_213/predict_result/") 3 | remove(list = ls()) 4 | library(umap) 5 | library(ggprism) 6 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 7 | 8 | data <- fread("./GSE309_predict.csv", data.table=F) 9 | colnames(data)[2] <- "predict" 10 | data1 <- data %>% mutate(true = case_when(type == 0 ~ "malignant",type == 1 ~ "normal")) %>% 11 | mutate(predict = case_when(predict == 0 ~ "malignant",predict == 1 ~ "normal")) %>% select(true,predict,GSE673_diff_path213$hsa) 12 | 13 | umap1 <- umap::umap(data1[,3:215]) 14 | umap2 <- umap1$layout 15 | df1<-data.frame(umap2,data1$true) 16 | df1$data1.true<-as.factor(df1$data1.true) 17 | 18 | p1<-ggplot(data = df1,aes(x=X1,y=X2,color=data1.true))+ 19 | geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+ 20 | guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+ 21 | scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 True")+ 22 | theme(axis.text = element_blank(),axis.ticks=element_blank()) 23 | p1 24 | 25 | 26 | df2<-data.frame(umap2,data1$predict) 27 | df2$data1.predict<-as.factor(df2$data1.predict) 28 | 29 | p2<-ggplot(data = df2,aes(x=X1,y=X2,color=data1.predict))+ 30 | geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+ 31 | guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+ 32 | scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 Predict")+ 33 | theme(axis.text = element_blank(),axis.ticks=element_blank()) 34 | p2 35 | 36 | 37 | library(cowplot) 38 | prow <- plot_grid( 39 | p1 + theme(legend.position="none"), 40 | p2 + theme(legend.position="none"), 41 | align = 'vh', 42 | labels = c(), 43 | hjust = -1, 44 | nrow = 1 45 | ) 46 | prow 47 | legend <- get_legend( 48 | p1 + theme(legend.box.margin = margin(0, 5, 0, 5),legend.text = element_text(size = 15,family = "sans")) 49 | ) 50 | 51 | plot_grid(prow, legend, rel_widths = c(4, .7)) 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | setwd("~/project/mcIdentify/data/model_built/pathway_score_213/predict_result/") 60 | remove(list = ls()) 61 | library(umap) 62 | library(ggprism) 63 | GSE673_diff_path213 <- readRDS("~/project/mcIdentify/data/model_built/pathway_score_213/GSE673_diff_path213.rds") 64 | 65 | data <- fread("./GOSH_predict.csv", data.table=F) 66 | colnames(data)[2] <- "predict" 67 | data1 <- data %>% mutate(true = case_when(type == 0 ~ "malignant",type == 1 ~ "normal")) %>% 68 | mutate(predict = case_when(predict == 0 ~ "malignant",predict == 1 ~ "normal")) %>% select(true,predict,GSE673_diff_path213$hsa) 69 | 70 | umap1 <- umap::umap(data1[,3:215]) 71 | umap2 <- umap1$layout 72 | 73 | predict_data <- fread("~/project/mcIdentify/data/model_built/pathway_score_213/framwork_umap/XGBoost_309_predict.csv") 74 | predict_data <- predict_data[-1,] 75 | colnames(predict_data) <- "predict" 76 | 77 | df2<-data.frame(umap2,predict_data$predict) 78 | 79 | df2$predict_data.predict<-as.factor(df2$predict_data.predict) 80 | 81 | 82 | library(ggplot2) 83 | p3<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+ 84 | geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+ 85 | guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+ 86 | scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 LR Predict")+ 87 | theme(axis.text = element_blank(),axis.ticks=element_blank()) 88 | p3 89 | 90 | 91 | p4<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+ 92 | geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+ 93 | guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+ 94 | scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 RF Predict")+ 95 | theme(axis.text = element_blank(),axis.ticks=element_blank()) 96 | p4 97 | 98 | 99 | p5<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+ 100 | geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+ 101 | guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+ 102 | scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 SVM Predict")+ 103 | theme(axis.text = element_blank(),axis.ticks=element_blank()) 104 | p5 105 | 106 | 107 | p6<-ggplot(data = df2,aes(x=X1,y=X2,color=predict_data.predict))+ 108 | geom_point(size = 0.5)+labs(x="UMAP1",y="UMAP2",color="")+ 109 | guides(fill="none")+theme_classic()+scale_fill_manual(values = c("#F8766D","#00BFC4"))+ 110 | scale_colour_manual(values = c("#F8766D","#00BFC4"))+theme_prism(border = T)+ggtitle("GSE309 XGBoost Predict")+ 111 | theme(axis.text = element_blank(),axis.ticks=element_blank()) 112 | p6 113 | 114 | 115 | 116 | library(cowplot) 117 | prow <- plot_grid( 118 | p3 + theme(legend.position="none"), 119 | p4 + theme(legend.position="none"), 120 | p5 + theme(legend.position="none"), 121 | p6 + theme(legend.position="none"), 122 | align = 'vh', 123 | labels = c(), 124 | hjust = -1, 125 | nrow = 2 126 | ) 127 | prow 128 | legend <- get_legend( 129 | p3 + theme(legend.box.margin = margin(0, 5, 0, 5),legend.text = element_text(size = 15,family = "sans")) 130 | ) 131 | 132 | plot_grid(prow, legend, rel_widths = c(4, .7)) 133 | 134 | -------------------------------------------------------------------------------- /inst/extdata/TCfinder.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/inst/extdata/TCfinder.hdf5 -------------------------------------------------------------------------------- /inst/extdata/predict_py.py: -------------------------------------------------------------------------------- 1 | 2 | from tensorflow.keras.models import load_model 3 | import pandas as pd 4 | import numpy as np 5 | 6 | def predict_py(path_score,Path): 7 | data2 = path_score 8 | model = load_model(Path+"/TCfinder.hdf5") 9 | predict = model.predict(data2) 10 | return predict 11 | -------------------------------------------------------------------------------- /inst/image/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XSLiuLab/TCfinder/f104ddc566e06c49ede97d499d9df695deee5490/inst/image/workflow.png -------------------------------------------------------------------------------- /man/data_normalized.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_normalized.R 3 | \name{data_normalized} 4 | \alias{data_normalized} 5 | \title{data normalized} 6 | \usage{ 7 | data_normalized(expr_data, method = "method", genome = "hg38") 8 | } 9 | \arguments{ 10 | \item{expr_data}{A single-cell counts expression matrix.} 11 | 12 | \item{method}{If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required. 13 | For other single-cell sequencing methods, this parameter does not need to be filled in.} 14 | 15 | \item{genome}{Reference genome, when method = "smart-seq2", 16 | this parameter needs to be filled in, you can choose hg19 and hg38} 17 | } 18 | \value{ 19 | A normalized single-cell expression matrix. 20 | } 21 | \description{ 22 | Normalize single-cell raw counts matrix. 23 | } 24 | \details{ 25 | Input a data.frame where the rows are the gene names and the columns are the sample names. 26 | } 27 | -------------------------------------------------------------------------------- /man/pathway_score.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pathway_score.R 3 | \name{pathway_score} 4 | \alias{pathway_score} 5 | \title{pathway score} 6 | \usage{ 7 | pathway_score(expr_data, normalized = TRUE, method = "method", genome = "hg38") 8 | } 9 | \arguments{ 10 | \item{expr_data}{Single-cell expression matrix after normalization of the original counts matrix.} 11 | 12 | \item{normalized}{If the matrix is not normalized, you need to set normalized = FALSE} 13 | 14 | \item{method}{This parameter is required when normalized = FALSE. If the single-cell sequencing method used is smart-seq2, method = "smart-seq2" is required. 15 | For other single-cell sequencing methods, this parameter does not need to be filled in.} 16 | 17 | \item{genome}{This parameter is required when normalized = FALSE. Reference genome, when method = "smart-seq2", 18 | this parameter needs to be filled in, you can choose hg19 and hg38} 19 | } 20 | \value{ 21 | A matrix containing 213 pathway scores. 22 | } 23 | \description{ 24 | Obtain a pathway score matrix for predicting tumor cells. 25 | } 26 | \details{ 27 | Input a sparse matrix, matrix, or data frame where the rows are the gene names and the columns are the sample names. Matrix that can be generated directly using the data_normalized.R function. 28 | } 29 | -------------------------------------------------------------------------------- /man/predict_cell.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict_cell.R 3 | \name{predict_cell} 4 | \alias{predict_cell} 5 | \title{Cell types prediction.} 6 | \usage{ 7 | predict_cell(path_score) 8 | } 9 | \arguments{ 10 | \item{path_score}{The pathway score matrix calculated by the pathway_score function.} 11 | } 12 | \value{ 13 | A data.frame containing cell types and predicted values. 14 | } 15 | \description{ 16 | Classify tumor cells from normal cells. 17 | } 18 | \details{ 19 | Input the pathway score matrix calculated by the pathway_score function. 20 | } 21 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(TCfinder) 11 | 12 | test_check("TCfinder") 13 | -------------------------------------------------------------------------------- /tests/testthat/test-data_normalized.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | expect_equal(2 * 2, 4) 3 | }) 4 | -------------------------------------------------------------------------------- /tests/testthat/test-pathway_score.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | expect_equal(2 * 2, 4) 3 | }) 4 | -------------------------------------------------------------------------------- /tests/testthat/test-predict_cell.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | expect_equal(2 * 2, 4) 3 | }) 4 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/interpretation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "interpretation" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{interpretation} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | ```{r setup} 18 | library(TCfinder) 19 | ``` 20 | --------------------------------------------------------------------------------