├── R ├── TCIDEA_obj_for_CEP.R ├── calCNVScore_for_CEP.R ├── calInferredCNA_for_CEP.R └── makingTCIDEA_for_CEP.R ├── README.md ├── calc_Chromosomal_Expression_Pattern.R └── example ├── calCNVScore_for_CEP.R ├── cell_info_tumor_example.Rds ├── final_example_AddNormal_all_cells_CNV_score_vs_cor_classification.pdf ├── log2TPM_normal_example.Rds ├── log2TPM_tumor_example.Rds └── refgenome_example.Rds /R/TCIDEA_obj_for_CEP.R: -------------------------------------------------------------------------------- 1 | 2 | tcidea <- setClass( 3 | "TCIDEA", 4 | slots = c( 5 | log.data = "ANY", 6 | scale.data = "ANY", 7 | ident = "ANY", 8 | tSNE.data = "ANY", 9 | tSNE.calculate = "ANY", 10 | cnv.data = "ANY", 11 | cnv.sd = "ANY", 12 | cnv.mean.sq = "ANY", 13 | cnv.cv = "ANY", 14 | cnv.abmean = "ANY", 15 | cluster.auc = "ANY", 16 | 17 | ## 18 | normal.use = "ANY", 19 | normal.raw.data = "ANY", 20 | normal.log.data = "ANY", 21 | 22 | ##edge index per cluster and gene set name -> T/F 23 | cluster.es.esp = "ANY", 24 | cluster.geneset = "ANY", 25 | # cluster.rank = "ANY", 26 | 27 | ## 28 | label = "ANY" 29 | ) 30 | ) 31 | 32 | 33 | ##Documentation 34 | #raw.data = raw umi count data 35 | #log.data = log data -> log2 normalized 36 | #scale.data = scaled data -> centering log2 normalized data 37 | setMethod( 38 | f = "show", 39 | signature = "TCIDEA", 40 | definition = function(object) { 41 | cat( 42 | "An object of class", 43 | object@label, 44 | "\n", 45 | nrow(x = object@log.data), 46 | "genes across", 47 | ncol(x = object@log.data), 48 | "samples.\n" 49 | ) 50 | if(object@normal.use){ 51 | cat( 52 | "Normal reference dataset ", 53 | nrow(x = object@normal.log.data), 54 | "genes across", 55 | ncol(x = object@normal.log.data), 56 | "samples.\n" 57 | ) 58 | } 59 | invisible(x = NULL) 60 | } 61 | ) 62 | 63 | 64 | -------------------------------------------------------------------------------- /R/calCNVScore_for_CEP.R: -------------------------------------------------------------------------------- 1 | calCNVScore <- function(sh, cell_info, s, levels, cutoff.corr, cutoff.score, meta, target.celltypes, 2 | output.dir){ 3 | 4 | sh2 = sh; dat = t(sh2) ## already remove chromosomal location info 5 | CNV_score <- data.frame(MS = colMeans(sh2^2), SS = apply(sh2^2, 2, sum), SD = apply(sh2, 2, sd)) 6 | ## 7 | CNV_score$Row.names <- rownames(CNV_score) 8 | cell_info$Row.names <- rownames(cell_info) 9 | 10 | cell_info2 <- plyr::join(cell_info, CNV_score, by="Row.names") # boxplot for celltype 11 | 12 | ## MS top 5% cells 13 | top_MS_cells <- arrange(cell_info2, desc(MS))[1:round(dim(cell_info2)[1]*0.05),]$Row.names # Top 5% 14 | 15 | ## calculate correlation : corr using 1 cell vs. top_MS_cells 16 | tmp <- data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells])) 17 | for(i in 1:dim(cell_info2)[1]){ 18 | cell_info2$COR[i] <- cor(sh2[,i, drop=FALSE], data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells]))) 19 | } 20 | 21 | 22 | ## 23 | cell_info3 <- cell_info2 24 | rownames(cell_info3) <- cell_info3$Row.names 25 | 26 | tumorcells <- filter(cell_info3, ((MS > cutoff.score | COR > cutoff.corr) & celltype %in% target.celltypes))$Row.names 27 | nontumorcells <- cell_info3$Row.names[!(cell_info3$Row.names %in% c(tumorcells))] 28 | immunecells <- cell_info3$Row.names[cell_info3$celltype != target.celltypes] 29 | 30 | ## only classified tumor vs. non-tumor ## 31 | cell_info3$cell_index <- rep("X", dim(cell_info3)[1]) 32 | cell_info3[tumorcells,]$cell_index <- "Tumor" 33 | cell_info3[nontumorcells,]$cell_index <- "Nontumor" 34 | cell_info3[immunecells,]$cell_index <- "Immune" 35 | 36 | ## 2D plot of MS score and correlation ## 37 | expos<-ggplot(cell_info3, aes(x=MS, y= COR)) + geom_point(aes(fill=cell_index), size=5, alpha=.8, shape=21, colour="black") + 38 | scale_fill_manual(values = c("Tumor"="red","Immune" = "gray70","Nontumor"="dodgerblue1")) + 39 | geom_vline(xintercept = cutoff.score, colour="black", size=0.5, linetype = "longdash") + geom_hline(yintercept = cutoff.corr, colour="black", size=0.5, linetype = "longdash") + 40 | xlab("MS score") + ylab("CNV correlation") + theme_bw() + 41 | theme(axis.title.x = element_text(face="bold", size=16), axis.text.x = element_text(size=12)) + 42 | theme(axis.title.y = element_text(face="bold", size=16), axis.text.y = element_text(size=12)) + 43 | theme(panel.border=element_rect(fill=NA, colour="black", size=2), legend.position = 'right') 44 | expos 45 | ggsave(paste0(output.dir,"/", "final_",s, "_all_cells_CNV_score_vs_cor_classification.pdf"), width = 7, height = 5) 46 | 47 | return (cell_info3) 48 | } 49 | 50 | ReadTotalCelltype <- function(FinalCellType){ 51 | 52 | total.celltype <- read.table(file = FinalCellType, 53 | sep = "\t", header = T) 54 | 55 | total.celltype$Cell <- as.character(total.celltype$Cell) 56 | total.celltype$NEW <- as.character(total.celltype$NEW) 57 | 58 | for(i in 1:nrow(total.celltype)){ 59 | tmp <- base::strsplit(x = total.celltype$Cell[i], split = "_") 60 | total.celltype$Sample[i] <- tmp[[1]][1]; total.celltype$Barcode[i] <- tmp[[1]][2] 61 | } 62 | 63 | return (total.celltype) 64 | } 65 | 66 | ReadClusterForSample <- function(s, total){ 67 | 68 | if(grep("-A1", s) > 0){s <- gsub("-A1", "", s)} 69 | 70 | ## 71 | sample.subset <- subset(total, total$Sample == s) 72 | dim <- dim(sample.subset) 73 | cat("Sample ",s," Cell : ", dim[1],"\n") 74 | 75 | ## 76 | df <- data.frame(cluster = sample.subset$NEW) 77 | rownames(df) <- sample.subset$Barcode 78 | 79 | return (df) 80 | } 81 | 82 | -------------------------------------------------------------------------------- /R/calInferredCNA_for_CEP.R: -------------------------------------------------------------------------------- 1 | ## Calculate Chromosomal expression pattern (CEP) ## 2 | runCEP <- function(target.normalized, sample.info, label, annotationdata, min.cells = 10, MYwalk = 100, 3 | target.celltypes, output.dir){ 4 | 5 | ## 1. making TCIDEA object only tumor cells ## 6 | tcidea <- newTCIDEA(log.data = target.normalized, clustergroup = sample.info, label = label) 7 | 8 | ##2. calculate inferredCNV 9 | ## Proportion of epithelia cells <= EP_CUTOFF 10 | ## Only use genes expressed > min.cells 11 | ## Average of 100 genes (Binning) 12 | tcidea <- calInferredCNV(tcidea, min.cells = min.cells, MYwalk = MYwalk, z.score = TRUE, limit = TRUE, 13 | annotationdata = annotationdata, log.file = paste0(label, "_log.txt"), use.total = FALSE) 14 | 15 | ## Calculate MS (Mean of squares) and CORR (Correlation) 16 | ## if MS score > cutoff.score or Correlation score > cutoff.corr : Malignant cells 17 | final_cell_info <- calCNVScore(tcidea@cnv.data, tcidea@ident, tcidea@label, levels, cutoff.score = 0.02, cutoff.corr = 0.2,meta = NULL, 18 | target.celltypes, output.dir) 19 | 20 | ## Save calculated info 21 | saveRDS(final_cell_info, file = paste0(output.dir,"/", label, "_after_calc_CNV_score.Rds")) # save final_cell_info (calculated CNV score) 22 | 23 | ## Save Object file 24 | saveRDS(tcidea, file = paste0(output.dir,"/", label, "_after_calc_CNV_score_TCIDEA_obj.Rds")) 25 | 26 | ## Remove object 27 | rm(tcidea) 28 | rm(final_cell_info) 29 | } 30 | 31 | 32 | 33 | calInferredCNV <- function( 34 | obj, min.cells = 10, MYwalk = 100, z.score = TRUE, limit = TRUE, 35 | annotationdata, log.file = "log_files.txt", 36 | use.total = FALSE 37 | ){ 38 | 39 | ## 40 | GTF_uniq <- annotationdata 41 | 42 | ## 43 | SC = obj@log.data 44 | 45 | cat("Calculate inferred CNV in ", obj@label, "class\n") 46 | 47 | ### 1. Filter out unreliable genes 48 | SC. = as.matrix(SC[rowMeans(SC) != 0,]) 49 | dim <- dim(SC.) 50 | cat("Raw data : ", dim[1], "genes across", dim[2], "samples\n") 51 | 52 | # Filter-out many genes (expr>1, min.cells>10) 53 | SC.filter = SC.[rowSums(SC.>1) > min.cells,]; dim(SC.filter) 54 | dim <- dim(SC.filter) 55 | cat("QC passed data : ", dim[1], "genes across", dim[2], "samples\n") 56 | 57 | ### 2. rm low-expressed genes 58 | if(use.total){ 59 | SC.f <- SC.filter 60 | }else{ SC.f <- SC.filter[rowMeans(SC.filter) > 0.1,]} 61 | 62 | dim <- dim(SC.f) 63 | cat("2nd QC passed data (rm low-expressed genes) : ", dim[1], "genes across", dim[2], "samples\n") 64 | 65 | SC.o.substract <- SC.f 66 | ### 4. "Annotation" of gene symbol with chromosomal information 67 | SC_anno = QAnno(SC.o.substract, GTF_uniq); dim(SC_anno) 68 | 69 | ### 5. Omit X & Y & MT chromosome 70 | Omit_XYM = c(23,24,25,26) # c("X","Y","M","GL"|"KI") 71 | SC_auto = SC_anno[!grepl(paste(Omit_XYM,collapse="|") , SC_anno$chromosome_name), ] 72 | dim(SC_auto) 73 | 74 | dim <- dim(SC_auto) 75 | cat("X & Y & MT chromosome removed data : ", dim[1], "genes across", dim[2]-6, "samples\n") 76 | 77 | NormalZ_auto.r <- NULL 78 | if(z.score == TRUE){ 79 | ### 5. "Z-scoring" by row 80 | SC_SD = data.matrix(apply(SC_auto[,-c(1:6)], 1, sd)) 81 | SCZ_auto.r = cbind(SC_auto[,1:6], Zscore(SC_auto[,-c(1:6)], SC_SD)); typeof(SCZ_auto.r) # "list" 82 | cat("Making Z-scoring data\n") 83 | } 84 | else{ 85 | SCZ_auto.r = SC_auto 86 | cat("Making Not centering, Z-scoring data\n") 87 | } 88 | 89 | ### 6. Limit the relative expression values to [-3,3] # as Tirosh did 90 | summary(as.numeric(as.matrix(SCZ_auto.r[,-c(1:6)]))) 91 | if(limit){ 92 | SCZ_auto.r2 = SCZ_auto.r[,-c(1:6)] 93 | SCZ_auto.r2[SCZ_auto.r2 < (-3)] <- (-3) 94 | SCZ_auto.r2[SCZ_auto.r2 > 3] <- 3 95 | summary(as.numeric(as.matrix(SCZ_auto.r2))) 96 | SCZ_auto.r2 = cbind(SCZ_auto.r[,1:6], SCZ_auto.r2) 97 | cat("Limit the relative expression values : limitation is -3 ~ 3\n") 98 | # write.table(SCZ_auto.r2, file = "LUNG_T18_zscore_by_row_after_lim.txt", sep= "\t") 99 | cat("After limitation, min : ",min(SCZ_auto.r2[,-c(1:6)])," max : ", max(SCZ_auto.r2[,-c(1:6)]), "\n") 100 | } 101 | else{ 102 | SCZ_auto.r2 = SCZ_auto.r 103 | cat("Not Limit the relative expression values\n") 104 | cat("min : ",min(SCZ_auto.r2[,-c(1:6)])," max : ", max(SCZ_auto.r2[,-c(1:6)]), "\n") 105 | } 106 | 107 | 108 | ### 7. Moving average of Z-score / Centering data 109 | library(caTools) ; 110 | 111 | SCZ_MV150 = MyMV_Zscore(SCZ_auto.r2, MYwalk) 112 | SCZ_MV150_centering = t(t(SCZ_MV150)-colMeans(SCZ_MV150)); typeof(SCZ_MV150_centering) # "double" 113 | 114 | f_SCZ_MV150 = round(SCZ_MV150, digits=3); 115 | f_SCZ_MV150_centering = round(SCZ_MV150_centering, digits=3); ## centering by column 116 | 117 | ## return cnv values. z-scored values 118 | obj@cnv.data <- SCZ_MV150_centering 119 | obj@cnv.sd <- calSD(SCZ_MV150_centering) 120 | 121 | ## add statistical values 122 | obj@cnv.mean.sq <- meanSquare(SCZ_MV150_centering) 123 | obj@cnv.cv <- calCV(SCZ_MV150_centering) 124 | obj@cnv.abmean <- calMeanAb(SCZ_MV150_centering) 125 | 126 | return (obj) 127 | } 128 | 129 | ## calculate mean of absolute ## 130 | calMeanAb <- function(matrix){ 131 | mean.ab = matrix(nrow = ncol(matrix), ncol = 1) 132 | rownames(mean.ab) <- colnames(matrix) 133 | 134 | for(i in 1:ncol(matrix)){ 135 | abs <- abs(x = matrix[,i]) 136 | mean <- mean(x=abs) 137 | # 138 | mean.ab[i,1] <- mean 139 | } 140 | ## 141 | colnames(mean.ab) <- "abMean" 142 | cat(colnames(mean.ab)) 143 | 144 | return (mean.ab) 145 | } 146 | 147 | ## calculate CV ## 148 | calCV <- function(matrix){ 149 | cv = matrix(nrow = ncol(matrix), ncol = 1) 150 | rownames(cv) <- colnames(matrix) 151 | 152 | for(i in 1:ncol(matrix)){ 153 | sample.SD <- sd(x = matrix[,i]) 154 | abs <- abs(x = matrix[,i]) 155 | mean <- mean(x=abs) 156 | cv[i,1] <- sample.SD / mean * 100 #150518 157 | } 158 | 159 | colnames(cv) <- "CV" 160 | cat(colnames(cv)) 161 | return (cv) 162 | } 163 | 164 | ## mean of squares : mean(squares per each values) 165 | meanSquare <- function(matrix){ 166 | 167 | m.square = matrix(nrow = ncol(matrix), ncol = 1) 168 | rownames(m.square) <- colnames(matrix) 169 | 170 | for(i in 1:ncol(matrix)){ 171 | square <- 0; n <- 0; 172 | for(j in 1:nrow(matrix)){ 173 | square <- square + (matrix[j,i])^2 174 | n <- n+1 175 | } 176 | ## 177 | m.square[i,1] <- square / n 178 | } 179 | 180 | colnames(m.square) <- "MS" 181 | cat(colnames(m.square)) 182 | 183 | return(m.square) 184 | } 185 | 186 | 187 | ## in this case, we only provide GRCh38 reference genome. 188 | readGTF.addData <- function(AnnotationLevel){ 189 | 190 | if(AnnotationLevel == "GRCh38"){load("GRCh38.rda"); return(GTF_uniq)} 191 | else{ 192 | cat('TCIDEA only provides GRCh38 gtf files.') 193 | } 194 | } 195 | 196 | ## using GTF matrix, we make chromosome table along chromosomal location ## 197 | QAnno = function(row_gene_named_matrix, GTF_uniq){ 198 | ANNO_overlap = intersect(rownames(row_gene_named_matrix) , rownames(GTF_uniq)) 199 | ANNO_GTF = GTF_uniq[ANNO_overlap, ] 200 | ANNO_input = row_gene_named_matrix[ANNO_overlap,]; 201 | ANNO_merge = cbind(ANNO_GTF,ANNO_input) 202 | MyANNO = cbind(ANNO_merge[,1],rownames(ANNO_merge),ANNO_merge[,c(2:5 , 7:ncol(ANNO_merge))]) 203 | colnames(MyANNO)=c("ensembl_gene_id","gene_name","description","chromosome_name","start_position","end_position",colnames(ANNO_input)) 204 | MyANNO[,4]=gsub("X",23,MyANNO[,4]) ; MyANNO[,4]=gsub("Y",24,MyANNO[,4]) ; MyANNO[,4]=gsub("MT",25,MyANNO[,4]) ; MyANNO[,4]=gsub("GL",26,MyANNO[,4]) ; MyANNO[,4]=gsub("KI",26,MyANNO[,4]) 205 | MyANNO[,c(4,5,6)] = sapply(MyANNO[,c(4,5,6)], as.numeric) 206 | MyTABLE = MyANNO[order(MyANNO[,4] , MyANNO[,5]) , ] 207 | return(MyTABLE) 208 | } 209 | 210 | ## calculate z-score ## 211 | Zscore = function(Tumor_ExpRatio, SD_of_TumorRatios){(Tumor_ExpRatio-rowMeans(Tumor_ExpRatio))/SD_of_TumorRatios} 212 | 213 | ## Normalize CEP to Z-score ## 214 | MyMV_Zscore = function(annotate_matrix, MYwalk){ 215 | annotate_matrix$chromosome_name = sapply(annotate_matrix$chromosome_name, as.numeric) 216 | MV_input = annotate_matrix[order(annotate_matrix$chromosome_name), ] 217 | MV_input = MV_input[,-c(1:3,5:6)] ; 218 | rownames(MV_input)=paste("chr",annotate_matrix[,4],":",annotate_matrix[,5],"-",annotate_matrix[,6]," (",annotate_matrix[,2],")",sep="") 219 | for(i in 1:22){ 220 | MV.chr = MV_input[MV_input[,1] == i, ]; 221 | MV.dat = apply(MV.chr, 2, runmean, MYwalk) 222 | if(i ==1){MyMV = MV.dat} else {MyMV=rbind(MyMV,MV.dat)}} 223 | MV_output = MyMV[,-1]; 224 | colnames(MV_output)=colnames(MV_input[,-1]) ; rownames(MV_output) = rownames(MV_input[,-1]); 225 | return(MV_output) 226 | } 227 | 228 | # calculate SD of each single-cells 229 | calSD <- function(matrix){ 230 | SD = matrix(nrow = ncol(matrix), ncol = 1) 231 | rownames(SD) <- colnames(matrix) 232 | 233 | for(i in 1:ncol(matrix)){ 234 | sample.SD <- sd(x = matrix[,i]) 235 | SD[i,1] <- sample.SD 236 | } 237 | 238 | colnames(SD) <- "SD" 239 | cat(colnames(SD)) 240 | return (SD) 241 | } 242 | -------------------------------------------------------------------------------- /R/makingTCIDEA_for_CEP.R: -------------------------------------------------------------------------------- 1 | 2 | #' Create TCIDEA object 3 | #' 4 | #' Initialize the TCIDEA object and adding option 5 | newTCIDEA <- function( 6 | log.data, 7 | label, 8 | clustergroup 9 | ){ 10 | 11 | obj <- new(Class = "TCIDEA", log.data = log.data, label = label) 12 | obj@ident = clustergroup 13 | 14 | return (obj) 15 | } 16 | 17 | ## Add normal data ## 18 | addNormalDataset <- function(tumor.data, tumor.ident, target.celltypes, 19 | normal.data){ 20 | 21 | # 1. calculate adding normal number # 22 | tb <- tumor.ident %>% dplyr::group_by(celltype) %>% dplyr::summarise(n = n()) 23 | tb$percent <- tb$n / sum(tb$n) * 100 24 | 25 | ep.n <- tb$n[tb$celltype %in% target.celltypes] 26 | needs.normal.n = 5 * ep.n - sum(tb$n)# to adding ep percent == 20 27 | 28 | ##2. Select normal data (random) 29 | set.seed(1011) 30 | random.s <- sample(colnames(normal.data), needs.normal.n) 31 | 32 | normal.random = normal.data[,random.s] 33 | normal.ident = data.frame(Index = random.s, celltype = "Normal", stringsAsFactors = F) 34 | rownames(normal.ident) = normal.ident$Index 35 | 36 | ##3. Add normal data 37 | intersect.gene = intersect(rownames(tumor.data), rownames(normal.random)) 38 | 39 | addnormal.data = cbind(tumor.data[intersect.gene,], normal.random[intersect.gene,]) 40 | addnormal.cellinfo = rbind(tumor.ident, normal.ident) 41 | addnormal.cellinfo = addnormal.cellinfo[colnames(addnormal.data),] 42 | 43 | list <- list(data =addnormal.data, ident = addnormal.cellinfo) 44 | 45 | return (list) 46 | } 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Single-cell RNA sequencing for lung adenocarcinoma 2 | 3 | In order to separate malignant tumor cells from non-malignant cells, we calculate CNV aberrations inferring the perturbation of chromosomal gene expression. 4 | 5 | 1. Adjusting the proportion of putative malignant cells below 20%. 6 | 2. Filter out less informative genes (default : less than 10 cells and mean expression of less than 0.1 at log2 scale). 7 | 3. Transformation into Z-score and limit the scale -3 to 3. 8 | 4. Sorting the genes by their chromosomal position and estimate CNV signals using the window size (default = 100 genes). 9 | 5. Summarize CNV signal with two parameters and classify malignant cells and non-malignant cells. 10 | - CNV signals (MS, Mean of Squares) : Mean squares of estimates across all windows. 11 | - CORR (Correlation with the high CNV signal cells) : Correlation of the CNV of each cell with the average of the top5% cells. 12 | - Malignant cells were classified if their CNV signals (MS) > 0.02 or CNV correlation (CORR) > 0.2. 13 | 14 | 15 | # Example code 16 | 17 | After download the codes, run example data using 'calc_Chromosomal_Expression_Pattern.R'. 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /calc_Chromosomal_Expression_Pattern.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(pals) 3 | library(plyr) 4 | library(dplyr) 5 | library(Seurat) 6 | library(gplots) 7 | library(RColorBrewer) 8 | 9 | #################################################################################################### 10 | source("R/calInferredCNA_for_CEP.R") ## calculate inferredCNV value 11 | source("R/makingTCIDEA_for_CEP.R") ## make TCIDEA object 12 | source("R/TCIDEA_obj_for_CEP.R") ## initiate TCIDEA object 13 | source("R/calCNVScore_for_CEP.R") ## calculate CNV score (MS, Corr) with CEP result 14 | 15 | ################################################################################################### 16 | ## Example data ################################################################################### 17 | cell_annotation_with_tumor <- readRDS(file = "example/cell_info_tumor_example.Rds") # celltype = annotation cell types in transcriptome data 18 | tumor_example <- readRDS(file = "example/log2TPM_tumor_example.Rds") 19 | normal_example <- readRDS(file = "example/log2TPM_normal_example.Rds") 20 | ref_genome_example <- readRDS(file = "example/refgenome_example.Rds") 21 | output.dir = paste0(getwd(), "/", "example") 22 | ################################################################################################### 23 | ## PARAMETERS ##################################################################################### 24 | EP_cutoff = 20 ## count if > 20% of EP -> add 25 | target.celltypes = "EP" ## declare name of epithelial cells in metadata 26 | label = "example" 27 | ################################################################################################### 28 | 29 | ## 1. check proportion of epithelial cells in tumor tissues ## 30 | prop <- as.data.frame(table(cell_annotation_with_tumor$celltype)) 31 | prop$Percent = prop$Freq / nrow(cell_annotation_with_tumor) * 100 32 | 33 | ##2. Check the proportion (adding normal cells or not) 34 | if(prop[prop$Var1 %in% target.celltypes,]$Percent > EP_cutoff){ 35 | 36 | list <- addNormalDataset(tumor.data = tumor_example, tumor.ident = cell_annotation_with_tumor, target.celltypes = target.celltypes, 37 | normal.data = normal_example) 38 | addnormal_example <- as.matrix(list$data); addnormal_annotation <- list$ident 39 | 40 | runCEP(target.normalized = addnormal_example, 41 | sample.info = addnormal_annotation, label = paste0(label,"_AddNormal"), 42 | annotationdata = ref_genome_example, target.celltypes = target.celltypes, output.dir = output.dir, 43 | min.cells = 10, MYwalk = 100) ## Sample list of EP proportion > EP_cutoff (20%) 44 | }else{ 45 | runCEP(target.normalized = tumor_example, 46 | sample.info = cell_annotation_with_tumor, label = label, 47 | annotationdata = ref_genome_example,target.celltypes = target.celltypes, output.dir = output.dir, 48 | min.cells = 10, MYwalk = 100) ## Sample list of EP proportion <= EP_cutoff (20%) 49 | } 50 | 51 | ################################################################################################### 52 | -------------------------------------------------------------------------------- /example/calCNVScore_for_CEP.R: -------------------------------------------------------------------------------- 1 | calCNVScore <- function(sh, cell_info, s, levels, cutoff.corr, cutoff.score, meta, target.celltypes){ 2 | 3 | sh2 = sh; dat = t(sh2) ## already remove chromosomal location info 4 | CNV_score <- data.frame(MS = colMeans(sh2^2), SS = apply(sh2^2, 2, sum), SD = apply(sh2, 2, sd)) 5 | ## 6 | CNV_score$Row.names <- rownames(CNV_score) 7 | cell_info$Row.names <- rownames(cell_info) 8 | 9 | cell_info2 <- plyr::join(cell_info, CNV_score, by="Row.names") # boxplot for celltype 10 | 11 | ## MS top 5% cells 12 | top_MS_cells <- arrange(cell_info2, desc(MS))[1:round(dim(cell_info2)[1]*0.05),]$Row.names # Top 5% 13 | 14 | ## calculate correlation : corr using 1 cell vs. top_MS_cells 15 | tmp <- data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells])) 16 | for(i in 1:dim(cell_info2)[1]){ 17 | cell_info2$COR[i] <- cor(sh2[,i, drop=FALSE], data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells]))) 18 | } 19 | 20 | 21 | ## 22 | cell_info3 <- cell_info2 23 | rownames(cell_info3) <- cell_info3$Row.names 24 | 25 | tumorcells <- filter(cell_info3, ((MS > cutoff.score | COR > cutoff.corr) & celltype %in% target.celltypes))$Row.names 26 | nontumorcells <- cell_info3$Row.names[!(cell_info3$Row.names %in% c(tumorcells))] 27 | immunecells <- cell_info3$Row.names[cell_info3$celltype != target.celltypes] 28 | 29 | ## only classified tumor vs. non-tumor ## 30 | cell_info3$cell_index <- rep("X", dim(cell_info3)[1]) 31 | cell_info3[tumorcells,]$cell_index <- "Tumor" 32 | cell_info3[nontumorcells,]$cell_index <- "Nontumor" 33 | cell_info3[immunecells,]$cell_index <- "Immune" 34 | 35 | ## 2D plot of MS score and correlation ## 36 | expos<-ggplot(cell_info3, aes(x=MS, y= COR)) + geom_point(aes(fill=cell_index), size=5, alpha=.8, shape=21, colour="black") + 37 | scale_fill_manual(values = c("Tumor"="red","Immune" = "gray70","Nontumor"="dodgerblue1")) + 38 | geom_vline(xintercept = cutoff.score, colour="black", size=0.5, linetype = "longdash") + geom_hline(yintercept = cutoff.corr, colour="black", size=0.5, linetype = "longdash") + 39 | xlab("MS score") + ylab("CNV correlation") + theme_bw() + 40 | theme(axis.title.x = element_text(face="bold", size=16), axis.text.x = element_text(size=12)) + 41 | theme(axis.title.y = element_text(face="bold", size=16), axis.text.y = element_text(size=12)) + 42 | theme(panel.border=element_rect(fill=NA, colour="black", size=2), legend.position = 'right') 43 | expos 44 | ggsave(paste0("final_",s, "_all_cells_CNV_score_vs_cor_classification.pdf"), width = 7, height = 5) 45 | 46 | return (cell_info3) 47 | } 48 | 49 | ReadTotalCelltype <- function(FinalCellType){ 50 | 51 | total.celltype <- read.table(file = FinalCellType, 52 | sep = "\t", header = T) 53 | 54 | total.celltype$Cell <- as.character(total.celltype$Cell) 55 | total.celltype$NEW <- as.character(total.celltype$NEW) 56 | 57 | for(i in 1:nrow(total.celltype)){ 58 | tmp <- base::strsplit(x = total.celltype$Cell[i], split = "_") 59 | total.celltype$Sample[i] <- tmp[[1]][1]; total.celltype$Barcode[i] <- tmp[[1]][2] 60 | } 61 | 62 | return (total.celltype) 63 | } 64 | 65 | ReadClusterForSample <- function(s, total){ 66 | 67 | if(grep("-A1", s) > 0){s <- gsub("-A1", "", s)} 68 | 69 | ## 70 | sample.subset <- subset(total, total$Sample == s) 71 | dim <- dim(sample.subset) 72 | cat("Sample ",s," Cell : ", dim[1],"\n") 73 | 74 | ## 75 | df <- data.frame(cluster = sample.subset$NEW) 76 | rownames(df) <- sample.subset$Barcode 77 | 78 | return (df) 79 | } 80 | 81 | -------------------------------------------------------------------------------- /example/cell_info_tumor_example.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/cell_info_tumor_example.Rds -------------------------------------------------------------------------------- /example/final_example_AddNormal_all_cells_CNV_score_vs_cor_classification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/final_example_AddNormal_all_cells_CNV_score_vs_cor_classification.pdf -------------------------------------------------------------------------------- /example/log2TPM_normal_example.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/log2TPM_normal_example.Rds -------------------------------------------------------------------------------- /example/log2TPM_tumor_example.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/log2TPM_tumor_example.Rds -------------------------------------------------------------------------------- /example/refgenome_example.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/refgenome_example.Rds --------------------------------------------------------------------------------