├── R
    ├── TCIDEA_obj_for_CEP.R
    ├── calCNVScore_for_CEP.R
    ├── calInferredCNA_for_CEP.R
    └── makingTCIDEA_for_CEP.R
├── README.md
├── calc_Chromosomal_Expression_Pattern.R
└── example
    ├── calCNVScore_for_CEP.R
    ├── cell_info_tumor_example.Rds
    ├── final_example_AddNormal_all_cells_CNV_score_vs_cor_classification.pdf
    ├── log2TPM_normal_example.Rds
    ├── log2TPM_tumor_example.Rds
    └── refgenome_example.Rds


/R/TCIDEA_obj_for_CEP.R:
--------------------------------------------------------------------------------
 1 | 
 2 | tcidea <- setClass(
 3 |   "TCIDEA",
 4 |   slots = c(
 5 |     log.data = "ANY",
 6 |     scale.data = "ANY",
 7 |     ident = "ANY",
 8 |     tSNE.data = "ANY",
 9 |     tSNE.calculate = "ANY",
10 |     cnv.data = "ANY",
11 |     cnv.sd = "ANY",
12 |     cnv.mean.sq = "ANY",
13 |     cnv.cv = "ANY",
14 |     cnv.abmean = "ANY",
15 |     cluster.auc = "ANY",
16 | 
17 |     ##
18 |     normal.use = "ANY",
19 |     normal.raw.data = "ANY",
20 |     normal.log.data = "ANY",
21 | 
22 |     ##edge index per cluster and gene set name -> T/F
23 |     cluster.es.esp = "ANY",
24 |     cluster.geneset = "ANY",
25 | #    cluster.rank = "ANY",
26 | 
27 |     ##
28 |     label = "ANY"
29 |   )
30 | )
31 | 
32 | 
33 | ##Documentation
34 | #raw.data = raw umi count data
35 | #log.data = log data -> log2 normalized
36 | #scale.data = scaled data -> centering log2 normalized data
37 | setMethod(
38 |   f = "show",
39 |   signature = "TCIDEA",
40 |   definition = function(object) {
41 |     cat(
42 |       "An object of class",
43 |       object@label,
44 |       "\n",
45 |       nrow(x = object@log.data),
46 |       "genes across",
47 |       ncol(x = object@log.data),
48 |       "samples.\n"
49 |     )
50 |     if(object@normal.use){
51 |       cat(
52 |         "Normal reference dataset ",
53 |         nrow(x = object@normal.log.data),
54 |         "genes across",
55 |         ncol(x = object@normal.log.data),
56 |         "samples.\n"
57 |       )
58 |     }
59 |     invisible(x = NULL)
60 |   }
61 | )
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/R/calCNVScore_for_CEP.R:
--------------------------------------------------------------------------------
 1 | calCNVScore <- function(sh, cell_info, s, levels, cutoff.corr, cutoff.score, meta, target.celltypes,
 2 |                         output.dir){
 3 | 
 4 |   sh2 = sh; dat = t(sh2) ## already remove chromosomal location info
 5 |   CNV_score <- data.frame(MS = colMeans(sh2^2), SS = apply(sh2^2, 2, sum), SD = apply(sh2, 2, sd))
 6 |   ##
 7 |   CNV_score$Row.names <- rownames(CNV_score)
 8 |   cell_info$Row.names <- rownames(cell_info)
 9 | 
10 |   cell_info2 <- plyr::join(cell_info, CNV_score, by="Row.names") # boxplot for celltype
11 | 
12 |   ## MS top 5% cells
13 |   top_MS_cells <- arrange(cell_info2, desc(MS))[1:round(dim(cell_info2)[1]*0.05),]$Row.names  # Top 5%
14 | 
15 |   ## calculate correlation : corr using 1 cell vs. top_MS_cells
16 |   tmp <- data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells]))
17 |   for(i in 1:dim(cell_info2)[1]){
18 |     cell_info2$COR[i] <-  cor(sh2[,i, drop=FALSE], data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells])))
19 |   }
20 | 
21 | 
22 |   ##
23 |   cell_info3 <- cell_info2
24 |   rownames(cell_info3) <- cell_info3$Row.names
25 | 
26 |   tumorcells <- filter(cell_info3, ((MS > cutoff.score | COR > cutoff.corr) & celltype %in% target.celltypes))$Row.names
27 |   nontumorcells <- cell_info3$Row.names[!(cell_info3$Row.names %in% c(tumorcells))]
28 |   immunecells <- cell_info3$Row.names[cell_info3$celltype != target.celltypes]
29 | 
30 |   ## only classified tumor vs. non-tumor ##
31 |   cell_info3$cell_index <- rep("X", dim(cell_info3)[1])
32 |   cell_info3[tumorcells,]$cell_index <- "Tumor"
33 |   cell_info3[nontumorcells,]$cell_index <- "Nontumor"
34 |   cell_info3[immunecells,]$cell_index <- "Immune"
35 | 
36 |   ## 2D plot of MS score and correlation ##
37 |   expos<-ggplot(cell_info3, aes(x=MS, y= COR)) + geom_point(aes(fill=cell_index), size=5, alpha=.8, shape=21, colour="black") +
38 |     scale_fill_manual(values = c("Tumor"="red","Immune" = "gray70","Nontumor"="dodgerblue1")) +
39 |     geom_vline(xintercept = cutoff.score, colour="black", size=0.5, linetype = "longdash") + geom_hline(yintercept = cutoff.corr, colour="black", size=0.5, linetype = "longdash") +
40 |     xlab("MS score") + ylab("CNV correlation") + theme_bw() +
41 |     theme(axis.title.x = element_text(face="bold", size=16), axis.text.x  = element_text(size=12)) +
42 |     theme(axis.title.y = element_text(face="bold", size=16), axis.text.y  = element_text(size=12)) +
43 |     theme(panel.border=element_rect(fill=NA, colour="black", size=2), legend.position = 'right')
44 |   expos
45 |   ggsave(paste0(output.dir,"/", "final_",s, "_all_cells_CNV_score_vs_cor_classification.pdf"), width = 7, height = 5)
46 | 
47 |   return (cell_info3)
48 | }
49 | 
50 | ReadTotalCelltype <- function(FinalCellType){
51 | 
52 |   total.celltype <- read.table(file = FinalCellType,
53 |                                sep = "\t", header = T)
54 | 
55 |   total.celltype$Cell <- as.character(total.celltype$Cell)
56 |   total.celltype$NEW <- as.character(total.celltype$NEW)
57 | 
58 |   for(i in 1:nrow(total.celltype)){
59 |     tmp <- base::strsplit(x = total.celltype$Cell[i], split = "_")
60 |     total.celltype$Sample[i] <- tmp[[1]][1]; total.celltype$Barcode[i] <- tmp[[1]][2]
61 |   }
62 | 
63 |   return (total.celltype)
64 | }
65 | 
66 | ReadClusterForSample <- function(s, total){
67 | 
68 |   if(grep("-A1", s) > 0){s <- gsub("-A1", "", s)}
69 | 
70 |   ##
71 |   sample.subset <- subset(total, total$Sample == s)
72 |   dim <- dim(sample.subset)
73 |   cat("Sample ",s," Cell : ", dim[1],"\n")
74 | 
75 |   ##
76 |   df <- data.frame(cluster = sample.subset$NEW)
77 |   rownames(df) <- sample.subset$Barcode
78 | 
79 |   return (df)
80 | }
81 | 
82 | 


--------------------------------------------------------------------------------
/R/calInferredCNA_for_CEP.R:
--------------------------------------------------------------------------------
  1 | ## Calculate Chromosomal expression pattern (CEP) ##
  2 | runCEP <- function(target.normalized, sample.info, label, annotationdata, min.cells = 10, MYwalk = 100,
  3 |                    target.celltypes, output.dir){
  4 |   
  5 |   ## 1. making TCIDEA object only tumor cells ##
  6 |   tcidea <- newTCIDEA(log.data = target.normalized, clustergroup = sample.info, label = label)
  7 |   
  8 |   ##2. calculate inferredCNV
  9 |   ## Proportion of epithelia cells <= EP_CUTOFF
 10 |   ## Only use genes expressed > min.cells
 11 |   ## Average of 100 genes (Binning)
 12 |   tcidea <- calInferredCNV(tcidea, min.cells = min.cells, MYwalk = MYwalk, z.score = TRUE, limit = TRUE,
 13 |                            annotationdata = annotationdata, log.file = paste0(label, "_log.txt"), use.total = FALSE)
 14 |   
 15 |   ## Calculate MS (Mean of squares) and CORR (Correlation)
 16 |   ## if MS score > cutoff.score or Correlation score > cutoff.corr : Malignant cells
 17 |   final_cell_info <- calCNVScore(tcidea@cnv.data, tcidea@ident, tcidea@label, levels, cutoff.score = 0.02, cutoff.corr = 0.2,meta = NULL,
 18 |                                  target.celltypes, output.dir)
 19 |   
 20 |   ## Save calculated info
 21 |   saveRDS(final_cell_info, file = paste0(output.dir,"/", label, "_after_calc_CNV_score.Rds")) # save final_cell_info (calculated CNV score)
 22 |   
 23 |   ## Save Object file
 24 |   saveRDS(tcidea, file = paste0(output.dir,"/", label, "_after_calc_CNV_score_TCIDEA_obj.Rds"))
 25 |   
 26 |   ## Remove object
 27 |   rm(tcidea)
 28 |   rm(final_cell_info)
 29 | }
 30 | 
 31 | 
 32 | 
 33 | calInferredCNV <- function(
 34 |   obj, min.cells = 10, MYwalk = 100, z.score = TRUE,  limit = TRUE, 
 35 |   annotationdata, log.file = "log_files.txt",
 36 |   use.total = FALSE
 37 |   ){
 38 | 
 39 |   ##
 40 |   GTF_uniq <- annotationdata
 41 | 
 42 |   ##
 43 |   SC = obj@log.data
 44 | 
 45 |   cat("Calculate inferred CNV in ", obj@label, "class\n")
 46 | 
 47 |   ### 1. Filter out unreliable genes
 48 |   SC. = as.matrix(SC[rowMeans(SC) != 0,])
 49 |   dim <-  dim(SC.)
 50 |   cat("Raw data : ", dim[1], "genes across", dim[2], "samples\n")
 51 | 
 52 |   # Filter-out many genes (expr>1, min.cells>10)
 53 |   SC.filter = SC.[rowSums(SC.>1) > min.cells,]; dim(SC.filter)
 54 |   dim <-  dim(SC.filter)
 55 |   cat("QC passed data : ", dim[1], "genes across", dim[2], "samples\n")
 56 | 
 57 |   ### 2. rm low-expressed genes
 58 |   if(use.total){
 59 |     SC.f <- SC.filter
 60 |   }else{ SC.f <- SC.filter[rowMeans(SC.filter) > 0.1,]}
 61 | 
 62 |   dim <- dim(SC.f)
 63 |   cat("2nd QC passed data (rm low-expressed genes) : ", dim[1], "genes across", dim[2], "samples\n")
 64 | 
 65 |   SC.o.substract <- SC.f
 66 |   ### 4. "Annotation" of gene symbol with chromosomal information
 67 |   SC_anno = QAnno(SC.o.substract, GTF_uniq); dim(SC_anno)
 68 | 
 69 |   ### 5. Omit X & Y & MT chromosome
 70 |   Omit_XYM = c(23,24,25,26) # c("X","Y","M","GL"|"KI")
 71 |   SC_auto = SC_anno[!grepl(paste(Omit_XYM,collapse="|") , SC_anno$chromosome_name), ]
 72 |   dim(SC_auto)
 73 | 
 74 |   dim <-  dim(SC_auto)
 75 |   cat("X & Y & MT chromosome removed data : ", dim[1], "genes across", dim[2]-6, "samples\n")
 76 | 
 77 |   NormalZ_auto.r <- NULL
 78 |   if(z.score == TRUE){
 79 |     ### 5. "Z-scoring" by row
 80 |     SC_SD = data.matrix(apply(SC_auto[,-c(1:6)], 1, sd))
 81 |     SCZ_auto.r = cbind(SC_auto[,1:6], Zscore(SC_auto[,-c(1:6)], SC_SD)); typeof(SCZ_auto.r) # "list"
 82 |     cat("Making Z-scoring data\n")
 83 |   }
 84 |   else{
 85 |     SCZ_auto.r = SC_auto
 86 |     cat("Making Not centering, Z-scoring data\n")
 87 |   }
 88 | 
 89 |   ### 6. Limit the relative expression values to [-3,3] # as Tirosh did
 90 |   summary(as.numeric(as.matrix(SCZ_auto.r[,-c(1:6)])))
 91 |   if(limit){
 92 |     SCZ_auto.r2 = SCZ_auto.r[,-c(1:6)]
 93 |     SCZ_auto.r2[SCZ_auto.r2 < (-3)] <- (-3)
 94 |     SCZ_auto.r2[SCZ_auto.r2 > 3] <- 3
 95 |     summary(as.numeric(as.matrix(SCZ_auto.r2)))
 96 |     SCZ_auto.r2 = cbind(SCZ_auto.r[,1:6], SCZ_auto.r2)
 97 |     cat("Limit the relative expression values : limitation is -3 ~ 3\n")
 98 |     #    write.table(SCZ_auto.r2, file = "LUNG_T18_zscore_by_row_after_lim.txt", sep= "\t")
 99 |     cat("After limitation, min : ",min(SCZ_auto.r2[,-c(1:6)])," max : ", max(SCZ_auto.r2[,-c(1:6)]), "\n")
100 |   }
101 |   else{
102 |     SCZ_auto.r2 = SCZ_auto.r
103 |     cat("Not Limit the relative expression values\n")
104 |     cat("min : ",min(SCZ_auto.r2[,-c(1:6)])," max : ", max(SCZ_auto.r2[,-c(1:6)]), "\n")
105 |   }
106 | 
107 | 
108 |   ### 7. Moving average of Z-score / Centering data
109 |   library(caTools) ;
110 | 
111 |   SCZ_MV150 = MyMV_Zscore(SCZ_auto.r2, MYwalk)
112 |   SCZ_MV150_centering = t(t(SCZ_MV150)-colMeans(SCZ_MV150)); typeof(SCZ_MV150_centering) # "double"
113 | 
114 |   f_SCZ_MV150 = round(SCZ_MV150, digits=3);
115 |   f_SCZ_MV150_centering = round(SCZ_MV150_centering, digits=3);  ## centering by column
116 | 
117 |   ## return cnv values. z-scored values
118 |   obj@cnv.data <- SCZ_MV150_centering
119 |   obj@cnv.sd <- calSD(SCZ_MV150_centering)
120 | 
121 |   ## add statistical values
122 |   obj@cnv.mean.sq <-  meanSquare(SCZ_MV150_centering)
123 |   obj@cnv.cv <- calCV(SCZ_MV150_centering)
124 |   obj@cnv.abmean <- calMeanAb(SCZ_MV150_centering)
125 | 
126 |   return (obj)
127 | }
128 | 
129 | ## calculate mean of absolute ##
130 | calMeanAb <- function(matrix){
131 |   mean.ab = matrix(nrow = ncol(matrix), ncol = 1)
132 |   rownames(mean.ab) <- colnames(matrix)
133 | 
134 |   for(i in 1:ncol(matrix)){
135 |     abs <- abs(x = matrix[,i])
136 |     mean <- mean(x=abs)
137 |     #
138 |     mean.ab[i,1] <- mean
139 |   }
140 |   ##
141 |   colnames(mean.ab) <- "abMean"
142 |   cat(colnames(mean.ab))
143 | 
144 |   return (mean.ab)
145 | }
146 | 
147 | ## calculate CV ##
148 | calCV <- function(matrix){
149 |   cv = matrix(nrow = ncol(matrix), ncol = 1)
150 |   rownames(cv) <- colnames(matrix)
151 | 
152 |   for(i in 1:ncol(matrix)){
153 |     sample.SD <- sd(x = matrix[,i])
154 |     abs <- abs(x = matrix[,i])
155 |     mean <- mean(x=abs)
156 |     cv[i,1] <- sample.SD / mean * 100 #150518
157 |   }
158 | 
159 |   colnames(cv) <- "CV"
160 |   cat(colnames(cv))
161 |   return (cv)
162 | }
163 | 
164 | ## mean of squares : mean(squares per each values)
165 | meanSquare <- function(matrix){
166 | 
167 |   m.square = matrix(nrow = ncol(matrix), ncol = 1)
168 |   rownames(m.square) <- colnames(matrix)
169 | 
170 |   for(i in 1:ncol(matrix)){
171 |     square <- 0; n <- 0;
172 |     for(j in 1:nrow(matrix)){
173 |       square <- square + (matrix[j,i])^2
174 |       n <- n+1
175 |     }
176 |     ##
177 |     m.square[i,1] <- square / n
178 |   }
179 | 
180 |   colnames(m.square) <- "MS"
181 |   cat(colnames(m.square))
182 | 
183 |   return(m.square)
184 | }
185 | 
186 | 
187 | ## in this case, we only provide GRCh38 reference genome.
188 | readGTF.addData <- function(AnnotationLevel){
189 | 
190 |   if(AnnotationLevel == "GRCh38"){load("GRCh38.rda"); return(GTF_uniq)}
191 |   else{
192 |     cat('TCIDEA only provides GRCh38 gtf files.')
193 |   }
194 | }
195 | 
196 | ## using GTF matrix, we make chromosome table along chromosomal location ##
197 | QAnno = function(row_gene_named_matrix, GTF_uniq){
198 |   ANNO_overlap = intersect(rownames(row_gene_named_matrix) , rownames(GTF_uniq))
199 |   ANNO_GTF     = GTF_uniq[ANNO_overlap, ]
200 |   ANNO_input   = row_gene_named_matrix[ANNO_overlap,];
201 |   ANNO_merge   = cbind(ANNO_GTF,ANNO_input)
202 |   MyANNO       = cbind(ANNO_merge[,1],rownames(ANNO_merge),ANNO_merge[,c(2:5 , 7:ncol(ANNO_merge))])
203 |   colnames(MyANNO)=c("ensembl_gene_id","gene_name","description","chromosome_name","start_position","end_position",colnames(ANNO_input))
204 |   MyANNO[,4]=gsub("X",23,MyANNO[,4]) ; MyANNO[,4]=gsub("Y",24,MyANNO[,4]) ; MyANNO[,4]=gsub("MT",25,MyANNO[,4]) ; MyANNO[,4]=gsub("GL",26,MyANNO[,4]) ; MyANNO[,4]=gsub("KI",26,MyANNO[,4])
205 |   MyANNO[,c(4,5,6)] = sapply(MyANNO[,c(4,5,6)], as.numeric)
206 |   MyTABLE = MyANNO[order(MyANNO[,4] , MyANNO[,5]) , ]
207 |   return(MyTABLE)
208 | }
209 | 
210 | ## calculate z-score ##
211 | Zscore = function(Tumor_ExpRatio, SD_of_TumorRatios){(Tumor_ExpRatio-rowMeans(Tumor_ExpRatio))/SD_of_TumorRatios}
212 | 
213 | ## Normalize CEP to Z-score ##
214 | MyMV_Zscore = function(annotate_matrix, MYwalk){
215 |   annotate_matrix$chromosome_name = sapply(annotate_matrix$chromosome_name, as.numeric)
216 |   MV_input = annotate_matrix[order(annotate_matrix$chromosome_name), ]
217 |   MV_input = MV_input[,-c(1:3,5:6)] ;
218 |   rownames(MV_input)=paste("chr",annotate_matrix[,4],":",annotate_matrix[,5],"-",annotate_matrix[,6]," (",annotate_matrix[,2],")",sep="")
219 |   for(i in 1:22){
220 |     MV.chr = MV_input[MV_input[,1] == i, ];
221 |     MV.dat = apply(MV.chr, 2, runmean, MYwalk)
222 |     if(i ==1){MyMV = MV.dat} else {MyMV=rbind(MyMV,MV.dat)}}
223 |   MV_output = MyMV[,-1];
224 |   colnames(MV_output)=colnames(MV_input[,-1]) ; rownames(MV_output) = rownames(MV_input[,-1]);
225 |   return(MV_output)
226 | }
227 | 
228 | # calculate SD of each single-cells
229 | calSD <- function(matrix){
230 |   SD = matrix(nrow = ncol(matrix), ncol = 1)
231 |   rownames(SD) <- colnames(matrix)
232 | 
233 |   for(i in 1:ncol(matrix)){
234 |     sample.SD <- sd(x = matrix[,i])
235 |     SD[i,1] <- sample.SD
236 |   }
237 | 
238 |   colnames(SD) <- "SD"
239 |   cat(colnames(SD))
240 |   return (SD)
241 | }
242 | 


--------------------------------------------------------------------------------
/R/makingTCIDEA_for_CEP.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Create TCIDEA object
 3 | #'
 4 | #' Initialize the TCIDEA object and adding option
 5 | newTCIDEA <- function(
 6 |   log.data,
 7 |   label,
 8 |   clustergroup
 9 |   ){
10 |   
11 |   obj <- new(Class = "TCIDEA", log.data = log.data, label = label)
12 |   obj@ident = clustergroup
13 | 
14 |   return (obj)
15 | }
16 | 
17 | ## Add normal data ##
18 | addNormalDataset <- function(tumor.data, tumor.ident, target.celltypes,
19 |                              normal.data){
20 |   
21 |   # 1. calculate adding normal number #
22 |   tb <- tumor.ident %>% dplyr::group_by(celltype) %>% dplyr::summarise(n = n())
23 |   tb$percent <- tb$n / sum(tb$n) * 100
24 |   
25 |   ep.n <- tb$n[tb$celltype %in% target.celltypes]
26 |   needs.normal.n = 5 * ep.n - sum(tb$n)# to adding ep percent == 20
27 |   
28 |   ##2. Select normal data (random)
29 |   set.seed(1011)
30 |   random.s <- sample(colnames(normal.data), needs.normal.n)
31 |   
32 |   normal.random = normal.data[,random.s]
33 |   normal.ident = data.frame(Index = random.s, celltype = "Normal", stringsAsFactors = F)
34 |   rownames(normal.ident) = normal.ident$Index
35 |   
36 |   ##3. Add normal data
37 |   intersect.gene = intersect(rownames(tumor.data), rownames(normal.random))
38 |   
39 |   addnormal.data = cbind(tumor.data[intersect.gene,], normal.random[intersect.gene,])
40 |   addnormal.cellinfo = rbind(tumor.ident, normal.ident)
41 |   addnormal.cellinfo = addnormal.cellinfo[colnames(addnormal.data),]
42 |   
43 |   list <- list(data =addnormal.data, ident = addnormal.cellinfo)
44 |   
45 |   return (list)
46 | }
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Single-cell RNA sequencing for lung adenocarcinoma
 2 | 
 3 | In order to separate malignant tumor cells from non-malignant cells, we calculate CNV aberrations inferring the perturbation of chromosomal gene expression.
 4 | 
 5 | 1. Adjusting the proportion of putative malignant cells below 20%.
 6 | 2. Filter out less informative genes (default : less than 10 cells and mean expression of less than 0.1 at log2 scale).
 7 | 3. Transformation into Z-score and limit the scale -3 to 3.
 8 | 4. Sorting the genes by their chromosomal position and estimate CNV signals using the window size (default = 100 genes).
 9 | 5. Summarize CNV signal with two parameters and classify malignant cells and non-malignant cells.
10 |   - CNV signals (MS, Mean of Squares) : Mean squares of estimates across all windows.
11 |   - CORR (Correlation with the high CNV signal cells) : Correlation of the CNV of each cell with the average of the top5% cells.
12 |   - Malignant cells were classified if their CNV signals (MS) > 0.02 or CNV correlation (CORR) > 0.2.
13 |   
14 |   
15 | # Example code
16 | 
17 | After download the codes, run example data using 'calc_Chromosomal_Expression_Pattern.R'.
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/calc_Chromosomal_Expression_Pattern.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(pals)
 3 | library(plyr)
 4 | library(dplyr)
 5 | library(Seurat)
 6 | library(gplots)
 7 | library(RColorBrewer)
 8 | 
 9 | ####################################################################################################
10 | source("R/calInferredCNA_for_CEP.R") ## calculate inferredCNV value
11 | source("R/makingTCIDEA_for_CEP.R") ## make TCIDEA object
12 | source("R/TCIDEA_obj_for_CEP.R") ## initiate TCIDEA object
13 | source("R/calCNVScore_for_CEP.R") ## calculate CNV score (MS, Corr) with CEP result
14 | 
15 | ###################################################################################################
16 | ## Example data ###################################################################################
17 | cell_annotation_with_tumor <- readRDS(file = "example/cell_info_tumor_example.Rds") # celltype = annotation cell types in transcriptome data
18 | tumor_example <- readRDS(file = "example/log2TPM_tumor_example.Rds")
19 | normal_example <- readRDS(file = "example/log2TPM_normal_example.Rds")
20 | ref_genome_example <- readRDS(file = "example/refgenome_example.Rds")
21 | output.dir = paste0(getwd(), "/", "example")
22 | ###################################################################################################
23 | ## PARAMETERS #####################################################################################
24 | EP_cutoff = 20 ## count if > 20% of EP -> add
25 | target.celltypes = "EP" ## declare name of epithelial cells in metadata
26 | label = "example" 
27 | ###################################################################################################
28 | 
29 | ## 1. check proportion of epithelial cells in tumor tissues ##
30 | prop <- as.data.frame(table(cell_annotation_with_tumor$celltype))
31 | prop$Percent = prop$Freq / nrow(cell_annotation_with_tumor) * 100
32 | 
33 | ##2. Check the proportion (adding normal cells or not)
34 | if(prop[prop$Var1 %in% target.celltypes,]$Percent > EP_cutoff){
35 |   
36 |   list <- addNormalDataset(tumor.data = tumor_example, tumor.ident = cell_annotation_with_tumor, target.celltypes = target.celltypes,
37 |                            normal.data = normal_example)
38 |   addnormal_example <- as.matrix(list$data); addnormal_annotation <- list$ident
39 |   
40 |   runCEP(target.normalized = addnormal_example,  
41 |          sample.info = addnormal_annotation, label = paste0(label,"_AddNormal"),
42 |          annotationdata = ref_genome_example, target.celltypes = target.celltypes, output.dir = output.dir,
43 |          min.cells = 10, MYwalk = 100) ## Sample list of EP proportion > EP_cutoff (20%)
44 | }else{
45 |   runCEP(target.normalized = tumor_example,  
46 |          sample.info = cell_annotation_with_tumor, label = label,
47 |          annotationdata = ref_genome_example,target.celltypes = target.celltypes, output.dir = output.dir,
48 |          min.cells = 10, MYwalk = 100) ## Sample list of EP proportion <= EP_cutoff (20%)
49 | }
50 | 
51 | ###################################################################################################
52 | 


--------------------------------------------------------------------------------
/example/calCNVScore_for_CEP.R:
--------------------------------------------------------------------------------
 1 | calCNVScore <- function(sh, cell_info, s, levels, cutoff.corr, cutoff.score, meta, target.celltypes){
 2 | 
 3 |   sh2 = sh; dat = t(sh2) ## already remove chromosomal location info
 4 |   CNV_score <- data.frame(MS = colMeans(sh2^2), SS = apply(sh2^2, 2, sum), SD = apply(sh2, 2, sd))
 5 |   ##
 6 |   CNV_score$Row.names <- rownames(CNV_score)
 7 |   cell_info$Row.names <- rownames(cell_info)
 8 | 
 9 |   cell_info2 <- plyr::join(cell_info, CNV_score, by="Row.names") # boxplot for celltype
10 | 
11 |   ## MS top 5% cells
12 |   top_MS_cells <- arrange(cell_info2, desc(MS))[1:round(dim(cell_info2)[1]*0.05),]$Row.names  # Top 5%
13 | 
14 |   ## calculate correlation : corr using 1 cell vs. top_MS_cells
15 |   tmp <- data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells]))
16 |   for(i in 1:dim(cell_info2)[1]){
17 |     cell_info2$COR[i] <-  cor(sh2[,i, drop=FALSE], data.frame(Ave_tumor = rowMeans(sh2[,top_MS_cells])))
18 |   }
19 | 
20 | 
21 |   ##
22 |   cell_info3 <- cell_info2
23 |   rownames(cell_info3) <- cell_info3$Row.names
24 | 
25 |   tumorcells <- filter(cell_info3, ((MS > cutoff.score | COR > cutoff.corr) & celltype %in% target.celltypes))$Row.names
26 |   nontumorcells <- cell_info3$Row.names[!(cell_info3$Row.names %in% c(tumorcells))]
27 |   immunecells <- cell_info3$Row.names[cell_info3$celltype != target.celltypes]
28 | 
29 |   ## only classified tumor vs. non-tumor ##
30 |   cell_info3$cell_index <- rep("X", dim(cell_info3)[1])
31 |   cell_info3[tumorcells,]$cell_index <- "Tumor"
32 |   cell_info3[nontumorcells,]$cell_index <- "Nontumor"
33 |   cell_info3[immunecells,]$cell_index <- "Immune"
34 | 
35 |   ## 2D plot of MS score and correlation ##
36 |   expos<-ggplot(cell_info3, aes(x=MS, y= COR)) + geom_point(aes(fill=cell_index), size=5, alpha=.8, shape=21, colour="black") +
37 |     scale_fill_manual(values = c("Tumor"="red","Immune" = "gray70","Nontumor"="dodgerblue1")) +
38 |     geom_vline(xintercept = cutoff.score, colour="black", size=0.5, linetype = "longdash") + geom_hline(yintercept = cutoff.corr, colour="black", size=0.5, linetype = "longdash") +
39 |     xlab("MS score") + ylab("CNV correlation") + theme_bw() +
40 |     theme(axis.title.x = element_text(face="bold", size=16), axis.text.x  = element_text(size=12)) +
41 |     theme(axis.title.y = element_text(face="bold", size=16), axis.text.y  = element_text(size=12)) +
42 |     theme(panel.border=element_rect(fill=NA, colour="black", size=2), legend.position = 'right')
43 |   expos
44 |   ggsave(paste0("final_",s, "_all_cells_CNV_score_vs_cor_classification.pdf"), width = 7, height = 5)
45 | 
46 |   return (cell_info3)
47 | }
48 | 
49 | ReadTotalCelltype <- function(FinalCellType){
50 | 
51 |   total.celltype <- read.table(file = FinalCellType,
52 |                                sep = "\t", header = T)
53 | 
54 |   total.celltype$Cell <- as.character(total.celltype$Cell)
55 |   total.celltype$NEW <- as.character(total.celltype$NEW)
56 | 
57 |   for(i in 1:nrow(total.celltype)){
58 |     tmp <- base::strsplit(x = total.celltype$Cell[i], split = "_")
59 |     total.celltype$Sample[i] <- tmp[[1]][1]; total.celltype$Barcode[i] <- tmp[[1]][2]
60 |   }
61 | 
62 |   return (total.celltype)
63 | }
64 | 
65 | ReadClusterForSample <- function(s, total){
66 | 
67 |   if(grep("-A1", s) > 0){s <- gsub("-A1", "", s)}
68 | 
69 |   ##
70 |   sample.subset <- subset(total, total$Sample == s)
71 |   dim <- dim(sample.subset)
72 |   cat("Sample ",s," Cell : ", dim[1],"\n")
73 | 
74 |   ##
75 |   df <- data.frame(cluster = sample.subset$NEW)
76 |   rownames(df) <- sample.subset$Barcode
77 | 
78 |   return (df)
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/example/cell_info_tumor_example.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/cell_info_tumor_example.Rds


--------------------------------------------------------------------------------
/example/final_example_AddNormal_all_cells_CNV_score_vs_cor_classification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/final_example_AddNormal_all_cells_CNV_score_vs_cor_classification.pdf


--------------------------------------------------------------------------------
/example/log2TPM_normal_example.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/log2TPM_normal_example.Rds


--------------------------------------------------------------------------------
/example/log2TPM_tumor_example.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/log2TPM_tumor_example.Rds


--------------------------------------------------------------------------------
/example/refgenome_example.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SGI-LungCancer/SingleCell/42e9959bd3fc2bec4343c6aba7a222364b55ff8f/example/refgenome_example.Rds


--------------------------------------------------------------------------------