├── BPscRNAseq.Rproj ├── DESCRIPTION ├── NAMESPACE ├── R ├── .DS_Store ├── airway2inferCNV.R ├── anno_bed.R ├── cal_DM.R ├── change_ID_of_exprSet.R ├── counts2exprSet.R ├── create_rData.R ├── create_rdata.Rmd ├── create_rdata.html ├── exprSet2CNV.R ├── exprSet2cellcycle.R ├── exprSet2inferCNV.R ├── get_genomic_positons.R ├── ht_cnv.R └── pam50subtyping.R ├── data ├── .DS_Store ├── airway_exprSet.rda ├── human2mouse.csv ├── human2mouse_symbols.rda ├── human_geneInfo_genecode_v25.rda ├── human_geneLength_genecode_v25.rda ├── mouse.gene.positions ├── mouse_geneInfo_genecode_vM12.rda └── mouse_geneLength_genecode_vM12.rda ├── man ├── .DS_Store ├── airway2inferCNV.Rd ├── anno_bed.Rd ├── cal_DM.Rd ├── change_ID_of_expreSet.Rd ├── counts2exprSet.Rd ├── exprSet2CNV.Rd ├── exprSet2cellcycle.Rd ├── exprSet2inferCNV.Rd ├── get_genomic_positions.Rd ├── ht_cnv.Rd └── pam50subtyping.Rd ├── readme.md └── test.pdf /BPscRNAseq.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: BPscRNAseq 2 | Type: Package 3 | Title: What the Package Does (Title Case) 4 | Version: 0.1.0 5 | Author: Who wrote it 6 | Maintainer: The package maintainer 7 | Description: More about what it does (maybe more than one line) 8 | Use four spaces when indenting paragraphs within the Description. 9 | License: What license is it under? 10 | Encoding: UTF-8 11 | LazyData: true 12 | RoxygenNote: 6.0.1 13 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/R/.DS_Store -------------------------------------------------------------------------------- /R/airway2inferCNV.R: -------------------------------------------------------------------------------- 1 | #' Create two input files for inferCNV for airway data 2 | #' 3 | #' Airway is the expression matrix from public bulk RNA-seq data. 4 | #' Check the document for input format at : https://github.com/broadinstitute/inferCNV/wiki 5 | #' The genomic positions for airway data is : human_geneInfo_genecode_v25 6 | #' 7 | #' @param run Choose TRUE or FALSE,defaults: FALSE 8 | #' @param dir Choose where to put the files,defaults: ./ 9 | #' 10 | #' @return Two files \code{airwary_inferCNV_pos.txt} and \code{airwary_inferCNV_exprSet.txt} 11 | #' @examples 12 | #' airway2inferCNV 13 | #' airway2inferCNV(TRUE) 14 | #' airway2inferCNV(TRUE, '~/biosoft/scRNA_cnv/project/airway') 15 | 16 | airway2inferCNV <- function(run=FALSE,dir='./'){ 17 | 18 | if(run){ 19 | library(airway) 20 | library(edgeR) 21 | library(DESeq2) 22 | data(airway) 23 | airway 24 | counts=assay(airway) 25 | counts[1:4,1:4];dim(counts) 26 | exprSet=counts2exprSet(counts) 27 | exprSet[1:4,1:4];dim(exprSet) 28 | exprSet2inferCNV(exprSet,geneType='ensembl',species='human',prefix='airwary',dir=dir) 29 | 30 | # Rscript ~/biosoft/scRNA_cnv/inferCNV/scripts/inferCNV.R \ 31 | # --output_dir test airwary_inferCNV_exprSet.txt airwary_inferCNV_pos.txt 32 | 33 | } 34 | 35 | } 36 | 37 | -------------------------------------------------------------------------------- /R/anno_bed.R: -------------------------------------------------------------------------------- 1 | #' Assign a position to genomic features by ChIPseeker 2 | #' 3 | #' filter SNP or INDELs in a vcf or maf file 4 | #' 5 | #' @param pos three columns of the positions, chromosome,start,end 6 | #' @param reference Choose hg19,hg38,mm10 ,defaults: hg38 7 | #' 8 | #' @return assigned results 9 | #' @examples 10 | #' anno_bed 11 | #' anno_bed(pos,reference='hg38' ) 12 | 13 | anno_bed <- function(pos,reference='hg38' ){ 14 | require(ChIPseeker) 15 | library(org.Hs.eg.db) 16 | library(org.Mm.eg.db) 17 | library(GenomicRanges) 18 | peak <- GRanges(seqnames=Rle(pos[,1]), 19 | ranges=IRanges(pos[,2], pos[,3]), strand=rep(c("*"), nrow(pos))) 20 | peak 21 | 22 | if(reference=='hg38'){ 23 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 24 | txdb=TxDb.Hsapiens.UCSC.hg38.knownGene 25 | peakAnno <- annotatePeak(peak, tssRegion=c(-3000, 3000), 26 | TxDb=txdb, annoDb="org.Hs.eg.db") 27 | } 28 | if(reference=='hg19'){ 29 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 30 | txdb=TxDb.Hsapiens.UCSC.hg19.knownGene 31 | peakAnno <- annotatePeak(peak, tssRegion=c(-3000, 3000), 32 | TxDb=txdb, annoDb="org.Hs.eg.db") 33 | } 34 | if(reference=='mm10'){ 35 | library(TxDb.Mmusculus.UCSC.mm10.knownGene) 36 | txdb=TxDb.Mmusculus.UCSC.mm10.knownGene 37 | peakAnno <- annotatePeak(peak, tssRegion=c(-3000, 3000), 38 | TxDb=txdb, annoDb="org.Mm.eg.db") 39 | } 40 | 41 | return(as.data.frame(peakAnno)) 42 | } 43 | -------------------------------------------------------------------------------- /R/cal_DM.R: -------------------------------------------------------------------------------- 1 | #' Calculate DM(distance to median) values based on cpm(counts per million) expression matrix 2 | #' 3 | #' expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or sample. 4 | #' The rownames for this expression matrix should be ensembl IDs or gene symbols 5 | #' One should set the species as human or mouse. 6 | #' The gene length information calculated base onf GENCODE database. 7 | #' 8 | #' @param exprSet cpm(counts per million) expression matrix 9 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 10 | #' @param species Choose human or mouse,defaults: human 11 | #' 12 | #' @return log10cv2_adj DM(distance to median) values for each gene 13 | #' @examples 14 | #' cal_DM 15 | #' cal_DM(exprSet,geneType='ensembl',species='human') 16 | #' 17 | cal_DM <- function(exprSet,geneType='ensembl',species='human' ){ 18 | # In paper : normalised read counts (reads per million) 19 | if(species=='human'){ 20 | head(human_geneLength_genecode_v25) 21 | len=human_geneLength_genecode_v25 22 | if(geneType=='ensembl'){ 23 | exprSet=exprSet[rownames(exprSet) %in% len$ensembl,] 24 | length_per_gene=len[match(rownames(exprSet) , len$ensembl ),2] 25 | } 26 | if(geneType=='symbol'){ 27 | ## TODO 28 | len=merge(human_geneLength_genecode_v25,human_geneInfo_genecode_v25,by='ensembl') 29 | len=len[,c('symbol','length')] 30 | exprSet=exprSet[rownames(exprSet) %in% len$symbol,] 31 | length_per_gene=len[match(rownames(exprSet) , len$symbol ),2] 32 | } 33 | } 34 | if(species=='mouse'){ 35 | head(mouse_geneLength_genecode_vM12) 36 | len=mouse_geneLength_genecode_vM12 37 | if(geneType=='ensembl'){ 38 | exprSet=exprSet[rownames(exprSet) %in% len$ensembl,] 39 | length_per_gene=len[match(rownames(exprSet) , len$ensembl ),2] 40 | } 41 | if(geneType=='symbol'){ 42 | ## TODO 43 | ## TODO 44 | len=merge(mouse_geneLength_genecode_vM12,mouse_geneInfo_genecode_vM12,by='ensembl') 45 | len=len[,c('symbol','length')] 46 | exprSet=exprSet[rownames(exprSet) %in% len$symbol,] 47 | length_per_gene=len[match(rownames(exprSet) , len$symbol ),2] 48 | 49 | } 50 | } 51 | 52 | ## step1: check the correlations among different characteristics of a expression matrix 53 | # exprSet should be a log2(cpm+1) expression matrix. 54 | 55 | mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE) 56 | sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE) 57 | mad_perl_gene <- apply(exprSet, 1, mad, na.rm = TRUE) 58 | cv_per_gene <- sd_per_gene/mean_per_gene 59 | cha <- data.frame(mean = log10(mean_per_gene), 60 | sd = sd_per_gene, 61 | mad=mad_perl_gene, 62 | cv = cv_per_gene, 63 | len=log10(length_per_gene)) 64 | rownames(cha) <- rownames(exprSet) 65 | # pairs(cha) 66 | # It's clear that these characteristics are related with each. 67 | # plot(cha[,c(1,4)]) 68 | # Squared coefficient of variation (CV2) vs. average normalized read count of genes 69 | # As gene expression levels increase, genes are more likely to show lower levels of variation. 70 | 71 | # step 2 :Compute rolling medians of CV2 across all samples. 72 | 73 | # https://jdblischak.github.io/singleCellSeq/analysis/cv-adjusted-wo-19098-r2.html 74 | library(zoo) 75 | # Order of genes by mean expression levels 76 | order_gene <- order( mean_per_gene ) 77 | cv=cv_per_gene 78 | # Rolling medians of log10 squared CV by mean expression levels 79 | roll_medians_mean <- rollapply(log10(cv^2)[order_gene], width = 50, by = 25, 80 | FUN = median, fill = list("extend", "extend", "NA") ) 81 | ## then change the NA values in the roll_medians_mean 82 | table(is.na(roll_medians_mean)) 83 | ii_na <- which( is.na(roll_medians_mean) ) 84 | roll_medians_mean[ii_na] <- median( log10(cv^2)[order_gene][ii_na] ) 85 | names(roll_medians_mean) <- rownames(exprSet)[order_gene] 86 | 87 | # re-order rolling medians according to the expression matrix 88 | roll_medians_mean <- roll_medians_mean[ match(rownames(exprSet), names(roll_medians_mean) ) ] 89 | stopifnot( all.equal(names(roll_medians_mean), rownames(exprSet) ) ) 90 | 91 | # adjusted coefficient of variation on log10 scale 92 | log10cv2_adj <- log10( cv^2) - roll_medians_mean 93 | 94 | if(F){ 95 | plot(log10cv2_adj,log10(mean_per_gene)) 96 | #install.packages("basicTrendline") 97 | library(basicTrendline) 98 | trendline(log10cv2_adj,log10(mean_per_gene),model="line2P") 99 | } 100 | 101 | # step 2 :Compute rolling medians of gene length(log10) across all samples. 102 | 103 | order_gene <- order( log10(length_per_gene) ) 104 | cv=log10cv2_adj 105 | roll_medians_length <- rollapply(cv[order_gene], width = 50, by = 25, 106 | FUN = median, fill = list("extend", "extend", "NA") ) 107 | ## then change the NA values in the roll_medians_length 108 | table(is.na(roll_medians_length)) 109 | ii_na <- which( is.na(roll_medians_length) ) 110 | roll_medians_length[ii_na] <- median( cv[order_gene][ii_na] ) 111 | names(roll_medians_length) <- rownames(exprSet)[order_gene] 112 | roll_medians_length <- roll_medians_length[ match(rownames(exprSet), names(roll_medians_length) ) ] 113 | stopifnot( all.equal(names(roll_medians_length), rownames(exprSet) ) ) 114 | log10cv2_adj <- cv - roll_medians_length 115 | 116 | if(F){ 117 | pheatmap::pheatmap(log2(exprSet[names(head(sort(log10cv2_adj),50)),]+1)) 118 | pheatmap::pheatmap(log2(exprSet[names(tail(sort(log10cv2_adj),50)),]+1)) 119 | } 120 | return(log10cv2_adj) 121 | } 122 | -------------------------------------------------------------------------------- /R/change_ID_of_exprSet.R: -------------------------------------------------------------------------------- 1 | #' Change the colnames of expression matrix between ensembl and symbol 2 | #' 3 | #' expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or samples. 4 | #' The rownames for this expression matrix should be ensembl IDs or gene symbols 5 | #' One should set the species as human or mouse. 6 | #' The expression value will be sum if two or more genes to one gene 7 | #' 8 | #' @param exprSet The expression matrix 9 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 10 | #' @param species Choose human or mouse,defaults: human 11 | #' 12 | #' @return exprSet The expression matrix which had been changed. 13 | #' @examples 14 | #' change_ID_of_expreSet 15 | #' change_ID_of_expreSet(exprSet,geneType='ensembl',species='human' ) 16 | 17 | change_ID_of_expreSet <- function(exprSet,geneType='ensembl',species='human' ){ 18 | print(dim(exprSet)) 19 | pos=data.frame() 20 | if(species=='human'){ 21 | pos=human_geneInfo_genecode_v25[,c('ensembl','symbol')] 22 | }else{ 23 | pos=mouse_geneInfo_genecode_vM12[,c('ensembl','symbol')] 24 | } 25 | head(pos) 26 | if(geneType=='ensembl'){ 27 | exprSet=exprSet[rownames(exprSet) %in% pos$ensembl,] 28 | print(dim(exprSet)) 29 | pos=pos[match(rownames(exprSet),pos$ensembl), ] 30 | #tmp=split(as.data.frame(exprSet),pos$symbol);x=tmp$ZNF385C 31 | tmp=lapply(split(as.data.frame(exprSet),pos$symbol), function(x){ 32 | if(class(x)== "data.frame"){ 33 | return(colSums(x)) 34 | }else(return(x)) 35 | }) 36 | newExprSet <- do.call(rbind,tmp) 37 | # head(newExprSet);dim(newExprSet) 38 | colnames(newExprSet)=colnames(exprSet) 39 | print(dim(newExprSet)) 40 | } 41 | if(geneType=='symbol'){ 42 | exprSet=exprSet[rownames(exprSet) %in% pos$symbol,] 43 | print(dim(exprSet)) 44 | pos=pos[match(rownames(exprSet),pos$symbol), ] 45 | #tmp=split(as.data.frame(exprSet),pos$symbol);x=tmp$ZNF385C 46 | tmp=lapply(split(as.data.frame(exprSet),pos$ensembl), function(x){ 47 | if(class(x)== "data.frame"){ 48 | return(colSums(x)) 49 | }else(return(x)) 50 | }) 51 | newExprSet <- do.call(rbind,tmp) 52 | # head(newExprSet);dim(newExprSet) 53 | colnames(newExprSet)=colnames(exprSet) 54 | print(dim(newExprSet)) 55 | } 56 | return(newExprSet) 57 | } 58 | 59 | -------------------------------------------------------------------------------- /R/counts2exprSet.R: -------------------------------------------------------------------------------- 1 | #' transform and normlize the raw counts matrix 2 | #' 3 | #' row counts matrix will be filtered and log2(cpm+1) tranformed. 4 | #' 5 | #' @param counts The raw counts matrix from featureCounts or other tools 6 | #' 7 | #' @return exprSet, the normalized expression matrix, log2(cpm+1) 8 | #' @examples 9 | #' counts2exprSet 10 | #' counts2exprSet(counts) 11 | 12 | counts2exprSet <- function(counts){ 13 | library(edgeR) 14 | library(DESeq2) 15 | exprSet=counts 16 | geneLists=rownames(exprSet) 17 | keepGene=rowSums(cpm(exprSet)>0) >=2 18 | table(keepGene);dim(exprSet) 19 | dim(exprSet[keepGene,]) 20 | exprSet=exprSet[keepGene,] 21 | rownames(exprSet)=geneLists[keepGene] 22 | 23 | # boxplot(exprSet,las=2) 24 | # CPM normalized counts. 25 | exprSet=log2(cpm(exprSet)+1) 26 | # boxplot(exprSet,las=2) 27 | exprSet[1:4,1:4] 28 | return(exprSet) 29 | } 30 | -------------------------------------------------------------------------------- /R/create_rData.R: -------------------------------------------------------------------------------- 1 | options(stringsAsFactors = F) 2 | 3 | if(F){ 4 | ls('package:BPscRNAseq') 5 | } 6 | 7 | ## for human gencode.v25.annotation.gtf 8 | if(F){ 9 | 10 | a=read.table('data/human.gene.positions')[,c(2:4,1,6,7)] 11 | colnames(a)=c('chr','start','end','ensembl','type','symbol') 12 | length(unique(a$symbol)) 13 | length(unique(a$ensembl)) 14 | head(a) 15 | human_geneInfo_genecode_v25=a 16 | devtools::use_data(human_geneInfo_genecode_v25, overwrite = T) 17 | 18 | a=read.table('data/human_ENSG_length') 19 | colnames(a)=c( 'ensembl','length' ) 20 | head(a) 21 | human_geneLength_genecode_v25=a 22 | devtools::use_data(human_geneLength_genecode_v25, overwrite = T) 23 | } 24 | 25 | 26 | ## for mouse gencode.vM12.annotation.gtf.gz 27 | if(F){ 28 | options(stringsAsFactors = F) 29 | a=read.table('data/mouse.gene.positions')[,c(2:4,1,7,6)] 30 | colnames(a)=c('chr','start','end','ensembl','symbol','type') 31 | length(unique(a$symbol)) 32 | length(unique(a$ensembl)) 33 | head(a) 34 | mouse_geneInfo_genecode_vM12=a 35 | devtools::use_data(mouse_geneInfo_genecode_vM12, overwrite = T) 36 | 37 | a=read.table('data/mouse_ENSG_length') 38 | colnames(a)=c( 'ensembl','length' ) 39 | head(a) 40 | mouse_geneLength_genecode_vM12=a 41 | devtools::use_data(mouse_geneLength_genecode_vM12, overwrite = T) 42 | 43 | } 44 | 45 | ## The orthologous genes between human and mouse 46 | 47 | if(F){ 48 | a=read.csv('data/human2mouse.csv',header = F ) 49 | table(a[,1] == toupper(a[,2])) 50 | colnames(a)=c('human','mouse') 51 | rmGenes=apply(a,1,function(x) sum(x=='') >0) 52 | a=a[!rmGenes,] 53 | table(a[,1] == toupper(a[,2])) 54 | human2mouse_symbols=a 55 | devtools::use_data(human2mouse_symbols, overwrite = T) 56 | } 57 | 58 | if(F){ 59 | library(BPscRNAseq) 60 | library(airway) 61 | library(edgeR) 62 | library(DESeq2) 63 | data(airway) 64 | airway 65 | counts=assay(airway) 66 | counts[1:4,1:4];dim(counts) 67 | exprSet=counts2exprSet(counts) 68 | exprSet[1:4,1:4];dim(exprSet) 69 | airway_exprSet =exprSet 70 | devtools::use_data(airway_exprSet, overwrite = T) 71 | } 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /R/create_rdata.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Create_rData" 3 | author: "jmzeng1314@163.com" 4 | date: "5/10/2018" 5 | output: 6 | html_document: 7 | toc: yes 8 | --- 9 | 10 | ```{r setup, include=FALSE} 11 | knitr::opts_chunk$set(echo = TRUE) 12 | ``` 13 | 14 | # gene id information based on gencode 15 | 16 | ## For human 17 | 18 | first change gencode.v25.annotation.gtf to human.gene.positions by the script 19 | 20 | ``` 21 | cat gencode.v25.annotation.gtf|perl -alne '{next unless $F[2] eq "gene";print}'|grep -w HAVANA |\ 22 | cut -f 1,4,5,9| cut -d";" -f 1,2,4|sed 's/gene_id//g'|sed 's/gene_type//g'|sed 's/gene_name//g'|\ 23 | sed 's/;//g'| sed 's/\"//g'|perl -alne '{/(ENSG\d+)/;print "$1\t$_"}' >human.gene.positions 24 | 25 | ``` 26 | Once we got the human.gene.positions file, we should tranfer it in our R package 27 | ``` 28 | ENSG00000223972 chr1 11869 14409 ENSG00000223972.5 transcribed_unprocessed_pseudogene DDX11L1 29 | ENSG00000227232 chr1 14404 29570 ENSG00000227232.5 unprocessed_pseudogene WASH7P 30 | ENSG00000243485 chr1 29554 31109 ENSG00000243485.4 lincRNA MIR1302-2 31 | ENSG00000237613 chr1 34554 36081 ENSG00000237613.2 lincRNA FAM138A 32 | ENSG00000268020 chr1 52473 53312 ENSG00000268020.3 unprocessed_pseudogene OR4G4P 33 | ENSG00000240361 chr1 62948 63887 ENSG00000240361.1 unprocessed_pseudogene OR4G11P 34 | ENSG00000186092 chr1 69091 70008 ENSG00000186092.4 protein_coding OR4F5 35 | ENSG00000238009 chr1 89295 133723 ENSG00000238009.6 lincRNA RP11-34P13.7 36 | ENSG00000239945 chr1 89551 91105 ENSG00000239945.1 lincRNA RP11-34P13.8 37 | ENSG00000233750 chr1 131025 134836 ENSG00000233750.3 processed_pseudogene CICP27 38 | ``` 39 | 40 | ```{r,eval=F} 41 | options(stringsAsFactors = F) 42 | a=read.table('data/human.gene.positions')[,c(2:4,1,6,7)] 43 | colnames(a)=c('chr','start','end','ensembl','type','symbol') 44 | length(unique(a$symbol)) 45 | length(unique(a$ensembl)) 46 | head(a) 47 | human_geneInfo_genecode_v25=a 48 | devtools::use_data(human_geneInfo_genecode_v25, overwrite = T) 49 | ``` 50 | 51 | ## For mouse 52 | 53 | first change gencode.vM12.annotation.gtf.gz to mouse.gene.positions by the script 54 | 55 | ``` 56 | zcat gencode.vM12.annotation.gtf.gz |perl -alne '{next unless $F[2] eq "gene";print}'| \ 57 | grep -w HAVANA |cut -f 1,4,5,9| cut -d";" -f 1,2,3|sed 's/gene_id//g'|sed 's/gene_type//g'|\ 58 | sed 's/gene_name//g'|sed 's/;//g'| sed 's/\"//g'| perl -alne '{/(ENSMUSG\d+)/;print "$1\t$_"}' >mouse.gene.positions 59 | ``` 60 | Once we got the mouse.gene.positions file, we should tranfer it in our R package 61 | ``` 62 | ENSMUSG00000102693 chr1 3073253 3074322 ENSMUSG00000102693.1 4933401J01Rik 63 | ENSMUSG00000051951 chr1 3205901 3671498 ENSMUSG00000051951.5 Xkr4 64 | ENSMUSG00000102851 chr1 3252757 3253236 ENSMUSG00000102851.1 Gm18956 65 | ENSMUSG00000103377 chr1 3365731 3368549 ENSMUSG00000103377.1 Gm37180 66 | ENSMUSG00000104017 chr1 3375556 3377788 ENSMUSG00000104017.1 Gm37363 67 | ENSMUSG00000103025 chr1 3464977 3467285 ENSMUSG00000103025.1 Gm37686 68 | ENSMUSG00000089699 chr1 3466587 3513553 ENSMUSG00000089699.1 Gm1992 69 | ENSMUSG00000103201 chr1 3512451 3514507 ENSMUSG00000103201.1 Gm37329 70 | ENSMUSG00000103147 chr1 3531795 3532720 ENSMUSG00000103147.1 Gm7341 71 | ENSMUSG00000103161 chr1 3592892 3595903 ENSMUSG00000103161.1 Gm38148 72 | ``` 73 | 74 | ```{r,eval=F} 75 | 76 | ## for mouse gencode.vM12.annotation.gtf.gz 77 | options(stringsAsFactors = F) 78 | a=read.table('data/mouse.gene.positions')[,c(2:4,1,6)] 79 | colnames(a)=c('chr','start','end','ensembl','symbol') 80 | length(unique(a$symbol)) 81 | length(unique(a$ensembl)) 82 | head(a) 83 | mouse_geneInfo_genecode_vM12=a 84 | devtools::use_data(mouse_geneInfo_genecode_vM12, overwrite = T) 85 | 86 | ``` 87 | 88 | 89 | 90 | 91 | 92 | # gene length information based on gencode 93 | 94 | ## For human 95 | 96 | First, we computed gene lengths by taking the union of all exons within a gene based on gencode.v25.annotation.gtf 97 | 98 | ``` 99 | cat ~/reference/gtf/gencode/gencode.v25.annotation.gtf |grep -v PAR_Y |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSG\d+)/;$gene=$1;undef %h} if($F[2] eq "exon"){$key="$F[3]\t$F[4]";$len=$F[4]-$F[3];$c{$gene}+=$len unless exists $h{$key};$h{$key}++} }END{print "$_\t$c{$_}" foreach keys %c}' >>human_ENSG_length 100 | ``` 101 | Once we got the human_ENSG_length file(totally 57992 genes), we should tranfer it in our R package 102 | ``` 103 | ENSG00000252040 131 104 | ENSG00000251770 82 105 | ENSG00000261028 856 106 | ENSG00000186844 421 107 | ENSG00000234241 1682 108 | ENSG00000144815 15589 109 | ENSG00000171236 3602 110 | ENSG00000262151 1062 111 | ENSG00000237381 1422 112 | ENSG00000164123 1835 113 | ``` 114 | 115 | ```{r,eval=F} 116 | options(stringsAsFactors = F) 117 | a=read.table('data/human_ENSG_length') 118 | colnames(a)=c( 'ensembl','length' ) 119 | head(a) 120 | human_geneLength_genecode_v25=a 121 | devtools::use_data(human_geneLength_genecode_v25, overwrite = T) 122 | ``` 123 | 124 | ## For mouse 125 | 126 | first change gencode.vM12.annotation.gtf.gz to mouse_ENSG_length by the script 127 | 128 | ``` 129 | cat gencode.vM12.annotation.gtf |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSMUSG\d+)/;$gene=$1;undef %h} if($F[2] eq "exon"){$key="$F[3]\t$F[4]";$len=$F[4]-$F[3];$c{$gene}+=$len unless exists $h{$key};$h{$key}++} }END{print "$_\t$c{$_}" foreach keys %c}' >mouse_ENSG_length 130 | 131 | ``` 132 | Once we got the mouse_ENSG_length file(totally 49585 genes), we should tranfer it in our R package 133 | ``` 134 | ENSMUSG00000019297 3684 135 | ENSMUSG00000109529 2059 136 | ENSMUSG00000026452 9834 137 | ENSMUSG00000048763 6004 138 | ENSMUSG00000023903 3363 139 | ENSMUSG00000107858 3759 140 | ENSMUSG00000057799 896 141 | ENSMUSG00000077492 131 142 | ENSMUSG00000078291 431 143 | ENSMUSG00000101288 586 144 | ``` 145 | 146 | ```{r,eval=F} 147 | ## for mouse gencode.vM12.annotation.gtf.gz 148 | options(stringsAsFactors = F) 149 | a=read.table('data/mouse_ENSG_length') 150 | colnames(a)=c( 'ensembl','length' ) 151 | head(a) 152 | mouse_geneLength_genecode_vM12=a 153 | devtools::use_data(mouse_geneLength_genecode_vM12, overwrite = T) 154 | ``` 155 | 156 | 157 | 158 | 159 | 160 | 161 | # The orthologous genes between human and mouse 162 | 163 | Firstly, I download the file **HOM_AllOrganism.rpt** from : http://www.informatics.jax.org/homology.shtml 164 | 165 | Then process it by : 166 | ``` 167 | perl -F"\t" -alne '{$key=$F[0];$h1{$key}=$F[3] if /human/;$h2{$key}=$F[3] if /mouse/;}END{print "$h1{$_},$h2{$_}" foreach keys %h1}' HOM_AllOrganism.rpt >human2mouse.csv 168 | ``` 169 | Then transfer this file to package 170 | 171 | ```{r,eval=F} 172 | a=read.csv('data/human2mouse.csv',header = F ) 173 | table(a[,1] == toupper(a[,2])) 174 | colnames(a)=c('human','mouse') 175 | rmGenes=apply(a,1,function(x) sum(x=='') >0) 176 | a=a[!rmGenes,] 177 | table(a[,1] == toupper(a[,2])) 178 | human2mouse_symbols=a 179 | devtools::use_data(human2mouse_symbols, overwrite = T) 180 | ``` 181 | -------------------------------------------------------------------------------- /R/exprSet2CNV.R: -------------------------------------------------------------------------------- 1 | #' Calculate CNV based on normalized expression matrix 2 | #' 3 | #' expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or sample. 4 | #' The rownames for this expression matrix should be ensembl IDs or gene symbols 5 | #' One should set the species as human or mouse. 6 | #' The genomic positions is human_geneInfo_genecode_v25 or mouse_geneInfo_genecode_vM12 7 | #' 8 | #' @param exprSet The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1)) 9 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 10 | #' @param species Choose human or mouse,defaults: human 11 | #' 12 | #' @return all_cnv CNV value matrix for each gene in each cell 13 | #' @examples 14 | #' exprSet2CNV 15 | #' exprSet2CNV(exprSet,geneType='ensembl',species='human') 16 | 17 | exprSet2CNV <- function(exprSet,geneType='ensembl',species='human' ){ 18 | 19 | pos = get_genomic_positions(rownames(exprSet),geneType ,species ) 20 | exprSet=exprSet[rownames(exprSet) %in% pos[,1],];dim(exprSet) 21 | pos=pos[pos[,1] %in% rownames(exprSet),];dim(pos) 22 | exprSet=exprSet[pos[,1],] 23 | 24 | 25 | res=cbind(pos,exprSet) 26 | table(res$chr) 27 | all_cnv <- lapply(split(res,res$chr), function(x){ 28 | # x=split(res,res$chr)[[1]] 29 | anno=x[,1:4] 30 | ## the expression matrix for each chromosome 31 | dat=x[,5:ncol(x)] 32 | # At first, expression matrix is log2(cpm+1), we need to scale it by gene. 33 | # Then, we defined relative expression by centering the expression levels, Er[i,j]=E[i,j]-average(E[i,1...n]). 34 | dat=apply(dat, 1, function(x) x-mean(x)) 35 | dat=t(dat) 36 | ## Then, To avoid considerable impact of any particular gene on the moving average 37 | ## we limited the relative expression values to [-3,3] by replacing all values above 3 by 3, 38 | ## and replacing values below -3 by -3. 39 | dat[dat>3]=3 40 | dat[dat < -3 ] = -3 41 | if(nrow(dat)>100){ 42 | cnv <- lapply(51:(nrow(dat)-50), function(i){ 43 | this_cnv <- unlist( lapply(1:ncol(dat), function(j){ 44 | sum(dat[(i-50):(i+50),j])/101 45 | })) 46 | return(this_cnv) 47 | }) 48 | cnv=do.call(rbind,cnv) 49 | cnv=cbind(anno[51:(nrow(x)-50),],cnv) 50 | # cnv[1:4,1:8] 51 | }else{ 52 | return(NULL) 53 | } 54 | }) 55 | all_cnv=do.call(rbind,all_cnv) 56 | head(all_cnv[1:4,1:8]) 57 | table(all_cnv$chr) 58 | colnames(all_cnv)[5:ncol(all_cnv)]=colnames(exprSet) 59 | return(all_cnv) 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /R/exprSet2cellcycle.R: -------------------------------------------------------------------------------- 1 | #' Assign cell cycle status based on expression matrix 2 | #' 3 | #' The rownames of expression matrix, should be ensembl IDs or gene symbols 4 | #' One should set the species as human or mouse. 5 | #' 6 | #' @param exprSet The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1)) 7 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 8 | #' @param species Choose human or mouse,defaults: human 9 | #' 10 | #' @return assigned A list of (phases,scores,normalized.scores) return from cyclone(scran) 11 | #' @examples 12 | #' exprSet2cellcycle 13 | #' exprSet2cellcycle(exprSet,geneType='ensembl',species='human') 14 | 15 | exprSet2cellcycle <- function(exprSet,geneType='ensembl',species='human' ){ 16 | library(scran) 17 | sce <- SingleCellExperiment(list(counts=exprSet)) 18 | if(species=='human'){ 19 | library(org.Hs.eg.db) 20 | mm.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran")) 21 | if(geneType=='ensembl'){ 22 | assigned <- cyclone(sce, pairs=mm.pairs ) 23 | } 24 | if(geneType=='symbol'){ 25 | ensembl <- mapIds(org.Hs.eg.db, keys=rownames(sce), keytype="SYMBOL", column="ENSEMBL") 26 | assigned <- cyclone(sce, pairs=mm.pairs, gene.names=ensembl) 27 | } 28 | } 29 | if(species=='mouse'){ 30 | library(org.Mm.eg.db) 31 | mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) 32 | if(geneType=='ensembl'){ 33 | assigned <- cyclone(sce, pairs=mm.pairs ) 34 | } 35 | if(geneType=='symbol'){ 36 | ensembl <- mapIds(org.Mm.eg.db, keys=rownames(sce), keytype="SYMBOL", column="ENSEMBL") 37 | assigned <- cyclone(sce, pairs=mm.pairs, gene.names=ensembl) 38 | } 39 | } 40 | # head(cycles$scores) 41 | # table(cycles$phases) 42 | # dat=cbind(cycles$score,cycles$phases) 43 | # colnames(dat) 44 | # attach(dat) 45 | # library(scatterplot3d) 46 | # scatterplot3d(G1, S, G2M, angle=20,color = rainbow(3)[as.numeric(as.factor(cycles$phases))], 47 | # grid=TRUE, box=FALSE) 48 | # detach(dat) 49 | return(assigned) 50 | } 51 | -------------------------------------------------------------------------------- /R/exprSet2inferCNV.R: -------------------------------------------------------------------------------- 1 | #' Create two input files for inferCNV based on expression matrix 2 | #' 3 | #' expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or samples. 4 | #' The rownames for this expression matrix should be ensembl IDs or gene symbols 5 | #' One should set the species as human or mouse. 6 | #' Check the document for input format at : https://github.com/broadinstitute/inferCNV/wiki 7 | #' The genomic positions for airway data is : human_geneInfo_genecode_v25 8 | #' 9 | #' @param exprSet The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1)) 10 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 11 | #' @param species Choose human or mouse,defaults: human 12 | #' @param prefix The prefix for files,defaults:example 13 | #' @param dir Choose where to put the files,defaults: ./ 14 | #' 15 | #' @return Two files \code{example_inferCNV_pos.txt} and \code{example_inferCNV_exprSet.txt} 16 | #' @examples 17 | #' exprSet2inferCNV 18 | #' exprSet2inferCNV(exprSet,geneType='ensembl',species='human',prefix='airwary',dir=dir) 19 | 20 | exprSet2inferCNV <- function(exprSet,geneType='ensembl',species='human',prefix='example',dir='./'){ 21 | 22 | if(species=='human'){ 23 | head(human_geneInfo_genecode_v25) 24 | if(geneType=='ensembl'){ 25 | pos=human_geneInfo_genecode_v25 26 | exprSet=exprSet[rownames(exprSet) %in% pos$ensembl,] 27 | dim(exprSet) 28 | pos=pos[match(rownames(exprSet),pos$ensembl),c(4,1:3)] 29 | } 30 | if(geneType=='symbol'){ 31 | pos=human_geneInfo_genecode_v25 32 | exprSet=exprSet[rownames(exprSet) %in% pos$symbol,] 33 | dim(exprSet) 34 | pos=pos[match(rownames(exprSet),pos$symbol),c(6,1:3)] 35 | } 36 | new_chr=gsub('chr','',pos$chr) 37 | table(new_chr) 38 | new_chr[new_chr=='X']=23 39 | new_chr[new_chr=='Y']=24 40 | new_chr=as.numeric(new_chr) 41 | pos$chr=new_chr 42 | pos=pos[order(pos$chr,pos$start),] 43 | } 44 | if(species=='mouse'){ 45 | head(mouse_geneInfo_genecode_vM12) 46 | if(geneType=='ensembl'){ 47 | pos=mouse_geneInfo_genecode_vM12 48 | exprSet=exprSet[rownames(exprSet) %in% pos$ensembl,] 49 | dim(exprSet) 50 | pos=pos[match(rownames(exprSet),pos$ensembl),c(4,1:3)] 51 | } 52 | if(geneType=='symbol'){ 53 | pos=mouse_geneInfo_genecode_vM12 54 | exprSet=exprSet[rownames(exprSet) %in% pos$symbol,] 55 | dim(exprSet) 56 | pos=pos[match(rownames(exprSet),pos$symbol),c(5,1:3)] 57 | } 58 | new_chr=gsub('chr','',pos$chr) 59 | table(new_chr) 60 | new_chr[new_chr=='X']=20 61 | new_chr[new_chr=='Y']=21 62 | new_chr=as.numeric(new_chr) 63 | pos$chr=new_chr 64 | pos=pos[order(pos$chr,pos$start),] 65 | } 66 | 67 | write.table(pos,file.path(dir,paste0(prefix,'_inferCNV_pos.txt')),row.names = F,col.names = F,sep = '\t',quote = F) 68 | write.table(exprSet,file.path(dir,paste0(prefix,'_inferCNV_exprSet.txt')),quote = F,sep = '\t') 69 | 70 | # Rscript ~/biosoft/scRNA_cnv/inferCNV/scripts/inferCNV.R \ 71 | # --output_dir test airwary_inferCNV_exprSet.txt airwary_inferCNV_pos.txt 72 | 73 | } 74 | 75 | -------------------------------------------------------------------------------- /R/get_genomic_positons.R: -------------------------------------------------------------------------------- 1 | #' Get the genomic positions based on a list of gene 2 | #' 3 | #' A list of gene, should be ensembl IDs or gene symbols 4 | #' One should set the species as human or mouse. 5 | #' The genomic positions is human_geneInfo_genecode_v25 or mouse_geneInfo_genecode_vM12 6 | #' 7 | #' @param geneList Should be the rownames of expression matrix 8 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 9 | #' @param species Choose human or mouse,defaults: human 10 | #' 11 | #' @return pos the genomic positions, columns should be : gene/chr/start/end 12 | #' @examples 13 | #' get_genomic_positions 14 | #' get_genomic_positions(rownames(exprSet),geneType='ensembl',species='human') 15 | 16 | get_genomic_positions <- function(geneList,geneType='ensembl',species='human'){ 17 | 18 | if(species=='human'){ 19 | head(human_geneInfo_genecode_v25) 20 | if(geneType=='ensembl'){ 21 | pos=human_geneInfo_genecode_v25 22 | pos=pos[match(geneList,pos$ensembl),c(4,1:3)] 23 | } 24 | if(geneType=='symbol'){ 25 | pos=human_geneInfo_genecode_v25 26 | pos=pos[match(geneList,pos$symbol),c(6,1:3)] 27 | } 28 | new_chr=gsub('chr','',pos$chr) 29 | table(new_chr) 30 | new_chr[new_chr=='X']=23 31 | new_chr[new_chr=='Y']=24 32 | new_chr=as.numeric(new_chr) 33 | pos$chr=new_chr 34 | pos=pos[order(pos$chr,pos$start),] 35 | } 36 | if(species=='mouse'){ 37 | head(mouse_geneInfo_genecode_vM12) 38 | if(geneType=='ensembl'){ 39 | pos=mouse_geneInfo_genecode_vM12 40 | pos=pos[match(geneList,pos$ensembl),c(4,1:3)] 41 | } 42 | if(geneType=='symbol'){ 43 | pos=mouse_geneInfo_genecode_vM12 44 | pos=pos[match(geneList,pos$symbol),c(5,1:3)] 45 | } 46 | new_chr=gsub('chr','',pos$chr) 47 | table(new_chr) 48 | new_chr[new_chr=='X']=20 49 | new_chr[new_chr=='Y']=21 50 | new_chr=as.numeric(new_chr) 51 | pos$chr=new_chr 52 | pos=pos[order(pos$chr,pos$start),] 53 | } 54 | return(pos) 55 | } 56 | -------------------------------------------------------------------------------- /R/ht_cnv.R: -------------------------------------------------------------------------------- 1 | #' Draw heatmap for CNV matrix 2 | #' 3 | #' CNV matrix calculated base on expression matrix by exprSet2CNV 4 | #' 5 | #' @param cnv The CNV results from exprSet2CNV 6 | #' @param meta Choose ensembl or symbol,defaults: ensembl 7 | #' @param prefix The prefix of the filename of the PDF heatmap 8 | #' @param noise_filter A value must be atleast this much more or less than the reference to be plotted [Default 0.2] 9 | #' @param upper The maximum value for heatmap [Default 2] 10 | #' @param species Choose human or mouse,defaults: human 11 | #' 12 | #' @return heatmap 13 | #' @examples 14 | #' ht_cnv 15 | #' exprSet2CNV(exprSet,geneType='ensembl',species='human') 16 | 17 | ht_cnv <- function(cnv,meta,prefix='test',noise_filter=0.2,upper=2){ 18 | library(pheatmap) 19 | all_cnv=cnv 20 | D=t(scale(all_cnv[,5:ncol(all_cnv)] )) 21 | apply(D, 1, summary) 22 | apply(D[,1:10], 2, summary) 23 | D[D> upper]=upper 24 | D[D< -upper] = -upper 25 | D[abs(D) < noise_filter]=0 26 | 27 | dim(D) 28 | colnames(D)=paste0('genes_',1:ncol(D)) 29 | rownames(D)=colnames(exprSet) 30 | 31 | require(RColorBrewer) 32 | cols <- colorRampPalette(brewer.pal(10, "RdBu"))(256) 33 | 34 | library(stringr) 35 | annotation_row = data.frame( 36 | patients=str_split(rownames(D),'_',simplify = T)[,1] 37 | ) 38 | 39 | rownames(annotation_row) = rownames(D) 40 | 41 | annotation_col = data.frame( 42 | chr= factor(all_cnv$chr,levels = unique(all_cnv$chr)) 43 | ) 44 | rownames(annotation_col) = colnames(D) 45 | pheatmap(D,cluster_rows = T,col=rev(cols), 46 | annotation_col=annotation_col, 47 | annotation_row = annotation_row, 48 | cluster_cols = F,show_rownames=F,show_colnames=F,filename=paste0(prefix,'_cnv.pdf')) 49 | } 50 | -------------------------------------------------------------------------------- /R/pam50subtyping.R: -------------------------------------------------------------------------------- 1 | #' Pam50 subtyping based on normalized expression matrix 2 | #' 3 | #' expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or sample. 4 | #' The rownames for this expression matrix should be ensembl IDs or gene symbols 5 | #' One should set the species as human or mouse. 6 | #' 7 | #' @param exprSet The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1)) 8 | #' @param geneType Choose ensembl or symbol,defaults: ensembl 9 | #' @param species Choose human or mouse,defaults: human 10 | #' 11 | #' @return all_cnv molecular.subtyping results 12 | #' @examples 13 | #' pam50subtyping 14 | #' pam50subtyping(exprSet,geneType='ensembl',species='human') 15 | 16 | pam50subtyping <- function(exprSet,geneType='ensembl',species='human' ){ 17 | suppressPackageStartupMessages(library(genefu)) 18 | data(pam50) 19 | pam50genes=pam50$centroids.map[c(1,3)] 20 | pam50genes[pam50genes$probe=='CDCA1',1]='NUF2' 21 | pam50genes[pam50genes$probe=='KNTC2',1]='NDC80' 22 | pam50genes[pam50genes$probe=='ORC6L',1]='ORC6' 23 | rownames(pam50genes)=pam50genes$probe 24 | 25 | 26 | 27 | # CDCA1 --> NUF2 NUF2, NDC80 Kinetochore Complex Component 28 | # KNTC2 --> NDC80 29 | # ORC6L --> ORC6 Origin Recognition Complex Subunit 6 30 | if(species=='human'){ 31 | head(human_geneInfo_genecode_v25) 32 | 33 | if(geneType=='ensembl'){ 34 | pos=human_geneInfo_genecode_v25 35 | pam50genes$ensembl=pos[match(rownames(pam50genes),pos$symbol),'ensembl'] 36 | exprSet=exprSet[rownames(exprSet) %in% pam50genes$ensembl,] 37 | ddata=t(exprSet) 38 | dannot=pam50genes[match(colnames(ddata),pam50genes$ensembl),] 39 | dannot$probe=dannot$ensembl 40 | rownames(dannot)=dannot$probe 41 | 42 | } 43 | if(geneType=='symbol'){ 44 | exprSet=exprSet[rownames(exprSet) %in% pam50genes$probe,] 45 | ddata=t(exprSet) 46 | dannot=pam50genes[match(colnames(ddata),pam50genes$probe),] 47 | } 48 | 49 | } 50 | if(species=='mouse'){ 51 | pam50genes$probe=human2mouse_symbols[match(pam50genes$probe , human2mouse_symbols[,1]),2] 52 | rownames(pam50genes)=pam50genes$probe 53 | 54 | head(mouse_geneInfo_genecode_vM12) 55 | if(geneType=='ensembl'){ 56 | pos=mouse_geneInfo_genecode_vM12 57 | pam50genes$ensembl=pos[match(rownames(pam50genes),pos$symbol),'ensembl'] 58 | exprSet=exprSet[rownames(exprSet) %in% pam50genes$ensembl,] 59 | ddata=t(exprSet) 60 | dannot=pam50genes[match(colnames(ddata),pam50genes$ensembl),] 61 | dannot$probe=dannot$ensembl 62 | rownames(dannot)=dannot$probe 63 | } 64 | if(geneType=='symbol'){ 65 | exprSet=exprSet[rownames(exprSet) %in% pam50genes$probe,] 66 | ddata=t(exprSet) 67 | dannot=pam50genes[match(colnames(ddata),pam50genes$probe),] 68 | } 69 | } 70 | 71 | message(paste0(nrow(dannot), ' of 50 genes are used to subtype')) 72 | PAM50Preds<-molecular.subtyping(sbt.model = "pam50",data=ddata, 73 | annot=dannot,do.mapping=TRUE) 74 | table(PAM50Preds$subtype) 75 | return(PAM50Preds) 76 | 77 | } 78 | 79 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/.DS_Store -------------------------------------------------------------------------------- /data/airway_exprSet.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/airway_exprSet.rda -------------------------------------------------------------------------------- /data/human2mouse_symbols.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/human2mouse_symbols.rda -------------------------------------------------------------------------------- /data/human_geneInfo_genecode_v25.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/human_geneInfo_genecode_v25.rda -------------------------------------------------------------------------------- /data/human_geneLength_genecode_v25.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/human_geneLength_genecode_v25.rda -------------------------------------------------------------------------------- /data/mouse_geneInfo_genecode_vM12.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/mouse_geneInfo_genecode_vM12.rda -------------------------------------------------------------------------------- /data/mouse_geneLength_genecode_vM12.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/data/mouse_geneLength_genecode_vM12.rda -------------------------------------------------------------------------------- /man/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/man/.DS_Store -------------------------------------------------------------------------------- /man/airway2inferCNV.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/airway2inferCNV.R 3 | \name{airway2inferCNV} 4 | \alias{airway2inferCNV} 5 | \title{Create two input files for inferCNV for airway data} 6 | \usage{ 7 | airway2inferCNV(run = FALSE, dir = "./") 8 | } 9 | \arguments{ 10 | \item{run}{Choose TRUE or FALSE,defaults: FALSE} 11 | 12 | \item{dir}{Choose where to put the files,defaults: ./} 13 | } 14 | \value{ 15 | Two files \code{airwary_inferCNV_pos.txt} and \code{airwary_inferCNV_exprSet.txt} 16 | } 17 | \description{ 18 | Airway is the expression matrix from public bulk RNA-seq data. 19 | Check the document for input format at : https://github.com/broadinstitute/inferCNV/wiki 20 | The genomic positions for airway data is : human_geneInfo_genecode_v25 21 | } 22 | \examples{ 23 | airway2inferCNV 24 | airway2inferCNV(TRUE) 25 | airway2inferCNV(TRUE, '~/biosoft/scRNA_cnv/project/airway') 26 | } 27 | -------------------------------------------------------------------------------- /man/anno_bed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/anno_bed.R 3 | \name{anno_bed} 4 | \alias{anno_bed} 5 | \title{Assign a position to genomic features by ChIPseeker} 6 | \usage{ 7 | anno_bed(pos, reference = "hg38") 8 | } 9 | \arguments{ 10 | \item{pos}{three columns of the positions, chromosome,start,end} 11 | 12 | \item{reference}{Choose hg19,hg38,mm10 ,defaults: hg38} 13 | } 14 | \value{ 15 | assigned results 16 | } 17 | \description{ 18 | filter SNP or INDELs in a vcf or maf file 19 | } 20 | \examples{ 21 | anno_bed 22 | anno_bed(pos,reference='hg38' ) 23 | } 24 | -------------------------------------------------------------------------------- /man/cal_DM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cal_DM.R 3 | \name{cal_DM} 4 | \alias{cal_DM} 5 | \title{Calculate DM(distance to median) values based on cpm(counts per million) expression matrix} 6 | \usage{ 7 | cal_DM(exprSet, geneType = "ensembl", species = "human") 8 | } 9 | \arguments{ 10 | \item{exprSet}{cpm(counts per million) expression matrix} 11 | 12 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{species}{Choose human or mouse,defaults: human} 15 | } 16 | \value{ 17 | log10cv2_adj DM(distance to median) values for each gene 18 | } 19 | \description{ 20 | expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or sample. 21 | The rownames for this expression matrix should be ensembl IDs or gene symbols 22 | One should set the species as human or mouse. 23 | The gene length information calculated base onf GENCODE database. 24 | } 25 | \examples{ 26 | cal_DM 27 | cal_DM(exprSet,geneType='ensembl',species='human') 28 | 29 | } 30 | -------------------------------------------------------------------------------- /man/change_ID_of_expreSet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/change_ID_of_exprSet.R 3 | \name{change_ID_of_expreSet} 4 | \alias{change_ID_of_expreSet} 5 | \title{Change the colnames of expression matrix between ensembl and symbol} 6 | \usage{ 7 | change_ID_of_expreSet(exprSet, geneType = "ensembl", species = "human") 8 | } 9 | \arguments{ 10 | \item{exprSet}{The expression matrix} 11 | 12 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{species}{Choose human or mouse,defaults: human} 15 | } 16 | \value{ 17 | exprSet The expression matrix which had been changed. 18 | } 19 | \description{ 20 | expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or samples. 21 | The rownames for this expression matrix should be ensembl IDs or gene symbols 22 | One should set the species as human or mouse. 23 | The expression value will be sum if two or more genes to one gene 24 | } 25 | \examples{ 26 | change_ID_of_expreSet 27 | change_ID_of_expreSet(exprSet,geneType='ensembl',species='human' ) 28 | } 29 | -------------------------------------------------------------------------------- /man/counts2exprSet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/counts2exprSet.R 3 | \name{counts2exprSet} 4 | \alias{counts2exprSet} 5 | \title{transform and normlize the raw counts matrix} 6 | \usage{ 7 | counts2exprSet(counts) 8 | } 9 | \arguments{ 10 | \item{counts}{The raw counts matrix from featureCounts or other tools} 11 | } 12 | \value{ 13 | exprSet, the normalized expression matrix, log2(cpm+1) 14 | } 15 | \description{ 16 | row counts matrix will be filtered and log2(cpm+1) tranformed. 17 | } 18 | \examples{ 19 | counts2exprSet 20 | counts2exprSet(counts) 21 | } 22 | -------------------------------------------------------------------------------- /man/exprSet2CNV.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exprSet2CNV.R 3 | \name{exprSet2CNV} 4 | \alias{exprSet2CNV} 5 | \title{Calculate CNV based on normalized expression matrix} 6 | \usage{ 7 | exprSet2CNV(exprSet, geneType = "ensembl", species = "human") 8 | } 9 | \arguments{ 10 | \item{exprSet}{The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1))} 11 | 12 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{species}{Choose human or mouse,defaults: human} 15 | } 16 | \value{ 17 | all_cnv CNV value matrix for each gene in each cell 18 | } 19 | \description{ 20 | expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or sample. 21 | The rownames for this expression matrix should be ensembl IDs or gene symbols 22 | One should set the species as human or mouse. 23 | The genomic positions is human_geneInfo_genecode_v25 or mouse_geneInfo_genecode_vM12 24 | } 25 | \examples{ 26 | exprSet2CNV 27 | exprSet2CNV(exprSet,geneType='ensembl',species='human') 28 | } 29 | -------------------------------------------------------------------------------- /man/exprSet2cellcycle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exprSet2cellcycle.R 3 | \name{exprSet2cellcycle} 4 | \alias{exprSet2cellcycle} 5 | \title{Assign cell cycle status based on expression matrix} 6 | \usage{ 7 | exprSet2cellcycle(exprSet, geneType = "ensembl", species = "human") 8 | } 9 | \arguments{ 10 | \item{exprSet}{The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1))} 11 | 12 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{species}{Choose human or mouse,defaults: human} 15 | } 16 | \value{ 17 | assigned A list of (phases,scores,normalized.scores) return from cyclone(scran) 18 | } 19 | \description{ 20 | The rownames of expression matrix, should be ensembl IDs or gene symbols 21 | One should set the species as human or mouse. 22 | } 23 | \examples{ 24 | exprSet2cellcycle 25 | exprSet2cellcycle(exprSet,geneType='ensembl',species='human') 26 | } 27 | -------------------------------------------------------------------------------- /man/exprSet2inferCNV.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exprSet2inferCNV.R 3 | \name{exprSet2inferCNV} 4 | \alias{exprSet2inferCNV} 5 | \title{Create two input files for inferCNV based on expression matrix} 6 | \usage{ 7 | exprSet2inferCNV(exprSet, geneType = "ensembl", species = "human", 8 | prefix = "example", dir = "./") 9 | } 10 | \arguments{ 11 | \item{exprSet}{The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1))} 12 | 13 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 14 | 15 | \item{species}{Choose human or mouse,defaults: human} 16 | 17 | \item{prefix}{The prefix for files,defaults:example} 18 | 19 | \item{dir}{Choose where to put the files,defaults: ./} 20 | } 21 | \value{ 22 | Two files \code{example_inferCNV_pos.txt} and \code{example_inferCNV_exprSet.txt} 23 | } 24 | \description{ 25 | expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or samples. 26 | The rownames for this expression matrix should be ensembl IDs or gene symbols 27 | One should set the species as human or mouse. 28 | Check the document for input format at : https://github.com/broadinstitute/inferCNV/wiki 29 | The genomic positions for airway data is : human_geneInfo_genecode_v25 30 | } 31 | \examples{ 32 | exprSet2inferCNV 33 | exprSet2inferCNV(exprSet,geneType='ensembl',species='human',prefix='airwary',dir=dir) 34 | } 35 | -------------------------------------------------------------------------------- /man/get_genomic_positions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_genomic_positons.R 3 | \name{get_genomic_positions} 4 | \alias{get_genomic_positions} 5 | \title{Get the genomic positions based on a list of gene} 6 | \usage{ 7 | get_genomic_positions(geneList, geneType = "ensembl", species = "human") 8 | } 9 | \arguments{ 10 | \item{geneList}{Should be the rownames of expression matrix} 11 | 12 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{species}{Choose human or mouse,defaults: human} 15 | } 16 | \value{ 17 | pos the genomic positions, columns should be : gene/chr/start/end 18 | } 19 | \description{ 20 | A list of gene, should be ensembl IDs or gene symbols 21 | One should set the species as human or mouse. 22 | The genomic positions is human_geneInfo_genecode_v25 or mouse_geneInfo_genecode_vM12 23 | } 24 | \examples{ 25 | get_genomic_positions 26 | get_genomic_positions(rownames(exprSet),geneType='ensembl',species='human') 27 | } 28 | -------------------------------------------------------------------------------- /man/ht_cnv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ht_cnv.R 3 | \name{ht_cnv} 4 | \alias{ht_cnv} 5 | \title{Draw heatmap for CNV matrix} 6 | \usage{ 7 | ht_cnv(cnv, meta, prefix = "test", noise_filter = 0.2, upper = 2) 8 | } 9 | \arguments{ 10 | \item{cnv}{The CNV results from exprSet2CNV} 11 | 12 | \item{meta}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{prefix}{The prefix of the filename of the PDF heatmap} 15 | 16 | \item{noise_filter}{A value must be atleast this much more or less than the reference to be plotted [Default 0.2]} 17 | 18 | \item{upper}{The maximum value for heatmap [Default 2]} 19 | 20 | \item{species}{Choose human or mouse,defaults: human} 21 | } 22 | \value{ 23 | heatmap 24 | } 25 | \description{ 26 | CNV matrix calculated base on expression matrix by exprSet2CNV 27 | } 28 | \examples{ 29 | ht_cnv 30 | exprSet2CNV(exprSet,geneType='ensembl',species='human') 31 | } 32 | -------------------------------------------------------------------------------- /man/pam50subtyping.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pam50subtyping.R 3 | \name{pam50subtyping} 4 | \alias{pam50subtyping} 5 | \title{Pam50 subtyping based on normalized expression matrix} 6 | \usage{ 7 | pam50subtyping(exprSet, geneType = "ensembl", species = "human") 8 | } 9 | \arguments{ 10 | \item{exprSet}{The expression matrix(which shoud be normalized,like log2(cpm+1) or log2(tpm+1))} 11 | 12 | \item{geneType}{Choose ensembl or symbol,defaults: ensembl} 13 | 14 | \item{species}{Choose human or mouse,defaults: human} 15 | } 16 | \value{ 17 | all_cnv molecular.subtyping results 18 | } 19 | \description{ 20 | expression matrix should be numeric values matrix in which each row is a gene, and each column is a cell or sample. 21 | The rownames for this expression matrix should be ensembl IDs or gene symbols 22 | One should set the species as human or mouse. 23 | } 24 | \examples{ 25 | pam50subtyping 26 | pam50subtyping(exprSet,geneType='ensembl',species='human') 27 | } 28 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # I hope It will be the best practice for scRNA seq data(downstream analysis) 2 | 3 | ### Upstream workflow 4 | 5 | I prefer `STAR+FeatureCounts` to generate the raw counts expression matrix . 6 | 7 | One can also download it from published paper, such as : [Cell Rep.](https://www.ncbi.nlm.nih.gov/pubmed/29091775#) 2017 [Single-Cell RNA-Seq Analysis of Infiltrating Neoplastic Cells at the Migrating Front of Human Glioblastoma.](https://www.ncbi.nlm.nih.gov/pubmed/29091775) 8 | 9 | - The raw sequence data can be found at GEO: [GSE84465](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84465) 10 | - The expression matrix can be dowload from: [3,589 cells in a cohort of four patients ](http://gbmseq.org/) 11 | 12 | ### Create input files for inferCNV 13 | 14 | Once we generate the raw counts expression matrix based on scRNA-seq data, such as `'GBM_raw_gene_counts.csv'` , we can use the code below to create input files for inferCNV 15 | 16 | ```R 17 | options(stringsAsFactors = F) 18 | dir='/Users/jmzeng/biosoft/scRNA_cnv/project/gbm_2017/' 19 | ## download from http://gbmseq.org/#downloadData 20 | counts=read.table(file.path(dir,'GBM_raw_gene_counts.csv')) 21 | #counts=counts[,1:100] 22 | counts[1:4,1:4];dim(counts) 23 | library(BPscRNAseq) 24 | exprSet=counts2exprSet(counts) 25 | exprSet[1:4,1:4];dim(exprSet) 26 | exprSet2inferCNV(exprSet,geneType='symbol',species='human',prefix='gbm_2017',dir=dir) 27 | ``` 28 | 29 | Then we can run inferCNV as https://github.com/broadinstitute/inferCNV/wiki 30 | 31 | ```Shell 32 | Rscript ~/biosoft/scRNA_cnv/inferCNV/scripts/inferCNV.R --output_dir test gbm_inferCNV_exprSet.txt gbm_inferCNV_pos.txt 33 | ``` 34 | 35 | There's also a more convenient way to get the example input files for inferCNV, by just run `airway2inferCNV(TRUE)` 36 | 37 | ### Calculate CNV values and draw heatmap 38 | 39 | If you don't want use the inferCNV based on the scripts from Broad institute, you can also calculate CNV in R and draw heatmap as below: 40 | 41 | ```R 42 | library(airway) 43 | library(edgeR) 44 | library(DESeq2) 45 | data(airway) 46 | airway 47 | counts=assay(airway) 48 | 49 | counts[1:4,1:4];dim(counts) 50 | exprSet=counts2exprSet(counts) 51 | exprSet[1:4,1:4];dim(exprSet) 52 | cnv=exprSet2CNV(exprSet,geneType='ensembl',species='human') 53 | cnv[1:4,1:8];dim(cnv) 54 | ht_cnv(cnv) 55 | ``` 56 | 57 | 58 | 59 | ### Compare the results between inferCNV and my function 60 | 61 | 62 | 63 | ```R 64 | rm(list=ls()) 65 | dir='/Users/jmzeng/biosoft/scRNA_cnv/project/gbm_2014' 66 | load(file.path(dir,'GBM_for_CNV_input.Rdata')) 67 | exprSet[1:4,1:4];dim(exprSet) 68 | cnv=exprSet2CNV(exprSet,geneType='symbol',species='human') 69 | cnv[1:4,1:8];dim(cnv) 70 | ht_cnv(cnv,prefix = 'gbm_2014_jimmy') 71 | exprSet2inferCNV(exprSet,geneType='symbol',species='human',prefix='gbm_2014',dir=dir) 72 | # Rscript ~/biosoft/scRNA_cnv/inferCNV/scripts/inferCNV.R --output_dir test 73 | # gbm_2014_inferCNV_exprSet.txt gbm_2014_inferCNV_pos.txt 74 | ``` 75 | 76 | 77 | 78 | ### Calculate the DM (distance to median) values 79 | 80 | ```R 81 | library(BPscRNAseq) 82 | library(airway) 83 | library(edgeR) 84 | library(DESeq2) 85 | data(airway) 86 | airway 87 | counts=assay(airway) 88 | counts[1:4,1:4];dim(counts) 89 | geneLists=rownames(counts) 90 | # removed lowly expressed genes whose mean normalised read counts (reads per million) are less than10, 91 | # since we cannot distinguish biological noise from technical noise for these genes. 92 | keepGene=rowMeans(cpm(counts) ) >=10 93 | table(keepGene);dim(counts) 94 | dim(counts[keepGene,]) 95 | exprSet=counts[keepGene,] 96 | rownames(exprSet)=geneLists[keepGene] 97 | exprSet=cpm(exprSet) 98 | exprSet[1:4,1:4];dim(exprSet) 99 | DM=cal_DM(exprSet,geneType='ensembl',species='human') 100 | pheatmap::pheatmap(log2(exprSet[names(head(sort(DM),50)),]+1)) 101 | pheatmap::pheatmap(log2(exprSet[names(tail(sort(DM),50)),]+1)) 102 | 103 | ``` 104 | 105 | -------------------------------------------------------------------------------- /test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmzeng1314/BPscRNAseq/d1948ba690583be0890cc6ce42ba5f12bd1ffe42/test.pdf --------------------------------------------------------------------------------