├── Batch-pollution-remove ├── Test_dge.txt.gz └── batch-pollution-remove.R ├── Circos ├── Main_circos.r ├── celltype.NV_SRS_75.out.result └── color.list ├── Early-processing ├── Drop-seq_tools.zip ├── STAR-2.5.2a.zip ├── bbmap.zip ├── pipeline.txt └── sccpipe.zip ├── Gene-regulon-network ├── 1-1_SCENIC_AdultHuman_main.py ├── 1-2_JSD_RSS.r └── 1-3_CSI.r ├── HCL_Fig1_script-1.py ├── HCL_Fig1_script_730new.py.txt ├── Metaneighbor ├── 2017-08-28-runMN-US.R └── hm_metaneighbor.r ├── PAGA └── PAGA_BonemarrowCD34P.py ├── Pseudocell ├── FetalStomach1_500more.RData └── Pseudocell_Human.r ├── README.md ├── Scenic_R_human.R ├── Seurat-example ├── FetalThymus2_dge.txt.gz └── Tissue_seurat.R └── scHCL-build-reference ├── .Rhistory ├── 1-build-reference.R ├── 2-scHCLuse.R └── HCLREFuse.RData └── example1.RData /Batch-pollution-remove/Test_dge.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Batch-pollution-remove/Test_dge.txt.gz -------------------------------------------------------------------------------- /Batch-pollution-remove/batch-pollution-remove.R: -------------------------------------------------------------------------------- 1 | #This scripy is used to remove polluted genes in DGE. 2 | ################1.load Test DGE and selcet cell with 500 more UMI 3 | setwd("/home/ggj/Rdata/JQ-Test/Data/Test/") 4 | temp <- list.files(pattern="*txt.gz") 5 | name <- character() 6 | for(i in 1:length(temp)){ 7 | message("loading DGE") 8 | name[i] <- unlist(strsplit(temp[i],"_dge"))[1] 9 | tmpvalue<-read.table(temp[i],header=T,row.names=1) 10 | assign(name[i],tmpvalue) 11 | message(paste(name[i],"is loaded")) 12 | } 13 | for(i in 1:length(temp)) { 14 | dge<-get(name[i]) 15 | colnames( dge ) <- paste0(as.character(name[i]),".",colnames(get(name[i]))) 16 | assign(name[i], dge) 17 | } 18 | 19 | name_500less <- name 20 | name_500less <- paste(name_500less,"500less", sep="_") 21 | 22 | for(i in 1:length(name)){ 23 | dge <- get(name[i]); 24 | temp <- dge[,colSums(dge)<500 & colSums(dge)> 50] 25 | assign(name_500less[i], temp) 26 | } 27 | 28 | ### bulid dge more than 500 UMI 29 | name_500more <- name 30 | name_500more <- paste(name_500more,"500more", sep="_") 31 | 32 | for(i in 1:length(name_500more)){ 33 | dge <- get(name[i]); 34 | temp <- dge[,colSums(dge)>=500] 35 | assign(name_500more[i], temp) 36 | message(paste(name[i],"is done")) 37 | } 38 | ################2.rmbatch in manual work 39 | name<-"Test" 40 | anno<-data.frame(matrix(unlist(strsplit(colnames(Test_500more),"\\.")),ncol = 2,byrow = T)[,2] ) 41 | colnames(anno)<-"Cell_barcode" 42 | anno$Sample<-"Test" 43 | anno$Batch<-"Test" 44 | anno$Cell_id<-colnames(Test_500more) 45 | anno$Cluster_id<-"1" 46 | anno$Ages<-"age" 47 | anno$Development_stage<-"Adult" 48 | anno$Method<-"Microwell-seq" 49 | anno$Gender<-"Female" 50 | anno$Source<-"Test" 51 | anno$Biomaterial<-"Test" 52 | anno$Name<-"Test" 53 | head(anno) 54 | dim(anno)##k the annotations 55 | # 2835 12 56 | 57 | #raw<-get(name) 58 | more500<-get(paste(name,"500more",sep = "_")) 59 | less<-get(paste(name,"500less",sep = "_")) 60 | raw<-merge(less,more500,by="row.names",all=T) 61 | raw<-data.frame(raw[,-1],row.names = raw$Row.names) 62 | raw[is.na(raw)]<-0 63 | ## determine the batch cells 64 | par(mfrow=c(1,1)) 65 | hist(colSums(less),breaks = 2000,xlim = c(0,1000)) 66 | allumi<-data.frame(umi=colSums(less)) 67 | ssa<-allumi[with(allumi,order(umi,decreasing = F)),]## check the order 68 | ssa[1:500] ### check the UMI:50-500, use 500 cells 69 | abline(v= 276 ) 70 | rm(ssa) 71 | ss<-rownames(allumi)[with(allumi,order(umi,decreasing = F))][1:500] 72 | less<-less[,ss] 73 | 74 | 75 | ## narrow down the gene 76 | aa<-data.frame(gene=rowSums(less)) 77 | table(aa$gene>10) 78 | usegene<-rownames(aa)[aa$gene>10] 79 | more500<-more500[usegene,] 80 | less<-less[usegene,] 81 | raw<-raw[usegene,] 82 | background <- data.frame(var=replicate(1,n = nrow(more500)), 83 | cellnum_express =rowSums(more500>0), 84 | rowMean_500more =rowMeans(more500), 85 | row.names = rownames(more500) 86 | ,rowMeans_all=rowMeans(raw) 87 | ) 88 | temp <- merge(background,data.frame(rowMean_less =rowMeans(less)),all.x=F, by="row.names") 89 | background <- data.frame(temp[,-1],row.names = temp[,1]) 90 | for (m in rownames(background)){ 91 | background[m,"var"] <- var(as.numeric(more500[m,])) 92 | background[m,"sd"] <- sqrt(background[m,"var"]) 93 | } 94 | 95 | background <- background[with(background,order(-rowMean_less,-rowMean_500more,-cellnum_express, -sd)),] 96 | background$multi<-background$rowMean_less*background$sd 97 | background<-background[background$multi>=1 ,] 98 | 99 | #background<-background[!grepl(x=rownames(background),pattern = "*MT-"),] 100 | #background<-background[!grepl(x=rownames(background),pattern = "*RPS"),] 101 | #background<-background[!grepl(x=rownames(background),pattern = "*RPL"),] 102 | 103 | summary(background$rowMean_500more/background$rowMean_less) 104 | summary(background$rowMeans_all/background$rowMean_less) 105 | plot(density(summary(colSums(less["HBB",])))) 106 | 107 | 108 | 109 | ## determine the coeffficient and select the med between 2-5 110 | med<-median(background$rowMeans_all/background$rowMean_less) 111 | med 112 | med<-median(background$rowMean_500more/background$rowMean_less) 113 | med 114 | # 2.761137 115 | 116 | 117 | 118 | background[,"batchValue"] <- background[,"rowMean_less"]*med# value to delete 119 | background$batchValue <- round(background$batchValue) 120 | background <- background[background$batchValue>0,] 121 | 122 | 123 | dge_m<-get(paste(name,"500more",sep = "_")) 124 | m <- dge_m 125 | for (i in rownames(background)) { m[i,] <- m[i,]-background[i,"batchValue"] } 126 | sum(dge_m)# 7201740 127 | sum(m)#4157436 128 | m[m<0] <- 0 # 129 | sum(m)#5963150 130 | (sum(dge_m)-sum(m))/sum(dge_m) 131 | #0.1719848 132 | rowSums(dge_m["HBB",]> 0) 133 | rowSums(m["HBB",]>0) 134 | 135 | par(mfrow=c(2,1)) 136 | plot(density(summary(colSums(dge_m["HBB",])))) 137 | plot(density(summary(colSums(m["HBB",])))) 138 | 139 | Test_500less<-Test_500less 140 | Test_500more<-Test_500more 141 | Test_Anno<-anno 142 | Test_rm.batch <- m 143 | Test_background <- background 144 | Test_less<-less 145 | 146 | save(Test_rm.batch, 147 | Test_background, 148 | Test_less, 149 | Test_500more, 150 | Test_Anno, 151 | file = "/home/ggj/Rdata/201901/Test_500more_rmbatch.RData") 152 | -------------------------------------------------------------------------------- /Circos/Main_circos.r: -------------------------------------------------------------------------------- 1 | library(circlize) 2 | #library(migest) 3 | library(dplyr) 4 | library(gdata) 5 | library(RColorBrewer) 6 | 7 | color_species = structure(c("#2E8B57", "#FF4500"), names = c("HCL","MCA")) 8 | 9 | DF<-read.table("top_hits_SRS_75.out.result",sep="\t",head=T) 10 | 11 | 12 | #all_regions = unique(Phe$Cluster) 13 | 14 | all_regions = unique(c(as.character(DF$Cluster1), as.character(DF$Cluster2))) 15 | #color_regions = structure(rev(rainbow(length(all_regions))), names = as.character(all_regions)) 16 | # color_regions = structure(c("#E41A1C","#377EB8","#4DAF4A","#FCCDE5","#B3DE69","#A65628","#6A3D9A","#1B9E77","#CAB2D6","#66A61E","#D95F02","#A6761D", 17 | # "#E6AB02","#7570B3"),names = as.character(all_regions)) 18 | 19 | color.list<-read.table("color.list",head=T,sep="=") 20 | rownames(color.list)<-color.list$name 21 | color.list<-color.list[as.character(all_regions),] 22 | #color_regions = structure(rev(rainbow(length(all_regions))), names = as.character(all_regions)) 23 | color_regions = structure(as.character(color.list$color), names = as.character(all_regions)) 24 | 25 | 26 | 27 | 28 | df2 = data.frame(from=paste(DF$Species1,DF$Cluster1,sep="|"),to=paste(DF$Species2,DF$Cluster2,sep="|"),value=DF$Mean_AUROC) 29 | #df3<-factor(df2) 30 | #df2<-data.frame(Phe$Cell) 31 | #combined = unique(data.frame(regions = Phe$Cluster, species = Phe$Species, stringsAsFactors = FALSE)) 32 | combined = unique(data.frame(regions = c(as.character(DF$Cluster1), as.character(DF$Cluster2)), 33 | species = c(as.character(DF$Species1), as.character(DF$Species2)), stringsAsFactors = FALSE)) 34 | combined = combined[order(combined$species, combined$regions), ] 35 | order = paste(combined$species, combined$regions, sep = "|") 36 | grid.col = structure(color_regions[combined$regions], names = order) 37 | gap = rep(1, length(order)) 38 | gap[which(!duplicated(combined$species, fromLast = TRUE))] = 5 39 | 40 | pdf("HM-circos-new_Cluster.pdf") 41 | circos.par(gap.degree = gap,start.degree=270) 42 | chordDiagram(df2, order = order, 43 | annotationTrack = c("grid"), 44 | grid.col = grid.col, directional = FALSE, 45 | preAllocateTracks = list( 46 | track.height = 0.04, 47 | track.margin = c(0.05, 0) 48 | ) 49 | ) 50 | for(species in unique(combined$species)) { 51 | l = combined$species == species 52 | sn = paste(combined$species[l], combined$regions[l], sep = "|") 53 | highlight.sector(sn, track.index = 1, col = color_species[species], 54 | #text = species, 55 | niceFacing = TRUE) 56 | } 57 | circos.clear() 58 | 59 | legend("bottomleft", pch = 15, col = color_regions, 60 | legend = names(color_regions), cex = 0.3) 61 | legend("bottomright", pch = 15, col = color_species, 62 | legend = names(color_species), cex = 0.6) 63 | 64 | 65 | dev.off() 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /Circos/celltype.NV_SRS_75.out.result: -------------------------------------------------------------------------------- 1 | Mouse10 MCA Mammary gland in lactation Secretory Secretory Mouse100 MCA Epithelial cell Epithelial Epithelial Mouse101 Mouse102 2 | -------------------------------------------------------------------------------- /Circos/color.list: -------------------------------------------------------------------------------- 1 | name=color 2 | Endothelial="#E41A1C" 3 | Epithelial="#377EB8" 4 | Epithelial.fetal="#3399FF" 5 | Erythroid="#4DAF4A" 6 | Fat="#FCCDE5" 7 | Germline="#A65628" 8 | Hepatocyte="#6A3D9A" 9 | Immune="#1B9E77" 10 | Muscle="#66A61E" 11 | Muscle.fetal="#B3DE69" 12 | Neuron="#D95F02" 13 | Neuron.fetal="#FF7F00" 14 | Proliferating="#A6761D" 15 | Proliferating.fetal="#BC80BD" 16 | Secretory="#E6AB02" 17 | Secretory.fetal="#8DD3C7" 18 | Stromal="#7570B3" 19 | Stromal.fetal="#CC99FF" 20 | -------------------------------------------------------------------------------- /Early-processing/Drop-seq_tools.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/Drop-seq_tools.zip -------------------------------------------------------------------------------- /Early-processing/STAR-2.5.2a.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/STAR-2.5.2a.zip -------------------------------------------------------------------------------- /Early-processing/bbmap.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/bbmap.zip -------------------------------------------------------------------------------- /Early-processing/pipeline.txt: -------------------------------------------------------------------------------- 1 | #1.use the index sequence to get raw.data.The raw fastq files of R1 and R2 is putted in /*/data/INPUT/ 2 | $ ./github/Early_processing/sccpipe -I /*/data/INPUT/ -O /*/data/OUTPUT/ 3 | 4 | 5 | #2.filiter the sequence including the CGACTCACTACAGGG,TCGGTGACACGATCG,TTTTTTTTTTTT 6 | $ ./github/Early_processing/bbmap/bbduk2.sh in=/*/data/COL10_R1.fastq in2=/*/data/COL10_R2_001.fastq outm=/*/data/H_c1_R1_001.fastq outm2=/*/data/H_c1_R2_001.fastq fliteral=CGACTCACTACAGGG k=15 skipr2=t hdist=3 -Xmx58g 7 | 8 | $ ./github/Early_processing/bbmap/bbduk2.sh in=/*/data/H_c1_R1_001.fastq in2=/*/data/H_c1_R2_001.fastq outm=/*/data/H_c2_R1_001.fastq outm2=/*/data/H_c2_R2_001.fastq fliteral=TCGGTGACACGATCG k=15 skipr2=t hdist=3 -Xmx58g 9 | 10 | $ ./github/Early_processing/bbmap/bbduk2.sh in=/*/data/H_c2_R1_001.fastq in2=/*/data/H_c2_R2_001.fastq outm=/*/data/H_c3_R1_001.fastq outm2=/*/data/H_c3_R2_001.fastq fliteral=TTTTTTTTTTTT k=12 skipr2=t hdist=3 -Xmx58g 11 | 12 | 13 | #3.fastq to the bam file 14 | $ java -Xmx58g -jar ./github/Early_processing/Drop-seq_tools/3rdParty/picard/picard.jar FastqToSam F1=/*/data/H_c3_R1_001.fastq F2=/*/data/H_c3_R2_001.fastq O=/*/data/Lung.bam QUALITY_FORMAT=Standard SAMPLE_NAME=sample_name 15 | 16 | 17 | #4.get dge(Digital gene expression) 18 | $ ./github/Early_processing/Drop-seq_tools/scHCL.sh -g /*/STAR_Reference_Human/genomeDir -r /*/STAR_Reference_Human/Homo_sapiens.GRCh38.fa -d ./github/Early_processing/Drop-seq_tools/ -o /*/data/ -t /*/data/ -s ./github/Early_processing/STAR-2.5.2a/source/STAR /*/data/Lung.bam 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Early-processing/sccpipe.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/sccpipe.zip -------------------------------------------------------------------------------- /Gene-regulon-network/1-1_SCENIC_AdultHuman_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pickle 4 | import pandas as pd 5 | import numpy as np 6 | from dask.diagnostics import ProgressBar 7 | from arboreto.utils import load_tf_names 8 | from arboreto.algo import grnboost2 9 | from arboreto.algo import genie3 10 | from numpy.core.umath_tests import inner1d 11 | from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase 12 | from pyscenic.utils import modules_from_adjacencies, load_motifs 13 | from pyscenic.prune import prune, prune2df, df2regulons 14 | from pyscenic.aucell import aucell 15 | import seaborn as sns 16 | 17 | DATA_FOLDER="./" 18 | RESOURCES_FOLDER="../resources" 19 | DATABASE_FOLDER = "../databases/" 20 | SCHEDULER="123.122.8.24:8786" 21 | DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19*.feather") 22 | MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl") 23 | MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hh_total_tfs.txt') 24 | SC_EXP_FNAME = os.path.join(RESOURCES_FOLDER, "Human.pse20.txt") 25 | REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons.p") 26 | MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs.csv") 27 | AUC_FNAME=os.path.join(DATA_FOLDER, "aucell.csv") 28 | Co_FNAME=os.path.join(DATA_FOLDER, "adjacencies.csv") 29 | 30 | ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T 31 | ex_matrix.shape 32 | tf_names = load_tf_names(MM_TFS_FNAME) 33 | db_fnames = glob.glob(DATABASES_GLOB) 34 | def name(fname): 35 | return os.path.basename(fname).split(".")[0] 36 | 37 | dbs=[RankingDatabase(fname=fname,name=name(fname)) for fname in db_fnames] 38 | 39 | #Phase1:co-expression module 40 | adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True) 41 | adjacencies.to_csv(Co_FNAME) 42 | adjacencies = pd.read_csv("/home/jingjingw/Jingjingw/Project/2018-MH-new/2019-1-18-TotalFig2-new/2_SCENIC/Human_total/adjacencies.csv") 43 | modules = list(modules_from_adjacencies(adjacencies, ex_matrix)) 44 | 45 | #Phase2: RcisTarget [Prune modules for targets with cis regulatory footprints] 46 | # Calculate a list of enriched motifs and the corresponding target genes for all modules. 47 | with ProgressBar(): 48 | df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME) 49 | 50 | # Create regulons from this table of enriched motifs. 51 | regulons = df2regulons(df) 52 | # Save the enriched motifs and the discovered regulons to disk. 53 | df.to_csv(MOTIFS_FNAME) 54 | with open(REGULONS_FNAME, "wb") as f: 55 | pickle.dump(regulons, f) 56 | 57 | regulons = prune(dbs, modules, MOTIF_ANNOTATIONS_FNAME) 58 | 59 | #Phase3: AUCell 60 | auc_mtx = aucell(ex_matrix, regulons, num_workers=4) 61 | auc_mtx.to_csv(AUC_FNAME) 62 | sns_plot=sns.clustermap(auc_mtx, figsize=(12,12)) 63 | sns_plot.savefig("sns.clustermap.png") 64 | 65 | -------------------------------------------------------------------------------- /Gene-regulon-network/1-2_JSD_RSS.r: -------------------------------------------------------------------------------- 1 | data<-read.csv("aucell.csv") 2 | rownames(data)<-data$Cell 3 | data<-data[,-1] 4 | 5 | #n<-gsub("[....][1-9]|[...]","",colnames(data)) 6 | #n<-as.matrix(n) 7 | #data<-t(data) 8 | #all.data <- data###add 9 | #new.data<-aggregate(list(all.data[,1:length(all.data[1,])]), 10 | # list(name=n[,1]),FUN=mean) 11 | #rownames(new.data)<-new.data$name 12 | #new.data<-new.data[,-1] 13 | #data1<-new.data#normalize 14 | 15 | data1<-t(data) 16 | coln<-colnames(data1) 17 | coln1<-gsub("_Cell[0-9]|_Cell[0-9][0-9]|_Cell[0-9][0-9][0-9]","",coln) 18 | coln1<-as.factor(coln1) 19 | le<-levels(coln1) 20 | nle<-length(le) 21 | coln1<-as.matrix(coln1) 22 | Result<-matrix(0,ncol=length(coln1),nrow=nle) 23 | for (i in 1:nle){ 24 | tmp<-which(coln1[,1] == as.character(le[i])) 25 | Result[i,tmp]<-1 26 | } 27 | colnames(Result)<-colnames(data1) 28 | rownames(Result)<-le 29 | 30 | Result1<-Result/(rowSums(Result)) 31 | #Result1[Result1==0]=0.000001 32 | 33 | #write.table(Result1,file="JSD.mouse.celltype.input",sep="\t",quote=F) 34 | 35 | 36 | #-------------------JSD 37 | KLD=function(A,B){ 38 | sum(A*log(A/B)) 39 | } 40 | JSD=function(P,Q){ 41 | M=(P+Q)/2 42 | jsd=0.5*KLD(P,M)+0.5*KLD(Q,M) 43 | return (jsd) 44 | } 45 | 46 | Input1<-data1 47 | Input2<-Result1 48 | Input1[Input1==0]<-0.0000001 #attention 49 | Input1<-Input1/rowSums(Input1) 50 | Input2[Input2==0]<-0.0000001#attention 51 | Input2<-Input2/rowSums(Input2) 52 | TFn<-length(Input1[,1]) 53 | Celln<-length(Input1[1,]) 54 | Celltypn<-length(Input2[,1]) 55 | JSD.result<-matrix(0,nrow=TFn,ncol=Celltypn) 56 | for (i in 1:TFn){ 57 | for(j in 1:Celltypn){ 58 | jsd1<-JSD(Input1[i,],Input2[j,]) 59 | JSD.result[i,j]=jsd1 60 | } 61 | 62 | } 63 | rownames(JSD.result)<-rownames(Input1) 64 | colnames(JSD.result)<-rownames(Input2) 65 | 66 | JSDR<-1-sqrt(JSD.result) 67 | write.table(JSDR,file="Human.RSS.total.out",sep="\t",quote=F) 68 | -------------------------------------------------------------------------------- /Gene-regulon-network/1-3_CSI.r: -------------------------------------------------------------------------------- 1 | data<-read.table("Together.aucell.filter",sep="\t") 2 | #data<-t(data) 3 | Mcor<-cor(t(data)) 4 | n<-length(Mcor[,1]) 5 | 6 | CSI<-matrix(nrow=n,ncol=n) 7 | for (i in 1:n){ 8 | for(j in 1:n){ 9 | r1<-length(which(Mcor[i,]>Mcor[i,j])) 10 | c1<-length(which(Mcor[,j]>Mcor[i,j])) 11 | CSI[i,j]<-1-(r1+c1)/((n-1)*2) 12 | } 13 | 14 | } 15 | rownames(CSI)<-rownames(data) 16 | colnames(CSI)<-rownames(data) 17 | 18 | #tf-tf network 19 | r<-CSI 20 | lower<-as.matrix(r[lower.tri(r)]) 21 | nn<-length(r[,1]) 22 | k<-0 23 | lower.r<-matrix(nrow=length(lower)) 24 | for(i in 1:(nn-1)){ 25 | for ( j in (i+1):nn){ 26 | k=k+1 27 | tmp<-paste(rownames(r)[i],colnames(r)[j],lower[k],sep="\t") 28 | lower.r[k]=tmp 29 | 30 | } 31 | } 32 | write.table(lower.r,file="TF-TFNetwork.CSI.celltype.lower.out",sep="\t",quote=F,row.names=F,col.names=F) 33 | 34 | Lower<-read.table("TF-TFNetwork.CSI.celltype.lower.out",sep="\t") 35 | out<-Lower[which(Lower[,3]>0.7),] 36 | write.table(out,file="TFmodule.network.out",sep="\t",quote=F,row.names=F,col.names=F) 37 | 38 | 39 | library(pheatmap) 40 | library(RColorBrewer) 41 | 42 | MS<-CSI 43 | MS[MS<0.65]=0 44 | col<-colorRampPalette(c("#FAF9DA","#28245F"))(100) 45 | pdf("Total_TF-TF.065.pdf",width=11,height =10) 46 | x=pheatmap(MS, 47 | color=col, 48 | clustering_method = "ward.D2", 49 | cex=0.6, 50 | #show_rownames=FALSE, 51 | show_colnames = FALSE 52 | ) 53 | dev.off() 54 | 55 | tree_order=x$tree_row$order 56 | tree_order_name=rownames(data)[tree_order] 57 | write.table(tree_order_name,"TF-TF.order.names",sep="\t",quote=F,col.names=F,row.names = F) 58 | -------------------------------------------------------------------------------- /HCL_Fig1_script-1.py: -------------------------------------------------------------------------------- 1 | ## load the necessary packages 2 | import numpy as np 3 | import scanpy.api as sc ### using the scanpy 1.3.7 version 4 | import pandas as pd 5 | import os 6 | import pandas as pd 7 | os.chdir("./HCL/Fig1") 8 | 9 | 10 | ## merge the dataset from different tissues 11 | tissues=["AdultAdrenalGland1","AdultAdrenalGland2","AdultArtery1", 12 | "AdultAscendingColon1","AdultBladder1","AdultBladder2","AdultBoneMarrow1" , 13 | "AdultBoneMarrow2","AdultCerebellum1","AdultCervix1","AdultTransverseColon1" , 14 | "AdultDuodenum1", "AdultEpityphlon1","AdultEsophagus1","AdultFallopiantube1", 15 | "AdultGallbladder1", "AdultHeart1" ,"AdultHeart2", "AdultIleum2" , 16 | "AdultKidney2","AdultKidney3","AdultLiver1","AdultLiver2" , 17 | "AdultLiver4" ,"AdultLung1","AdultLung2","AdultMuscle1", 18 | "AdultOmentum1","AdultOmentum2","AdultPancreas1","AdultPeripheralBlood1", 19 | "AdultPeripheralBlood2","AdultPleura1","AdultProstate1","AdultRectum1", 20 | "AdultSigmoidColon1","AdultSpleen1","AdultStomach1","AdultStomach2","AdultTemporalLobe1", 21 | "AdultThyroid1","AdultThyroid2","AdultTrachea2","AdultUreter1" , 22 | "AdultUterus1","ChorionicVillus1","CordBlood1","CordBloodCD34P1", 23 | "FetalAdrenalGland2","FetalBrain3","FetalBrain4","FetalBrain5", 24 | "FetalCalvaria1","FetalFemaleGonad1", "FetalHeart1","FetalIntestine1", 25 | "FetalIntestine2","FetalIntestine3","FetalKidney3","FetalKidney4", 26 | "FetalKidney5","FetalLiver1","FetalLung1","FetalMaleGonad1", 27 | "FetalMaleGonad2","FetalMuscle1", "FetalPancreas1","FetalPancreas2", 28 | "FetalRib2","FetalRib3","FetalSkin2","FetalSpinalCord1", 29 | "FetalStomach1","FetalThymus1","FetalThymus2","hESC1","Placenta1" ] 30 | 31 | datause= pd.read_table("./dge/AdultAdipose1.rmbatchdge.txt",sep=" ") 32 | for tissue in tissues: 33 | new=pd.read_table("./dge/" + tissue + '.rmbatchdge.txt',sep=' ') 34 | datause=pd.merge(datause,new,left_index=True,right_index=True,how='outer') 35 | print(tissue + " is done") 36 | 37 | genes=datause.index 38 | genes=genes.tolist() 39 | cells=datause.columns 40 | cells=cells.tolist() 41 | cells=pd.DataFrame(columns=["cell"],data=cells) 42 | cells.to_csv("cells.csv",sep=",",header=False,index=False) 43 | genes=pd.DataFrame(columns=["gene"],data=genes) 44 | genes.to_csv("genes.csv",sep=",",header=False,index=False) 45 | datause=datause.fillna(0) 46 | datause.to_csv('datause.csv',sep='\t',header=False,index=False) 47 | 48 | datause.shape 49 | ##451613 × 38360 50 | 51 | 52 | ## load the data 53 | %%time 54 | adata=sc.read_csv("./datause.csv",delimiter='\t').transpose() 55 | adata.var_names = pd.read_csv('./genes.csv', header=None)[0] 56 | adata.obs_names = pd.read_csv('./cells.csv', header=None)[0] 57 | adata.obs['tissue']=pd.read_csv('./cellanno_new.csv',sep=",",header=None)[0].values 58 | mito_genes = [name for name in adata.var_names if name.startswith('MT-')] 59 | adata[:, mito_genes]=0 60 | 61 | adata.write('./HCL_scanpy1.h5ad') 62 | 63 | 64 | ## Filter the genes 65 | sc.pp.filter_genes(adata, min_cells=20) 66 | sc.pp.filter_cells(adata, min_genes=0) 67 | adata.obs['n_counts'] = adata.X.sum(axis=1) 68 | 69 | 70 | ## Filter the cells 71 | sc.pl.violin(adata, ['n_genes', 'n_counts'],jitter=0.4, multi_panel=True) 72 | #sc.pl.scatter(adata, x='n_counts', y='percent_mito') 73 | sc.pl.scatter(adata, x='n_counts', y='n_genes') 74 | 75 | 76 | ## Logarithmize the data. 77 | sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) 78 | adata.raw = sc.pp.log1p(adata, copy=True) 79 | adata.write('./HCL_scanpy2.h5ad') 80 | 81 | ## Choose variable genes 82 | adata=sc.read("./HCL_scanpy2.h5ad") 83 | filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.001, max_mean=15, min_disp=0.5) 84 | sc.pl.filter_genes_dispersion(filter_result) 85 | adata = adata[:, filter_result.gene_subset] 86 | adata.shape() 87 | ##451613 × 3118 88 | 89 | 90 | ## Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance. 91 | sc.pp.log1p(adata) 92 | sc.pp.regress_out(adata, ['n_counts','ngenes']) 93 | 94 | ## scale the data 95 | sc.pp.scale(adata, max_value=10) 96 | 97 | 98 | ## PCA 99 | sc.tl.pca(adata, n_comps=100) 100 | sc.pl.pca_loadings(adata) 101 | adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat, 102 | sc.pl.pca_scatter(adata, color='COL1A1') # visualize 103 | 104 | ## Choose sigificant PCs 105 | sc.pl.pca_variance_ratio(adata, log=True, show=100,n_pcs=100) 106 | 107 | 108 | ## Computing the neighborhood graph and do t-sne 109 | sc.pp.neighbors(adata, n_neighbors=10,n_pcs=50) 110 | sc.tl.louvain(adata, resolution=4) 111 | sc.tl.tsne(adata,use_fast_tsne=True,n_jobs=20,perplexity=100,n_pcs=50) 112 | sc.pl.tsne(adata, color='louvain',size=8,legend_loc="on data") 113 | sc.pl.tsne(adata, color='louvain',size=8) 114 | sc.pl.tsne(adata, color='tissue',size=8,legend_loc="on data") 115 | sc.pl.tsne(adata, color='tissue',size=8) 116 | 117 | adata.write('./HCL_scanpy_pc50.h5ad') 118 | 119 | ### change the cluster name 120 | new_cluster_names = list(range(1,103)) 121 | adata.rename_categories('louvain', new_cluster_names) 122 | 123 | ## Find marker genes using wilcoxon test 124 | sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon') 125 | result = adata.uns['rank_genes_groups'] 126 | groups = result['names'].dtype.names 127 | pd.DataFrame( 128 | {group + '_' + key[:1]: result[key][group] 129 | for group in groups for key in ['names', 'logfoldchanges','scores', 'pvals', 'pvals_adj']}).to_csv("HCL102_markers_wilcoxon.csv") 130 | adata.write('./HCL_scanpy_pc50_markers_wilcoxon.h5ad', compression='gzip') 131 | 132 | -------------------------------------------------------------------------------- /HCL_Fig1_script_730new.py.txt: -------------------------------------------------------------------------------- 1 | ## load the necessary packages 2 | import numpy as np 3 | import scanpy.api as sc ### using the scanpy 1.3.7 version 4 | import pandas as pd 5 | import os 6 | import pandas as pd 7 | os.chdir("./HCL/Fig1") 8 | 9 | 10 | ## merge the dataset from different tissues 11 | tissues=["AdultAdrenalGland2","AdultArtery1", 12 | "AdultAscendingColon1","AdultBladder1","AdultBladder2","AdultBoneMarrow1" , 13 | "AdultBoneMarrow2","AdultCerebellum1","AdultCervix1","AdultTransverseColon1" , 14 | "AdultDuodenum1", "AdultEpityphlon1","AdultEsophagus1","AdultFallopiantube1", 15 | "AdultGallbladder1", "AdultHeart1" ,"AdultHeart2", "AdultIleum2" , 16 | "AdultKidney2","AdultKidney3","AdultLiver1","AdultLiver2" , 17 | "AdultLiver4" ,"AdultLung1","AdultLung2","AdultMuscle1", 18 | "AdultOmentum1","AdultOmentum2","AdultPancreas1","AdultPeripheralBlood1", 19 | "AdultPeripheralBlood2","AdultPleura1","AdultProstate1","AdultRectum1", 20 | "AdultSigmoidColon1","AdultSpleen1","AdultStomach1","AdultStomach2","AdultTemporalLobe1", 21 | "AdultThyroid1","AdultThyroid2","AdultTrachea2","AdultUreter1" , 22 | "AdultUterus1","ChorionicVillus1","CordBlood1","CordBloodCD34P1", 23 | "FetalAdrenalGland2","FetalBrain3","FetalBrain4","FetalBrain5", 24 | "FetalCalvaria1","FetalFemaleGonad1", "FetalHeart1","FetalIntestine1", 25 | "FetalIntestine2","FetalIntestine3","FetalKidney3","FetalKidney4", 26 | "FetalKidney5","FetalLiver1","FetalLung1","FetalMaleGonad1", 27 | "FetalMaleGonad2","FetalMuscle1", "FetalPancreas1","FetalPancreas2", 28 | "FetalRib2","FetalRib3","FetalSkin2","FetalSpinalCord1", 29 | "FetalStomach1","FetalThymus1","FetalThymus2","hESC1","Placenta1" ] 30 | 31 | datause= pd.read_table("./dge/AdultAdipose1.rmbatchdge.txt",sep=" ") 32 | for tissue in tissues: 33 | new=pd.read_table("./dge/" + tissue + '.rmbatchdge.txt',sep=' ') 34 | datause=pd.merge(datause,new,left_index=True,right_index=True,how='outer') 35 | print(tissue + " is done") 36 | 37 | genes=datause.index 38 | genes=genes.tolist() 39 | cells=datause.columns 40 | cells=cells.tolist() 41 | cells=pd.DataFrame(columns=["cell"],data=cells) 42 | cells.to_csv("cells.csv",sep=",",header=False,index=False) 43 | genes=pd.DataFrame(columns=["gene"],data=genes) 44 | genes.to_csv("genes.csv",sep=",",header=False,index=False) 45 | datause=datause.fillna(0) 46 | datause.to_csv('datause.csv',sep='\t',header=False,index=False) 47 | 48 | datause.shape 49 | ##451613 × 38360 50 | 51 | 52 | ## load the data 53 | %%time 54 | adata=sc.read_csv("./datause.csv",delimiter='\t').transpose() 55 | adata.var_names = pd.read_csv('./genes.csv', header=None)[0] 56 | adata.obs_names = pd.read_csv('./cells.csv', header=None)[0] 57 | adata.obs['tissue']=pd.read_csv('./cellanno_new.csv',sep=",",header=None)[0].values 58 | mito_genes = [name for name in adata.var_names if name.startswith('MT-')] 59 | adata[:, mito_genes]=0 60 | 61 | adata.write('./HCL_scanpy1.h5ad') 62 | 63 | 64 | ## Filter the genes 65 | sc.pp.filter_genes(adata, min_cells=20) 66 | sc.pp.filter_cells(adata, min_genes=0) 67 | adata.obs['n_counts'] = adata.X.sum(axis=1) 68 | 69 | 70 | ## Filter the cells 71 | sc.pl.violin(adata, ['n_genes', 'n_counts'],jitter=0.4, multi_panel=True) 72 | #sc.pl.scatter(adata, x='n_counts', y='percent_mito') 73 | sc.pl.scatter(adata, x='n_counts', y='n_genes') 74 | 75 | 76 | ## Logarithmize the data. 77 | sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) 78 | adata.raw = sc.pp.log1p(adata, copy=True) 79 | adata.write('./HCL_scanpy2.h5ad') 80 | 81 | ## Choose variable genes 82 | adata=sc.read("./HCL_scanpy2.h5ad") 83 | filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.001, max_mean=15, min_disp=0.45) 84 | sc.pl.filter_genes_dispersion(filter_result) 85 | adata = adata[:, filter_result.gene_subset] 86 | adata.shape() 87 | ##451613 × 3118 88 | 89 | 90 | ## Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance. 91 | #sc.pp.log1p(adata) 92 | sc.pp.regress_out(adata, ['n_counts','ngenes']) 93 | 94 | ## scale the data 95 | sc.pp.scale(adata, max_value=10) 96 | 97 | 98 | ## PCA 99 | sc.tl.pca(adata, n_comps=100) 100 | sc.pl.pca_loadings(adata) 101 | adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat, 102 | sc.pl.pca_scatter(adata, color='COL1A1') # visualize 103 | 104 | ## Choose sigificant PCs 105 | sc.pl.pca_variance_ratio(adata, log=True, show=100,n_pcs=100) 106 | 107 | 108 | ## Computing the neighborhood graph and do t-sne 109 | sc.pp.neighbors(adata, n_neighbors=15,n_pcs=50) 110 | sc.tl.louvain(adata, resolution=3.5) 111 | sc.tl.tsne(adata,use_fast_tsne=True,n_jobs=20,perplexity=100,n_pcs=50) 112 | sc.pl.tsne(adata, color='louvain',size=8,legend_loc="on data") 113 | sc.pl.tsne(adata, color='louvain',size=8) 114 | sc.pl.tsne(adata, color='tissue',size=8,legend_loc="on data") 115 | sc.pl.tsne(adata, color='tissue',size=8) 116 | 117 | adata.write('./HCL_scanpy_pc50.h5ad') 118 | 119 | ### change the cluster name 120 | new_cluster_names = list(range(1,103)) 121 | adata.rename_categories('louvain', new_cluster_names) 122 | 123 | ## Find marker genes using wilcoxon test 124 | sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon',n_genes=200) 125 | result = adata.uns['rank_genes_groups'] 126 | groups = result['names'].dtype.names 127 | pd.DataFrame( 128 | {group + '_' + key[:1]: result[key][group] 129 | for group in groups for key in ['names', 'logfoldchanges','scores', 'pvals', 'pvals_adj']}).to_csv("HCL102_markers_wilcoxon.csv") 130 | adata.write('./HCL_scanpy_pc50_markers_wilcoxon.h5ad', compression='gzip') 131 | 132 | -------------------------------------------------------------------------------- /Metaneighbor/2017-08-28-runMN-US.R: -------------------------------------------------------------------------------- 1 | run_MetaNeighbor_US<-function(vargenes, data, celltypes, pheno){ 2 | 3 | cell.labels=matrix(0,ncol=length(celltypes),nrow=dim(pheno)[1]) 4 | rownames(cell.labels)=colnames(data) 5 | colnames(cell.labels)=celltypes 6 | for(i in 1:length(celltypes)){ 7 | type=celltypes[i] 8 | m<-match(pheno$Celltype,type) 9 | cell.labels[!is.na(m),i]=1 10 | } 11 | 12 | m<-match(rownames(data),vargenes) 13 | cor.dat=cor(data[!is.na(m),],method="s") 14 | rank.dat=cor.dat*0 15 | rank.dat[]=rank(cor.dat,ties.method="average",na.last = "keep") 16 | rank.dat[is.na(rank.dat)]=0 17 | rank.dat=rank.dat/max(rank.dat) 18 | sumin = (rank.dat) %*% cell.labels 19 | sumall = matrix(apply(rank.dat,2,sum), ncol = dim(sumin)[2], nrow=dim(sumin)[1]) 20 | predicts = sumin/sumall 21 | 22 | cell.NV=matrix(0,ncol=length(celltypes),nrow=length(celltypes)) 23 | colnames(cell.NV)=colnames(cell.labels) 24 | rownames(cell.NV)=colnames(cell.labels) 25 | 26 | for(i in 1:dim(cell.labels)[2]){ 27 | predicts.temp=predicts 28 | m<-match(pheno$Celltype,colnames(cell.labels)[i]) 29 | study=unique(pheno[!is.na(m),"Study_ID"]) 30 | m<-match(pheno$Study_ID,study) 31 | pheno2=pheno[!is.na(m),] 32 | predicts.temp=predicts.temp[!is.na(m),] 33 | predicts.temp=apply(abs(predicts.temp), 2, rank,na.last="keep",ties.method="average") 34 | filter=matrix(0,ncol=length(celltypes),nrow=dim(pheno2)[1]) 35 | m<-match(pheno2$Celltype,colnames(cell.labels)[i]) 36 | filter[!is.na(m),1:length(celltypes)]=1 37 | negatives = which(filter == 0, arr.ind=T) 38 | positives = which(filter == 1, arr.ind=T) 39 | predicts.temp[negatives] <- 0 40 | np = colSums(filter,na.rm=T) 41 | nn = apply(filter,2,function(x) sum(x==0,na.rm=T)) 42 | p = apply(predicts.temp,2,sum,na.rm=T) 43 | cell.NV[i,]= (p/np - (np+1)/2)/nn 44 | } 45 | 46 | cell.NV=(cell.NV+t(cell.NV))/2 47 | return(cell.NV) 48 | 49 | } 50 | 51 | get_variable_genes<-function(data, pheno) { 52 | var.genes1=vector("list") 53 | experiment=unique(pheno$Study_ID) 54 | j=1 55 | for(exp in experiment){ 56 | dat.sub=data[,pheno$Study_ID==exp] 57 | genes.list=vector("list") 58 | med.dat=apply(dat.sub,1,median) 59 | var.dat=apply(dat.sub,1,var) 60 | quant.med=unique(quantile(med.dat,prob=seq(0,1,length=11),type=5)) 61 | genes.list=vector("list",length=length(quant.med)) 62 | for(i in 1:length(quant.med)){ 63 | if(i==1){ 64 | filt1=med.dat<=quant.med[i] 65 | var.temp=var.dat[filt1] 66 | quant.var=quantile(var.temp,na.rm=T) 67 | filt2=var.temp>quant.var[4]###### total is 4;TF is3 68 | genes.list[[i]]=names(var.temp)[filt2] 69 | } 70 | else { 71 | filt1=med.dat<=quant.med[i]&med.dat>quant.med[i-1] 72 | var.temp=var.dat[filt1] 73 | quant.var=quantile(var.temp,na.rm=T) 74 | filt2=var.temp>quant.var[4]###### 75 | genes.list[[i]]=names(var.temp)[filt2] 76 | } 77 | } 78 | temp=length(genes.list) 79 | var.genes1[[j]]=unlist(genes.list[1:temp-1]) 80 | j=j+1 81 | } 82 | var.genes=Reduce(intersect, var.genes1) 83 | return(var.genes) 84 | } 85 | 86 | 87 | get_top_hits <- function(cell.NV, pheno, threshold=0.95, filename) { 88 | 89 | type_by_study=table(pheno[,c("Celltype","Study_ID")]) 90 | m<-match(rownames(cell.NV),rownames(type_by_study)) 91 | f.a=!is.na(m) 92 | f.b=m[f.a] 93 | cell.NV=cell.NV[f.a,f.a] 94 | type_by_study=type_by_study[f.b,] 95 | 96 | for(i in 1:dim(type_by_study)[2]){ 97 | filt=type_by_study[,i]!=0 98 | cell.NV[filt,filt]=0 99 | } 100 | 101 | diag(cell.NV)=0 102 | temp=vector() 103 | for(i in 1:dim(cell.NV)[1]){ 104 | temp=c(temp,which.max(cell.NV[i,])) 105 | } 106 | temp=cbind(rownames(cell.NV),temp) 107 | for(i in 1:dim(cell.NV)[1]){ 108 | temp[i,2]=cell.NV[i,as.numeric(temp[i,2])] 109 | } 110 | 111 | recip=temp[duplicated(temp[,2]),] 112 | filt=as.numeric(temp[,2])>=threshold 113 | recip=rbind(recip,temp[filt,]) 114 | recip=cbind(recip,c(rep("Reciprocal_top_hit",each=dim(recip)[1]-sum(filt)),rep(paste("Above",threshold,sep="_"),each=sum(filt)))) 115 | recip=recip[!duplicated(recip[,2]),] 116 | 117 | recip2=cbind(rownames(recip),recip[,1:3]) 118 | colnames(recip2)=c("Celltype_1","Celltype_2","Mean_AUROC","Match_type") 119 | rownames(recip2)=NULL 120 | recip=recip2[order(recip2[,3],decreasing=T),] 121 | recip2=as.data.frame(recip) 122 | recip2[,3]=round(as.numeric(as.character(recip2[,3])),2) 123 | write.table(recip,file=filename,sep="\t",quote=F) 124 | return(recip2) 125 | } 126 | 127 | 128 | -------------------------------------------------------------------------------- /Metaneighbor/hm_metaneighbor.r: -------------------------------------------------------------------------------- 1 | 2 | human<-readRDS("./HCL_all_v2_pse20.CPM.rds") 3 | 4 | mouse<-readRDS("./MCA_V2.PSUDOCELL20.rds") 5 | 6 | orth<-read.table("./Human_Mouse_one-one.orth",sep="\t") 7 | #orth<-as.matrix(orth) 8 | mouse<-as.data.frame(mouse) 9 | mouse.orth<-mouse[as.character(orth[,4]),] 10 | human<-as.data.frame(human) 11 | human.orth<-human[as.character(orth[,2]),] 12 | 13 | 14 | data<-cbind(mouse.orth,human.orth) 15 | rownames(data)<-orth[,1] 16 | data[is.na(data)]<-0 17 | 18 | P<-read.table("./MCA_V2.psudocell20.phe",sep="\t",head=T) 19 | P2<-read.table("./HCL_v2.pse20.phe",sep="\t",head=T) 20 | P1<-rbind(P,P2) 21 | colnames(P1)<-c("Sample_ID","Study_ID","Celltype") 22 | data1<-data[,as.character(P1$Sample_ID)] 23 | 24 | 25 | source("2017-08-28-runMN-US.R") 26 | #library(gplots) 27 | #library(RColorBrewer) 28 | 29 | celltypes1 <-unique(as.character(P1$Celltype)) 30 | 31 | 32 | var.genes1=get_variable_genes(data1,P1) 33 | length(var.genes1) 34 | write.table(var.genes1,"var.genes_75.out",sep="\t",quote=F)#####-------- 35 | celltype.NV=run_MetaNeighbor_US(var.genes1,data1,celltypes1,P1) 36 | write.table(celltype.NV,file="celltype.NV_SRS_75.out",sep="\t",quote=F)###--------- 37 | 38 | cols=rev(colorRampPalette(brewer.pal(11,"RdYlBu"))(100)) 39 | breaks=seq(0,1,length=101) 40 | pdf("celltype.NV_SRS_75.pdf") #########-------------------------- 41 | heatmap.2(celltype.NV,trace="none",density.info="none",col=cols,breaks=breaks,cexRow=0.3,cexCol=0.3) 42 | dev.off() 43 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.9,filename="top_hits_SRS_75.out") 44 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.8,filename="top_hits_SRS_0.8_75.out") 45 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.7,filename="top_hits_SRS_0.7_75.out") 46 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.6,filename="top_hits_SRS_0.6_75.out") 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /PAGA/PAGA_BonemarrowCD34P.py: -------------------------------------------------------------------------------- 1 | load("/home/ggj/HCA/RData/pbmc/old/AdultBoneMarrowCD34P1_Seurat.RData") 2 | setwd("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/") 3 | dim(pbmc@data) 4 | #17364 11781 5 | setwd("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga") 6 | aa<-as.data.frame(as.matrix(pbmc@assays$RNA@counts)) 7 | write.csv(aa[,rownames(anno)],file = "./wt.dge.csv",quote = F) 8 | anno<-FetchData(pbmc,vars = "ident") 9 | write.csv(anno,file = "./wt_embryo.anno.csv",quote = F) 10 | table(pbmc@ident) 11 | 12 | vargene<-pbmc@var.genes 13 | gene<-rownames(pbmc@raw.data) 14 | genefilter<-gene%in%vargene 15 | write.csv(genefilter,file = "./genefilter.csv",quote = F,row.names = F) 16 | 17 | 18 | anno<-FetchData(pbmc,vars="ident") 19 | anno$shuchu<-ifelse(anno$ident%in%c(1,2,6,12,14,18,21),"FALSE","TRUE") 20 | write.csv(anno$shuchu,file="nonimmunecell.csv",quote=F,row.names=F) 21 | 22 | anno$shuchu<-ifelse(anno$ident%in%c(1,2,6,12,14,18,21),"TRUE","FALSE") 23 | write.csv(anno$shuchu,file="immunecell.csv",quote=F,row.names=F) 24 | 25 | #####################################python 26 | import numpy as np 27 | import scanpy.api as sc 28 | import pandas as pd 29 | import os 30 | import pandas as pd 31 | 32 | os.chdir("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/") 33 | adata=sc.read_csv("wt.dge.csv",delimiter=',').transpose() 34 | adata.var_names = pd.read_csv('gene.csv', header=None)[0] 35 | 36 | datause= pd.read_table("wt.dge.csv",sep=",",index_col=0) 37 | adata=sc.AnnData(datause.T) 38 | 39 | 40 | mito_genes = [name for name in adata.var_names if name.startswith('mt-')] 41 | #adata[:, mito_genes]=0 42 | 43 | 44 | 45 | adata.obs['cluster']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[0].values 46 | adata.obs['type']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[1].values 47 | adata.obs['batch']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[2].values 48 | adata.obs['cluster1']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[3].values 49 | 50 | 51 | 52 | sc.pp.filter_genes(adata, min_cells=3) 53 | sc.pp.filter_cells(adata, min_genes=0) 54 | adata.obs['n_counts'] = adata.X.sum(axis=1) 55 | 56 | 57 | sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) 58 | sc.pp.log1p(adata) 59 | adata.raw = adata 60 | 61 | adata.write('./wt1.h5ad', compression='gzip') 62 | 63 | 64 | gene_filter1=pd.read_csv("genefilter.csv") 65 | gene_filter=gene_filter1['x'].values 66 | sc.pl.filter_genes_dispersion(filter_result) 67 | adata = adata[:, gene_filter] 68 | 69 | cc=adata.var_names 70 | cc=cc.values 71 | bb=list(set(cc).intersection(set(gene_filter))) 72 | len(bb) 73 | 74 | 75 | filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.01, max_mean=15, min_disp=0.4) 76 | import collections 77 | collections.Counter(filter_result.gene_subset) 78 | #Counter({False: 14819, True: 1750}) 79 | sc.pl.filter_genes_dispersion(filter_result) 80 | adata = adata[:, filter_result.gene_subset] 81 | 82 | 83 | sc.pp.regress_out(adata, ['n_counts']) 84 | 85 | ## scale the data 86 | sc.pp.scale(adata, max_value=10) 87 | 88 | 89 | ### PCA 90 | sc.tl.pca(adata, n_comps=50) 91 | sc.pl.pca_loadings(adata) 92 | # visualize 93 | adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat 94 | sc.pl.pca_scatter(adata, color='COL1A1') 95 | # PC 96 | sc.pl.pca_variance_ratio(adata, log=True, show=50,n_pcs=50) 97 | ## 25 98 | adata 99 | 100 | 101 | 102 | 103 | 104 | sc.pp.neighbors(adata, n_neighbors=10,n_pcs=25) 105 | sc.tl.louvain(adata, resolution=1) 106 | sc.tl.tsne(adata,use_fast_tsne=True,n_jobs=20,perplexity=200,n_pcs=25) 107 | sc.pl.tsne(adata, color='louvain',size=8,legend_loc="on data") 108 | 109 | 110 | adata.obs['type']=adata.obs['type'].astype('category') 111 | 112 | 113 | sc.pl.tsne(adata, color='louvain',size=8) 114 | sc.pl.tsne(adata, color='tissue',size=8,legend_loc="on data") 115 | sc.pl.tsne(adata, color='tissue',size=8) 116 | adata.write('./wt_cluster.h5ad', compression='gzip') 117 | 118 | 119 | 120 | sc.pp.neighbors(adata,n_pcs=25) 121 | sc.tl.umap(adata, min_dist=0.1) 122 | sc.pl.umap(adata, color='type', title='UMAP', legend_loc='on data', legend_fontsize=5) 123 | sc.pl.umap(adata, color='louvain', title='UMAP', legend_loc='on data', legend_fontsize=5) 124 | 125 | 126 | 127 | #sc.tl.paga(adata, groups='type', model='v1.0') 128 | sc.tl.paga(adata, groups='type') 129 | 130 | sc.pl.paga( 131 | adata, 132 | layout='fr', 133 | threshold=0.01, 134 | fontsize=8, 135 | node_size_scale=1, 136 | node_size_power=0.7, 137 | max_edge_width=0.7) 138 | 139 | import matplotlib.pyplot as pl 140 | 141 | sc.tl.paga(adata, groups='type') 142 | 143 | sc.tl.draw_graph(adata,init_pos="paga",layout="fa",maxiter=500) 144 | sc.pl.draw_graph(adata, color='type',title='Force Atlas 2', legend_loc='on data', legend_fontsize=5,palette=sc.pl.palettes.default_20) 145 | #,save="iter1000.pdf") 146 | sc.pl.tsne(adata, color='louvain',size=8,palette=sc.pl.palettes.godsnot_64,legend_loc="on data") 147 | 148 | sc.pl.draw_graph(adata, color='cluster',title='Force Atlas 2', legend_loc='on data', legend_fontsize=5,palette=sc.pl.palettes.default_20,size=8) 149 | 150 | adata.write('./wt_paga.h5ad', compression='gzip') 151 | 152 | 153 | 154 | adata1=adata.copy() 155 | sc.tl.paga(adata1, groups='louvain') 156 | 157 | 158 | 159 | ################################ remove immune cells 160 | adata.obs['donor_tf']= pd.read_csv("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/nonimmunecell.csv",sep=",",header=None)[0].values 161 | #adata.obs['cluster_num']= pd.read_csv('/home/ggj/NEW/HCA/SPRING/PBCD34/cluster.csv',sep=",",header=None)[0].values 162 | 163 | cluster = adata[adata.obs["donor_tf"]] 164 | #6118 × 1750 165 | 166 | sc.pp.neighbors(cluster,n_pcs=25) 167 | sc.tl.umap(cluster, min_dist=0.1) 168 | 169 | sc.tl.paga(cluster, groups='type') 170 | sc.pl.paga( 171 | cluster, 172 | layout='fr', 173 | threshold=0.01, 174 | fontsize=8, 175 | node_size_scale=1, 176 | node_size_power=0.7, 177 | max_edge_width=0.7) 178 | 179 | import matplotlib.pyplot as pl 180 | sc.tl.draw_graph(cluster,init_pos="paga",layout="fa",maxiter=500) 181 | sc.pl.draw_graph(cluster, color='type', legend_loc='on data',legend_fontsize=5,palette=sc.pl.palettes.default_26,size=8) 182 | 183 | cluster.write('./wt_nonimmune_paga.h5ad', compression='gzip') 184 | 185 | 186 | ################################ immune cells 187 | adata.obs['donor_tf']= pd.read_csv("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/immunecell.csv",sep=",",header=None)[0].values 188 | immune = adata[adata.obs["donor_tf"]] 189 | #5663 × 1750 190 | 191 | sc.pp.neighbors(immune,n_pcs=25) 192 | sc.tl.umap(immune, min_dist=0.1) 193 | 194 | sc.tl.paga(immune, groups='type') 195 | sc.pl.paga( 196 | immune, 197 | layout='fr', 198 | threshold=0.01, 199 | fontsize=8, 200 | node_size_scale=1, 201 | node_size_power=0.7, 202 | max_edge_width=0.7) 203 | 204 | import matplotlib.pyplot as pl 205 | sc.tl.draw_graph(immune,init_pos="paga",layout="fa",maxiter=500) 206 | sc.pl.draw_graph(immune, color='type', legend_loc='on data',legend_fontsize=5,palette=sc.pl.palettes.default_26,size=8) 207 | immune.write('./wt_immune_paga.h5ad', compression='gzip') 208 | -------------------------------------------------------------------------------- /Pseudocell/FetalStomach1_500more.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Pseudocell/FetalStomach1_500more.RData -------------------------------------------------------------------------------- /Pseudocell/Pseudocell_Human.r: -------------------------------------------------------------------------------- 1 | load("/home/jingjingw/Jingjingw/Project/2018-MH-new/Pseudocell/FetalStomach1_500more.RData") 2 | name<-"FetalStomach1" 3 | outfile1<-"Human_FetalStomach1_pseudocell20.Rds" 4 | outfile2<-"Human_FetalStomach1_pseudocell20.pheno.out" 5 | 6 | 7 | 8 | Inter<-get(paste(name,"pbmc",sep = "_")) 9 | Inter[Inter<0]=0 10 | idd<-get(paste(name,"Anno1",sep = "_")) 11 | Inter.id<-cbind(rownames(idd),idd$Cluster_id) 12 | 13 | rownames(Inter.id)<-rownames(idd) 14 | colnames(Inter.id)<-c("CellID","Celltype") 15 | Inter.id<-as.data.frame(Inter.id) 16 | Inter1<-Inter[,Inter.id$CellID] 17 | Inter<-as.matrix(Inter1) 18 | pseudocell.size = 20 ## 10 test 19 | new_ids_list = list() 20 | for (i in 1:length(levels(Inter.id$Celltype))) { 21 | cluster_id = levels(Inter.id$Celltype)[i] 22 | cluster_cells <- rownames(Inter.id[Inter.id$Celltype == cluster_id,]) 23 | cluster_size <- length(cluster_cells) 24 | pseudo_ids <- floor(seq_along(cluster_cells)/pseudocell.size) 25 | pseudo_ids <- paste0(cluster_id, "_Cell", pseudo_ids) 26 | names(pseudo_ids) <- sample(cluster_cells) 27 | new_ids_list[[i]] <- pseudo_ids 28 | } 29 | 30 | new_ids <- unlist(new_ids_list) 31 | new_ids <- as.data.frame(new_ids) 32 | new_ids_length <- table(new_ids) 33 | 34 | new_colnames <- rownames(new_ids) ###add 35 | all.data<-Inter[,as.character(new_colnames)] ###add 36 | all.data <- t(all.data)###add 37 | 38 | new.data<-aggregate(list(all.data[,1:length(all.data[1,])]), 39 | list(name=new_ids[,1]),FUN=mean) 40 | rownames(new.data)<-new.data$name 41 | new.data<-new.data[,-1] 42 | 43 | new_ids_length<-as.matrix(new_ids_length)## 44 | short<-which(new_ids_length<10)## 45 | new_good_ids<-as.matrix(new_ids_length[-short,])## 46 | result<-t(new.data)[,rownames(new_good_ids)] 47 | colnames(result)<-paste("Human",colnames(result),sep="") 48 | rownames(result)<-rownames(Inter) 49 | #saveRDS(result,file=outdir1[i]) ### 50 | saveRDS(result,file=outfile1) ### 51 | cellty<-gsub("[_]Cell[0-9]|[_]Cell[0-9][0-9]|[_]Cell[0-9][0-9][0-9]|[_]Cell[0-9][0-9][0-9][0-9]|[_]Cell[0-9][0-9][0-9][0-9][0-9]","",colnames(result)) 52 | new.phe<-paste(colnames(result),'HumanFetal',cellty,sep="\t") 53 | 54 | #write.table(new.phe,file=outdir2[i],quote=F,row.names=F) ### 55 | 56 | write.table(new.phe,file=outfile2,quote=F,row.names=F) ### 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HCL 2 | Codes used in Human Cell Landscape(HCL) 3 | -------------------------------------------------------------------------------- /Scenic_R_human.R: -------------------------------------------------------------------------------- 1 | ############# 2 | library(Seurat) 3 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/") 4 | #load your DGE 5 | exprMat <- readRDS("./HCL_v2.pse20.SRS_17000.rds") 6 | gene<-data.frame(colSums(exprMat>0)) 7 | #load the annotation for each cell 8 | ident<-read.table("./HCL_v2.pse20.SRS_17000.phe",header =T,row.names = 1) 9 | dir.create("SCENIC") 10 | setwd("SCENIC/") 11 | cellInfo <-merge(ident,gene,by="row.names",all=T) 12 | cellInfo<-data.frame(cellInfo[,-1],row.names = cellInfo$Row.names) 13 | cellInfo[is.na(cellInfo)]<-0 14 | cellInfo<-cellInfo[,-1] 15 | colnames(cellInfo)<- c('CellType','nGene') 16 | cellInfo$cluster<-gsub("Human","",cellInfo$CellType) 17 | dim(exprMat) 18 | head(cellInfo) 19 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/") 20 | dir.create("int") 21 | saveRDS(cellInfo, file="./int/cellInfo.Rds") 22 | table(cellInfo$cluster) 23 | colVars <- list(CellType=c('Human1'='#ffff00', 24 | 'Human2'='#1ce6ff', 25 | 'Human3'='#ff34ff', 26 | 'Human4'='#ff4a46', 27 | 'Human5'='#008941', 28 | 'Human6'='#006fa6', 29 | 'Human7'='#a30059', 30 | 'Human8'='#ffdbe5', 31 | 'Human9'='#7a4900', 32 | 'Human10'='#0000a6', 33 | 'Human11'='#63ffac', 34 | 'Human12'='#b79762', 35 | 'Human13'='#004d43', 36 | 'Human14'='#8fb0ff', 37 | 'Human15'='#997d87', 38 | 'Human16'='#5a0007', 39 | 'Human17'='#809693', 40 | 'Human18'='#feffe6', 41 | 'Human19'='#1b4400', 42 | 'Human20'='#4fc601', 43 | 'Human21'='#3b5dff', 44 | 'Human22'='#4a3b53', 45 | 'Human23'='#ff2f80', 46 | 'Human24'='#61615a', 47 | 'Human25'='#ba0900', 48 | 'Human26'='#6b7900', 49 | 'Human27'='#00c2a0', 50 | 'Human28'='#ffaa92', 51 | 'Human29'='#ff90c9', 52 | 'Human30'='#b903aa', 53 | 'Human31'='#d16100', 54 | 'Human32'='#ddefff', 55 | 'Human33'='#000035', 56 | 'Human34'='#7b4f4b', 57 | 'Human35'='#a1c299', 58 | 'Human36'='#300018', 59 | 'Human37'='#0aa6d8', 60 | 'Human38'='#013349', 61 | 'Human39'='#00846f', 62 | 'Human40'='#372101', 63 | 'Human41'='#ffb500', 64 | 'Human42'='#c2ffed', 65 | 'Human43'='#a079bf', 66 | 'Human44'='#cc0744', 67 | 'Human45'='#c0b9b2', 68 | 'Human46'='#c2ff99', 69 | 'Human47'='#001e09', 70 | 'Human48'='#00489c', 71 | 'Human49'='#6f0062', 72 | 'Human50'='#0cbd66', 73 | 'Human51'='#eec3ff', 74 | 'Human52'='#456d75', 75 | 'Human53'='#b77b68', 76 | 'Human54'='#7a87a1', 77 | 'Human55'='#788d66', 78 | 'Human56'='#885578', 79 | 'Human57'='#fad09f', 80 | 'Human58'='#ff8a9a', 81 | 'Human59'='#d157a0', 82 | 'Human60'='#bec459', 83 | 'Human61'='#456648', 84 | 'Human62'='#0086ed', 85 | 'Human63'='#886f4c', 86 | 'Human64'='#34362d', 87 | 'Human65'='#b4a8bd', 88 | 'Human66'='#00a6aa', 89 | 'Human67'='#452c2c', 90 | 'Human68'='#636375', 91 | 'Human69'='#a3c8c9', 92 | 'Human70'='#ff913f', 93 | 'Human71'='#938a81', 94 | 'Human72'='#575329', 95 | 'Human73'='#00fecf', 96 | 'Human74'='#b05b6f', 97 | 'Human75'='#8cd0ff', 98 | 'Human76'='#3b9700', 99 | 'Human77'='#04f757', 100 | 'Human78'='#c8a1a1', 101 | 'Human79'='#1e6e00', 102 | 'Human80'='#7900d7', 103 | 'Human81'='#a77500', 104 | 'Human82'='#6367a9', 105 | 'Human83'='#a05837', 106 | 'Human84'='#6b002c', 107 | 'Human85'='#772600', 108 | 'Human86'='#d790ff', 109 | 'Human87'='#9b9700', 110 | 'Human88'='#549e79', 111 | 'Human89'='#fff69f', 112 | 'Human90'='#201625', 113 | 'Human91'='#72418f', 114 | 'Human92'='#bc23ff', 115 | 'Human93'='#99adc0', 116 | 'Human94'='#3a2465', 117 | 'Human95'='#922329', 118 | 'Human96'='#5b4534', 119 | 'Human97'='#fde8dc', 120 | 'Human98'='#404e55', 121 | 'Human99'='#0089a3', 122 | 'Human100'='#cb7e98', 123 | 'Human101'='#a4e804', 124 | 'Human102'='#324e72')) 125 | colVars$CellType <- colVars$CellType[intersect(names(colVars$CellType), cellInfo$CellType)] 126 | saveRDS(colVars, file="./int/colVars.Rds") 127 | plot.new(); legend(0,1, fill=colVars$CellType, legend=names(colVars$CellType)) 128 | library(SCENIC) 129 | org="hgnc" # or hgnc, or dmel 130 | dbDir="/home/ggj/Rdata/201906/Human/databases/" # RcisTarget databases location 131 | myDatasetTitle="SCENIC example on Human1.1" # choose a name for your analysis 132 | dbs <- c("hg19-500bp-upstream-10species.mc9nr.feather","hg19-tss-centered-5kb-10species.mc9nr.feather") 133 | names(dbs)<-c("500bp","5kb") 134 | scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=10) 135 | 136 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/") 137 | # scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds" 138 | scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds" 139 | scenicOptions@inputDatasetInfo$colVars <- "int/colVars.Rds" 140 | saveRDS(scenicOptions, file="./int/scenicOptions.Rds") 141 | 142 | 143 | 144 | ### 145 | ### Co-expression network 146 | genesKept <- geneFiltering(exprMat, scenicOptions) 147 | exprMat_filtered <- exprMat[genesKept, ] 148 | runCorrelation(exprMat_filtered, scenicOptions) 149 | ##1 load AUCell matrix from PyScenic 150 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data") 151 | regulonAUC<-importAUCfromText("./aucell.csv") 152 | regulonAUC 153 | dim(regulonAUC) 154 | saveRDS(regulonAUC, file="./int/3.4_regulonAUC.Rds") 155 | 156 | ##2 run SCENIC tSNE 157 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/") 158 | nPcs <- c(50) 159 | scenicOptions@settings$seed <- 123 # same seed for all of them 160 | # Run t-SNE with different settings: 161 | fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50)) 162 | fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50), onlyHighConf=TRUE,filePrefix="int/tSNE_oHC") 163 | # Plot as pdf (individual files in int/): 164 | fileNames <- paste0("int/",grep(".Rds", grep("tSNE_", list.files("int"), value=T), value=T)) 165 | par(mfrow=c(length(nPcs), 3)) 166 | fileNames <- paste0("int/",grep(".Rds", grep("tSNE_AUC", list.files("int"), value=T, perl = T), value=T)) 167 | plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, cex=.5) 168 | # Using only "high-confidence" regulons (normally similar) 169 | scenicOptions@settings$defaultTsne$aucType <- "AUC" 170 | scenicOptions@settings$defaultTsne$dims <- 50 171 | scenicOptions@settings$defaultTsne$perpl <- 50 172 | saveRDS(scenicOptions, file="int/scenicOptions.Rds") 173 | scenicOptions<-readRDS("./int/scenicOptions.Rds") 174 | # Better if it is logged/normalized 175 | aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat) # default t-SNE 176 | savedSelections <- shiny::runApp(aucellApp) 177 | # Save the modified thresholds: 178 | newThresholds <- savedSelections$thresholds 179 | auc<-read.csv("/home/ggj/github/HCL/HCL/Scenic_R/example_data/aucell.csv",row.names = 1) 180 | usethrethold<-apply(auc,2,summary) 181 | colnames(usethrethold)<-gsub("\\...","(+)",colnames(usethrethold)) 182 | dim(usethrethold) 183 | A0.5<-0.5*usethrethold[6,] 184 | names(A0.5) 185 | auc[1:5,1:5] 186 | setdiff(colnames(usethrethold),rownames(regulonAUC)) 187 | setdiff(rownames(regulonAUC),colnames(usethrethold)) 188 | #runSCENIC aucell binarize 189 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/") 190 | #0.5*max 191 | newThresholds<-A0.5 192 | scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds" 193 | saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds")) 194 | saveRDS(scenicOptions, file="int/scenicOptions.Rds") 195 | runSCENIC_4_aucell_binarize(scenicOptions) 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /Seurat-example/FetalThymus2_dge.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Seurat-example/FetalThymus2_dge.txt.gz -------------------------------------------------------------------------------- /Seurat-example/Tissue_seurat.R: -------------------------------------------------------------------------------- 1 | setwd("./FetalThymus2") 2 | 3 | ## Reading files(dges) 4 | a<-read.table("./FetalThymus2_dge.txt.gz",row.names = 1,header = T) 5 | FetalThymus2<-a 6 | 7 | name<-"FetalThymus2" 8 | ###### bulid dge below 500 UMI and dge more than 500 UMI 9 | FetalThymus2_500less <- FetalThymus2[,colSums(FetalThymus2)<500 & colSums(FetalThymus2)> 100] 10 | FetalThymus2_500more<-FetalThymus2[,colSums(FetalThymus2)>=500] 11 | 12 | FetalThymus2_Anno <- data.frame(Cell_barcode= colnames(FetalThymus2_500more), 13 | Sample = replicate("FetalThymus",n=ncol(FetalThymus2_500more)), 14 | Batch = replicate("FetalThymus2",n=ncol(FetalThymus2_500more))) 15 | colnames(FetalThymus2_500more) <- paste("2",colnames(FetalThymus2_500more),sep = ".") 16 | colnames(FetalThymus2_500more) <- paste("FetalThymus",colnames(FetalThymus2_500more),sep = "_") 17 | FetalThymus2_Anno[,"Cell_id"] <- colnames(FetalThymus2_500more) 18 | FetalThymus2_Anno[,"Cluster_id"] = replicate("1",n=ncol(FetalThymus2_500more)) 19 | FetalThymus2_Anno$Ages<-"10W" 20 | FetalThymus2_Anno$Development_stage<-"Fetus" 21 | FetalThymus2_Anno$Method<-rep("Microwell-seq") 22 | FetalThymus2_Anno$Gender<-"Male" 23 | FetalThymus2_Anno$Source<-rep("HCL") 24 | FetalThymus2_Anno$Biomaterial<-rep("FetalThymus") 25 | FetalThymus2_Anno$Name<-rep("FetalThymus2_10W") 26 | 27 | 28 | 29 | ## make background 30 | name<-"FetalThymus2" 31 | name_background <- paste(name,"background", sep="_") 32 | name_500more <- paste(name,"500more", sep="_") 33 | name_500less <- paste(name,"500less", sep="_") 34 | 35 | ## check the data condition 36 | par(mfrow=c(2,1)) 37 | hist(colSums(FetalThymus2_500more),breaks = 200) 38 | hist(colSums(FetalThymus2_500more>0),breaks = 200) 39 | abline(v=300) 40 | summary(colSums(FetalThymus2_500more>0)) 41 | #Min. 1st Qu. Median Mean 3rd Qu. Max. 42 | #71.0 386.0 440.0 473.8 530.0 1561.0 43 | 44 | library(Seurat) 45 | seqwell <- CreateSeuratObject(raw.data = Matrix(as.matrix(FetalThymus2_500more),sparse=T) 46 | ,min.cells = 3,min.genes = 300,names.delim = "\\.") ##no normarlize 47 | 48 | dim(seqwell@data) 49 | #19211 9801 50 | mito.genes <- grep(pattern = "^MT-", x = rownames(x = seqwell@data), value = TRUE) 51 | percent.mito <- colSums(seqwell @raw.data[mito.genes, ])/colSums(seqwell @raw.data) 52 | seqwell <- AddMetaData(object = seqwell, metadata = percent.mito, col.name = "percent.mito") 53 | VlnPlot(object = seqwell , features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3) 54 | par(mfrow = c(1, 2)) 55 | GenePlot(object = seqwell , gene1 = "nUMI", gene2 = "percent.mito") 56 | GenePlot(object = seqwell, gene1 = "nUMI", gene2 = "nGene") 57 | seqwell<- FilterCells(object = seqwell , subset.names = c("nGene", "percent.mito"), 58 | low.thresholds = c(300, -Inf), high.thresholds = c(2500, 0.2)) 59 | seqwell <- NormalizeData(object = seqwell, normalization.method = "LogNormalize", 60 | scale.factor = 10000) 61 | seqwell<- ScaleData(seqwell,vars.to.regress=c("nUMI", "percent.mito"), do.par = TRUE, num.cores =8) 62 | par(mfrow=c(1,1)) 63 | seqwell<- FindVariableGenes(object = seqwell, mean.function = ExpMean, dispersion.function = LogVMR 64 | ,x.low.cutoff = 0.01, 65 | x.high.cutoff = 6, y.cutoff = 0.5) 66 | length(seqwell @var.genes)# 1893 67 | #hv.genes <- head(rownames(seqwell@hvg.info), 2000) 68 | 69 | pbmc<-seqwell 70 | rm(seqwell) 71 | 72 | var.gene<-pbmc@var.genes 73 | var.gene<-var.gene[!grepl(pattern = "*RPS",x=var.gene)] 74 | var.gene<-var.gene[!grepl(pattern = "*RPL",x=var.gene)] 75 | var.gene<-var.gene[!grepl(pattern = "*MT",x=var.gene)] 76 | length(var.gene) 77 | #2076 78 | 79 | # Perform linear dimensional reduction 80 | pbmc <- RunPCA(object = pbmc, pc.genes = var.gene, pcs.compute = 50, do.print = TRUE, 81 | pcs.print = 1:5, genes.print = 5) 82 | # Determine statistically significant principal components 83 | pbmc <- JackStraw(object = pbmc, num.replicate = 100, num.pc = 40, num.cores = 8,do.par = TRUE) 84 | # The JackStrawPlot function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). ‘Significant’ PCs will show a strong enrichment of genes with low p-values (solid curve above the dashed line). In this case it appears that PCs 1-10 are significant. 85 | JackStrawPlot(object = pbmc, PCs = 1:40)#25 86 | # A more ad hoc method for determining which PCs to use is to look at a plot of the standard deviations of the principle components and draw your cutoff where there is a clear elbow in the graph. This can be done with PCElbowPlot. In this example, it looks like the elbow would fall around PC 9. 87 | PCElbowPlot(object = pbmc,num.pc = 50)#14 88 | PCHeatmap(object = pbmc, pc.use = 1:15, cells.use = 500, do.balanced = TRUE, 89 | label.columns = FALSE, use.full = FALSE) 90 | PCHeatmap(object = pbmc, pc.use = 16:30, cells.use = 500, do.balanced = TRUE, 91 | label.columns = FALSE, use.full = FALSE) 92 | PCHeatmap(object = pbmc, pc.use = 31:50, cells.use = 500, do.balanced = TRUE, 93 | label.columns = FALSE, use.full = FALSE) 94 | 95 | 96 | # Run Non-linear dimensional reduction (tSNE) 97 | # Seurat continues to use tSNE as a powerful tool to visualize and explore these datasets. While we no longer advise clustering directly on tSNE components, cells within the graph-based clusters determined above should co-localize on the tSNE plot. This is because the tSNE aims to place cells with similar local neighborhoods in high-dimensional space together in low-dimensional space. As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument. 98 | pbmc <- RunTSNE(object = pbmc, dims.use = 1:20, do.fast = TRUE) 99 | TSNEPlot(object = pbmc,do.label = T, pt.size = 1,label.size = 5) 100 | 101 | pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:15, save.SNN = TRUE, 102 | resolution =c(0.6,0.8,1,1.4,2,2.5,4),force.recalc = T,k.param=15) 103 | 104 | pbmc <- RunTSNE(object = pbmc, reduction.use = "pca", dims.use = 1:12, tsne.method = "FIt-SNE", 105 | nthreads = 8, reduction.name = "FItSNE", reduction.key = "FItSNE_", 106 | fast_tsne_path = "/home/ggj/Documents/tools/FIt-SNE-master/bin/fast_tsne", 107 | max_iter = 2000,perplexity=100) 108 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 109 | pt.size = 1,group.by = "res.0.6",do.label = T)+ggtitle("res.0.6") #17 110 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 111 | pt.size = 1,group.by = "res.0.8",do.label = T) +ggtitle("res.0.8")#17 112 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 113 | pt.size = 1,group.by = "res.1",do.label = T) +ggtitle("res.1")#19 114 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 115 | pt.size = 1,group.by = "res.1.4",do.label = T)+ggtitle("res.1.4") #19 116 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 117 | pt.size = 1,group.by = "res.2",do.label = T)+ggtitle("res.2") #23 118 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 119 | pt.size = 1,group.by = "res.2.5",do.label = T) +ggtitle("res.2.5")#27 120 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 121 | pt.size = 1,group.by = "res.4",do.label = T)+ggtitle("res.4") #32 122 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 123 | pt.size = 1,do.label = T) 124 | 125 | 126 | 127 | 128 | pbmc<-SetAllIdent(pbmc,id="res.0.6") 129 | aa<-FindMarkers(pbmc,0,1)# merge 130 | 131 | pbmc<-SetAllIdent(pbmc,id="res.0.8") 132 | aa<-FindMarkers(pbmc,2,1)# merge 133 | 134 | 135 | pbmc<-SetAllIdent(pbmc,id="res.1.4") 136 | 137 | current.cluster.ids <- 0:13 138 | new.cluster.ids <-c(0,1,2,3,4,5,6,7,8,9,10,11,12,13) 139 | new.cluster.ids <-c(1,1,1,1,1,1,1,1,1,2,3,4,5,6) 140 | pbmc@ident <- plyr::mapvalues(pbmc@ident, from = current.cluster.ids, to = new.cluster.ids) 141 | table(pbmc@ident) 142 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 143 | pt.size = 1,do.label = T) 144 | 145 | 146 | save.image("./FetalThymus2.RData") 147 | pbmc.markers<-FindAllMarkers(pbmc, only.pos = TRUE, thresh.use = 0.25,min.pct = 0.15) 148 | pbmc.markers <- pbmc.markers[with(pbmc.markers, order(cluster,-avg_logFC, p_val_adj)),] 149 | table(pbmc.markers$cluster) 150 | library(gdata) 151 | WriteXLS::WriteXLS(pbmc.markers,"./markers.xlsx") 152 | library(dplyr) 153 | pbmc.markers %>% group_by(cluster) %>% top_n(20, avg_logFC) ->top20 154 | DoHeatmap(pbmc, genes.use = top20$gene, slim.col.label = TRUE, remove.key = TRUE,cex.row =3,group.cex=5) 155 | save.image("./FetalThymus2.RData") 156 | save(pbmc,pbmc.markers,FetalThymus2_Anno,file = "./FetalThymus2_pbmc.RData") 157 | 158 | -------------------------------------------------------------------------------- /scHCL-build-reference/.Rhistory: -------------------------------------------------------------------------------- 1 | load("~/Rdata/201806/Agingmouse/OneYearThymus/OneYearThymus.RData") 2 | FeaturePlot(pbmc,c("Cd4"),reduction.use = "FItSNE") 3 | FeaturePlot(pbmc,c("Cd4","Cd8a1"),reduction.use = "FItSNE") 4 | FeaturePlot(pbmc,c("Cd4","Cd8a"),reduction.use = "FItSNE") 5 | FeaturePlot(pbmc,c("Cd4","Cd8a","Cd8b1"),reduction.use = "FItSNE") 6 | -------------------------------------------------------------------------------- /scHCL-build-reference/1-build-reference.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(dplyr) 3 | library(Matrix) 4 | # get_tissue_arv 5 | # this function return a single subs gene all-cell mean and 3 sample 100cells 6 | get_tissue_arv <- function(cell_counts, subcount) { 7 | single_tissue_names_per_subs = colnames(pbmc@data[,pbmc@ident==subcount]) 8 | dge <- data.frame(as.matrix(pbmc@raw.data[,single_tissue_names_per_subs])) 9 | single_tissue_raw_data_per_subs <- as.matrix(t(t(dge)/colSums(dge))*100000) 10 | if (cell_counts >= 300) { 11 | numsample<-floor(ncol(single_tissue_raw_data_per_subs)/100) 12 | single_sample_in_ref<-data.frame(matrix(nrow=nrow(single_tissue_raw_data_per_subs),ncol = numsample)) 13 | rownames(single_sample_in_ref)<-rownames(single_tissue_raw_data_per_subs) 14 | for (i in 1:numsample){ 15 | single_sample_in_ref[,i] = rowMeans(single_tissue_raw_data_per_subs[,sample(1:length(single_tissue_raw_data_per_subs[1,]),100,replace = F)]) 16 | } 17 | } 18 | else if (cell_counts>=100 & cell_counts <300) { 19 | single_sample_in_ref<-data.frame(matrix(nrow=nrow(single_tissue_raw_data_per_subs),ncol = 3)) 20 | rownames(single_sample_in_ref)<-rownames(single_tissue_raw_data_per_subs) 21 | for (i in 1:3){ 22 | single_sample_in_ref[,i] = rowMeans(single_tissue_raw_data_per_subs[,sample(1:length(single_tissue_raw_data_per_subs[1,]),100,replace = T)]) 23 | } 24 | } 25 | else { 26 | single_sample_in_ref<-data.frame(matrix(nrow=nrow(single_tissue_raw_data_per_subs),ncol = 3)) 27 | rownames(single_sample_in_ref)<-rownames(single_tissue_raw_data_per_subs) 28 | for (i in 1:3){ 29 | single_sample_in_ref[,i] = rowMeans(single_tissue_raw_data_per_subs[,sample(1:length(single_tissue_raw_data_per_subs[1,]),length(single_tissue_raw_data_per_subs[1,]))])} 30 | } 31 | 32 | single_sample_in_ref<-floor(single_sample_in_ref[,sample(1:ncol(single_sample_in_ref),3,replace = F)]) 33 | sample_tissue_mean = data.frame( apply(single_sample_in_ref,1 , mean) ) 34 | sumgene_sample_tissue <- sum(sample_tissue_mean>0) 35 | return(list(as.matrix(sample_tissue_mean),as.matrix(single_sample_in_ref),as.numeric(sumgene_sample_tissue))) 36 | } 37 | # get pesudocell of 100 cell data 38 | get_tissue_sample_data <- function(pbmc){ 39 | tissue_data=c() 40 | tissue_sample = c() 41 | tissue_sumgene =c() 42 | subs_count = length(table(pbmc@ident)) 43 | cells_count_persub = as.numeric(table(pbmc@ident)) 44 | for(i in 1:subs_count){ 45 | xx=get_tissue_arv(cells_count_persub[i],i) 46 | colnames(xx[[2]])<-paste0(i,"_",colnames(xx[[2]])) 47 | tissue_sample <- cbind(tissue_sample,xx[[1]]) 48 | tissue_data <- cbind(tissue_data,xx[[2]]) 49 | tissue_sumgene <-c(tissue_sumgene,xx[[3]]) 50 | } 51 | colnames(tissue_sample)<-1:length(cells_count_persub) 52 | rownames(tissue_sample)<-rownames(tissue_data) 53 | return(list(tissue_data,tissue_sample,tissue_sumgene)) } 54 | 55 | ####the seurat data including your clustering information is saved in the Rdata, you should set the pathway of Rdata as working pathway 56 | #how to build a seurat data? please read the instructions of Seurat R package 57 | setwd("/media/ggj/SHYbeifen/HCLREFuse.RData/") 58 | tissuedata <- list.files(pattern="*.RData") 59 | tissuenames <- reshape2::colsplit(tissuedata,pattern="_pbmc.RData",names=c("tissue","c"))$tissue 60 | total_tissue_gene = data.frame() 61 | total_tissue_data = data.frame() 62 | for(i in 1:length(tissuenames)){ 63 | message(paste0("Loading ",tissuedata[i])) 64 | load(tissuedata[i]) 65 | message("Finish Loading") 66 | xx = get_tissue_sample_data(pbmc) 67 | colnames(xx[[1]]) <- paste0(tissuenames[i],"_",colnames(xx[[1]])) 68 | colnames(xx[[2]]) <- paste0(tissuenames[i],"_",colnames(xx[[2]])) 69 | message("Staring mearge") 70 | if(i==1){total_tissue_gene = xx[[1]]; total_tissue_data = xx[[2]];tissue_sumgene <-xx[[3]]} 71 | else{ total_tissue_gene=merge(total_tissue_gene,xx[[1]],by="row.names",all=T,sort=T); rownames(total_tissue_gene)=total_tissue_gene[,1] ; total_tissue_gene=total_tissue_gene[,-1]; total_tissue_gene[is.na(total_tissue_gene)]=0 ; 72 | total_tissue_data=merge(total_tissue_data,xx[[2]],by="row.names",all=T,sort=T); rownames(total_tissue_data)=total_tissue_data[,1] ; total_tissue_data=total_tissue_data[,-1]; total_tissue_data[is.na(total_tissue_data)]=0 ; 73 | tissue_sumgene <-c(tissue_sumgene,xx[[3]]) 74 | } 75 | rm(pbmc) 76 | message(paste0("Finish ",tissuenames[i])) } 77 | tissue_sumgene<-data.frame(tissue_sumgene,row.names = colnames(tissuedata)) 78 | hist(tissue_sumgene$tissue_sumgene,breaks = 100,main="average",xlab = "ngene") 79 | hist(colSums(total_tissue_gene>0),breaks = 100,main="sample",xlab = "ngene") 80 | summary(tissue_sumgene$tissue_sumgene) 81 | summary(colSums(total_tissue_gene>0)) 82 | 83 | ############################# 84 | all_tissue_gene <- colnames(total_tissue_gene) 85 | subs_group <- colnames(total_tissue_data) 86 | names_id <- c() 87 | for(i in 1:length(subs_group)) { id=grep(pattern=paste0("^",subs_group[i],"_"),all_tissue_gene); names_id = c(names_id,rep(subs_group[i],length(id))) } 88 | allclusterID<-data.frame(colnames(total_tissue_gene),names_id) 89 | rownames(allclusterID)<-allclusterID$colnames.total_tissue_gene. 90 | 91 | ###run Seurat for the next step to calculate different gene test 92 | library(Seurat) 93 | library(dplyr) 94 | library(Matrix) 95 | pbmc <- CreateSeuratObject(raw.data =total_tissue_gene, min.cells = 3, min.genes = 20, 96 | project = "nolog") 97 | pbmc <- AddMetaData(object = pbmc, metadata =allclusterID) 98 | pbmc<-SetAllIdent(pbmc,id="names_id") 99 | pbmc@ident 100 | pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 101 | scale.factor = 100000) 102 | idnetname<-rownames(table(pbmc@ident)) 103 | save(idnetname,pbmc,file = "/media/ggj/SHYbeifen/HCLREFuse.RData/ref/Marker/human_ref_pbmc.RData") 104 | pbmc.markers <- FindAllMarkers(object =pbmc, only.pos = TRUE, min.pct = 0.15, 105 | thresh.use = 0.15) 106 | pbmc.markers<-pbmc.markers[order(pbmc.markers$cluster,-pbmc.markers$avg_logFC,pbmc.markers$p_val ),] 107 | 108 | # get feature gene ,choose top20 genes per cluster 109 | library(dplyr) 110 | pbmc.markers %>% group_by(cluster) %>% top_n(20,avg_logFC) -> top20 111 | top20 <- top20$gene 112 | top20 <- top20[!duplicated(top20)] 113 | ref_exp <- reference[top20,] 114 | save.image("/home/ggj/Rdata/201810/NewReference/ref-exp.RData") 115 | -------------------------------------------------------------------------------- /scHCL-build-reference/2-scHCLuse.R: -------------------------------------------------------------------------------- 1 | #load reference 2 | load("/home/ggj/Rdata/201810/NewReference/ref-exp.RData") 3 | #start scHCL 4 | Test_dge<-read.table("./_dge_sample.csv",sep = ",",header = T,row.names = 1)#load the matirx of new cells,row is genename ,col is cellname 5 | #log-normalized the uploaded dge 6 | Test_dge<-as.matrix(t(t(Test_dge)/colSums(Test_dge))*100000) 7 | Test_dge<-log(Test_dge+1) 8 | tst <- data.frame(matrix(nrow =length(ref[,1]),ncol = length(Test_dge[1,])))#the 104 is the test cell numbers 9 | rownames(tst)<-rownames(ref) 10 | colnames(tst)<-colnames(Test_dge) 11 | for (i in rownames(ref)) {ref 12 | if(i%in%rownames(Test_dge)) tst[i,]<- Test_dge[i,] 13 | } 14 | tst[is.na(tst)]<-0 15 | ref<-log(ref_exp+1) 16 | cors <- cor(ref,tst) 17 | cors[is.na(cors)]<-0 18 | cor1<-cors 19 | cor1m<-apply(cor1,2,max) # 20 | cor1S<-apply(cors,2,function(x) rownames(cors)[which.max(x)]) 21 | cor1r<-cbind(cor1S,cor1m) 22 | #scHCL results is in scHCL dataframe 23 | scHCL<-data.frame(cor1r) 24 | colnames(scHCL)<-c("scHCL_result" , "cors_log") 25 | -------------------------------------------------------------------------------- /scHCL-build-reference/HCLREFuse.RData/example1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/scHCL-build-reference/HCLREFuse.RData/example1.RData --------------------------------------------------------------------------------