├── Batch-pollution-remove
    ├── Test_dge.txt.gz
    └── batch-pollution-remove.R
├── Circos
    ├── Main_circos.r
    ├── celltype.NV_SRS_75.out.result
    └── color.list
├── Early-processing
    ├── Drop-seq_tools.zip
    ├── STAR-2.5.2a.zip
    ├── bbmap.zip
    ├── pipeline.txt
    └── sccpipe.zip
├── Gene-regulon-network
    ├── 1-1_SCENIC_AdultHuman_main.py
    ├── 1-2_JSD_RSS.r
    └── 1-3_CSI.r
├── HCL_Fig1_script-1.py
├── HCL_Fig1_script_730new.py.txt
├── Metaneighbor
    ├── 2017-08-28-runMN-US.R
    └── hm_metaneighbor.r
├── PAGA
    └── PAGA_BonemarrowCD34P.py
├── Pseudocell
    ├── FetalStomach1_500more.RData
    └── Pseudocell_Human.r
├── README.md
├── Scenic_R_human.R
├── Seurat-example
    ├── FetalThymus2_dge.txt.gz
    └── Tissue_seurat.R
└── scHCL-build-reference
    ├── .Rhistory
    ├── 1-build-reference.R
    ├── 2-scHCLuse.R
    └── HCLREFuse.RData
        └── example1.RData


/Batch-pollution-remove/Test_dge.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Batch-pollution-remove/Test_dge.txt.gz


--------------------------------------------------------------------------------
/Batch-pollution-remove/batch-pollution-remove.R:
--------------------------------------------------------------------------------
  1 | #This scripy is used to remove polluted genes  in DGE.
  2 | ################1.load Test DGE and selcet cell with 500 more UMI
  3 | setwd("/home/ggj/Rdata/JQ-Test/Data/Test/")
  4 | temp <- list.files(pattern="*txt.gz")
  5 | name <- character()
  6 | for(i in 1:length(temp)){
  7 |   message("loading DGE")
  8 |   name[i] <- unlist(strsplit(temp[i],"_dge"))[1]
  9 |   tmpvalue<-read.table(temp[i],header=T,row.names=1)
 10 |   assign(name[i],tmpvalue)
 11 |   message(paste(name[i],"is loaded"))
 12 | }
 13 | for(i in 1:length(temp)) {
 14 |   dge<-get(name[i]) 
 15 |   colnames( dge ) <- paste0(as.character(name[i]),".",colnames(get(name[i])))         
 16 |   assign(name[i], dge)
 17 | }
 18 | 
 19 | name_500less <- name
 20 | name_500less <- paste(name_500less,"500less", sep="_")
 21 | 
 22 | for(i in 1:length(name)){
 23 |   dge <- get(name[i]);
 24 |   temp <- dge[,colSums(dge)<500 & colSums(dge)> 50]
 25 |   assign(name_500less[i], temp)
 26 | }
 27 | 
 28 | ### bulid dge more than 500 UMI
 29 | name_500more <- name
 30 | name_500more <- paste(name_500more,"500more", sep="_")
 31 | 
 32 | for(i in 1:length(name_500more)){
 33 |   dge <- get(name[i]);
 34 |   temp <- dge[,colSums(dge)>=500]
 35 |   assign(name_500more[i], temp)
 36 |   message(paste(name[i],"is done"))
 37 | }
 38 | ################2.rmbatch in manual work
 39 | name<-"Test"
 40 | anno<-data.frame(matrix(unlist(strsplit(colnames(Test_500more),"\\.")),ncol = 2,byrow = T)[,2] )
 41 | colnames(anno)<-"Cell_barcode"
 42 | anno$Sample<-"Test"
 43 | anno$Batch<-"Test"
 44 | anno$Cell_id<-colnames(Test_500more)
 45 | anno$Cluster_id<-"1"
 46 | anno$Ages<-"age"
 47 | anno$Development_stage<-"Adult"
 48 | anno$Method<-"Microwell-seq"
 49 | anno$Gender<-"Female"
 50 | anno$Source<-"Test"
 51 | anno$Biomaterial<-"Test"
 52 | anno$Name<-"Test"
 53 | head(anno)
 54 | dim(anno)##k the annotations
 55 | # 2835   12
 56 | 
 57 | #raw<-get(name)
 58 | more500<-get(paste(name,"500more",sep = "_"))
 59 | less<-get(paste(name,"500less",sep = "_"))
 60 | raw<-merge(less,more500,by="row.names",all=T)
 61 | raw<-data.frame(raw[,-1],row.names = raw$Row.names)
 62 | raw[is.na(raw)]<-0
 63 | ## determine the batch cells
 64 | par(mfrow=c(1,1))
 65 | hist(colSums(less),breaks = 2000,xlim = c(0,1000))
 66 | allumi<-data.frame(umi=colSums(less))
 67 | ssa<-allumi[with(allumi,order(umi,decreasing = F)),]## check the order
 68 | ssa[1:500]   ### check the UMI:50-500, use 500 cells
 69 | abline(v= 276 )
 70 | rm(ssa)
 71 | ss<-rownames(allumi)[with(allumi,order(umi,decreasing = F))][1:500]
 72 | less<-less[,ss]
 73 | 
 74 | 
 75 | ## narrow down the gene
 76 | aa<-data.frame(gene=rowSums(less))
 77 | table(aa$gene>10)
 78 | usegene<-rownames(aa)[aa$gene>10]
 79 | more500<-more500[usegene,]
 80 | less<-less[usegene,]
 81 | raw<-raw[usegene,]
 82 | background <- data.frame(var=replicate(1,n = nrow(more500)),
 83 |                          cellnum_express =rowSums(more500>0),
 84 |                          rowMean_500more =rowMeans(more500),
 85 |                          row.names = rownames(more500)
 86 |                          ,rowMeans_all=rowMeans(raw)
 87 | )
 88 | temp <- merge(background,data.frame(rowMean_less =rowMeans(less)),all.x=F, by="row.names")
 89 | background <- data.frame(temp[,-1],row.names = temp[,1])
 90 | for (m in rownames(background)){
 91 |   background[m,"var"] <- var(as.numeric(more500[m,]))
 92 |   background[m,"sd"] <- sqrt(background[m,"var"])
 93 | }
 94 | 
 95 | background <- background[with(background,order(-rowMean_less,-rowMean_500more,-cellnum_express, -sd)),]
 96 | background$multi<-background$rowMean_less*background$sd
 97 | background<-background[background$multi>=1  ,]
 98 | 
 99 | #background<-background[!grepl(x=rownames(background),pattern = "*MT-"),]
100 | #background<-background[!grepl(x=rownames(background),pattern = "*RPS"),]
101 | #background<-background[!grepl(x=rownames(background),pattern = "*RPL"),]
102 | 
103 | summary(background$rowMean_500more/background$rowMean_less)
104 | summary(background$rowMeans_all/background$rowMean_less)
105 | plot(density(summary(colSums(less["HBB",]))))
106 | 
107 | 
108 | 
109 | ## determine the coeffficient and select the med between 2-5  
110 | med<-median(background$rowMeans_all/background$rowMean_less)
111 | med
112 | med<-median(background$rowMean_500more/background$rowMean_less)
113 | med
114 | # 2.761137
115 | 
116 | 
117 | 
118 | background[,"batchValue"] <- background[,"rowMean_less"]*med#  value to delete
119 | background$batchValue <- round(background$batchValue) 
120 | background <- background[background$batchValue>0,]
121 | 
122 | 
123 | dge_m<-get(paste(name,"500more",sep = "_"))
124 | m <- dge_m
125 | for (i in rownames(background)) { m[i,] <- m[i,]-background[i,"batchValue"] }
126 | sum(dge_m)# 7201740
127 | sum(m)#4157436
128 | m[m<0] <- 0  # 
129 | sum(m)#5963150
130 | (sum(dge_m)-sum(m))/sum(dge_m)
131 | #0.1719848
132 | rowSums(dge_m["HBB",]> 0)
133 | rowSums(m["HBB",]>0)
134 | 
135 | par(mfrow=c(2,1))
136 | plot(density(summary(colSums(dge_m["HBB",]))))
137 | plot(density(summary(colSums(m["HBB",]))))
138 | 
139 | Test_500less<-Test_500less
140 | Test_500more<-Test_500more
141 | Test_Anno<-anno
142 | Test_rm.batch <- m
143 | Test_background <- background
144 | Test_less<-less
145 | 
146 | save(Test_rm.batch,
147 |      Test_background,
148 |      Test_less,
149 |      Test_500more,
150 |      Test_Anno,
151 |      file = "/home/ggj/Rdata/201901/Test_500more_rmbatch.RData")
152 | 


--------------------------------------------------------------------------------
/Circos/Main_circos.r:
--------------------------------------------------------------------------------
 1 | library(circlize)
 2 | #library(migest)
 3 | library(dplyr)
 4 | library(gdata)
 5 | library(RColorBrewer)
 6 | 
 7 | color_species = structure(c("#2E8B57", "#FF4500"), names = c("HCL","MCA"))
 8 | 
 9 | DF<-read.table("top_hits_SRS_75.out.result",sep="\t",head=T)
10 | 
11 | 
12 | #all_regions = unique(Phe$Cluster)
13 | 
14 | all_regions = unique(c(as.character(DF$Cluster1), as.character(DF$Cluster2)))
15 | #color_regions = structure(rev(rainbow(length(all_regions))), names = as.character(all_regions))
16 | # color_regions = structure(c("#E41A1C","#377EB8","#4DAF4A","#FCCDE5","#B3DE69","#A65628","#6A3D9A","#1B9E77","#CAB2D6","#66A61E","#D95F02","#A6761D",
17 | #  "#E6AB02","#7570B3"),names = as.character(all_regions))
18 | 
19 | color.list<-read.table("color.list",head=T,sep="=")
20 | rownames(color.list)<-color.list$name
21 | color.list<-color.list[as.character(all_regions),]
22 | #color_regions = structure(rev(rainbow(length(all_regions))), names = as.character(all_regions))
23 | color_regions = structure(as.character(color.list$color), names = as.character(all_regions))
24 | 
25 | 
26 | 
27 | 
28 | df2 = data.frame(from=paste(DF$Species1,DF$Cluster1,sep="|"),to=paste(DF$Species2,DF$Cluster2,sep="|"),value=DF$Mean_AUROC)
29 | #df3<-factor(df2)
30 | #df2<-data.frame(Phe$Cell)
31 | #combined = unique(data.frame(regions = Phe$Cluster, species = Phe$Species, stringsAsFactors = FALSE))
32 | combined = unique(data.frame(regions = c(as.character(DF$Cluster1), as.character(DF$Cluster2)), 
33 |     species = c(as.character(DF$Species1), as.character(DF$Species2)), stringsAsFactors = FALSE))
34 | combined = combined[order(combined$species, combined$regions), ]
35 | order = paste(combined$species, combined$regions, sep = "|")
36 | grid.col = structure(color_regions[combined$regions], names = order)
37 | gap = rep(1, length(order))
38 | gap[which(!duplicated(combined$species, fromLast = TRUE))] = 5
39 | 
40 | pdf("HM-circos-new_Cluster.pdf")
41 | circos.par(gap.degree = gap,start.degree=270)
42 | chordDiagram(df2, order = order, 
43 | 	annotationTrack = c("grid"),
44 |     grid.col = grid.col, directional = FALSE,
45 |     preAllocateTracks = list(
46 |         track.height = 0.04,
47 |         track.margin = c(0.05, 0)
48 |     )
49 | )
50 | for(species in unique(combined$species)) {
51 |     l = combined$species == species
52 |     sn = paste(combined$species[l], combined$regions[l], sep = "|")
53 |     highlight.sector(sn, track.index = 1, col = color_species[species], 
54 |         #text = species, 
55 |         niceFacing = TRUE)
56 | }
57 | circos.clear()
58 | 
59 | legend("bottomleft", pch = 15, col = color_regions, 
60 |     legend = names(color_regions), cex = 0.3)
61 | legend("bottomright", pch = 15, col = color_species, 
62 |     legend = names(color_species), cex = 0.6)
63 | 
64 | 
65 | dev.off()
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/Circos/celltype.NV_SRS_75.out.result:
--------------------------------------------------------------------------------
1 | Mouse10	MCA	Mammary gland in lactation	Secretory	Secretory	Mouse100	MCA	Epithelial cell	Epithelial	Epithelial	Mouse101	Mouse102
2 | 


--------------------------------------------------------------------------------
/Circos/color.list:
--------------------------------------------------------------------------------
 1 | name=color
 2 | Endothelial="#E41A1C"
 3 | Epithelial="#377EB8"
 4 | Epithelial.fetal="#3399FF"
 5 | Erythroid="#4DAF4A"
 6 | Fat="#FCCDE5"
 7 | Germline="#A65628"
 8 | Hepatocyte="#6A3D9A"
 9 | Immune="#1B9E77"
10 | Muscle="#66A61E"
11 | Muscle.fetal="#B3DE69"
12 | Neuron="#D95F02"
13 | Neuron.fetal="#FF7F00"
14 | Proliferating="#A6761D"
15 | Proliferating.fetal="#BC80BD"
16 | Secretory="#E6AB02"
17 | Secretory.fetal="#8DD3C7"
18 | Stromal="#7570B3"
19 | Stromal.fetal="#CC99FF"
20 | 


--------------------------------------------------------------------------------
/Early-processing/Drop-seq_tools.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/Drop-seq_tools.zip


--------------------------------------------------------------------------------
/Early-processing/STAR-2.5.2a.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/STAR-2.5.2a.zip


--------------------------------------------------------------------------------
/Early-processing/bbmap.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/bbmap.zip


--------------------------------------------------------------------------------
/Early-processing/pipeline.txt:
--------------------------------------------------------------------------------
 1 | #1.use the index sequence to get raw.data.The raw fastq files of R1 and R2 is putted in /*/data/INPUT/
 2 | $ ./github/Early_processing/sccpipe -I /*/data/INPUT/  -O /*/data/OUTPUT/
 3 | 
 4 | 
 5 | #2.filiter the sequence including the CGACTCACTACAGGG,TCGGTGACACGATCG,TTTTTTTTTTTT 
 6 | $ ./github/Early_processing/bbmap/bbduk2.sh in=/*/data/COL10_R1.fastq in2=/*/data/COL10_R2_001.fastq outm=/*/data/H_c1_R1_001.fastq outm2=/*/data/H_c1_R2_001.fastq fliteral=CGACTCACTACAGGG k=15 skipr2=t hdist=3 -Xmx58g
 7 | 
 8 | $ ./github/Early_processing/bbmap/bbduk2.sh in=/*/data/H_c1_R1_001.fastq in2=/*/data/H_c1_R2_001.fastq outm=/*/data/H_c2_R1_001.fastq outm2=/*/data/H_c2_R2_001.fastq fliteral=TCGGTGACACGATCG k=15 skipr2=t hdist=3 -Xmx58g
 9 | 
10 | $ ./github/Early_processing/bbmap/bbduk2.sh in=/*/data/H_c2_R1_001.fastq in2=/*/data/H_c2_R2_001.fastq outm=/*/data/H_c3_R1_001.fastq outm2=/*/data/H_c3_R2_001.fastq fliteral=TTTTTTTTTTTT k=12 skipr2=t hdist=3 -Xmx58g
11 | 
12 | 
13 | #3.fastq to the bam file
14 | $ java -Xmx58g -jar ./github/Early_processing/Drop-seq_tools/3rdParty/picard/picard.jar  FastqToSam F1=/*/data/H_c3_R1_001.fastq F2=/*/data/H_c3_R2_001.fastq  O=/*/data/Lung.bam QUALITY_FORMAT=Standard SAMPLE_NAME=sample_name
15 | 
16 | 
17 | #4.get dge(Digital gene expression)
18 | $ ./github/Early_processing/Drop-seq_tools/scHCL.sh  -g /*/STAR_Reference_Human/genomeDir -r /*/STAR_Reference_Human/Homo_sapiens.GRCh38.fa -d ./github/Early_processing/Drop-seq_tools/ -o /*/data/ -t /*/data/ -s ./github/Early_processing/STAR-2.5.2a/source/STAR  /*/data/Lung.bam
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/Early-processing/sccpipe.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Early-processing/sccpipe.zip


--------------------------------------------------------------------------------
/Gene-regulon-network/1-1_SCENIC_AdultHuman_main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import pickle
 4 | import pandas as pd
 5 | import numpy as np
 6 | from dask.diagnostics import ProgressBar
 7 | from arboreto.utils import load_tf_names
 8 | from arboreto.algo import grnboost2
 9 | from arboreto.algo import genie3
10 | from numpy.core.umath_tests import inner1d
11 | from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
12 | from pyscenic.utils import modules_from_adjacencies, load_motifs
13 | from pyscenic.prune import prune, prune2df, df2regulons
14 | from pyscenic.aucell import aucell
15 | import seaborn as sns
16 | 
17 | DATA_FOLDER="./"
18 | RESOURCES_FOLDER="../resources"
19 | DATABASE_FOLDER = "../databases/"
20 | SCHEDULER="123.122.8.24:8786"
21 | DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19*.feather")
22 | MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl")
23 | MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hh_total_tfs.txt')
24 | SC_EXP_FNAME = os.path.join(RESOURCES_FOLDER, "Human.pse20.txt")
25 | REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons.p")
26 | MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs.csv")
27 | AUC_FNAME=os.path.join(DATA_FOLDER, "aucell.csv")
28 | Co_FNAME=os.path.join(DATA_FOLDER, "adjacencies.csv")
29 | 
30 | ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T
31 | ex_matrix.shape
32 | tf_names = load_tf_names(MM_TFS_FNAME)
33 | db_fnames = glob.glob(DATABASES_GLOB)
34 | def name(fname):
35 | 	return os.path.basename(fname).split(".")[0]
36 | 
37 | dbs=[RankingDatabase(fname=fname,name=name(fname)) for fname in db_fnames]
38 | 
39 | #Phase1:co-expression module
40 | adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)
41 | adjacencies.to_csv(Co_FNAME)
42 | adjacencies = pd.read_csv("/home/jingjingw/Jingjingw/Project/2018-MH-new/2019-1-18-TotalFig2-new/2_SCENIC/Human_total/adjacencies.csv")
43 | modules = list(modules_from_adjacencies(adjacencies, ex_matrix))
44 | 
45 | #Phase2: RcisTarget [Prune modules for targets with cis regulatory footprints]
46 | # Calculate a list of enriched motifs and the corresponding target genes for all modules.
47 | with ProgressBar():
48 |     df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
49 | 
50 | # Create regulons from this table of enriched motifs.
51 | regulons = df2regulons(df)
52 | # Save the enriched motifs and the discovered regulons to disk.
53 | df.to_csv(MOTIFS_FNAME)
54 | with open(REGULONS_FNAME, "wb") as f:
55 |     pickle.dump(regulons, f)
56 | 
57 | regulons = prune(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
58 |     
59 | #Phase3: AUCell    
60 | auc_mtx = aucell(ex_matrix, regulons, num_workers=4)
61 | auc_mtx.to_csv(AUC_FNAME)
62 | sns_plot=sns.clustermap(auc_mtx, figsize=(12,12))   
63 | sns_plot.savefig("sns.clustermap.png") 
64 |     
65 | 


--------------------------------------------------------------------------------
/Gene-regulon-network/1-2_JSD_RSS.r:
--------------------------------------------------------------------------------
 1 | data<-read.csv("aucell.csv")
 2 | rownames(data)<-data$Cell
 3 | data<-data[,-1]
 4 | 
 5 | #n<-gsub("[....][1-9]|[...]","",colnames(data))
 6 | #n<-as.matrix(n)
 7 | #data<-t(data)
 8 | #all.data <- data###add
 9 | #new.data<-aggregate(list(all.data[,1:length(all.data[1,])]),
10 | #	list(name=n[,1]),FUN=mean)
11 | #rownames(new.data)<-new.data$name
12 | #new.data<-new.data[,-1]
13 | #data1<-new.data#normalize
14 | 
15 | data1<-t(data)
16 | coln<-colnames(data1)
17 | coln1<-gsub("_Cell[0-9]|_Cell[0-9][0-9]|_Cell[0-9][0-9][0-9]","",coln)
18 | coln1<-as.factor(coln1)
19 | le<-levels(coln1)
20 | nle<-length(le)
21 | coln1<-as.matrix(coln1)
22 | Result<-matrix(0,ncol=length(coln1),nrow=nle)
23 | for (i in 1:nle){
24 | 	tmp<-which(coln1[,1] == as.character(le[i]))
25 | 	Result[i,tmp]<-1
26 | }
27 | colnames(Result)<-colnames(data1)
28 | rownames(Result)<-le
29 | 
30 | Result1<-Result/(rowSums(Result))
31 | #Result1[Result1==0]=0.000001
32 | 
33 | #write.table(Result1,file="JSD.mouse.celltype.input",sep="\t",quote=F)
34 | 
35 | 
36 | #-------------------JSD
37 | KLD=function(A,B){
38 | 	sum(A*log(A/B))
39 | }
40 | JSD=function(P,Q){
41 | 	M=(P+Q)/2
42 | 	jsd=0.5*KLD(P,M)+0.5*KLD(Q,M)
43 | 	return (jsd)
44 | }
45 | 
46 | Input1<-data1
47 | Input2<-Result1
48 | Input1[Input1==0]<-0.0000001 #attention
49 | Input1<-Input1/rowSums(Input1)
50 | Input2[Input2==0]<-0.0000001#attention
51 | Input2<-Input2/rowSums(Input2)
52 | TFn<-length(Input1[,1])
53 | Celln<-length(Input1[1,])
54 | Celltypn<-length(Input2[,1])
55 | JSD.result<-matrix(0,nrow=TFn,ncol=Celltypn)
56 | for (i in 1:TFn){
57 | 	for(j in 1:Celltypn){
58 | 		jsd1<-JSD(Input1[i,],Input2[j,])
59 | 		JSD.result[i,j]=jsd1	
60 | 	}
61 | 
62 | }
63 | rownames(JSD.result)<-rownames(Input1)
64 | colnames(JSD.result)<-rownames(Input2)
65 | 
66 | JSDR<-1-sqrt(JSD.result)
67 | write.table(JSDR,file="Human.RSS.total.out",sep="\t",quote=F)
68 | 


--------------------------------------------------------------------------------
/Gene-regulon-network/1-3_CSI.r:
--------------------------------------------------------------------------------
 1 | data<-read.table("Together.aucell.filter",sep="\t")
 2 | #data<-t(data)
 3 | Mcor<-cor(t(data))
 4 | n<-length(Mcor[,1])
 5 | 
 6 | CSI<-matrix(nrow=n,ncol=n)
 7 | for (i in 1:n){
 8 | 	for(j in 1:n){
 9 | 			r1<-length(which(Mcor[i,]>Mcor[i,j]))
10 | 			c1<-length(which(Mcor[,j]>Mcor[i,j]))
11 | 			CSI[i,j]<-1-(r1+c1)/((n-1)*2)
12 | 		}
13 | 	
14 | 	}
15 | rownames(CSI)<-rownames(data)
16 |  colnames(CSI)<-rownames(data)
17 |  
18 | #tf-tf network
19 | r<-CSI	 
20 | lower<-as.matrix(r[lower.tri(r)])
21 | nn<-length(r[,1])
22 | k<-0
23 | lower.r<-matrix(nrow=length(lower))
24 | for(i in 1:(nn-1)){
25 | 	for ( j in (i+1):nn){
26 | 		k=k+1
27 | 		tmp<-paste(rownames(r)[i],colnames(r)[j],lower[k],sep="\t")
28 | 		lower.r[k]=tmp
29 | 		
30 | 		}
31 | 	}
32 | write.table(lower.r,file="TF-TFNetwork.CSI.celltype.lower.out",sep="\t",quote=F,row.names=F,col.names=F)
33 | 
34 | Lower<-read.table("TF-TFNetwork.CSI.celltype.lower.out",sep="\t")
35 | out<-Lower[which(Lower[,3]>0.7),]
36 | write.table(out,file="TFmodule.network.out",sep="\t",quote=F,row.names=F,col.names=F)
37 | 
38 | 
39 | library(pheatmap)
40 | library(RColorBrewer)
41 | 
42 | MS<-CSI
43 | MS[MS<0.65]=0
44 | col<-colorRampPalette(c("#FAF9DA","#28245F"))(100)
45 | pdf("Total_TF-TF.065.pdf",width=11,height =10)
46 | x=pheatmap(MS,
47 | 		color=col,
48 | 		clustering_method = "ward.D2",
49 | 		cex=0.6,
50 | 		#show_rownames=FALSE,
51 |          show_colnames = FALSE
52 | )
53 | dev.off()
54 | 
55 | tree_order=x$tree_row$order
56 | tree_order_name=rownames(data)[tree_order]
57 | write.table(tree_order_name,"TF-TF.order.names",sep="\t",quote=F,col.names=F,row.names = F)
58 | 


--------------------------------------------------------------------------------
/HCL_Fig1_script-1.py:
--------------------------------------------------------------------------------
  1 | ## load the necessary packages
  2 | import numpy as np
  3 | import scanpy.api as sc  ### using the scanpy 1.3.7 version
  4 | import pandas as pd
  5 | import os
  6 | import pandas as pd
  7 | os.chdir("./HCL/Fig1")
  8 | 
  9 | 
 10 | ## merge the dataset from different tissues
 11 | tissues=["AdultAdrenalGland1","AdultAdrenalGland2","AdultArtery1",         
 12 | "AdultAscendingColon1","AdultBladder1","AdultBladder2","AdultBoneMarrow1" ,    
 13 | "AdultBoneMarrow2","AdultCerebellum1","AdultCervix1","AdultTransverseColon1" ,         
 14 | "AdultDuodenum1", "AdultEpityphlon1","AdultEsophagus1","AdultFallopiantube1",  
 15 | "AdultGallbladder1", "AdultHeart1" ,"AdultHeart2", "AdultIleum2" ,         
 16 | "AdultKidney2","AdultKidney3","AdultLiver1","AdultLiver2" ,         
 17 | "AdultLiver4" ,"AdultLung1","AdultLung2","AdultMuscle1",         
 18 | "AdultOmentum1","AdultOmentum2","AdultPancreas1","AdultPeripheralBlood1",
 19 | "AdultPeripheralBlood2","AdultPleura1","AdultProstate1","AdultRectum1",         
 20 | "AdultSigmoidColon1","AdultSpleen1","AdultStomach1","AdultStomach2","AdultTemporalLobe1",   
 21 | "AdultThyroid1","AdultThyroid2","AdultTrachea2","AdultUreter1" ,        
 22 | "AdultUterus1","ChorionicVillus1","CordBlood1","CordBloodCD34P1",      
 23 | "FetalAdrenalGland2","FetalBrain3","FetalBrain4","FetalBrain5",          
 24 | "FetalCalvaria1","FetalFemaleGonad1", "FetalHeart1","FetalIntestine1",      
 25 | "FetalIntestine2","FetalIntestine3","FetalKidney3","FetalKidney4",         
 26 | "FetalKidney5","FetalLiver1","FetalLung1","FetalMaleGonad1",      
 27 | "FetalMaleGonad2","FetalMuscle1", "FetalPancreas1","FetalPancreas2",       
 28 | "FetalRib2","FetalRib3","FetalSkin2","FetalSpinalCord1",     
 29 | "FetalStomach1","FetalThymus1","FetalThymus2","hESC1","Placenta1" ]
 30 | 
 31 | datause= pd.read_table("./dge/AdultAdipose1.rmbatchdge.txt",sep=" ")
 32 | for tissue in tissues:
 33 |     new=pd.read_table("./dge/" + tissue + '.rmbatchdge.txt',sep=' ')
 34 |     datause=pd.merge(datause,new,left_index=True,right_index=True,how='outer')
 35 |     print(tissue + " is done")
 36 | 
 37 | genes=datause.index
 38 | genes=genes.tolist()
 39 | cells=datause.columns
 40 | cells=cells.tolist()
 41 | cells=pd.DataFrame(columns=["cell"],data=cells)
 42 | cells.to_csv("cells.csv",sep=",",header=False,index=False)
 43 | genes=pd.DataFrame(columns=["gene"],data=genes)
 44 | genes.to_csv("genes.csv",sep=",",header=False,index=False)
 45 | datause=datause.fillna(0)
 46 | datause.to_csv('datause.csv',sep='\t',header=False,index=False)
 47 | 
 48 | datause.shape
 49 | ##451613 × 38360
 50 | 
 51 | 
 52 | ## load the data
 53 | %%time
 54 | adata=sc.read_csv("./datause.csv",delimiter='\t').transpose()
 55 | adata.var_names = pd.read_csv('./genes.csv', header=None)[0]
 56 | adata.obs_names = pd.read_csv('./cells.csv', header=None)[0]
 57 | adata.obs['tissue']=pd.read_csv('./cellanno_new.csv',sep=",",header=None)[0].values
 58 | mito_genes = [name for name in adata.var_names if name.startswith('MT-')]
 59 | adata[:, mito_genes]=0
 60 | 
 61 | adata.write('./HCL_scanpy1.h5ad')
 62 | 
 63 | 
 64 | ## Filter the genes
 65 | sc.pp.filter_genes(adata, min_cells=20)
 66 | sc.pp.filter_cells(adata, min_genes=0)
 67 | adata.obs['n_counts'] = adata.X.sum(axis=1)
 68 | 
 69 | 
 70 | ## Filter the cells
 71 | sc.pl.violin(adata, ['n_genes', 'n_counts'],jitter=0.4, multi_panel=True)
 72 | #sc.pl.scatter(adata, x='n_counts', y='percent_mito')
 73 | sc.pl.scatter(adata, x='n_counts', y='n_genes')
 74 | 
 75 | 
 76 | ## Logarithmize the data.
 77 | sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
 78 | adata.raw = sc.pp.log1p(adata, copy=True)
 79 | adata.write('./HCL_scanpy2.h5ad')
 80 | 
 81 | ## Choose variable genes
 82 | adata=sc.read("./HCL_scanpy2.h5ad")
 83 | filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.001, max_mean=15, min_disp=0.5)
 84 | sc.pl.filter_genes_dispersion(filter_result)
 85 | adata = adata[:, filter_result.gene_subset]
 86 | adata.shape()
 87 | ##451613 × 3118
 88 | 
 89 | 
 90 | ## Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.
 91 | sc.pp.log1p(adata)
 92 | sc.pp.regress_out(adata, ['n_counts','ngenes'])
 93 | 
 94 | ## scale the data
 95 | sc.pp.scale(adata, max_value=10)
 96 | 
 97 | 
 98 | ## PCA
 99 | sc.tl.pca(adata, n_comps=100)
100 | sc.pl.pca_loadings(adata)
101 | adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat, 
102 | sc.pl.pca_scatter(adata, color='COL1A1') # visualize
103 | 
104 | ## Choose sigificant PCs
105 | sc.pl.pca_variance_ratio(adata, log=True,  show=100,n_pcs=100)
106 | 
107 | 
108 | ## Computing the neighborhood graph and do t-sne
109 | sc.pp.neighbors(adata, n_neighbors=10,n_pcs=50)
110 | sc.tl.louvain(adata, resolution=4)
111 | sc.tl.tsne(adata,use_fast_tsne=True,n_jobs=20,perplexity=100,n_pcs=50)
112 | sc.pl.tsne(adata, color='louvain',size=8,legend_loc="on data")
113 | sc.pl.tsne(adata, color='louvain',size=8)
114 | sc.pl.tsne(adata, color='tissue',size=8,legend_loc="on data")
115 | sc.pl.tsne(adata, color='tissue',size=8)
116 | 
117 | adata.write('./HCL_scanpy_pc50.h5ad')
118 | 
119 | ### change the cluster name
120 | new_cluster_names = list(range(1,103))
121 | adata.rename_categories('louvain', new_cluster_names)
122 | 
123 | ## Find marker genes using wilcoxon test
124 | sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon')
125 | result = adata.uns['rank_genes_groups']
126 | groups = result['names'].dtype.names
127 | pd.DataFrame(
128 |     {group + '_' + key[:1]: result[key][group]
129 |     for group in groups for key in ['names', 'logfoldchanges','scores', 'pvals', 'pvals_adj']}).to_csv("HCL102_markers_wilcoxon.csv")
130 | adata.write('./HCL_scanpy_pc50_markers_wilcoxon.h5ad', compression='gzip')
131 | 
132 | 


--------------------------------------------------------------------------------
/HCL_Fig1_script_730new.py.txt:
--------------------------------------------------------------------------------
  1 | ## load the necessary packages
  2 | import numpy as np
  3 | import scanpy.api as sc  ### using the scanpy 1.3.7 version
  4 | import pandas as pd
  5 | import os
  6 | import pandas as pd
  7 | os.chdir("./HCL/Fig1")
  8 | 
  9 | 
 10 | ## merge the dataset from different tissues
 11 | tissues=["AdultAdrenalGland2","AdultArtery1",         
 12 | "AdultAscendingColon1","AdultBladder1","AdultBladder2","AdultBoneMarrow1" ,    
 13 | "AdultBoneMarrow2","AdultCerebellum1","AdultCervix1","AdultTransverseColon1" ,         
 14 | "AdultDuodenum1", "AdultEpityphlon1","AdultEsophagus1","AdultFallopiantube1",  
 15 | "AdultGallbladder1", "AdultHeart1" ,"AdultHeart2", "AdultIleum2" ,         
 16 | "AdultKidney2","AdultKidney3","AdultLiver1","AdultLiver2" ,         
 17 | "AdultLiver4" ,"AdultLung1","AdultLung2","AdultMuscle1",         
 18 | "AdultOmentum1","AdultOmentum2","AdultPancreas1","AdultPeripheralBlood1",
 19 | "AdultPeripheralBlood2","AdultPleura1","AdultProstate1","AdultRectum1",         
 20 | "AdultSigmoidColon1","AdultSpleen1","AdultStomach1","AdultStomach2","AdultTemporalLobe1",   
 21 | "AdultThyroid1","AdultThyroid2","AdultTrachea2","AdultUreter1" ,        
 22 | "AdultUterus1","ChorionicVillus1","CordBlood1","CordBloodCD34P1",      
 23 | "FetalAdrenalGland2","FetalBrain3","FetalBrain4","FetalBrain5",          
 24 | "FetalCalvaria1","FetalFemaleGonad1", "FetalHeart1","FetalIntestine1",      
 25 | "FetalIntestine2","FetalIntestine3","FetalKidney3","FetalKidney4",         
 26 | "FetalKidney5","FetalLiver1","FetalLung1","FetalMaleGonad1",      
 27 | "FetalMaleGonad2","FetalMuscle1", "FetalPancreas1","FetalPancreas2",       
 28 | "FetalRib2","FetalRib3","FetalSkin2","FetalSpinalCord1",     
 29 | "FetalStomach1","FetalThymus1","FetalThymus2","hESC1","Placenta1" ]
 30 | 
 31 | datause= pd.read_table("./dge/AdultAdipose1.rmbatchdge.txt",sep=" ")
 32 | for tissue in tissues:
 33 |     new=pd.read_table("./dge/" + tissue + '.rmbatchdge.txt',sep=' ')
 34 |     datause=pd.merge(datause,new,left_index=True,right_index=True,how='outer')
 35 |     print(tissue + " is done")
 36 | 
 37 | genes=datause.index
 38 | genes=genes.tolist()
 39 | cells=datause.columns
 40 | cells=cells.tolist()
 41 | cells=pd.DataFrame(columns=["cell"],data=cells)
 42 | cells.to_csv("cells.csv",sep=",",header=False,index=False)
 43 | genes=pd.DataFrame(columns=["gene"],data=genes)
 44 | genes.to_csv("genes.csv",sep=",",header=False,index=False)
 45 | datause=datause.fillna(0)
 46 | datause.to_csv('datause.csv',sep='\t',header=False,index=False)
 47 | 
 48 | datause.shape
 49 | ##451613 × 38360
 50 | 
 51 | 
 52 | ## load the data
 53 | %%time
 54 | adata=sc.read_csv("./datause.csv",delimiter='\t').transpose()
 55 | adata.var_names = pd.read_csv('./genes.csv', header=None)[0]
 56 | adata.obs_names = pd.read_csv('./cells.csv', header=None)[0]
 57 | adata.obs['tissue']=pd.read_csv('./cellanno_new.csv',sep=",",header=None)[0].values
 58 | mito_genes = [name for name in adata.var_names if name.startswith('MT-')]
 59 | adata[:, mito_genes]=0
 60 | 
 61 | adata.write('./HCL_scanpy1.h5ad')
 62 | 
 63 | 
 64 | ## Filter the genes
 65 | sc.pp.filter_genes(adata, min_cells=20)
 66 | sc.pp.filter_cells(adata, min_genes=0)
 67 | adata.obs['n_counts'] = adata.X.sum(axis=1)
 68 | 
 69 | 
 70 | ## Filter the cells
 71 | sc.pl.violin(adata, ['n_genes', 'n_counts'],jitter=0.4, multi_panel=True)
 72 | #sc.pl.scatter(adata, x='n_counts', y='percent_mito')
 73 | sc.pl.scatter(adata, x='n_counts', y='n_genes')
 74 | 
 75 | 
 76 | ## Logarithmize the data.
 77 | sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
 78 | adata.raw = sc.pp.log1p(adata, copy=True)
 79 | adata.write('./HCL_scanpy2.h5ad')
 80 | 
 81 | ## Choose variable genes
 82 | adata=sc.read("./HCL_scanpy2.h5ad")
 83 | filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.001, max_mean=15, min_disp=0.45)
 84 | sc.pl.filter_genes_dispersion(filter_result)
 85 | adata = adata[:, filter_result.gene_subset]
 86 | adata.shape()
 87 | ##451613 × 3118
 88 | 
 89 | 
 90 | ## Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.
 91 | #sc.pp.log1p(adata)
 92 | sc.pp.regress_out(adata, ['n_counts','ngenes'])
 93 | 
 94 | ## scale the data
 95 | sc.pp.scale(adata, max_value=10)
 96 | 
 97 | 
 98 | ## PCA
 99 | sc.tl.pca(adata, n_comps=100)
100 | sc.pl.pca_loadings(adata)
101 | adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat, 
102 | sc.pl.pca_scatter(adata, color='COL1A1') # visualize
103 | 
104 | ## Choose sigificant PCs
105 | sc.pl.pca_variance_ratio(adata, log=True,  show=100,n_pcs=100)
106 | 
107 | 
108 | ## Computing the neighborhood graph and do t-sne
109 | sc.pp.neighbors(adata, n_neighbors=15,n_pcs=50)
110 | sc.tl.louvain(adata, resolution=3.5)
111 | sc.tl.tsne(adata,use_fast_tsne=True,n_jobs=20,perplexity=100,n_pcs=50)
112 | sc.pl.tsne(adata, color='louvain',size=8,legend_loc="on data")
113 | sc.pl.tsne(adata, color='louvain',size=8)
114 | sc.pl.tsne(adata, color='tissue',size=8,legend_loc="on data")
115 | sc.pl.tsne(adata, color='tissue',size=8)
116 | 
117 | adata.write('./HCL_scanpy_pc50.h5ad')
118 | 
119 | ### change the cluster name
120 | new_cluster_names = list(range(1,103))
121 | adata.rename_categories('louvain', new_cluster_names)
122 | 
123 | ## Find marker genes using wilcoxon test
124 | sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon',n_genes=200)
125 | result = adata.uns['rank_genes_groups']
126 | groups = result['names'].dtype.names
127 | pd.DataFrame(
128 |     {group + '_' + key[:1]: result[key][group]
129 |     for group in groups for key in ['names', 'logfoldchanges','scores', 'pvals', 'pvals_adj']}).to_csv("HCL102_markers_wilcoxon.csv")
130 | adata.write('./HCL_scanpy_pc50_markers_wilcoxon.h5ad', compression='gzip')
131 | 
132 | 


--------------------------------------------------------------------------------
/Metaneighbor/2017-08-28-runMN-US.R:
--------------------------------------------------------------------------------
  1 | run_MetaNeighbor_US<-function(vargenes, data, celltypes, pheno){
  2 |   
  3 |   cell.labels=matrix(0,ncol=length(celltypes),nrow=dim(pheno)[1])
  4 |   rownames(cell.labels)=colnames(data)
  5 |   colnames(cell.labels)=celltypes
  6 |   for(i in 1:length(celltypes)){
  7 |     type=celltypes[i]
  8 |     m<-match(pheno$Celltype,type)
  9 |     cell.labels[!is.na(m),i]=1
 10 |   }
 11 |   
 12 |   m<-match(rownames(data),vargenes)
 13 |   cor.dat=cor(data[!is.na(m),],method="s")
 14 |   rank.dat=cor.dat*0
 15 |   rank.dat[]=rank(cor.dat,ties.method="average",na.last = "keep")
 16 |   rank.dat[is.na(rank.dat)]=0
 17 |   rank.dat=rank.dat/max(rank.dat)
 18 |   sumin    =  (rank.dat) %*% cell.labels
 19 |   sumall   = matrix(apply(rank.dat,2,sum), ncol = dim(sumin)[2], nrow=dim(sumin)[1])
 20 |   predicts = sumin/sumall
 21 |   
 22 |   cell.NV=matrix(0,ncol=length(celltypes),nrow=length(celltypes))
 23 |   colnames(cell.NV)=colnames(cell.labels)
 24 |   rownames(cell.NV)=colnames(cell.labels)
 25 |   
 26 |   for(i in 1:dim(cell.labels)[2]){
 27 |     predicts.temp=predicts
 28 |     m<-match(pheno$Celltype,colnames(cell.labels)[i])
 29 |     study=unique(pheno[!is.na(m),"Study_ID"])
 30 |     m<-match(pheno$Study_ID,study)
 31 |     pheno2=pheno[!is.na(m),]
 32 |     predicts.temp=predicts.temp[!is.na(m),]
 33 |     predicts.temp=apply(abs(predicts.temp), 2, rank,na.last="keep",ties.method="average")
 34 |     filter=matrix(0,ncol=length(celltypes),nrow=dim(pheno2)[1])
 35 |     m<-match(pheno2$Celltype,colnames(cell.labels)[i])
 36 |     filter[!is.na(m),1:length(celltypes)]=1
 37 |     negatives = which(filter == 0, arr.ind=T)
 38 |     positives = which(filter == 1, arr.ind=T)
 39 |     predicts.temp[negatives] <- 0
 40 |     np = colSums(filter,na.rm=T)
 41 |     nn = apply(filter,2,function(x) sum(x==0,na.rm=T))
 42 |     p =  apply(predicts.temp,2,sum,na.rm=T)
 43 |     cell.NV[i,]= (p/np - (np+1)/2)/nn
 44 |   }
 45 |   
 46 |   cell.NV=(cell.NV+t(cell.NV))/2
 47 |   return(cell.NV)
 48 |   
 49 | }
 50 | 
 51 | get_variable_genes<-function(data, pheno) {
 52 | var.genes1=vector("list")
 53 | experiment=unique(pheno$Study_ID)
 54 | j=1
 55 | for(exp in experiment){
 56 |   dat.sub=data[,pheno$Study_ID==exp]
 57 |     genes.list=vector("list")
 58 |     med.dat=apply(dat.sub,1,median)
 59 |     var.dat=apply(dat.sub,1,var)
 60 |     quant.med=unique(quantile(med.dat,prob=seq(0,1,length=11),type=5))
 61 |     genes.list=vector("list",length=length(quant.med))
 62 |     for(i in 1:length(quant.med)){
 63 |       if(i==1){
 64 |         filt1=med.dat<=quant.med[i]
 65 |         var.temp=var.dat[filt1]
 66 |         quant.var=quantile(var.temp,na.rm=T)
 67 |         filt2=var.temp>quant.var[4]###### total is 4;TF is3
 68 |         genes.list[[i]]=names(var.temp)[filt2]
 69 |       }
 70 |       else {
 71 |         filt1=med.dat<=quant.med[i]&med.dat>quant.med[i-1]
 72 |         var.temp=var.dat[filt1]
 73 |         quant.var=quantile(var.temp,na.rm=T)
 74 |         filt2=var.temp>quant.var[4]######
 75 |         genes.list[[i]]=names(var.temp)[filt2]
 76 |       }
 77 |     }
 78 |     temp=length(genes.list)
 79 |     var.genes1[[j]]=unlist(genes.list[1:temp-1])
 80 |     j=j+1
 81 | }
 82 | var.genes=Reduce(intersect, var.genes1)
 83 | return(var.genes)
 84 | }
 85 | 
 86 | 
 87 | get_top_hits <- function(cell.NV, pheno, threshold=0.95, filename) {
 88 |   
 89 |   type_by_study=table(pheno[,c("Celltype","Study_ID")])
 90 |   m<-match(rownames(cell.NV),rownames(type_by_study))
 91 |   f.a=!is.na(m)
 92 |   f.b=m[f.a]
 93 |   cell.NV=cell.NV[f.a,f.a]
 94 |   type_by_study=type_by_study[f.b,]
 95 |   
 96 |   for(i in 1:dim(type_by_study)[2]){
 97 |     filt=type_by_study[,i]!=0
 98 |     cell.NV[filt,filt]=0
 99 |   }
100 |   
101 |   diag(cell.NV)=0
102 |   temp=vector()
103 |   for(i in 1:dim(cell.NV)[1]){
104 |     temp=c(temp,which.max(cell.NV[i,]))
105 |   }
106 |   temp=cbind(rownames(cell.NV),temp)
107 |   for(i in 1:dim(cell.NV)[1]){
108 |     temp[i,2]=cell.NV[i,as.numeric(temp[i,2])]
109 |   }
110 |   
111 |   recip=temp[duplicated(temp[,2]),]
112 |   filt=as.numeric(temp[,2])>=threshold
113 |   recip=rbind(recip,temp[filt,])
114 |   recip=cbind(recip,c(rep("Reciprocal_top_hit",each=dim(recip)[1]-sum(filt)),rep(paste("Above",threshold,sep="_"),each=sum(filt))))
115 |   recip=recip[!duplicated(recip[,2]),]
116 |   
117 |   recip2=cbind(rownames(recip),recip[,1:3])
118 |   colnames(recip2)=c("Celltype_1","Celltype_2","Mean_AUROC","Match_type")
119 |   rownames(recip2)=NULL
120 |   recip=recip2[order(recip2[,3],decreasing=T),]
121 |   recip2=as.data.frame(recip)
122 |   recip2[,3]=round(as.numeric(as.character(recip2[,3])),2)
123 |   write.table(recip,file=filename,sep="\t",quote=F)
124 |   return(recip2)
125 | }
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/Metaneighbor/hm_metaneighbor.r:
--------------------------------------------------------------------------------
 1 | 
 2 | human<-readRDS("./HCL_all_v2_pse20.CPM.rds")
 3 | 
 4 | mouse<-readRDS("./MCA_V2.PSUDOCELL20.rds")
 5 | 
 6 | orth<-read.table("./Human_Mouse_one-one.orth",sep="\t")
 7 | #orth<-as.matrix(orth)
 8 | mouse<-as.data.frame(mouse)
 9 | mouse.orth<-mouse[as.character(orth[,4]),]
10 | human<-as.data.frame(human)
11 | human.orth<-human[as.character(orth[,2]),]
12 | 
13 | 
14 | data<-cbind(mouse.orth,human.orth)
15 | rownames(data)<-orth[,1]
16 | data[is.na(data)]<-0
17 | 
18 | P<-read.table("./MCA_V2.psudocell20.phe",sep="\t",head=T)
19 | P2<-read.table("./HCL_v2.pse20.phe",sep="\t",head=T)
20 | P1<-rbind(P,P2)
21 | colnames(P1)<-c("Sample_ID","Study_ID","Celltype")
22 | data1<-data[,as.character(P1$Sample_ID)]
23 | 
24 | 
25 | source("2017-08-28-runMN-US.R")
26 | #library(gplots)
27 | #library(RColorBrewer)
28 | 
29 | celltypes1 <-unique(as.character(P1$Celltype))
30 | 
31 | 
32 | var.genes1=get_variable_genes(data1,P1)
33 | length(var.genes1)
34 | write.table(var.genes1,"var.genes_75.out",sep="\t",quote=F)#####--------
35 | celltype.NV=run_MetaNeighbor_US(var.genes1,data1,celltypes1,P1)
36 | write.table(celltype.NV,file="celltype.NV_SRS_75.out",sep="\t",quote=F)###---------
37 | 
38 | cols=rev(colorRampPalette(brewer.pal(11,"RdYlBu"))(100))
39 | breaks=seq(0,1,length=101)
40 | pdf("celltype.NV_SRS_75.pdf")  #########--------------------------
41 | heatmap.2(celltype.NV,trace="none",density.info="none",col=cols,breaks=breaks,cexRow=0.3,cexCol=0.3)
42 | dev.off()
43 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.9,filename="top_hits_SRS_75.out") 
44 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.8,filename="top_hits_SRS_0.8_75.out")
45 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.7,filename="top_hits_SRS_0.7_75.out")
46 | top_hits=get_top_hits(celltype.NV,P1,threshold=0.6,filename="top_hits_SRS_0.6_75.out")
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/PAGA/PAGA_BonemarrowCD34P.py:
--------------------------------------------------------------------------------
  1 | load("/home/ggj/HCA/RData/pbmc/old/AdultBoneMarrowCD34P1_Seurat.RData")
  2 | setwd("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/")
  3 | dim(pbmc@data)
  4 | #17364 11781
  5 | setwd("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga")
  6 | aa<-as.data.frame(as.matrix(pbmc@assays$RNA@counts))
  7 | write.csv(aa[,rownames(anno)],file = "./wt.dge.csv",quote = F)
  8 | anno<-FetchData(pbmc,vars = "ident")
  9 | write.csv(anno,file = "./wt_embryo.anno.csv",quote = F)
 10 | table(pbmc@ident)
 11 | 
 12 | vargene<-pbmc@var.genes
 13 | gene<-rownames(pbmc@raw.data)
 14 | genefilter<-gene%in%vargene
 15 | write.csv(genefilter,file = "./genefilter.csv",quote = F,row.names = F)
 16 | 
 17 | 
 18 | anno<-FetchData(pbmc,vars="ident")
 19 | anno$shuchu<-ifelse(anno$ident%in%c(1,2,6,12,14,18,21),"FALSE","TRUE")
 20 | write.csv(anno$shuchu,file="nonimmunecell.csv",quote=F,row.names=F)
 21 | 
 22 | anno$shuchu<-ifelse(anno$ident%in%c(1,2,6,12,14,18,21),"TRUE","FALSE")
 23 | write.csv(anno$shuchu,file="immunecell.csv",quote=F,row.names=F)
 24 | 
 25 | #####################################python
 26 | import numpy as np
 27 | import scanpy.api as sc
 28 | import pandas as pd
 29 | import os
 30 | import pandas as pd
 31 | 
 32 | os.chdir("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/")
 33 | adata=sc.read_csv("wt.dge.csv",delimiter=',').transpose()
 34 | adata.var_names = pd.read_csv('gene.csv', header=None)[0]
 35 | 
 36 | datause= pd.read_table("wt.dge.csv",sep=",",index_col=0)
 37 | adata=sc.AnnData(datause.T)
 38 | 
 39 | 
 40 | mito_genes = [name for name in adata.var_names if name.startswith('mt-')]
 41 | #adata[:, mito_genes]=0
 42 | 
 43 | 
 44 | 
 45 | adata.obs['cluster']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[0].values
 46 | adata.obs['type']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[1].values
 47 | adata.obs['batch']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[2].values
 48 | adata.obs['cluster1']= pd.read_csv('wt_embryo.anno.csv',sep=",",header=None)[3].values
 49 | 
 50 | 
 51 | 
 52 | sc.pp.filter_genes(adata, min_cells=3)
 53 | sc.pp.filter_cells(adata, min_genes=0)
 54 | adata.obs['n_counts'] = adata.X.sum(axis=1)
 55 | 
 56 | 
 57 | sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
 58 | sc.pp.log1p(adata)
 59 | adata.raw = adata
 60 | 
 61 | adata.write('./wt1.h5ad', compression='gzip')
 62 | 
 63 | 
 64 | gene_filter1=pd.read_csv("genefilter.csv")
 65 | gene_filter=gene_filter1['x'].values
 66 | sc.pl.filter_genes_dispersion(filter_result)
 67 | adata = adata[:, gene_filter]
 68 | 
 69 | cc=adata.var_names
 70 | cc=cc.values
 71 | bb=list(set(cc).intersection(set(gene_filter)))
 72 | len(bb)
 73 | 
 74 | 
 75 | filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.01, max_mean=15, min_disp=0.4)
 76 | import collections
 77 | collections.Counter(filter_result.gene_subset)
 78 | #Counter({False: 14819, True: 1750})
 79 | sc.pl.filter_genes_dispersion(filter_result)
 80 | adata = adata[:, filter_result.gene_subset]
 81 | 
 82 | 
 83 | sc.pp.regress_out(adata, ['n_counts'])
 84 | 
 85 | ## scale the data
 86 | sc.pp.scale(adata, max_value=10)
 87 | 
 88 | 
 89 | ### PCA
 90 | sc.tl.pca(adata, n_comps=50)
 91 | sc.pl.pca_loadings(adata)
 92 | # visualize
 93 | adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
 94 | sc.pl.pca_scatter(adata, color='COL1A1')
 95 | # PC
 96 | sc.pl.pca_variance_ratio(adata, log=True,  show=50,n_pcs=50)
 97 | ## 25
 98 | adata
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | sc.pp.neighbors(adata, n_neighbors=10,n_pcs=25)
105 | sc.tl.louvain(adata, resolution=1)
106 | sc.tl.tsne(adata,use_fast_tsne=True,n_jobs=20,perplexity=200,n_pcs=25)
107 | sc.pl.tsne(adata, color='louvain',size=8,legend_loc="on data")
108 | 
109 | 
110 | adata.obs['type']=adata.obs['type'].astype('category')
111 | 
112 | 
113 | sc.pl.tsne(adata, color='louvain',size=8)
114 | sc.pl.tsne(adata, color='tissue',size=8,legend_loc="on data")
115 | sc.pl.tsne(adata, color='tissue',size=8)
116 | adata.write('./wt_cluster.h5ad', compression='gzip')
117 | 
118 | 
119 | 
120 | sc.pp.neighbors(adata,n_pcs=25)
121 | sc.tl.umap(adata, min_dist=0.1)
122 | sc.pl.umap(adata, color='type', title='UMAP', legend_loc='on data', legend_fontsize=5)
123 | sc.pl.umap(adata, color='louvain', title='UMAP', legend_loc='on data', legend_fontsize=5)
124 | 
125 | 
126 | 
127 | #sc.tl.paga(adata, groups='type', model='v1.0')
128 | sc.tl.paga(adata, groups='type')
129 | 
130 | sc.pl.paga(
131 |     adata,
132 |     layout='fr',
133 |     threshold=0.01,
134 |     fontsize=8,
135 |     node_size_scale=1,
136 |     node_size_power=0.7,
137 |     max_edge_width=0.7)
138 | 
139 | import matplotlib.pyplot as pl
140 | 
141 | sc.tl.paga(adata, groups='type')
142 | 
143 | sc.tl.draw_graph(adata,init_pos="paga",layout="fa",maxiter=500)
144 | sc.pl.draw_graph(adata, color='type',title='Force Atlas 2', legend_loc='on data', legend_fontsize=5,palette=sc.pl.palettes.default_20)
145 | #,save="iter1000.pdf")
146 | sc.pl.tsne(adata, color='louvain',size=8,palette=sc.pl.palettes.godsnot_64,legend_loc="on data")
147 | 
148 | sc.pl.draw_graph(adata, color='cluster',title='Force Atlas 2', legend_loc='on data', legend_fontsize=5,palette=sc.pl.palettes.default_20,size=8)
149 | 
150 | adata.write('./wt_paga.h5ad', compression='gzip')
151 | 
152 | 
153 | 
154 | adata1=adata.copy()
155 | sc.tl.paga(adata1, groups='louvain')
156 | 
157 | 
158 | 
159 | ################################ remove immune cells
160 | adata.obs['donor_tf']= pd.read_csv("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/nonimmunecell.csv",sep=",",header=None)[0].values
161 | #adata.obs['cluster_num']= pd.read_csv('/home/ggj/NEW/HCA/SPRING/PBCD34/cluster.csv',sep=",",header=None)[0].values
162 | 
163 | cluster = adata[adata.obs["donor_tf"]]
164 | #6118 × 1750 
165 | 
166 | sc.pp.neighbors(cluster,n_pcs=25)
167 | sc.tl.umap(cluster, min_dist=0.1)
168 | 
169 | sc.tl.paga(cluster, groups='type')
170 | sc.pl.paga(
171 |     cluster,
172 |     layout='fr',
173 |     threshold=0.01,
174 |     fontsize=8,
175 |     node_size_scale=1,
176 |     node_size_power=0.7,
177 |     max_edge_width=0.7)
178 | 
179 | import matplotlib.pyplot as pl
180 | sc.tl.draw_graph(cluster,init_pos="paga",layout="fa",maxiter=500)
181 | sc.pl.draw_graph(cluster, color='type',  legend_loc='on data',legend_fontsize=5,palette=sc.pl.palettes.default_26,size=8)
182 | 
183 | cluster.write('./wt_nonimmune_paga.h5ad', compression='gzip')
184 | 
185 | 
186 | ################################  immune cells
187 | adata.obs['donor_tf']= pd.read_csv("/home/ggj/NEW/DifferentiationForce/Data/20200111_embryo/wt/paga/immunecell.csv",sep=",",header=None)[0].values
188 | immune = adata[adata.obs["donor_tf"]]
189 | #5663 × 1750 
190 | 
191 | sc.pp.neighbors(immune,n_pcs=25)
192 | sc.tl.umap(immune, min_dist=0.1)
193 | 
194 | sc.tl.paga(immune, groups='type')
195 | sc.pl.paga(
196 |     immune,
197 |     layout='fr',
198 |     threshold=0.01,
199 |     fontsize=8,
200 |     node_size_scale=1,
201 |     node_size_power=0.7,
202 |     max_edge_width=0.7)
203 | 
204 | import matplotlib.pyplot as pl
205 | sc.tl.draw_graph(immune,init_pos="paga",layout="fa",maxiter=500)
206 | sc.pl.draw_graph(immune, color='type',  legend_loc='on data',legend_fontsize=5,palette=sc.pl.palettes.default_26,size=8)
207 | immune.write('./wt_immune_paga.h5ad', compression='gzip')
208 | 


--------------------------------------------------------------------------------
/Pseudocell/FetalStomach1_500more.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Pseudocell/FetalStomach1_500more.RData


--------------------------------------------------------------------------------
/Pseudocell/Pseudocell_Human.r:
--------------------------------------------------------------------------------
 1 | load("/home/jingjingw/Jingjingw/Project/2018-MH-new/Pseudocell/FetalStomach1_500more.RData")
 2 | name<-"FetalStomach1"
 3 | outfile1<-"Human_FetalStomach1_pseudocell20.Rds"
 4 | outfile2<-"Human_FetalStomach1_pseudocell20.pheno.out"
 5 | 
 6 | 
 7 | 
 8 | Inter<-get(paste(name,"pbmc",sep = "_"))
 9 | Inter[Inter<0]=0
10 | idd<-get(paste(name,"Anno1",sep = "_"))
11 | Inter.id<-cbind(rownames(idd),idd$Cluster_id)
12 | 
13 | rownames(Inter.id)<-rownames(idd)
14 | colnames(Inter.id)<-c("CellID","Celltype")
15 | Inter.id<-as.data.frame(Inter.id)
16 | Inter1<-Inter[,Inter.id$CellID]
17 | Inter<-as.matrix(Inter1)
18 | pseudocell.size = 20 ## 10 test
19 | new_ids_list = list()
20 | for (i in 1:length(levels(Inter.id$Celltype))) {
21 | 	cluster_id = levels(Inter.id$Celltype)[i]
22 | 	cluster_cells <- rownames(Inter.id[Inter.id$Celltype == cluster_id,])
23 | 	cluster_size <- length(cluster_cells)		
24 | 	pseudo_ids <- floor(seq_along(cluster_cells)/pseudocell.size)
25 | 	pseudo_ids <- paste0(cluster_id, "_Cell", pseudo_ids)
26 | 	names(pseudo_ids) <- sample(cluster_cells)	
27 | 	new_ids_list[[i]] <- pseudo_ids		
28 | 	}
29 | 	
30 | new_ids <- unlist(new_ids_list)
31 | new_ids <- as.data.frame(new_ids)
32 | new_ids_length <- table(new_ids)
33 | 
34 | new_colnames <- rownames(new_ids)  ###add
35 | all.data<-Inter[,as.character(new_colnames)] ###add
36 | all.data <- t(all.data)###add
37 | 
38 | new.data<-aggregate(list(all.data[,1:length(all.data[1,])]),
39 | 	list(name=new_ids[,1]),FUN=mean)
40 | rownames(new.data)<-new.data$name
41 | new.data<-new.data[,-1]
42 | 
43 | new_ids_length<-as.matrix(new_ids_length)##
44 | short<-which(new_ids_length<10)##
45 | new_good_ids<-as.matrix(new_ids_length[-short,])##
46 | result<-t(new.data)[,rownames(new_good_ids)]
47 | colnames(result)<-paste("Human",colnames(result),sep="")
48 | rownames(result)<-rownames(Inter)
49 | #saveRDS(result,file=outdir1[i]) ###
50 | saveRDS(result,file=outfile1) ###
51 | cellty<-gsub("[_]Cell[0-9]|[_]Cell[0-9][0-9]|[_]Cell[0-9][0-9][0-9]|[_]Cell[0-9][0-9][0-9][0-9]|[_]Cell[0-9][0-9][0-9][0-9][0-9]","",colnames(result))
52 | new.phe<-paste(colnames(result),'HumanFetal',cellty,sep="\t")
53 | 
54 | #write.table(new.phe,file=outdir2[i],quote=F,row.names=F) ###
55 | 
56 | write.table(new.phe,file=outfile2,quote=F,row.names=F) ###
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HCL
2 |  Codes used in Human Cell Landscape(HCL)
3 | 


--------------------------------------------------------------------------------
/Scenic_R_human.R:
--------------------------------------------------------------------------------
  1 | #############
  2 | library(Seurat)
  3 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/")
  4 | #load your DGE
  5 | exprMat <- readRDS("./HCL_v2.pse20.SRS_17000.rds")
  6 | gene<-data.frame(colSums(exprMat>0))
  7 | #load the annotation for each cell
  8 | ident<-read.table("./HCL_v2.pse20.SRS_17000.phe",header =T,row.names = 1)
  9 | dir.create("SCENIC")
 10 | setwd("SCENIC/")
 11 | cellInfo <-merge(ident,gene,by="row.names",all=T)
 12 | cellInfo<-data.frame(cellInfo[,-1],row.names = cellInfo$Row.names)
 13 | cellInfo[is.na(cellInfo)]<-0
 14 | cellInfo<-cellInfo[,-1]
 15 | colnames(cellInfo)<- c('CellType','nGene')
 16 | cellInfo$cluster<-gsub("Human","",cellInfo$CellType)
 17 | dim(exprMat)
 18 | head(cellInfo)
 19 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/")
 20 | dir.create("int")
 21 | saveRDS(cellInfo, file="./int/cellInfo.Rds")
 22 | table(cellInfo$cluster)
 23 | colVars <- list(CellType=c('Human1'='#ffff00',
 24 |                            'Human2'='#1ce6ff',
 25 |                            'Human3'='#ff34ff',
 26 |                            'Human4'='#ff4a46',
 27 |                            'Human5'='#008941',
 28 |                            'Human6'='#006fa6',
 29 |                            'Human7'='#a30059',
 30 |                            'Human8'='#ffdbe5',
 31 |                            'Human9'='#7a4900',
 32 |                            'Human10'='#0000a6',
 33 |                            'Human11'='#63ffac',
 34 |                            'Human12'='#b79762',
 35 |                            'Human13'='#004d43',
 36 |                            'Human14'='#8fb0ff',
 37 |                            'Human15'='#997d87',
 38 |                            'Human16'='#5a0007',
 39 |                            'Human17'='#809693',
 40 |                            'Human18'='#feffe6',
 41 |                            'Human19'='#1b4400',
 42 |                            'Human20'='#4fc601',
 43 |                            'Human21'='#3b5dff',
 44 |                            'Human22'='#4a3b53',
 45 |                            'Human23'='#ff2f80',
 46 |                            'Human24'='#61615a',
 47 |                            'Human25'='#ba0900',
 48 |                            'Human26'='#6b7900',
 49 |                            'Human27'='#00c2a0',
 50 |                            'Human28'='#ffaa92',
 51 |                            'Human29'='#ff90c9',
 52 |                            'Human30'='#b903aa',
 53 |                            'Human31'='#d16100',
 54 |                            'Human32'='#ddefff',
 55 |                            'Human33'='#000035',
 56 |                            'Human34'='#7b4f4b',
 57 |                            'Human35'='#a1c299',
 58 |                            'Human36'='#300018',
 59 |                            'Human37'='#0aa6d8',
 60 |                            'Human38'='#013349',
 61 |                            'Human39'='#00846f',
 62 |                            'Human40'='#372101',
 63 |                            'Human41'='#ffb500',
 64 |                            'Human42'='#c2ffed',
 65 |                            'Human43'='#a079bf',
 66 |                            'Human44'='#cc0744',
 67 |                            'Human45'='#c0b9b2',
 68 |                            'Human46'='#c2ff99',
 69 |                            'Human47'='#001e09',
 70 |                            'Human48'='#00489c',
 71 |                            'Human49'='#6f0062',
 72 |                            'Human50'='#0cbd66',
 73 |                            'Human51'='#eec3ff',
 74 |                            'Human52'='#456d75',
 75 |                            'Human53'='#b77b68',
 76 |                            'Human54'='#7a87a1',
 77 |                            'Human55'='#788d66',
 78 |                            'Human56'='#885578',
 79 |                            'Human57'='#fad09f',
 80 |                            'Human58'='#ff8a9a',
 81 |                            'Human59'='#d157a0',
 82 |                            'Human60'='#bec459',
 83 |                            'Human61'='#456648',
 84 |                            'Human62'='#0086ed',
 85 |                            'Human63'='#886f4c',
 86 |                            'Human64'='#34362d',
 87 |                            'Human65'='#b4a8bd',
 88 |                            'Human66'='#00a6aa',
 89 |                            'Human67'='#452c2c',
 90 |                            'Human68'='#636375',
 91 |                            'Human69'='#a3c8c9',
 92 |                            'Human70'='#ff913f',
 93 |                            'Human71'='#938a81',
 94 |                            'Human72'='#575329',
 95 |                            'Human73'='#00fecf',
 96 |                            'Human74'='#b05b6f',
 97 |                            'Human75'='#8cd0ff',
 98 |                            'Human76'='#3b9700',
 99 |                            'Human77'='#04f757',
100 |                            'Human78'='#c8a1a1',
101 |                            'Human79'='#1e6e00',
102 |                            'Human80'='#7900d7',
103 |                            'Human81'='#a77500',
104 |                            'Human82'='#6367a9',
105 |                            'Human83'='#a05837',
106 |                            'Human84'='#6b002c',
107 |                            'Human85'='#772600',
108 |                            'Human86'='#d790ff',
109 |                            'Human87'='#9b9700',
110 |                            'Human88'='#549e79',
111 |                            'Human89'='#fff69f',
112 |                            'Human90'='#201625',
113 |                            'Human91'='#72418f',
114 |                            'Human92'='#bc23ff',
115 |                            'Human93'='#99adc0',
116 |                            'Human94'='#3a2465',
117 |                            'Human95'='#922329',
118 |                            'Human96'='#5b4534',
119 |                            'Human97'='#fde8dc',
120 |                            'Human98'='#404e55',
121 |                            'Human99'='#0089a3',
122 |                            'Human100'='#cb7e98',
123 |                            'Human101'='#a4e804',
124 |                            'Human102'='#324e72'))
125 | colVars$CellType <- colVars$CellType[intersect(names(colVars$CellType), cellInfo$CellType)]
126 | saveRDS(colVars, file="./int/colVars.Rds")
127 | plot.new(); legend(0,1, fill=colVars$CellType, legend=names(colVars$CellType))
128 | library(SCENIC)
129 | org="hgnc" # or hgnc, or dmel
130 | dbDir="/home/ggj/Rdata/201906/Human/databases/" # RcisTarget databases location
131 | myDatasetTitle="SCENIC example on Human1.1" # choose a name for your analysis
132 | dbs <- c("hg19-500bp-upstream-10species.mc9nr.feather","hg19-tss-centered-5kb-10species.mc9nr.feather")
133 | names(dbs)<-c("500bp","5kb")
134 | scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=10) 
135 | 
136 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/")
137 | # scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
138 | scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
139 | scenicOptions@inputDatasetInfo$colVars <- "int/colVars.Rds"
140 | saveRDS(scenicOptions, file="./int/scenicOptions.Rds")
141 | 
142 | 
143 | 
144 | ###
145 | ### Co-expression network
146 | genesKept <- geneFiltering(exprMat, scenicOptions)
147 | exprMat_filtered <- exprMat[genesKept, ]
148 | runCorrelation(exprMat_filtered, scenicOptions)
149 | ##1 load AUCell matrix from PyScenic
150 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data")
151 | regulonAUC<-importAUCfromText("./aucell.csv")
152 | regulonAUC
153 | dim(regulonAUC)
154 | saveRDS(regulonAUC, file="./int/3.4_regulonAUC.Rds")
155 | 
156 | ##2 run SCENIC tSNE
157 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/")
158 | nPcs <- c(50) 
159 | scenicOptions@settings$seed <- 123 # same seed for all of them
160 | # Run t-SNE with different settings:
161 | fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50))
162 | fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50), onlyHighConf=TRUE,filePrefix="int/tSNE_oHC")
163 | # Plot as pdf (individual files in int/):
164 | fileNames <- paste0("int/",grep(".Rds", grep("tSNE_", list.files("int"), value=T), value=T))
165 | par(mfrow=c(length(nPcs), 3))
166 | fileNames <- paste0("int/",grep(".Rds", grep("tSNE_AUC", list.files("int"), value=T, perl = T), value=T))
167 | plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, cex=.5)
168 | # Using only "high-confidence" regulons (normally similar)
169 | scenicOptions@settings$defaultTsne$aucType <- "AUC"
170 | scenicOptions@settings$defaultTsne$dims <- 50
171 | scenicOptions@settings$defaultTsne$perpl <- 50
172 | saveRDS(scenicOptions, file="int/scenicOptions.Rds")
173 | scenicOptions<-readRDS("./int/scenicOptions.Rds")
174 | # Better if it is logged/normalized
175 | aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat) # default t-SNE
176 | savedSelections <- shiny::runApp(aucellApp)
177 | # Save the modified thresholds:
178 | newThresholds <- savedSelections$thresholds
179 | auc<-read.csv("/home/ggj/github/HCL/HCL/Scenic_R/example_data/aucell.csv",row.names = 1)
180 | usethrethold<-apply(auc,2,summary)
181 | colnames(usethrethold)<-gsub("\\...","(+)",colnames(usethrethold))
182 | dim(usethrethold)
183 | A0.5<-0.5*usethrethold[6,]
184 | names(A0.5)
185 | auc[1:5,1:5]
186 | setdiff(colnames(usethrethold),rownames(regulonAUC))
187 | setdiff(rownames(regulonAUC),colnames(usethrethold))
188 | #runSCENIC aucell binarize
189 | setwd("/home/ggj/github/HCL/HCL/Scenic_R/example_data/SCENIC/")
190 | #0.5*max
191 | newThresholds<-A0.5
192 | scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds"
193 | saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds"))
194 | saveRDS(scenicOptions, file="int/scenicOptions.Rds") 
195 | runSCENIC_4_aucell_binarize(scenicOptions)
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/Seurat-example/FetalThymus2_dge.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/Seurat-example/FetalThymus2_dge.txt.gz


--------------------------------------------------------------------------------
/Seurat-example/Tissue_seurat.R:
--------------------------------------------------------------------------------
  1 | setwd("./FetalThymus2")
  2 | 
  3 | ## Reading files(dges)
  4 | a<-read.table("./FetalThymus2_dge.txt.gz",row.names = 1,header = T)
  5 | FetalThymus2<-a
  6 | 
  7 | name<-"FetalThymus2"
  8 | ###### bulid dge below 500 UMI and dge more than 500 UMI
  9 | FetalThymus2_500less <- FetalThymus2[,colSums(FetalThymus2)<500 & colSums(FetalThymus2)> 100]
 10 | FetalThymus2_500more<-FetalThymus2[,colSums(FetalThymus2)>=500]
 11 | 
 12 | FetalThymus2_Anno <- data.frame(Cell_barcode= colnames(FetalThymus2_500more),
 13 |                                Sample      = replicate("FetalThymus",n=ncol(FetalThymus2_500more)),
 14 |                                Batch       = replicate("FetalThymus2",n=ncol(FetalThymus2_500more)))
 15 | colnames(FetalThymus2_500more) <- paste("2",colnames(FetalThymus2_500more),sep = ".")                           
 16 | colnames(FetalThymus2_500more) <- paste("FetalThymus",colnames(FetalThymus2_500more),sep = "_")                           
 17 | FetalThymus2_Anno[,"Cell_id"]  <- colnames(FetalThymus2_500more)
 18 | FetalThymus2_Anno[,"Cluster_id"] = replicate("1",n=ncol(FetalThymus2_500more))
 19 | FetalThymus2_Anno$Ages<-"10W"
 20 | FetalThymus2_Anno$Development_stage<-"Fetus"
 21 | FetalThymus2_Anno$Method<-rep("Microwell-seq")
 22 | FetalThymus2_Anno$Gender<-"Male"
 23 | FetalThymus2_Anno$Source<-rep("HCL")
 24 | FetalThymus2_Anno$Biomaterial<-rep("FetalThymus")
 25 | FetalThymus2_Anno$Name<-rep("FetalThymus2_10W")
 26 | 
 27 | 
 28 | 
 29 | ##  make background
 30 | name<-"FetalThymus2"
 31 | name_background <- paste(name,"background", sep="_")
 32 | name_500more  <-  paste(name,"500more", sep="_")
 33 | name_500less  <-  paste(name,"500less", sep="_")
 34 | 
 35 | ## check the data condition
 36 | par(mfrow=c(2,1))
 37 | hist(colSums(FetalThymus2_500more),breaks = 200)
 38 | hist(colSums(FetalThymus2_500more>0),breaks = 200)
 39 | abline(v=300)
 40 | summary(colSums(FetalThymus2_500more>0))
 41 | #Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 42 | #71.0   386.0   440.0   473.8   530.0  1561.0 
 43 | 
 44 | library(Seurat)
 45 | seqwell <- CreateSeuratObject(raw.data = Matrix(as.matrix(FetalThymus2_500more),sparse=T)
 46 |                               ,min.cells = 3,min.genes = 300,names.delim = "\\.")  ##no normarlize
 47 | 
 48 | dim(seqwell@data)
 49 | #19211  9801
 50 | mito.genes <- grep(pattern = "^MT-", x = rownames(x = seqwell@data), value = TRUE)
 51 | percent.mito <- colSums(seqwell @raw.data[mito.genes, ])/colSums(seqwell @raw.data)
 52 | seqwell <- AddMetaData(object = seqwell, metadata = percent.mito, col.name = "percent.mito")
 53 | VlnPlot(object = seqwell , features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
 54 | par(mfrow = c(1, 2))
 55 | GenePlot(object = seqwell , gene1 = "nUMI", gene2 = "percent.mito")
 56 | GenePlot(object = seqwell, gene1 = "nUMI", gene2 = "nGene")
 57 | seqwell<- FilterCells(object = seqwell , subset.names = c("nGene", "percent.mito"), 
 58 |                       low.thresholds = c(300, -Inf), high.thresholds = c(2500, 0.2))
 59 | seqwell <- NormalizeData(object = seqwell, normalization.method = "LogNormalize", 
 60 |                          scale.factor = 10000)
 61 | seqwell<- ScaleData(seqwell,vars.to.regress=c("nUMI", "percent.mito"), do.par = TRUE, num.cores =8)
 62 | par(mfrow=c(1,1))
 63 | seqwell<- FindVariableGenes(object = seqwell, mean.function = ExpMean, dispersion.function = LogVMR 
 64 |                             ,x.low.cutoff = 0.01, 
 65 |                             x.high.cutoff = 6, y.cutoff = 0.5)
 66 | length(seqwell @var.genes)# 1893
 67 | #hv.genes <- head(rownames(seqwell@hvg.info), 2000)
 68 | 
 69 | pbmc<-seqwell
 70 | rm(seqwell)
 71 | 
 72 | var.gene<-pbmc@var.genes
 73 | var.gene<-var.gene[!grepl(pattern = "*RPS",x=var.gene)]
 74 | var.gene<-var.gene[!grepl(pattern = "*RPL",x=var.gene)]
 75 | var.gene<-var.gene[!grepl(pattern = "*MT",x=var.gene)]
 76 | length(var.gene)
 77 | #2076
 78 | 
 79 | # Perform linear dimensional reduction
 80 | pbmc <- RunPCA(object = pbmc, pc.genes = var.gene, pcs.compute = 50, do.print = TRUE, 
 81 |                pcs.print = 1:5, genes.print = 5)
 82 | # Determine statistically significant principal components
 83 | pbmc <- JackStraw(object = pbmc, num.replicate = 100, num.pc = 40, num.cores = 8,do.par = TRUE)
 84 | # The JackStrawPlot function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). ‘Significant’ PCs will show a strong enrichment of genes with low p-values (solid curve above the dashed line). In this case it appears that PCs 1-10 are significant.
 85 | JackStrawPlot(object = pbmc, PCs = 1:40)#25
 86 | # A more ad hoc method for determining which PCs to use is to look at a plot of the standard deviations of the principle components and draw your cutoff where there is a clear elbow in the graph. This can be done with PCElbowPlot. In this example, it looks like the elbow would fall around PC 9.
 87 | PCElbowPlot(object = pbmc,num.pc = 50)#14
 88 | PCHeatmap(object = pbmc, pc.use = 1:15, cells.use = 500, do.balanced = TRUE, 
 89 |           label.columns = FALSE, use.full = FALSE)
 90 | PCHeatmap(object = pbmc, pc.use = 16:30, cells.use = 500, do.balanced = TRUE, 
 91 |           label.columns = FALSE, use.full = FALSE)
 92 | PCHeatmap(object = pbmc, pc.use = 31:50, cells.use = 500, do.balanced = TRUE, 
 93 |           label.columns = FALSE, use.full = FALSE)
 94 | 
 95 | 
 96 | # Run Non-linear dimensional reduction (tSNE)
 97 | # Seurat continues to use tSNE as a powerful tool to visualize and explore these datasets. While we no longer advise clustering directly on tSNE components, cells within the graph-based clusters determined above should co-localize on the tSNE plot. This is because the tSNE aims to place cells with similar local neighborhoods in high-dimensional space together in low-dimensional space. As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.
 98 | pbmc <- RunTSNE(object = pbmc, dims.use = 1:20, do.fast = TRUE)
 99 | TSNEPlot(object = pbmc,do.label = T, pt.size = 1,label.size = 5)
100 | 
101 | pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:15, save.SNN = TRUE, 
102 |                      resolution =c(0.6,0.8,1,1.4,2,2.5,4),force.recalc = T,k.param=15)
103 | 
104 | pbmc <- RunTSNE(object = pbmc, reduction.use = "pca", dims.use = 1:12, tsne.method = "FIt-SNE", 
105 |                 nthreads = 8, reduction.name = "FItSNE", reduction.key = "FItSNE_", 
106 |                 fast_tsne_path = "/home/ggj/Documents/tools/FIt-SNE-master/bin/fast_tsne", 
107 |                 max_iter = 2000,perplexity=100)
108 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
109 |         pt.size = 1,group.by = "res.0.6",do.label = T)+ggtitle("res.0.6") #17
110 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
111 |         pt.size = 1,group.by = "res.0.8",do.label = T) +ggtitle("res.0.8")#17
112 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
113 |         pt.size = 1,group.by = "res.1",do.label = T) +ggtitle("res.1")#19
114 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
115 |         pt.size = 1,group.by = "res.1.4",do.label = T)+ggtitle("res.1.4") #19
116 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
117 |         pt.size = 1,group.by = "res.2",do.label = T)+ggtitle("res.2") #23
118 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
119 |         pt.size = 1,group.by = "res.2.5",do.label = T) +ggtitle("res.2.5")#27
120 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
121 |         pt.size = 1,group.by = "res.4",do.label = T)+ggtitle("res.4") #32
122 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
123 |         pt.size = 1,do.label = T) 
124 | 
125 | 
126 | 
127 | 
128 | pbmc<-SetAllIdent(pbmc,id="res.0.6")
129 | aa<-FindMarkers(pbmc,0,1)# merge 
130 | 
131 | pbmc<-SetAllIdent(pbmc,id="res.0.8")
132 | aa<-FindMarkers(pbmc,2,1)# merge 
133 | 
134 | 
135 | pbmc<-SetAllIdent(pbmc,id="res.1.4")
136 | 
137 | current.cluster.ids <- 0:13
138 | new.cluster.ids <-c(0,1,2,3,4,5,6,7,8,9,10,11,12,13)
139 | new.cluster.ids <-c(1,1,1,1,1,1,1,1,1,2,3,4,5,6)
140 | pbmc@ident <- plyr::mapvalues(pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
141 | table(pbmc@ident)
142 | DimPlot(object = pbmc, reduction.use = "FItSNE", no.legend = F, do.return = TRUE, 
143 |         pt.size = 1,do.label = T) 
144 | 
145 | 
146 | save.image("./FetalThymus2.RData")
147 | pbmc.markers<-FindAllMarkers(pbmc, only.pos = TRUE,  thresh.use = 0.25,min.pct = 0.15) 
148 | pbmc.markers <- pbmc.markers[with(pbmc.markers, order(cluster,-avg_logFC, p_val_adj)),]
149 | table(pbmc.markers$cluster)
150 | library(gdata)
151 | WriteXLS::WriteXLS(pbmc.markers,"./markers.xlsx")
152 | library(dplyr)
153 | pbmc.markers %>% group_by(cluster) %>% top_n(20, avg_logFC) ->top20
154 | DoHeatmap(pbmc, genes.use = top20$gene,  slim.col.label = TRUE, remove.key = TRUE,cex.row =3,group.cex=5)
155 | save.image("./FetalThymus2.RData")
156 | save(pbmc,pbmc.markers,FetalThymus2_Anno,file = "./FetalThymus2_pbmc.RData")
157 | 
158 | 


--------------------------------------------------------------------------------
/scHCL-build-reference/.Rhistory:
--------------------------------------------------------------------------------
1 | load("~/Rdata/201806/Agingmouse/OneYearThymus/OneYearThymus.RData")
2 | FeaturePlot(pbmc,c("Cd4"),reduction.use = "FItSNE")
3 | FeaturePlot(pbmc,c("Cd4","Cd8a1"),reduction.use = "FItSNE")
4 | FeaturePlot(pbmc,c("Cd4","Cd8a"),reduction.use = "FItSNE")
5 | FeaturePlot(pbmc,c("Cd4","Cd8a","Cd8b1"),reduction.use = "FItSNE")
6 | 


--------------------------------------------------------------------------------
/scHCL-build-reference/1-build-reference.R:
--------------------------------------------------------------------------------
  1 | library(Seurat)
  2 | library(dplyr)
  3 | library(Matrix)
  4 | # get_tissue_arv 
  5 | # this function return a single subs gene all-cell mean and 3 sample 100cells
  6 | get_tissue_arv <- function(cell_counts, subcount)  {
  7 |   single_tissue_names_per_subs = colnames(pbmc@data[,pbmc@ident==subcount])
  8 |   dge <- data.frame(as.matrix(pbmc@raw.data[,single_tissue_names_per_subs]))
  9 |   single_tissue_raw_data_per_subs <- as.matrix(t(t(dge)/colSums(dge))*100000) 
 10 |   if (cell_counts >= 300)	{
 11 |     numsample<-floor(ncol(single_tissue_raw_data_per_subs)/100)   
 12 |     single_sample_in_ref<-data.frame(matrix(nrow=nrow(single_tissue_raw_data_per_subs),ncol = numsample))
 13 |     rownames(single_sample_in_ref)<-rownames(single_tissue_raw_data_per_subs)
 14 |     for (i in 1:numsample){
 15 |       single_sample_in_ref[,i] = rowMeans(single_tissue_raw_data_per_subs[,sample(1:length(single_tissue_raw_data_per_subs[1,]),100,replace = F)])
 16 |     }             
 17 |   }
 18 |   else if (cell_counts>=100 & cell_counts <300) {
 19 |     single_sample_in_ref<-data.frame(matrix(nrow=nrow(single_tissue_raw_data_per_subs),ncol = 3))
 20 |     rownames(single_sample_in_ref)<-rownames(single_tissue_raw_data_per_subs)
 21 |     for (i in 1:3){
 22 |       single_sample_in_ref[,i] = rowMeans(single_tissue_raw_data_per_subs[,sample(1:length(single_tissue_raw_data_per_subs[1,]),100,replace = T)])
 23 |     }   
 24 |   }
 25 |   else   {
 26 |     single_sample_in_ref<-data.frame(matrix(nrow=nrow(single_tissue_raw_data_per_subs),ncol = 3))
 27 |     rownames(single_sample_in_ref)<-rownames(single_tissue_raw_data_per_subs)  
 28 |     for (i in 1:3){
 29 |       single_sample_in_ref[,i] = rowMeans(single_tissue_raw_data_per_subs[,sample(1:length(single_tissue_raw_data_per_subs[1,]),length(single_tissue_raw_data_per_subs[1,]))])}
 30 |   }
 31 |   
 32 |   single_sample_in_ref<-floor(single_sample_in_ref[,sample(1:ncol(single_sample_in_ref),3,replace = F)])
 33 |   sample_tissue_mean = data.frame(   apply(single_sample_in_ref,1 ,  mean)   )
 34 |   sumgene_sample_tissue <- sum(sample_tissue_mean>0)
 35 |   return(list(as.matrix(sample_tissue_mean),as.matrix(single_sample_in_ref),as.numeric(sumgene_sample_tissue)))  
 36 | }
 37 | # get pesudocell of 100 cell data
 38 | get_tissue_sample_data <- function(pbmc){
 39 |   tissue_data=c()
 40 |   tissue_sample = c()
 41 |   tissue_sumgene =c()
 42 |   subs_count = length(table(pbmc@ident))
 43 |   cells_count_persub = as.numeric(table(pbmc@ident))
 44 |   for(i in 1:subs_count){
 45 |     xx=get_tissue_arv(cells_count_persub[i],i)	
 46 |     colnames(xx[[2]])<-paste0(i,"_",colnames(xx[[2]]))
 47 |     tissue_sample <- cbind(tissue_sample,xx[[1]])
 48 |     tissue_data <- cbind(tissue_data,xx[[2]])	
 49 |     tissue_sumgene <-c(tissue_sumgene,xx[[3]])
 50 |   }
 51 |   colnames(tissue_sample)<-1:length(cells_count_persub)
 52 |   rownames(tissue_sample)<-rownames(tissue_data)
 53 |   return(list(tissue_data,tissue_sample,tissue_sumgene)) }
 54 | 
 55 | ####the seurat data including your clustering information is saved in the Rdata, you should set the pathway of Rdata as working pathway
 56 | #how to build a seurat data? please read the instructions of Seurat R package
 57 | setwd("/media/ggj/SHYbeifen/HCLREFuse.RData/")
 58 | tissuedata <- list.files(pattern="*.RData")
 59 | tissuenames <- reshape2::colsplit(tissuedata,pattern="_pbmc.RData",names=c("tissue","c"))$tissue
 60 | total_tissue_gene = data.frame()
 61 | total_tissue_data = data.frame()
 62 | for(i in 1:length(tissuenames)){
 63 |   message(paste0("Loading ",tissuedata[i]))
 64 |   load(tissuedata[i])
 65 |   message("Finish Loading")
 66 |   xx = get_tissue_sample_data(pbmc)
 67 |   colnames(xx[[1]]) <- paste0(tissuenames[i],"_",colnames(xx[[1]]))
 68 |   colnames(xx[[2]]) <- paste0(tissuenames[i],"_",colnames(xx[[2]]))
 69 |   message("Staring mearge")
 70 |   if(i==1){total_tissue_gene = xx[[1]]; total_tissue_data = xx[[2]];tissue_sumgene <-xx[[3]]}
 71 |   else{ total_tissue_gene=merge(total_tissue_gene,xx[[1]],by="row.names",all=T,sort=T); rownames(total_tissue_gene)=total_tissue_gene[,1] ; total_tissue_gene=total_tissue_gene[,-1]; total_tissue_gene[is.na(total_tissue_gene)]=0 ; 
 72 |   total_tissue_data=merge(total_tissue_data,xx[[2]],by="row.names",all=T,sort=T); rownames(total_tissue_data)=total_tissue_data[,1] ; total_tissue_data=total_tissue_data[,-1]; total_tissue_data[is.na(total_tissue_data)]=0 ;
 73 |   tissue_sumgene <-c(tissue_sumgene,xx[[3]])
 74 |   }
 75 |   rm(pbmc)	
 76 |   message(paste0("Finish ",tissuenames[i]))	  }
 77 | tissue_sumgene<-data.frame(tissue_sumgene,row.names = colnames(tissuedata))
 78 | hist(tissue_sumgene$tissue_sumgene,breaks = 100,main="average",xlab = "ngene")
 79 | hist(colSums(total_tissue_gene>0),breaks = 100,main="sample",xlab = "ngene")
 80 | summary(tissue_sumgene$tissue_sumgene)
 81 | summary(colSums(total_tissue_gene>0))
 82 | 
 83 | #############################  
 84 | all_tissue_gene <- colnames(total_tissue_gene)
 85 | subs_group <- colnames(total_tissue_data)
 86 | names_id <- c()
 87 | for(i in 1:length(subs_group)) { id=grep(pattern=paste0("^",subs_group[i],"_"),all_tissue_gene); names_id = c(names_id,rep(subs_group[i],length(id)))  }
 88 | allclusterID<-data.frame(colnames(total_tissue_gene),names_id)
 89 | rownames(allclusterID)<-allclusterID$colnames.total_tissue_gene.
 90 | 
 91 | ###run Seurat for the next step to calculate different gene test 
 92 | library(Seurat)
 93 | library(dplyr)
 94 | library(Matrix)
 95 | pbmc <- CreateSeuratObject(raw.data =total_tissue_gene, min.cells = 3, min.genes = 20, 
 96 |                            project = "nolog")
 97 | pbmc <- AddMetaData(object = pbmc, metadata =allclusterID)
 98 | pbmc<-SetAllIdent(pbmc,id="names_id")
 99 | pbmc@ident
100 | pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
101 |                       scale.factor = 100000)
102 | idnetname<-rownames(table(pbmc@ident))
103 | save(idnetname,pbmc,file = "/media/ggj/SHYbeifen/HCLREFuse.RData/ref/Marker/human_ref_pbmc.RData")
104 | pbmc.markers <- FindAllMarkers(object =pbmc, only.pos = TRUE, min.pct = 0.15, 
105 |                                thresh.use = 0.15)
106 | pbmc.markers<-pbmc.markers[order(pbmc.markers$cluster,-pbmc.markers$avg_logFC,pbmc.markers$p_val  ),]
107 | 
108 | # get feature gene ,choose top20 genes per cluster
109 | library(dplyr)
110 | pbmc.markers %>% group_by(cluster) %>% top_n(20,avg_logFC) -> top20
111 | top20 <- top20$gene
112 | top20 <- top20[!duplicated(top20)]
113 | ref_exp <- reference[top20,]
114 | save.image("/home/ggj/Rdata/201810/NewReference/ref-exp.RData")
115 | 


--------------------------------------------------------------------------------
/scHCL-build-reference/2-scHCLuse.R:
--------------------------------------------------------------------------------
 1 | #load reference
 2 | load("/home/ggj/Rdata/201810/NewReference/ref-exp.RData")
 3 | #start scHCL
 4 | Test_dge<-read.table("./_dge_sample.csv",sep = ",",header = T,row.names = 1)#load the matirx of new cells,row is genename ,col is cellname
 5 | #log-normalized the uploaded dge
 6 | Test_dge<-as.matrix(t(t(Test_dge)/colSums(Test_dge))*100000) 
 7 | Test_dge<-log(Test_dge+1)
 8 | tst <- data.frame(matrix(nrow =length(ref[,1]),ncol = length(Test_dge[1,])))#the 104 is the test cell numbers 
 9 | rownames(tst)<-rownames(ref)
10 | colnames(tst)<-colnames(Test_dge)
11 | for (i in rownames(ref)) {ref
12 |   if(i%in%rownames(Test_dge)) tst[i,]<- Test_dge[i,]
13 | }
14 | tst[is.na(tst)]<-0
15 | ref<-log(ref_exp+1)
16 | cors <- cor(ref,tst)
17 | cors[is.na(cors)]<-0
18 | cor1<-cors
19 | cor1m<-apply(cor1,2,max) #
20 | cor1S<-apply(cors,2,function(x) rownames(cors)[which.max(x)])
21 | cor1r<-cbind(cor1S,cor1m)
22 | #scHCL results is in scHCL dataframe
23 | scHCL<-data.frame(cor1r)
24 | colnames(scHCL)<-c("scHCL_result" ,  "cors_log")
25 | 


--------------------------------------------------------------------------------
/scHCL-build-reference/HCLREFuse.RData/example1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ggjlab/HCL/7e33ba7e9b3cfd1662fb80de09dec7b7e50c085d/scHCL-build-reference/HCLREFuse.RData/example1.RData


--------------------------------------------------------------------------------