├── README.md
└── analysis_scripts
    ├── 01_prelim.filters.R
    ├── 02_prelim.cluster.R
    ├── 03_filter_from_prelim_cluster.R
    ├── 04_cluster.R
    ├── 05_label_transfer_of_aged_cells.R
    ├── 06_annotate_and_filter_from_second_cluster.R
    ├── 07_UMAP.R
    ├── 08a_subcluster_immune.R
    ├── 08b_subcluster_tanycytes_ependymal.R
    ├── 09a_degenes_age_mast_subclass.R
    ├── 09b_degenes_age_mast_supertype_nn.R
    ├── 10a_gsea_gprofiler_subclass.R
    ├── 10b_gsea_gprofiler_nnsupertype.R
    ├── 11_GO_matrix_gprofiler.R
    ├── 12_augur.R
    ├── 13_pb_edgeR.R
    ├── functions
        ├── degenes_age_mast_simplified_qcscore.R
        ├── run_umap.py
        └── scrattch.mapping
        │   ├── HANN.R
        │   ├── HANN_build.R
        │   ├── HANN_prepareTaxonomy.R
        │   └── HANN_utils.R
    └── r_objects
        ├── ABC_MWB_taxonomy
            ├── AIT13.0_mouse
            │   └── cl.clean.rda
            └── AIT21.0_mouse
            │   └── cl.clean.rda
        ├── broad.roi.key.rda
        ├── col.age.rda
        ├── col.broi.rda
        ├── genotype.genes.rda
        ├── sex.genes.rda
        └── supertype.exclude.rda


/README.md:
--------------------------------------------------------------------------------
1 | # mouse_aging_scRNAseq
2 | Analysis scripts related to Jin et al manuscript on scRNA seq in aging mouse brain
3 | 


--------------------------------------------------------------------------------
/analysis_scripts/01_prelim.filters.R:
--------------------------------------------------------------------------------
 1 | library(scrattch.hicat)
 2 | library(dplyr)
 3 | 
 4 | ## Description/Goal:
 5 | ## Filter data at the single-cell level using loose filter criteria
 6 | 
 7 | #################################################
 8 | ## Load data files
 9 | #################################################
10 | load("samp.dat_bothages_qcscore_20220705.rda") ## loading in cell metadata file
11 | 
12 | ## Filter by very loose criteria
13 | qc.cut = 50 ## qc score cutoff
14 | gc.cut = 1000 ## gene detection cutoff
15 | doub.cut = 0.3 ## doublet score cutoff
16 | 
17 | samp.dat.filtered = samp.dat[samp.dat$gene.counts.0 > gc.cut &
18 |                                samp.dat$qc.score > qc.cut &
19 |                                samp.dat$doublet_score < 0.3,]
20 | 
21 | save(samp.dat.filtered, file = "samp.dat.filtered_prelim.20220705.rda") ## save new filtered meta-data object for later


--------------------------------------------------------------------------------
/analysis_scripts/02_prelim.cluster.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Perform first round of clustering
 3 | 
 4 | 
 5 | library(dplyr)
 6 | library(scrattch.hicat)
 7 | library(scrattch.bigcat)
 8 | library(bigstatsr)
 9 | library(Matrix)
10 | library(RcppParallel)
11 | library(matrixStats)
12 | 
13 | 
14 | #################################################
15 | ## Load data files
16 | #################################################
17 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## cell by gene file-backed matrix (see github.com/alleninstitute/scrattch.bigcat for examples for assembly)
18 | load("samp.dat.filtered_prelim.20220705.rda") ## filtered metadata file from previous step
19 | 
20 | select.cells = samp.dat.filtered$sample_id
21 | 
22 | 
23 | #################################################
24 | ## First clustering round
25 | #################################################
26 | ## Run clustering
27 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50) ## Looser settings
28 | 
29 | set.seed(123)
30 | resulti = iter_clust_big(big.dat = big.dat, select.cells = select.cells, de.param =de.param, prefix="./prelim_cluster/cluster_0220705",
31 |                          max.cl.size=200, split.size = 50, verbose=1, sampleSize=10000)
32 | 
33 | save(resulti, file="./prelim_cluster/resulti_cluster_0220705.rda")
34 | 
35 | cl = resulti$cl ## de novo clusters
36 | markers = resulti$markers ## marker genes for all clusters
37 | 
38 | ## Subsample results and merge
39 | sampled.cells = sample_cells(cl, 100)
40 | sampled.cells = sample(sampled.cells, 200000, replace = F)
41 | norm.dat = get_logNormal(big.dat, sampled.cells)
42 | select.markers = intersect(markers, big.dat$row_id)
43 | 
44 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50)
45 | merge.result = merge_cl(norm.dat=norm.dat, cl=cl, rd.dat.t=norm.dat[select.markers,], de.param=de.param, verbose=TRUE)
46 | 
47 | 
48 | 
49 | #################################################
50 | ## Save objects for future use
51 | #################################################
52 | save(merge.result, file="merge.result_th300_cluster_0220705.rda")
53 | save(norm.dat, file = "norm.dat_cl100_ss200Kcells.rda")
54 | 


--------------------------------------------------------------------------------
/analysis_scripts/03_filter_from_prelim_cluster.R:
--------------------------------------------------------------------------------
  1 | ## Description/Goal:
  2 | ## Use preliminary clustering results to perform additional round of filtering of clusters 
  3 | ## before second and final round of clustering
  4 | 
  5 | ## Note:
  6 | ## This script uses an older version of ABC-MWB annotations (AIT13.0) - we recommend using
  7 | ## the latest version that has been published in Yao, 2023 Nature, but are including the older
  8 | ## version here for transparancy
  9 | 
 10 | 
 11 | library(dplyr)
 12 | library(scrattch.hicat)
 13 | options(stringsAsFactors = F)
 14 | 
 15 | 
 16 | #################################################
 17 | ## Load data files
 18 | #################################################
 19 | ## Load objects from prior scripts
 20 | load("./prelim_cluster_objects/merge.result_th300_cluster_0220705.rda")
 21 | load("samp.dat.filtered_prelim.20220705.rda")
 22 | 
 23 | ## Load annotations that already exist for adult cells (these were an older version of ABC-MWB annotations)
 24 | load("r_objects/ABC_MWB_taxonomy/AIT13.0_mouse/cl.clean.rda") ## loads objects cl.clean and cl.df.clean
 25 | 
 26 | 
 27 | #################################################
 28 | ## Combine adult annotations with preliminary cluster results
 29 | #################################################
 30 | ## Subset ABC-MWB results and label missing cells (assumed to be junk)
 31 | roi.keep = unique(samp.dat.filtered$roi)
 32 | 
 33 | cells.anno = intersect(samp.dat.filtered$sample_id, names(cl.clean))
 34 | cells.adult = samp.dat.filtered$sample_id[samp.dat.filtered$age_cat == "adult"]
 35 | cells.junk = setdiff(cells.adult, cells.anno)
 36 | 
 37 | anno.df.u19 = data.frame(sample_id = c(cells.anno, cells.junk))
 38 | anno.df.u19$cl.u19 = cl.clean[anno.df.u19$sample_id]
 39 | anno.df.u19$cl.u19[is.na(anno.df.u19$cl.u19)] = "missing.or.junk"
 40 | 
 41 | head(anno.df.u19)
 42 | tail(anno.df.u19)
 43 | 
 44 | anno.df.u19 = left_join(anno.df.u19, cl.df.clean[,c("cl", "cluster_label", "Level1_label", "Level2_label", "subclass_label", "class_label")], by = c("cl.u19" = "cl"))
 45 | anno.df.u19$cluster_label = as.character(anno.df.u19$cluster_label)
 46 | anno.df.u19[is.na(anno.df.u19)] = "missing.or.junk"
 47 | 
 48 | head(anno.df.u19)
 49 | tail(anno.df.u19)
 50 | 
 51 | 
 52 | ## Add cl to my data
 53 | row.names(samp.dat.filtered) = samp.dat.filtered$sample_id
 54 | cl = merge.result$cl
 55 | samp.dat.filtered$cl = cl[samp.dat.filtered$sample_id]
 56 | 
 57 | ## Add U19 labels
 58 | samp.dat.filtered = left_join(samp.dat.filtered, anno.df.u19)
 59 | 
 60 | 
 61 | cl.df  = samp.dat.filtered %>% group_by(cl) %>% summarize(ncell = n(), ## this object summarizes many QC stats for each de novo cluster and will be used to visualize and determine filter criteria
 62 |                                                           med.gc = median(gene.counts.0),
 63 |                                                           med.qc = median(qc.score),
 64 |                                                           med.doub = median(doublet_score),
 65 |                                                           age_prop = sum(age_cat == "aged") / n(),
 66 |                                                           adult_prop = sum(age_cat == "adult") / n(),
 67 |                                                           U19_id_prop = 1 - (sum(is.na(cluster_label)) / n()),
 68 |                                                           max.cluster.U19 = names(which.max(table(cluster_label))),
 69 |                                                           max.cluster.U19.prop = (max(table(cluster_label)) / n()),
 70 |                                                           max.subclass.U19 = names(which.max(table(subclass_label))),
 71 |                                                           max.subclass.U19.prop = (max(table(subclass_label)) / n()),
 72 |                                                           max.class.U19 = names(which.max(table(class_label))),
 73 |                                                           max.class.U19.prop = (max(table(class_label)) / n()),
 74 |                                                           max.level1.U19 = names(which.max(table(Level1_label))),
 75 |                                                           max.level1.U19.prop = (max(table(Level1_label)) / n()),
 76 |                                                           max.level2.U19 = names(which.max(table(Level2_label))),
 77 |                                                           max.level2.U19.prop = (max(table(Level2_label)) / n()),
 78 |                                                           max.donor.prop = (max(table(library_prep)) / n()))
 79 | 
 80 | 
 81 | #################################################
 82 | ## Vis QC metrics
 83 | #################################################
 84 | library(gridExtra)
 85 | library(ggplot2)
 86 | 
 87 | ## Gene counts
 88 | p2 = cl.df %>%
 89 |   ggplot(aes(x = med.gc))+
 90 |   geom_histogram(bins = 50)+
 91 |   scale_x_log10()+
 92 |   geom_vline(xintercept = 2000)+
 93 |   geom_vline(xintercept = 3000, color = "red")+
 94 |   geom_vline(xintercept = 5500, color = "blue")+
 95 |   ggtitle("Median Gene Counts")+
 96 |   facet_wrap(~max.class.U19, ncol = 2, scales = "free_y")+
 97 |   theme_bw()
 98 | 
 99 | p1 = cl.df %>%
100 |   ggplot(aes(x = med.gc))+
101 |   geom_histogram(bins = 50)+
102 |   scale_x_log10()+
103 |   geom_vline(xintercept = 2000)+
104 |   geom_vline(xintercept = 3000, color = "red")+
105 |   geom_vline(xintercept = 5500, color = "blue")+
106 |   ggtitle("Median Gene Counts")+
107 |   facet_wrap(~max.class.U19, ncol = 2)+
108 |   theme_bw()
109 | 
110 | grid.arrange(p1, p2, nrow = 1)
111 | 
112 | ## QC Score
113 | p2 = cl.df %>%
114 |   ggplot(aes(x = med.qc))+
115 |   geom_histogram(bins = 50)+
116 |   # scale_x_log10()+
117 |   geom_vline(xintercept = 150)+
118 |   geom_vline(xintercept = 300, color = "red")+
119 |   ggtitle("Median QC Score")+
120 |   facet_wrap(~max.class.U19, ncol = 2, scales = "free_y")+
121 |   theme_bw()
122 | 
123 | p1 = cl.df %>%
124 |   ggplot(aes(x = med.qc))+
125 |   geom_histogram(bins = 50)+
126 |   # scale_x_log10()+
127 |   geom_vline(xintercept = 150)+
128 |   geom_vline(xintercept = 300, color = "red")+
129 |   ggtitle("Median QC Score")+
130 |   facet_wrap(~max.class.U19, ncol = 2)+
131 |   theme_bw()
132 | 
133 | grid.arrange(p1, p2, nrow = 1)
134 | 
135 | 
136 | #################################################
137 | ## Label LQ clusters
138 | #################################################
139 | ## Cluster-level QC and GC cutoffs for neurons versus non-neurons
140 | tmp1 = cl.df$cl[cl.df$max.class.U19 %in% c("NN", "IMN") & (cl.df$med.gc < 2000 | cl.df$med.qc < 100)] 
141 | tmp.neurons = c("Gaba", "Glut", "Hybrid", "Sero", "Dopa")
142 | tmp2 = cl.df$cl[cl.df$max.class.U19 %in% tmp.neurons & (cl.df$med.gc < 3000 | cl.df$med.qc < 250)]
143 | cl.lq.qc = unique(c(tmp1, tmp2))
144 | 
145 | ## Clusters that are mostly mapping to "junk"
146 | cl.lq.junk = cl.df$cl[cl.df$max.cluster.U19 == "missing.or.junk" & cl.df$U19_id_prop > 0.05]
147 | 
148 | ## Clusters that are biased towards a single donor
149 | cl.lq.donor = cl.df$cl[cl.df$max.donor.prop > 0.8]
150 | 
151 | ## Other manually annotated LQ clusters:
152 | cl.lq.manual = c(
153 |                  7051, ## Oligo cluster with endo markers
154 |                  7191, 7193, ## Astro + Microglia doublets
155 |                  7214, 7215, ## Astro + Endo doublets
156 |                  595 ## Astro + Gluta doublet (Slc17a7)
157 |                  )
158 | 
159 | 
160 | cl.lq = unique(c(cl.lq.qc, cl.lq.junk, cl.lq.donor, cl.lq.manual))
161 | 
162 | cl.df$keep = T
163 | cl.df$keep[cl.df$cl %in% cl.lq] = F
164 | 
165 | sum(cl.df$keep) ## number of remaining clusters
166 | sum(cl.df$ncell) ## number of starting cells
167 | sum(cl.df$ncell[cl.df$keep == T]) ## number of cells remaining after new filters
168 | 
169 | save(cl.df, file = "./prelim_cluster_objects/cl.df_U19.max.2022-09-15.rda") ## save cl.df object for later
170 | 
171 | 
172 | #################################################
173 | ## Filter LQ clusters
174 | #################################################
175 | ## Filter
176 | cl = merge.result$cl
177 | samp.dat.filtered$cl = cl[samp.dat.filtered$sample_id]
178 | samp.dat.filtered = left_join(samp.dat.filtered, cl.df)
179 | 
180 | anno.df.clean = samp.dat.filtered[samp.dat.filtered$keep == T,]
181 | 
182 | save(anno.df.clean, file = "anno.df.clean_20220915.rda")
183 | 
184 | 
185 | 
186 | 
187 | ## Create anno.df (remove weird geno/donors)
188 | weird.donor = "483935"
189 | weird.geno = c("Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt", "CX3CR1-GFP/CX3CR1-GFP")
190 | anno.df.clean = samp.dat.filtered[samp.dat.filtered$external_donor_name != weird.donor,]
191 | anno.df.clean = anno.df.clean[!anno.df.clean$full_genotype %in% weird.geno,]
192 | row.names(anno.df.clean) = anno.df.clean$sample_id
193 | save(anno.df.clean, file = "anno.df.clean_20220915.rda")
194 | 


--------------------------------------------------------------------------------
/analysis_scripts/04_cluster.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Perform final round of clustering
 3 | 
 4 | 
 5 | library(dplyr)
 6 | library(scrattch.hicat)
 7 | library(scrattch.bigcat)
 8 | library(bigstatsr)
 9 | library(Matrix)
10 | library(RcppParallel)
11 | library(matrixStats)
12 | 
13 | 
14 | #################################################
15 | ## Load data
16 | #################################################
17 | load("big.dat_Aging.16ROI.V3.20220701.rda")
18 | load("anno.df.clean_20220915.rda") ## most recent metadata object with filtered cells
19 | 
20 | select.cells = anno.df.clean$sample_id
21 | 
22 | 
23 | #################################################
24 | ## Second clustering round
25 | #################################################
26 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50)
27 | 
28 | set.seed(123)
29 | resulti = iter_clust_big(big.dat = big.dat, select.cells = select.cells, de.param =de.param, prefix="./cluster_0916/cluster_intermed_files",
30 |                          max.cl.size=200, split.size = 50, verbose=1, sampleSize=10000)
31 | 
32 | save(resulti, file=paste0("./cluster_0916/resulti_cluster.rda"))
33 | 
34 | 
35 | ## Merge clusters
36 | load("./cluster_0916/resulti_cluster.rda")
37 | cl = resulti$cl
38 | markers = resulti$markers
39 | 
40 | sampled.cells = sample_cells(cl, 100)
41 | # sampled.cells = sample(sampled.cells, 200000, replace = F)
42 | norm.dat = get_logNormal(big.dat, sampled.cells)
43 | select.markers = intersect(markers, big.dat$row_id)
44 | 
45 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=100)
46 | merge.result = merge_cl(norm.dat=norm.dat, cl=cl, rd.dat.t=norm.dat[select.markers,], de.param=de.param, verbose=TRUE)
47 | 
48 | 
49 | 
50 | #################################################
51 | ## Save objects for future use
52 | #################################################
53 | save(merge.result, file= "./cluster_0916/merge.result_th300_mincells100.rda") ## clustering results
54 | save(norm.dat, file = "./cluster_0916/norm.dat_cluster0916_cl100.rda") ## subsampled cell-by-gene matrix
55 | 


--------------------------------------------------------------------------------
/analysis_scripts/05_label_transfer_of_aged_cells.R:
--------------------------------------------------------------------------------
  1 | ## Description/Goal:
  2 | ## Performing label tansfer for all aged cells from ABC-WMB taxonomy (AIT21.0)
  3 | 
  4 | ## Note:
  5 | ## This script was run on an HPC job scheduler, and prior to the publishing of scrattch.mapping package
  6 | ## We recomment installing github.com/alleninstitute/scrattch.mapping package to perform
  7 | ## label transfer, but are including original scripts here for the sake of transparency
  8 | 
  9 | ####################################################################
 10 | ## These arguments were used for indexing on an HPC job scheduler ##
 11 | args = commandArgs(TRUE)
 12 | array.index = args[1]
 13 | array.index = as.integer(array.index)
 14 | ####################################################################
 15 | 
 16 | 
 17 | ## These functions have since been implemented in github.com/alleninstitute/scrattch-mapping
 18 | ## but are also included in github repo for this project
 19 | source("functions/scrattch.mapping/HANN.R")
 20 | source("functions/scrattch.mapping/HANN_build.R")
 21 | source("functions/scrattch.mapping/HANN_utils.R")
 22 | source("functions/scrattch.mapping/HANN_prepareTaxonomy.R")
 23 | 
 24 | 
 25 | #################################################
 26 | ## Load data files
 27 | #################################################
 28 | library(scrattch.bigcat)
 29 | library(openblasctl) 
 30 | openblas_set_num_threads(1)
 31 | 
 32 | load("big.dat_Aging.16ROI.V3.20231208.rda")
 33 | load("samp.dat_bothages_qcscore_20231208.rda")
 34 | 
 35 | ## Select cells to keep
 36 | keep.cells = samp.dat$sample_id[samp.dat$age_cat == "aged"] ## Only select aged cells
 37 | 
 38 | 
 39 | #################################################
 40 | ## Run mapping
 41 | #################################################
 42 | library(dplyr)
 43 | library(scrattch.bigcat)
 44 | library(Matrix)
 45 | library(data.table)
 46 | library(arrow)
 47 | 
 48 | ## Breaking the data into parts for parallel processing on HPC
 49 | chunksize = 50000
 50 | start = seq(1,length(keep.cells), chunksize)
 51 | end = c(start[-1] - 1, length(keep.cells))
 52 | print(c(array.index, start[array.index], end[array.index]))
 53 | 
 54 | 
 55 | ## Perform label transfer
 56 | select.cells = keep.cells[c(start[array.index]:end[array.index])]
 57 | 
 58 | print("Get norm.dat...")
 59 | qdat = get_cols(big.dat, select.cells)
 60 | 
 61 | print("Start mapping...")
 62 | mapped = run_mapping_on_taxonomy(
 63 |   qdat,                       # qdat : log normalized count matrix (gene x cell)
 64 |   Taxonomy='AIT21.0_mouse', # taxonomy
 65 |   TaxHome='/allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/Taxonomies/',
 66 |   prefix='10X_cells_v3',      # prefix for your data platform (SS, 10X_cells_v2, 10X_cells_v3, 10X_nuclei_v3,MFISH)
 67 |   prebuild=FALSE,             # if the taxonomy is available for your platform, use it.
 68 |   newbuild=FALSE,              #
 69 |   mapping.method='flat', # 'flat','hierarchy' (AIT12.0_mouse: only flat)
 70 |   nlevel=2,
 71 |   mc.cores=10,                # lower this to 5 if the run fails due to the memory
 72 |   iter=100,
 73 |   blocksize=5000)
 74 | 
 75 | fn.tmp = paste0("mapping_aged_fmap_AIT21.0_allcells/map.results_part", array.index, "of", length(start), ".rds")
 76 | print(paste0("Saving as ", fn.tmp, "..."))
 77 | saveRDS(mapped, file = fn.tmp)
 78 | 
 79 | 
 80 | #################################################
 81 | ## Assemble mapping results
 82 | #################################################
 83 | fmap.files = list.files("./mapping_aged_fmap_AIT21.0/", pattern = "*.rds", full.names = T)
 84 | fmap.result = c()
 85 | 
 86 | for(i in fmap.files){
 87 |   
 88 |   tmp.fmap = readRDS(i)
 89 |   fmap.result = rbind(fmap.result, tmp.fmap$best.map.df)
 90 |   
 91 | }
 92 | 
 93 | head(fmap.result)
 94 | names(fmap.result)[-1] = paste0(names(fmap.result)[-1], "_AIT21.0")
 95 | ref.cl.df = tmp.fmap$cl.df
 96 | 
 97 | saveRDS(fmap.result, file = "mapping_aged_fmap_AIT21.0/map.result_bestmap.allcells.20230915.rds") ## Mapping results
 98 | save(ref.cl.df, file = "mapping_aged_fmap_AIT21.0/ref.cl.df.rda") ## Saves the annotation table for the reference that was used (ABC-WMB taxonomy)
 99 | 
100 | 


--------------------------------------------------------------------------------
/analysis_scripts/06_annotate_and_filter_from_second_cluster.R:
--------------------------------------------------------------------------------
  1 | ## Description/Goal:
  2 | ## Annotate and filter based on latest clustering
  3 | 
  4 | library(plyr)
  5 | library(dplyr)
  6 | library(scrattch.hicat)
  7 | options(stringsAsFactors = F)
  8 | 
  9 | 
 10 | #################################################
 11 | ## Load data
 12 | #################################################
 13 | ## Load annotations that already exist from ABC-MWB
 14 | load("r_objects/ABC_MWB_taxonomy/AIT21.0_mouse/cl.clean.rda")
 15 | remove(cl.df.clean)
 16 | load("mapping_aged_fmap_AIT21.0_allcells/ref.cl.df.rda")
 17 | 
 18 | ## Load clustering results
 19 | load("./cluster_0916/merge.result_th300_mincells100.rda")
 20 | load("samp.dat_bothages_qcscore_20231208.rda")
 21 | 
 22 | cl = merge.result$cl
 23 | samp.dat.filtered = samp.dat[samp.dat$sample_id %in% names(cl),]
 24 | 
 25 | 
 26 | 
 27 | #################################################
 28 | ## Combine adult annotations with  cluster results
 29 | #################################################
 30 | ## Subset ABC-MWB results and label missing cells (assumed to be junk)
 31 | roi.keep = unique(samp.dat.filtered$roi)
 32 | 
 33 | cells.anno = intersect(samp.dat.filtered$sample_id, names(cl.clean))
 34 | cells.adult = samp.dat.filtered$sample_id[samp.dat.filtered$age_cat == "adult"]
 35 | cells.junk = setdiff(cells.adult, cells.anno)
 36 | 
 37 | anno.df.u19 = data.frame(sample_id = c(cells.anno, cells.junk))
 38 | anno.df.u19$cl.u19 = cl.clean[anno.df.u19$sample_id]
 39 | anno.df.u19$cl.u19[is.na(anno.df.u19$cl.u19)] = "missing.or.junk"
 40 | 
 41 | head(anno.df.u19)
 42 | tail(anno.df.u19)
 43 | 
 44 | ref.cl.df$cl = as.character(ref.cl.df$cl)
 45 | anno.df.u19 = left_join(anno.df.u19, ref.cl.df[,c("cl", "cluster_id_label", "supertype_label", "subclass_label", "class_label")], by = c("cl.u19" = "cl"))
 46 | anno.df.u19[is.na(anno.df.u19)] = "missing.or.junk"
 47 | 
 48 | head(anno.df.u19)
 49 | tail(anno.df.u19)
 50 | dim(anno.df.u19)
 51 | 
 52 | 
 53 | ## Add mapped aged labels
 54 | load("mapping_aged_fmap_AIT21.0_allcells//ref.cl.df.rda")
 55 | map.result = readRDS("mapping_aged_fmap_AIT21.0_allcells/map.result_bestmap.allcells.20231219.rds")
 56 | map.result =  left_join(map.result, ref.cl.df[,c("cl", "cluster_id_label", "supertype_label", "subclass_label", "class_label")], by = c("best.cl_AIT21.0" = "cl"))
 57 | names(map.result)[names(map.result) == "best.cl._AIT21.0"] = "cl.u19"
 58 | common = intersect(names(map.result), names(anno.df.u19))
 59 | map.result = map.result[map.result$sample_id %in% samp.dat.filtered$sample_id[samp.dat.filtered$age_cat == "aged"],]
 60 | 
 61 | anno.df.u19 = rbind(anno.df.u19[,common], map.result[,common])
 62 | 
 63 | 
 64 | ## Add cl to my data
 65 | row.names(samp.dat.filtered) = samp.dat.filtered$sample_id
 66 | samp.dat.filtered$cl = cl[samp.dat.filtered$sample_id]
 67 | samp.dat.filtered = samp.dat.filtered[!is.na(samp.dat.filtered$cl),]
 68 | 
 69 | ## Add U19 labels
 70 | samp.dat.filtered = left_join(samp.dat.filtered, anno.df.u19)
 71 | 
 72 | cl.df  = samp.dat.filtered %>% dplyr::group_by(cl) %>% dplyr::summarize(ncell = n(), 
 73 |                                                                         med.gc = median(gene.counts.0),
 74 |                                                                         med.qc = median(qc.score),
 75 |                                                                         med.doub = median(doublet_score),
 76 |                                                                         age_prop = sum(age_cat == "aged") / n(),
 77 |                                                                         adult_prop = sum(age_cat == "adult") / n(),
 78 |                                                                         
 79 |                                                                         max.roi = names(which.max(table(roi))),
 80 |                                                                         max.roi.prop = (max(table(roi)) / n()),
 81 |                                                                         
 82 |                                                                         max.class = names(which.max(table(class_label))),
 83 |                                                                         max.class_2 = names(sort(table(class_label), decreasing = T))[2],
 84 |                                                                         max.class.prop = (max(table(class_label)) / n()),
 85 |                                                                         max.class.prop_2 = sort((table(class_label)) / n(), decreasing = T)[2],
 86 |                                                                         
 87 |                                                                         max.subclass = names(which.max(table(subclass_label))),
 88 |                                                                         max.subclass_2 = names(sort(table(subclass_label), decreasing = T))[2],
 89 |                                                                         max.subclass.prop = (max(table(subclass_label)) / n()),
 90 |                                                                         max.subclass.prop_2 = sort((table(subclass_label)) / n(), decreasing = T)[2],
 91 |                                                                         
 92 |                                                                         max.supertype = names(which.max(table(supertype_label))),
 93 |                                                                         max.supertype_2 = names(sort(table(supertype_label), decreasing = T))[2],
 94 |                                                                         max.supertype.prop = (max(table(supertype_label)) / n()),
 95 |                                                                         max.supertype.prop_2 = sort((table(supertype_label)) / n(), decreasing = T)[2],
 96 |                                                                         
 97 |                                                                         max.cluster = names(which.max(table(cluster_id_label))),
 98 |                                                                         max.cluster_2 = names(sort(table(cluster_id_label), decreasing = T))[2],
 99 |                                                                         max.cluster.prop = (max(table(cluster_id_label)) / n()),
100 |                                                                         max.cluster.prop_2 = sort((table(cluster_id_label)) / n(), decreasing = T)[2],
101 |                                                                         
102 |                                                                         max.lib.prop = (max(table(library_prep)) / n()))
103 | 
104 | 
105 | 
106 | hist(cl.df$max.class.prop)
107 | hist(cl.df$max.subclass.prop)
108 | 
109 | 
110 | #################################################
111 | ## Vis QC metrics
112 | #################################################
113 | library(ggplot2)
114 | library(gridExtra)
115 | 
116 | 
117 | ## Gene counts
118 | p2 = cl.df %>%
119 |   ggplot(aes(x = med.gc))+
120 |   geom_histogram(bins = 50)+
121 |   scale_x_log10()+
122 |   geom_vline(xintercept = 2000)+
123 |   geom_vline(xintercept = 3000, color = "red")+
124 |   geom_vline(xintercept = 5000, color = "blue")+
125 |   geom_vline(xintercept = 5500, color = "dark green")+
126 |   ggtitle("Median Gene Counts")+
127 |   facet_wrap(~max.class, ncol = 2, scales = "free_y")+
128 |   theme_bw()
129 | 
130 | p1 = cl.df %>%
131 |   ggplot(aes(x = med.gc))+
132 |   geom_histogram(bins = 50)+
133 |   scale_x_log10()+
134 |   geom_vline(xintercept = 2000)+
135 |   geom_vline(xintercept = 3000, color = "red")+
136 |   geom_vline(xintercept = 5000, color = "blue")+
137 |   geom_vline(xintercept = 5500, color = "dark green")+
138 |   ggtitle("Median Gene Counts")+
139 |   facet_wrap(~max.class, ncol = 2)+
140 |   theme_bw()
141 | 
142 | grid.arrange(p1, p2, nrow = 1)
143 | 
144 | 
145 | 
146 | ## QC Score
147 | p2 = cl.df %>%
148 |   ggplot(aes(x = med.qc))+
149 |   geom_histogram(bins = 50)+
150 |   # scale_x_log10()+
151 |   geom_vline(xintercept = 150)+
152 |   geom_vline(xintercept = 300, color = "red")+
153 |   ggtitle("Median QC Score")+
154 |   facet_wrap(~max.class, ncol = 2, scales = "free_y")+
155 |   theme_bw()
156 | 
157 | p1 = cl.df %>%
158 |   ggplot(aes(x = med.qc))+
159 |   geom_histogram(bins = 50)+
160 |   # scale_x_log10()+
161 |   geom_vline(xintercept = 150)+
162 |   geom_vline(xintercept = 300, color = "red")+
163 |   ggtitle("Median QC Score")+
164 |   facet_wrap(~max.class, ncol = 2)+
165 |   theme_bw()
166 | 
167 | grid.arrange(p1, p2, nrow = 1)
168 | 
169 | 
170 | 
171 | 
172 | #################################################
173 | ## Label LQ clusters
174 | #################################################
175 | ## Cluster-level QC and GC cutoffs for different classes
176 | tmp = c("Immune", "Vascular", "Astro-Epen")
177 | tmp.nn = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 2000 | cl.df$med.qc < 150)]
178 | tmp = c("OPC-Oligo")
179 | tmp.oligo = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 3000 | cl.df$med.qc < 150)]
180 | tmp = c("OB-CR Glut", "DG-IMN Glut", "OB-IMN GABA")
181 | tmp.imn = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 3000 | cl.df$med.qc < 150)]
182 | tmp = setdiff(unique(ref.cl.df$class_label), c("Immune", "Vascular", "OPC-Oligo", "Astro-Epen", "OB-CR Glut", "DG-IMN Glut", "OB-IMN GABA"))
183 | tmp.neuron = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 5500 | cl.df$med.qc < 300)]
184 | 
185 | cl.lq.qc = unique(c(tmp.nn, tmp.imn, tmp.neuron, tmp.oligo))
186 | 
187 | ## Clusters that are mostly mapping to "junk"
188 | cl.lq.junk = cl.df$cl[cl.df$max.subclass == "missing.or.junk" | cl.df$max.supertype == "missing.or.junk"]
189 | 
190 | ## Clusters that are biased towards a single donor
191 | cl.lq.donor = cl.df$cl[cl.df$max.donor.prop > 0.8]
192 | 
193 | ## Other manually annotated LQ clusters:
194 | cl.lq.manual = c("2108" #mapped to half neurons and half microglia
195 | )
196 | 
197 | ## Exclude off target supertypes
198 | load("r_objects/supertype.exclude.rda")
199 | cl.offtarget = cl.df$cl[cl.df$max.supertype %in% supertype.exclude]
200 | tmp = cl.df[cl.df$cl %in% cl.offtarget,]
201 | 
202 | ## Ambiguous mapping
203 | hist(cl.df$max.class.prop)
204 | tmp.cl.df = cl.df[cl.df$max.class.prop < 0.7,]
205 | cl.ambmappingjunk = tmp.cl.df$cl[tmp.cl.df$max.class_2 == "missing.or.junk" & tmp.cl.df$max.class.prop_2 > 0.3]
206 | tmp = cl.df[cl.df$cl %in% unique(c(cl.ambmappingjunk)),]
207 | 
208 | cl.lq = unique(c(cl.lq.qc, cl.lq.junk, cl.lq.donor, cl.lq.manual, cl.offtarget, cl.ambmappingjunk))
209 | 
210 | 
211 | #################################################
212 | ## Final filter
213 | #################################################
214 | cl.df$keep = T
215 | cl.df$keep[cl.df$cl %in% cl.lq] = F
216 | 
217 | sum(cl.df$keep) ## number of remaining clusters
218 | sum(cl.df$ncell) ## number of starting cells
219 | sum(cl.df$ncell[cl.df$keep == T]) ## number of cells remaining after new filters
220 | 
221 | cl.df.clean = cl.df[cl.df$keep == T,]
222 | 
223 | save(cl.df, file = "./cluster_0916/cl.df_v231221.rda")
224 | 
225 | ## Assign class
226 | cl.df.clean$class_label = cl.df.clean$max.class
227 | table(cl.df.clean$class_label)
228 | 
229 | ## Assign subclass
230 | cl.df.clean$subclass_label = cl.df.clean$max.subclass
231 | table(cl.df.clean$subclass_label)
232 | 
233 | ## Assign supertype
234 | cl.df.clean$supertype_label = cl.df.clean$max.supertype
235 | table(cl.df.clean$supertype_label)
236 | 
237 | 
238 | #################################################
239 | ## Finding hierarchy discrepencies - correct each manually
240 | #################################################
241 | load("mapping_aged_fmap_AIT21.0_allcells//ref.cl.df.rda")
242 | tmp = unique(cl.df.clean[,c("class_label", "subclass_label", "supertype_label")])
243 | tmp.ref = unique(ref.cl.df[,c("class_label", "subclass_label", "supertype_label")])
244 | setdiff(tmp, tmp.ref)
245 | 
246 | ##--> cl292
247 | cl.df.clean$class_label[cl.df.clean$cl == 292] = "MB Glut" #changing to MB Glut because its similar in prop to former (P Glut) and max ROI is also MB
248 | ##--> cl502
249 | cl.df.clean$supertype_label[cl.df.clean$cl == 502] = "RHP-COA Ndnf Gaba_6" #changing to top supertype of the matching subclass
250 | ##--> cl629
251 | cl.df.clean$supertype_label[cl.df.clean$cl == 629] = "STN-PSTN Pitx2 Glut_2" #changing to top supertype of the matching subclass
252 | ##--> cl3678
253 | cl.df.clean$supertype_label[cl.df.clean$cl == 3678] = "L4/5 IT CTX Glut_1" #changing to top supertype of the matching subclass
254 | ##--> cl3415
255 | cl.df.clean$supertype_label[cl.df.clean$cl == 3415] = "OB-STR-CTX Inh IMN_1" #changing to top supertype of the matching subclass; all aged cells in this cluster are also mapping to OB rather than DIR
256 | 
257 | tmp = unique(cl.df.clean[,c("class_label", "subclass_label", "supertype_label")])
258 | tmp.ref = unique(ref.cl.df[,c("class_label", "subclass_label", "supertype_label")])
259 | setdiff(tmp, tmp.ref) #should be empty now
260 | 
261 | ## Correcting order
262 | class_label = intersect(unique(ref.cl.df$class_label), unique(cl.df.clean$class_label))
263 | class_id = 1:length(class_label)
264 | class.df = data.frame(class_label = class_label, class_id = class_id)
265 | 
266 | subclass_label = intersect(unique(ref.cl.df$subclass_label), unique(cl.df.clean$subclass_label))
267 | subclass_id = 1:length(subclass_label)
268 | subclass.df = data.frame(subclass_label = subclass_label, subclass_id = subclass_id)
269 | 
270 | supertype_label = intersect(unique(ref.cl.df$supertype_label), unique(cl.df.clean$supertype_label))
271 | supertype_id = 1:length(supertype_label)
272 | supertype.df = data.frame(supertype_label = supertype_label, supertype_id = supertype_id)
273 | 
274 | cl.df.clean = left_join(cl.df.clean, class.df)
275 | cl.df.clean = left_join(cl.df.clean, subclass.df)
276 | cl.df.clean = left_join(cl.df.clean, supertype.df)
277 | 
278 | 
279 | ## Add colors
280 | class.colors = read.csv("mapping_aged_fmap_AIT21.0/ait21_class_colors.csv")
281 | subclass.colors = read.csv("mapping_aged_fmap_AIT21.0/ait21_subclass_colors.csv")
282 | supertype.colors = read.csv("mapping_aged_fmap_AIT21.0/ait21_supertype_colors.csv")
283 | 
284 | cl.df.clean = left_join(cl.df.clean, class.colors)
285 | cl.df.clean = left_join(cl.df.clean, subclass.colors)
286 | cl.df.clean = left_join(cl.df.clean, supertype.colors)
287 | 
288 | 
289 | ## Update cl numbers
290 | cl.df.clean = cl.df.clean[order(cl.df.clean$supertype_id, cl.df.clean$subclass_id, cl.df.clean$class_id),]
291 | cl.df.clean$cl_0916 = cl.df.clean$cl
292 | cl.df.clean$cl = 1:nrow(cl.df.clean)
293 | cl.df.clean$cluster_label = paste0(cl.df.clean$cl, "_", cl.df.clean$supertype_label)
294 | 
295 | save(cl.df.clean, file = "cluster_0916/cl.df.clean_v231221.rda")
296 | 
297 | 
298 | #################################################
299 | ## Make new annotated metadata object (anno.df.clean)
300 | #################################################
301 | ## Load clustering results and new cl.df.clean
302 | load("./cluster_0916/merge.result_th300_mincells100.rda")
303 | load("./cluster_0916/cl.df.clean_v231221.rda")
304 | 
305 | ## Load original cell metadata file and key for broad rois
306 | load("samp.dat_bothages_qcscore_20231208.rda")
307 | load("r_objects/broad.roi.key.rda")
308 | 
309 | ## Add newest cluster label to metadata file
310 | cl = merge.result$cl
311 | select.cells = names(cl)
312 | row.names(samp.dat) = samp.dat$sample_id
313 | samp.dat.filtered = samp.dat[select.cells,]
314 | samp.dat.filtered$cl_0916 = cl[row.names(samp.dat.filtered)]
315 | 
316 | anno.df.clean = samp.dat.filtered[samp.dat.filtered$cl_0916 %in% cl.df.clean$cl_0916,]
317 | anno.df.clean = left_join(anno.df.clean, cl.df.clean)
318 | anno.df.clean = left_join(anno.df.clean, broad.key)
319 | 
320 | save(anno.df.clean, file = "anno.df.clean_cluster0916_v231221_20231221.rda") 
321 | 
322 | 


--------------------------------------------------------------------------------
/analysis_scripts/07_UMAP.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Compute UMAP
 3 | 
 4 | library(scrattch.bigcat)
 5 | library(scrattch.hicat)
 6 | library(MatrixGenerics)
 7 | 
 8 | #################################################
 9 | ## Load data
10 | #################################################
11 | load("cluster_0916/merge.result_th300_mincells100.rda") ## clustering results
12 | load("cluster_0916/norm.dat_cluster0916_cl100.rda") ## subsampled cell by gene matrix
13 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## fbm big.dat
14 | load("anno.df.clean_cluster0916_v1020_20221020.rda") ## metadata
15 | 
16 | ## Markers
17 | cl = merge.result$cl
18 | markers = merge.result$markers
19 | select.markers = intersect(markers, row.names(norm.dat))
20 | load("genotype.genes.rda")
21 | load("/allen/programs/celltypes/workgroups/rnaseqanalysis/yzizhen/10X_analysis/sex.genes.rda")
22 | select.markers = setdiff(select.markers, sex.genes)
23 | select.markers = setdiff(select.markers, genotype.genes)
24 | 
25 | 
26 | ## Rm.eigen
27 | rm.eigen = as.matrix(setNames(log2(anno.df.clean$gene.counts.0), anno.df.clean$sample_id), ncol=1)
28 | rownames(rm.eigen) = anno.df.clean$sample_id
29 | colnames(rm.eigen) = "log2Genes"
30 | save(rm.eigen, file = "rm.eigen.rda") ## save for later use
31 | 
32 | #################################################
33 | ## PCA
34 | #################################################
35 | set.seed(123)
36 | 
37 | ## PCA all
38 | select.cells = anno.df.clean$sample_id
39 | rd.result = rd_PCA_big(big.dat, norm.dat[select.markers, ], select.cells = select.cells, max.dim=150, method="elbow", mc.cores=20)
40 | rd.dat = filter_RD(rd.result$rd.dat, rm.eigen, 0.7)
41 | save(rd.dat, file = "./cluster_0916//rd.result_refdat_2022-10-21.rda") ## save PCA results as R object
42 | write.csv(rd.dat, file = "./cluster_0916/rd.result_refdat_2022-10-21.csv") ## save PCA results as csv which will be used for UMAP computation in Python
43 | 
44 | 
45 | #####################################
46 | ## UMAP
47 | #####################################
48 | ## These lines of script call run_umap.py script for UMAP computation
49 | tmp.in = "./cluster_0916/rd.result_refdat_2022-10-21.csv"
50 | tmp.out = "./cluster_0916/umap.2d_global_k10_d0.4_2022-10-21.csv"
51 | tmp.system = paste0("/functions/run_umap.py -i ", tmp.in, " -o ", tmp.out, " -k 10 -c 2 -d 0.4")
52 | system(tmp.system)


--------------------------------------------------------------------------------
/analysis_scripts/08a_subcluster_immune.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Extract only immune cells and cluster them separately
 3 | 
 4 | 
 5 | library(dplyr)
 6 | library(scrattch.hicat)
 7 | library(scrattch.bigcat)
 8 | 
 9 | #################################################
10 | ## Load data
11 | #################################################
12 | load("anno.df.clean_cluster0916_v1020_20221020.rda")
13 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## fbm big.dat
14 | 
15 | ## Create smaller cell-by-gene matrix with only immune cells
16 | select.cells = anno.df.clean$sample_id[anno.df.clean$class_label == "Immune"]
17 | norm.dat = get_logNormal(big.dat, select.cells)
18 | 
19 | 
20 | 
21 | #################################################
22 | ## Clustering
23 | #################################################
24 | set.seed(123)
25 | 
26 | ## Cluster
27 | resulti = iter_clust(norm.dat = norm.dat[,select.cells], select.cells = select.cells, "./data_immune/",
28 |                      max.cl.size = 200, split.size = 50, verbose = 1)
29 | 
30 | ## Merge
31 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50, padj.th = 0.01, min.genes = 3) ## Looser settings
32 | merge.result = merge_cl(norm.dat=norm.dat, cl=resulti$cl, rd.dat.t=norm.dat[resulti$markers,select.cells], de.param=de.param, verbose=TRUE) ## 10
33 | 
34 | ## Save for later
35 | save(resulti, file = "./data_immune/resulti.rda")
36 | save(merge.result, file = "./data_immune/merge.result.rda")
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/analysis_scripts/08b_subcluster_tanycytes_ependymal.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Extract only tanycytes and ependymal cells and cluster them separately
 3 | 
 4 | 
 5 | library(dplyr)
 6 | library(scrattch.hicat)
 7 | library(scrattch.bigcat)
 8 | 
 9 | #################################################
10 | ## Load data
11 | #################################################
12 | load("anno.df.clean_cluster0916_v1020_20221020.rda")
13 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## fbm big.dat
14 | 
15 | ## Create smaller cell-by-gene matrix with only tanycytes and ependymal cells
16 | select.cells = anno.df.clean$sample_id[anno.df.clean$subclass_label %in% c("Tanycyte", "Epenedymal")]
17 | norm.dat = get_logNormal(big.dat, select.cells)
18 | 
19 | 
20 | #################################################
21 | ## Clustering
22 | #################################################
23 | set.seed(123)
24 | 
25 | ## Cluster
26 | resulti = iter_clust(norm.dat = norm.dat[,select.cells], select.cells = select.cells, "./data_tanepen/",
27 |                      max.cl.size = 200, split.size = 50, verbose = 1)
28 | 
29 | ## Merge
30 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50, padj.th = 0.01, min.genes = 3) ## Looser settings
31 | merge.result = merge_cl(norm.dat=norm.dat, cl=resulti$cl, rd.dat.t=norm.dat[resulti$markers,select.cells], de.param=de.param, verbose=TRUE) ## 10
32 | 
33 | ## Save for later
34 | save(resulti, file = "./data_tanepen/resulti.rda")
35 | save(merge.result, file = "./data_tanepen/merge.result.rda")


--------------------------------------------------------------------------------
/analysis_scripts/09a_degenes_age_mast_subclass.R:
--------------------------------------------------------------------------------
  1 | ## Description/Goal:
  2 | ## Running MAST at the subclass level to find age-DE genes
  3 | 
  4 | library(MAST)
  5 | library(scrattch.hicat)
  6 | library(dplyr)
  7 | library(ggplot2)
  8 | library(data.table)
  9 | library(bigstatsr)
 10 | library(scrattch.bigcat)
 11 | 
 12 | 
 13 | ################################################################################
 14 | ## Load data
 15 | ################################################################################
 16 | ## Cleaned metadata data file
 17 | load("anno.df.clean_cluster0916_v231221_20240206.rda")
 18 | 
 19 | ## Scrattch.bigcat compatible file-backed-matrix
 20 | load("big.dat_Aging.16ROI.V3.20231208.rda")
 21 | 
 22 | 
 23 | 
 24 | ################################################################################
 25 | ## Deciding which subclasses to include for analysis
 26 | # ################################################################################
 27 | # -- Criteria:
 28 | # -- ncell > 100 for each age_cat
 29 | 
 30 | tmp.age.count = c(table(anno.df.clean$subclass_label[anno.df.clean$age_cat == "aged"]))
 31 | tmp.adult.count = c(table(anno.df.clean$subclass_label[anno.df.clean$age_cat == "adult"]))
 32 | tmp.age.count2 = tmp.age.count[tmp.age.count > 100]
 33 | tmp.adult.count2 = tmp.adult.count[tmp.adult.count > 100]
 34 | 
 35 | tmp.keep = intersect(names(tmp.age.count2), names(tmp.adult.count2))
 36 | 
 37 | check.remove = setdiff(unique(anno.df.clean$subclass_label), tmp.keep)
 38 | 
 39 | subclass.keep = tmp.keep
 40 | 
 41 | save(subclass.keep, file = "subclass.keep_cluster0916_v20231221.rda")
 42 | 
 43 | 
 44 | 
 45 | ################################################################################
 46 | ## MAST per subclass (using parallel function)
 47 | ################################################################################
 48 | source("functions/degenes_age_mast_simplified_qcscore.R") ## custom function for running MAST in parallel
 49 | setwd("MAST_simplified/")
 50 | 
 51 | 
 52 | de.gene.results = get_age_degenes_mast_big_p(anno.df.clean, big.dat, groups = subclass.keep, split.by = "subclass_label", save.as.tmp = F,
 53 |                                     maxcells = 2000, freq_expressed = 0.1, mc.cores = 15)
 54 | 
 55 | saveRDS(de.gene.results, file = "de.gene.result_cluster0916_v231221_subclass.rds")
 56 | 
 57 | 
 58 | 
 59 | ################################################################################
 60 | ## Assemble MAST results
 61 | ################################################################################
 62 | ## Load results
 63 | load("anno.df.clean_cluster0916_v231221_20240206.rda")
 64 | results = readRDS("MAST_simplified/de.gene.result_cluster0916_v231221_subclass.rds")
 65 | 
 66 | ## Exclude genes
 67 | load("r_objects/sex.genes.rda") #sex-biased genes
 68 | load("r_objects/genotype.genes.rda") #genotype-biased genes
 69 | load("r_objects/iegenes.rda") #immediate early genes
 70 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes)
 71 | 
 72 | ## Cutoffs for age-DE genes
 73 | fcthreshold = 1
 74 | pthreshold = 0.01
 75 | 
 76 | 
 77 | ## Create summarized objects with results
 78 | de.gene.results = lapply(results, function(x) x[["result"]])
 79 | 
 80 | for(i in names(de.gene.results)){
 81 |   de.gene.results[[i]]$subclass = i
 82 | }
 83 | de.results.long0 = do.call(rbind, de.gene.results)
 84 | saveRDS(de.results.long0, file = paste0(opath, "de.results.long0_allresults.rds"))
 85 | 
 86 | 
 87 | de.gene.results = lapply(de.gene.results, function(x) x[x$padjust < pthreshold & abs(x$coef) > fcthreshold,])
 88 | de.gene.results = lapply(de.gene.results, function(x) x[order(x$coef, decreasing = T),])
 89 | de.gene.results = lapply(de.gene.results, function(x) x[!x$primerid %in% genes.exclude,]) ## excluding sex, genotype, iegenes
 90 | sort(unlist(lapply(de.gene.results, nrow)))
 91 | subclass.results = as.data.frame(sort(unlist(lapply(de.gene.results, nrow))))
 92 | names(subclass.results) = "de.gene.count"
 93 | subclass.results$subclass = row.names(subclass.results)
 94 | 
 95 | tmp = unlist(lapply(results, function(x) length(x$select.cells)))
 96 | names(tmp) = names(results)
 97 | subclass.results$ncell.tested = tmp[subclass.results$subclass]
 98 | 
 99 | saveRDS(subclass.results, file = paste0(opath, "subclasswide_degene_summary_pcut", pthreshold, "_fc", fcthreshold,  "_excludedgenes.rds"))
100 | 
101 | de.results.long = de.results.long0[de.results.long0$padjust < pthreshold & abs(de.results.long0$coef) > fcthreshold,]
102 | saveRDS(de.results.long, file = paste0(opath, "de.results.long_pcut", pthreshold, "_fc", fcthreshold,  ".rds"))
103 | 
104 | 
105 | ################################################################################
106 | ## Visualize results
107 | ################################################################################
108 | opath = "MAST_simplified"
109 | 
110 | tmp.level1 = unique(anno.df.clean[,c("class_label", "class_id")])
111 | tmp.level1 = tmp.level1[order(tmp.level1$class_id),]
112 | level1.order = tmp.level1$class_label
113 | tmp = unique(anno.df.clean[,c("subclass_label", "class_color", "class_label")])
114 | level1.color = tmp$class_color
115 | names(level1.color) = tmp$subclass_label
116 | 
117 | subclass.results1 = left_join(subclass.results, tmp, by = c("subclass" = "subclass_label"))
118 | row.names(subclass.results1) = subclass.results1$subclass
119 | # subclass.results1$subclass_color = level1.color[subclass.results1$subclass]
120 | subclass.results1 = subclass.results1[order(subclass.results1$subclass),]
121 | subclass.results1$class_label = factor(subclass.results1$class_label, levels = level1.order)
122 | subclass.results1 = subclass.results1[order(subclass.results1$class_label),]
123 | subclass.results1$subclass = factor(subclass.results1$subclass, levels = subclass.results1$subclass)
124 | 
125 | ## Barplot of nDEgene count
126 | p1 = subclass.results1 %>%
127 |   ggplot(aes(y = de.gene.count, x = subclass, fill = subclass))+
128 |   geom_bar(stat = "identity", width = 0.8)+
129 |   scale_fill_manual(values = level1.color)+
130 |   theme_classic()+
131 |   theme(legend.position = "none")+
132 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5))
133 | 
134 | 
135 | de.results.long$subclass = factor(de.results.long$subclass, levels = subclass.results1$subclass)
136 | de.results.long$direction = "positive"
137 | de.results.long$direction[de.results.long$coef < 0] = "negative"
138 | de.results.long$abs.coef = abs(de.results.long$coef)
139 | 
140 | 
141 | ## Dot plot of coeficients
142 | p2 = de.results.long %>%
143 |   ggplot(aes(x = subclass, y = abs.coef, color = subclass, shape = direction))+
144 |   geom_jitter(size = 0.8, width = 0.05, alpha = 0.8)+
145 |   scale_color_manual(values = level1.color)+
146 |   scale_shape_manual(values = c(4, 1))+
147 |   theme_classic()+
148 |   theme(legend.position = "none")+
149 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5))
150 | 
151 | p4 = de.results.long %>%
152 |   ggplot(aes(x = subclass, y = -log10(padjust), color = subclass, size = 5))+
153 |   geom_jitter(size = 0.5, width = 0.1)+
154 |   scale_color_manual(values = level1.color)+
155 |   theme_bw()+
156 |   theme(legend.position = "none")+
157 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5))
158 | 
159 | 
160 | p3 = subclass.results1 %>%
161 |   ggplot(aes(y = de.gene.count, x = subclass, fill = subclass))+
162 |   geom_bar(stat = "identity", width = 0.8)+
163 |   scale_fill_manual(values = level1.color)+
164 |   geom_jitter(data = de.results.long, aes(x = subclass, y = -30*abs.coef, color = subclass, shape = direction), size = 0.8, width = 0.05, alpha = 0.8)+
165 |   scale_shape_manual(values = c(4, 1))+
166 |   scale_color_manual(values = level1.color)+
167 |   theme_classic()+
168 |   theme(legend.position = "none")+
169 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5))
170 | 
171 | 
172 | ggsave(p1, filename = paste0(opath, "barplot_ndegene_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5)
173 | ggsave(p2, filename = paste0(opath, "dotplot_signifcoef_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5)
174 | ggsave(p4, filename = paste0(opath, "dotplot_signifpval_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5)
175 | ggsave(p3, filename = paste0(opath, "combdotbar_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5)
176 | 


--------------------------------------------------------------------------------
/analysis_scripts/09b_degenes_age_mast_supertype_nn.R:
--------------------------------------------------------------------------------
  1 | ## Description/Goal:
  2 | ## Running MAST at the supertype level in non-neuronals to find age-DE genes
  3 | 
  4 | library(MAST)
  5 | library(scrattch.hicat)
  6 | library(dplyr)
  7 | library(ggplot2)
  8 | library(data.table)
  9 | library(bigstatsr)
 10 | library(scrattch.bigcat)
 11 | 
 12 | 
 13 | ################################################################################
 14 | ## Load data
 15 | ################################################################################
 16 | ## Cleaned metadata data file
 17 | load("anno.df.clean_cluster0916_v231221_20240206.rda")
 18 | 
 19 | ## Loading cell by gene matrix in dgC sparse matrix format (not included in github)
 20 | load("norm.dat_objects/nonneuronal-imn/norm.dat.rda")
 21 | 
 22 | 
 23 | ################################################################################
 24 | ## Deciding which subclasses to include for analysis
 25 | # ################################################################################
 26 | anno.df.select = anno.df.clean[grepl("NN", anno.df.clean$supertype_label),]
 27 | tmp = as.data.frame(table(anno.df.select[,c("supertype_label", "age_cat")]))
 28 | tmp = tmp[tmp$Freq > 100,]
 29 | supertype.keep = names(table(tmp$supertype_label)[table(tmp$supertype_label) == 2])
 30 | common = intersect(colnames(norm.dat), anno.df.clean$sample_id)
 31 | anno.df.select = anno.df.clean[anno.df.clean$sample_id %in% common,]
 32 | 
 33 | 
 34 | ################################################################################
 35 | ## MAST per supertype (using parallel function)
 36 | ################################################################################
 37 | source("functions/degenes_age_mast_simplified_qcscore.R") ## custom function for running MAST in parallel
 38 | setwd("MAST_nnsupertype/")
 39 | 
 40 | de.gene.results = get_age_degenes_mast_p(anno.df.select, norm.dat, groups = supertype.keep, split.by = "supertype_label", save.as.tmp = F,
 41 |                                              maxcells = 2000, freq_expressed = 0.1, mc.cores = 10)
 42 | 
 43 | saveRDS(de.gene.results, file = "de.gene.result_cluster0916_v231221_nnsupertype.rds")
 44 | 
 45 | 
 46 | ################################################################################
 47 | ## Assemble MAST results
 48 | ################################################################################
 49 | opath = "MAST_nnsupertype"
 50 | 
 51 | ## Load results
 52 | load("anno.df.clean_cluster0916_v231221_20240206.rda")
 53 | results = readRDS("MAST_nnsupertype/de.gene.result_cluster0916_v231221_nnsupertype.rds")
 54 | 
 55 | ## Exclude genes
 56 | load("r_objects/sex.genes.rda") #sex-biased genes
 57 | load("r_objects/genotype.genes.rda") #genotype-biased genes
 58 | load("r_objects/iegenes.rda") #immediate early genes
 59 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes)
 60 | 
 61 | ## Cutoffs for age-DE genes
 62 | fcthreshold = 1
 63 | pthreshold = 0.01
 64 | 
 65 | ## Assemble results
 66 | de.gene.results = lapply(results, function(x) x[["result"]])
 67 | 
 68 | for(i in names(de.gene.results)){
 69 |   de.gene.results[[i]]$subclass = i
 70 | }
 71 | de.results.long0 = do.call(rbind, de.gene.results)
 72 | saveRDS(de.results.long0, file = paste0(opath, "de.results.long0_allresults.rds"))
 73 | 
 74 | de.gene.results = lapply(de.gene.results, function(x) x[x$padjust < pthreshold & abs(x$coef) > fcthreshold,])
 75 | de.gene.results = lapply(de.gene.results, function(x) x[order(x$coef, decreasing = T),])
 76 | de.gene.results = lapply(de.gene.results, function(x) x[!x$primerid %in% genes.exclude,]) ## excluding sex, genotype, iegenes
 77 | sort(unlist(lapply(de.gene.results, nrow)))
 78 | subclass.results = as.data.frame(sort(unlist(lapply(de.gene.results, nrow))))
 79 | names(subclass.results) = "de.gene.count"
 80 | subclass.results$subclass = row.names(subclass.results)
 81 | 
 82 | tmp = unlist(lapply(results, function(x) length(x$select.cells)))
 83 | names(tmp) = names(results)
 84 | subclass.results$ncell.tested = tmp[subclass.results$subclass]
 85 | 
 86 | saveRDS(subclass.results, file = paste0(opath, "supertype_degene_summary_pcut", pthreshold, "_fc", fcthreshold,  "_excludegenes.rds"))
 87 | 
 88 | de.results.long = de.results.long0[de.results.long0$padjust < pthreshold & abs(de.results.long0$coef) > fcthreshold,]
 89 | saveRDS(de.results.long, file = paste0(opath, "de.results.long_pcut", pthreshold, "_fc", fcthreshold,  ".rds"))
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/analysis_scripts/10a_gsea_gprofiler_subclass.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Gene set enrichment analysis with gprofiler2 at the subclass level
 3 | 
 4 | library(gprofiler2)
 5 | library(dplyr)
 6 | 
 7 | #################################################
 8 | ## Run gprofiler
 9 | #################################################
10 | ## Load in age-DE genes from MAST
11 | de.results = readRDS("MAST_simplified/de.gene.result_cluster0916_v231221_subclass.rds")
12 | de.results = lapply(de.results, function(x) x[["result"]])
13 | 
14 | ## Remove certain genes
15 | load("sex.genes.rda")
16 | load("genotype.genes.rda")
17 | load("iegenes.rda")
18 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes)
19 | 
20 | de.results = lapply(de.results, function(x) x[!x$primerid %in% genes.exclude,])
21 | 
22 | ## Signif cutoffs
23 | pval.cut = 0.01
24 | logfc.cut = 1
25 | 
26 | ## Positive only
27 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef > 0,])
28 | tmp = unlist(lapply(de.genes, nrow))
29 | de.genes = de.genes[names(tmp[tmp >= 3])]
30 | gp.result.pos = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01))
31 | 
32 | 
33 | ## Negative only
34 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef < 0,])
35 | tmp = unlist(lapply(de.genes, nrow))
36 | de.genes = de.genes[names(tmp[tmp >= 3])]
37 | gp.result.neg = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01))
38 | 
39 | 
40 | save(gp.result.pos, gp.result.neg, file = "gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda")
41 | 
42 | 
43 | 
44 | #################################################
45 | ## Consolidate results
46 | #################################################
47 | load("gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda")
48 | 
49 | for(i in names(gp.result.pos)){
50 |   print(i)
51 |   tmp = gp.result.pos[[i]]$result
52 |   if(is.null(tmp)){
53 |     gp.result.pos[[i]] = NULL
54 |     next
55 |   }
56 |   tmp$subclass = i
57 |   tmp$direction = "pos"
58 |   gp.result.pos[[i]] = tmp
59 | }
60 | 
61 | for(i in names(gp.result.neg)){
62 |   print(i)
63 |   tmp = gp.result.neg[[i]]$result
64 |   if(is.null(tmp)){
65 |     gp.result.neg[[i]] = NULL
66 |     next
67 |   }
68 |   tmp$subclass = i
69 |   tmp$direction = "neg"
70 |   gp.result.neg[[i]] = tmp
71 | }
72 | 
73 | tmp.pos = do.call(rbind, gp.result.pos)
74 | tmp.neg = do.call(rbind, gp.result.neg)
75 | 
76 | go.results.all = rbind(data.frame(tmp.pos), data.frame(tmp.neg))
77 | go.results.all = apply(go.results.all,2,as.character)
78 | 
79 | write.csv(go.results.all, file = "gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv")


--------------------------------------------------------------------------------
/analysis_scripts/10b_gsea_gprofiler_nnsupertype.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Gene set enrichment analysis with gprofiler2 at the subclass level
 3 | 
 4 | library(gprofiler2)
 5 | library(dplyr)
 6 | 
 7 | 
 8 | #################################################
 9 | ## Run gprofiler
10 | #################################################
11 | ## Load in age-DE genes from MAST
12 | de.results = readRDS("MAST_nnsupertype/de.gene.result_cluster0916_v231221_nnsupertype.rds")
13 | de.results = lapply(de.results, function(x) x[["result"]])
14 | 
15 | ## Remove certain genes
16 | load("sex.genes.rda")
17 | load("genotype.genes.rda")
18 | load("iegenes.rda")
19 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes)
20 | test = lapply(de.results, function(x) x[!x$primerid %in% genes.exclude,])
21 | unlist(lapply(de.results, nrow))
22 | unlist(lapply(test, nrow))
23 | de.results = test
24 | 
25 | ## Signif cutoffs
26 | pval.cut = 0.01
27 | logfc.cut = 1
28 | 
29 | ## Positive only
30 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef > 0,])
31 | tmp = unlist(lapply(de.genes, nrow))
32 | de.genes = de.genes[names(tmp[tmp >= 3])]
33 | gp.result.pos = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01))
34 | 
35 | 
36 | ## Negative only
37 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef < 0,])
38 | tmp = unlist(lapply(de.genes, nrow))
39 | de.genes = de.genes[names(tmp[tmp >= 3])]
40 | gp.result.neg = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01))
41 | 
42 | 
43 | save(gp.result.pos, gp.result.neg, file = "gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda")
44 | 
45 | 
46 | 
47 | #################################################
48 | ## Consolidate results
49 | #################################################
50 | ## Check and add columns
51 | load("gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda")
52 | 
53 | for(i in names(gp.result.pos)){
54 |   print(i)
55 |   tmp = gp.result.pos[[i]]$result
56 |   if(is.null(tmp)){
57 |     gp.result.pos[[i]] = NULL
58 |     next
59 |   }
60 |   tmp$subclass = i
61 |   tmp$direction = "pos"
62 |   gp.result.pos[[i]] = tmp
63 | }
64 | 
65 | 
66 | 
67 | for(i in names(gp.result.neg)){
68 |   print(i)
69 |   tmp = gp.result.neg[[i]]$result
70 |   if(is.null(tmp)){
71 |     gp.result.neg[[i]] = NULL
72 |     next
73 |   }
74 |   tmp$subclass = i
75 |   tmp$direction = "neg"
76 |   gp.result.neg[[i]] = tmp
77 | }
78 | 
79 | tmp.pos = do.call(rbind, gp.result.pos)
80 | tmp.neg = do.call(rbind, gp.result.neg)
81 | 
82 | go.results.all = rbind(data.frame(tmp.pos), data.frame(tmp.neg))
83 | go.results.all = apply(go.results.all,2,as.character)
84 | 
85 | write.csv(go.results.all, file = "gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv")
86 | 


--------------------------------------------------------------------------------
/analysis_scripts/11_GO_matrix_gprofiler.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Compute and plot gene enrichment matrix
 3 | 
 4 | library(pheatmap)
 5 | library(RColorBrewer)
 6 | 
 7 | #################################################
 8 | ## Load data and combine
 9 | #################################################
10 | go.results.all = read.csv("gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv")
11 | go.results.nn = read.csv("gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv")
12 | go.results.all = go.results.all[!grepl(" NN", go.results.all$subclass),]
13 | go.results.all0 = rbind(go.results.all, go.results.nn)
14 | 
15 | 
16 | #################################################
17 | ## Filter
18 | #################################################
19 | ## Which data sources to use
20 | source.keep = c("GO:MF", "GO:BP", "GO:CC")
21 | go.results.all = go.results.all0[go.results.all0$source %in% source.keep,]
22 | 
23 | ## Which terms/subclasses/supertypes to keep
24 | termfreq = 2 #filters terms with frequency less than this
25 | subclassfreq = 6 #filters celltypes with frequency less than this
26 | goterm.keep = names(table(go.results.all$term_name)[table(go.results.all$term_name) >= termfreq])
27 | go.results.all = go.results.all[go.results.all$term_name %in% goterm.keep,]
28 | subclass.keep = names(table(go.results.all$subclass)[table(go.results.all$subclass) >= subclassfreq])
29 | go.results.all = go.results.all[go.results.all$subclass %in% subclass.keep,]
30 | setdiff(unique(go.results.all0$subclass), subclass.keep) # check subclasses that got filtered out
31 | 
32 | ## Filter out broad terms
33 | nterm_cutoff = 2000 #filters terms with greater than this number of gene members (to filter out terms that are too broad)
34 | terms.tmp = unique(go.results.all[,c("term_name", "term_size")])
35 | unique(go.results.all$term_name[go.results.all$term_size > nterm_cutoff]) #which terms would be thrown out
36 | go.results.all = go.results.all[go.results.all$term_size <= nterm_cutoff,]
37 | 
38 | ## Final counts of terms/subclasses
39 | length(unique(go.results.all$term_name))
40 | length(unique(go.results.all$subclass))
41 | 
42 | go.mat = matrix(0, nrow = length(unique(go.results.all$subclass)), ncol = length(unique(go.results.all$term_name)))
43 | row.names(go.mat) = unique(go.results.all$subclass)
44 | colnames(go.mat) = unique(go.results.all$term_name)
45 | 
46 | for(n in 1:nrow(go.results.all)){
47 |   x = go.results.all$subclass[n]
48 |   y = go.results.all$term_name[n]
49 |   tmp = -log10(go.results.all$p_value[n])
50 |   if(go.results.all$direction[n] == "pos"){
51 |     go.mat[x,y] = tmp
52 |   }
53 |   if(go.results.all$direction[n] == "neg"){
54 |     go.mat[x,y] = -tmp
55 |   }
56 | }
57 | 
58 | 
59 | #################################################
60 | ## Plot
61 | #################################################
62 | ## Make objects for colors
63 | load("col.broi.rda")
64 | tmp = "#808080"
65 | names(tmp) = "NN"
66 | col.broi = c(col.broi, tmp)
67 | subclass.roi.summary = anno.df.clean %>% group_by(subclass_label) %>% summarize(max.broad.roi = names(which.max(table(broad_roi))))
68 | subclass.roi.summary = subclass.roi.summary[!grepl("NN", subclass.roi.summary$subclass_label),]
69 | tmp = data.frame(subclass_label = unique(go.results.nn$subclass), max.broad.roi = "NN")
70 | subclass.roi.summary = rbind(subclass.roi.summary, tmp)
71 | subclass.roi.summary = subclass.roi.summary[subclass.roi.summary$subclass_label %in% subclass.keep,]
72 | 
73 | anno.df = data.frame(max.broad.roi = subclass.roi.summary$max.broad.roi)
74 | row.names(anno.df) = subclass.roi.summary$subclass_label
75 | 
76 | anno.list = list(max.broad.roi = col.broi)
77 | 
78 | 
79 | 
80 | ## Setting limit on max score to make colors easier to interpret
81 | max = 10
82 | go.mat[go.mat > max] = max
83 | go.mat[go.mat < (-1*max)] = (-1*max)
84 | 
85 | 
86 | ## Plot heatmap
87 | pheatmap(go.mat, fontsize_row = 6, fontsize_col = 6, #scale = "row",
88 |          annotation_row = anno.df, annotation_colors = anno.list,
89 |          filename = "figures_go/gprofiler_gomat.pdf", width = 15, height = 10, show_colnames = T,
90 |          color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
91 |          clustering_method = "ward.D", ##complete, ward.D, and ward.D2 are usually the best ones
92 |          border_color = NA)


--------------------------------------------------------------------------------
/analysis_scripts/12_augur.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Run augur on subclasses
 3 | 
 4 | library(Augur)
 5 | 
 6 | #################################################
 7 | ## Modify select_variance function in Augur
 8 | #################################################
 9 | library(godmode) ## this package used for running modified version of select_variance()
10 | source("cust_functions/select_variance_MOD.R")
11 | godmode:::assignAnywhere("select_variance", select_variance2)
12 | 
13 | #################################################
14 | ## Load data and combine
15 | #################################################
16 | load("anno.df.clean_cluster0916_v231221_20231221.rda")
17 | load("subclass.keep_cluster0916_v20231221.rda") ## subclasses included in MAST analysis
18 | load("big.dat_Aging.16ROI.V3.20231208.rda")
19 | load("cluster_0916/norm.dat_cluster0916_v20231221_subclass_ss1200.rda")
20 | 
21 | #################################################
22 | ## Run Augur
23 | #################################################
24 | anno.df.select = anno.df.clean[anno.df.clean$sample_id %in% colnames(norm.dat),]
25 | anno.df.select = anno.df.select[anno.df.select$subclass_label %in% subclass.keep,]
26 | anno.df.select$subclass_label = as.character(anno.df.select$subclass_label)
27 | min(table(anno.df.select$subclass_label, anno.df.select$age_cat))
28 | 
29 | meta.df = anno.df.select[,c("subclass_label", "age_cat")]
30 | row.names(meta.df) = anno.df.select$sample_id
31 | 
32 | norm.dat.select = norm.dat[,row.names(meta.df)]
33 | 
34 | augur = calculate_auc(norm.dat.select, meta.df, cell_type_col = "subclass_label", label_col = "age_cat", 
35 |                       feature_perc = 0.8, var_quantile = 0.9, subsample_size = 200,
36 |                       n_threads = 25)
37 | 
38 | saveRDS(augur, file = "augur/augur.result_ait21_dispersioncorrected_var0.9_fperc0.8_subsampsize200_20240102.rds")
39 | 
40 | #################################################
41 | ## Check results and save smaller objects for later
42 | #################################################
43 | augur = readRDS("augur/augur.result_ait21_dispersioncorrected_var0.9_fperc0.8_subsampsize200_20240102.rds")
44 | rank.all = augur$AUC
45 | aucresults.all = augur$results
46 | save(rank.all, aucresults.all, file = "augur/augur.objects_allcells.rda")
47 | 
48 | load("augur/augur.objects_allcells.rda")
49 | 
50 | rank.all = rank.all[order(rank.all$cell_type),]
51 | names(rank.all) = c("cell_type", "auc.all")
52 | rank.summary = rank.summary[order(rank.summary$auc.all, decreasing = T),]
53 | saveRDS(rank.summary,  file = "augur/rank.summary_all.m.f_ait21_dispersioncorrected_var0.9_fperc0.8_subsampsize200_20240102.rds")
54 | 


--------------------------------------------------------------------------------
/analysis_scripts/13_pb_edgeR.R:
--------------------------------------------------------------------------------
 1 | ## Description/Goal:
 2 | ## Run pseudobulk analysis on all subclasses
 3 | 
 4 | 
 5 | ###################
 6 | ## Load data
 7 | ###################
 8 | library(scrattch.hicat)
 9 | library(scrattch.bigcat)
10 | library(dplyr)
11 | 
12 | load("anno.df.clean_cluster0916_v230915_20230915.rda")
13 | load("cluster_0916/norm.dat_cluster0916_v230915_subclass_ss1500.rda") ## subsampled cell by gene matrix
14 | load("big.dat_Aging.16ROI.V3.20220701.rda")
15 | load("subclass.keep_cluster0916_v230915.rda")
16 | 
17 | ###################
18 | ## Compute pseudobulk means (means per library per subclass)
19 | ###################
20 | anno.df.select = anno.df.clean[anno.df.clean$subclass_label %in% subclass.keep,]
21 | cl = paste(anno.df.select$subclass_label, anno.df.select$age_cat, anno.df.select$library_prep, sep = "__")
22 | length(unique(cl))
23 | names(cl) = anno.df.select$sample_id
24 | 
25 | tmp.summary = anno.df.select %>% dplyr::group_by(subclass_label, age_cat, library_prep) %>% dplyr::summarize(n = n())
26 | tmp.summary.keep = tmp.summary[tmp.summary$n >= 10,]
27 | tmp.summary.keep$cl = paste(tmp.summary.keep$subclass_label, tmp.summary.keep$age_cat, tmp.summary.keep$library_prep, sep = "__")
28 | 
29 | cl.clean = cl[cl %in% tmp.summary.keep$cl]
30 | 
31 | cl.means = get_cl_stats_big(big.dat, cl.clean, max.cl.size = 200, stats = "means", mc.cores = 5)
32 | 
33 | save(cl.means, file = "pb_edgeR/cl.means")
34 | 
35 | 
36 | 
37 | ###################
38 | ## Pseudobulk analysis with EdgeR
39 | ###################
40 | library(edgeR)
41 | library(statmod)
42 | 
43 | load("pb_edgeR/cl.means")
44 | load("subclass.keep_cluster0916_v230915.rda")
45 | 
46 | cl.means = cl.means$means
47 | tmp.split = strsplit(colnames(cl.means), "__")
48 | subclass_label = unlist(lapply(tmp.split, function(x) x[[1]]))
49 | age_cat = unlist(lapply(tmp.split, function(x) x[[2]]))
50 | library_prep = unlist(lapply(tmp.split, function(x) x[[3]]))
51 | meta.df = data.frame(sample = colnames(cl.means), subclass_label = subclass_label, age_cat = age_cat, library_prep = library_prep)
52 | 
53 | qlf.result = c()
54 | lrt.result = c()
55 | 
56 | for(i in subclass.keep){
57 |   
58 |   print(i)
59 |   
60 |   sample.keep = grep(i, colnames(cl.means))
61 |   meta.select = meta.df[sample.keep,]
62 |   
63 |   age = meta.select$age_cat
64 |   # y.select = DGEList(counts = cl.means[,sample.keep], group = age)
65 |   
66 |   tmp.counts = 2^cl.means[,sample.keep] - 1
67 |   y.select.counts = DGEList(counts = tmp.counts, group = age)
68 |   
69 |   design = model.matrix(~age)
70 |   row.names(design) = colnames(y.select.counts)
71 |   
72 |   y.select.counts = estimateDisp(y.select.counts, design, robust = T)
73 |   # plotBCV(y.select.counts)
74 |   
75 |   print("Running quasi-likelihood...")
76 |   fit = glmQLFit(y.select.counts, design)
77 |   qlf = glmQLFTest(fit, coef = 2)
78 |   # topTags(qlf, n = 20)
79 |   
80 |   print("Running LRT...")
81 |   fit = glmFit(y.select.counts, design)
82 |   lrt = glmLRT(fit, coef = 2)
83 |   # topTags(lrt, n = 20)
84 |   
85 |   print("Saving results...")
86 |   tmp.qlf = list(result = qlf, dgelist = y.select.counts)
87 |   tmp.lrt = list(result = lrt, dgelist = y.select.counts)
88 |   qlf.result[i] = list(tmp.qlf)
89 |   lrt.result[i] = list(tmp.lrt)
90 |   
91 | }
92 | 
93 | 
94 | saveRDS(qlf.result, file = "pb_edgeR/qlf.result_subclass_cluster0916_v230915.rds")
95 | saveRDS(lrt.result, file = "pb_edgeR/lrt.result_subclass_cluster0916_v230915.rds")
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/analysis_scripts/functions/degenes_age_mast_simplified_qcscore.R:
--------------------------------------------------------------------------------
  1 | get_age_degenes_mast = function(anno.df, norm.dat, id, split.by, maxcells = 500, freq_expressed = 0.1, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){
  2 | 	
  3 | 	set.seed(2010)
  4 | 
  5 | 	require(MAST)
  6 | 	require(dplyr)
  7 | 	require(scrattch.hicat)
  8 | 	require(data.table)
  9 | 
 10 | 	## Rename objects
 11 | 	anno.df.select = anno.df
 12 | 	row.names(anno.df.select) = anno.df.select$sample_id
 13 | 	group.tmp = id
 14 | 	dat.select = norm.dat
 15 | 	print(paste0("Starting ", group.tmp, "..."))
 16 | 
 17 | 
 18 |     ## Exclude or include
 19 |     if(exclude){
 20 |     	anno.tmp = anno.df.select[anno.df.select[,split.by] != group.tmp,]
 21 |     } else {
 22 |     	anno.tmp = anno.df.select[anno.df.select[,split.by] == group.tmp,]
 23 | 
 24 |     }
 25 | 
 26 |     ## Subsample to the same size per age group, up to maxcells
 27 |     if(is.null(select.cells)){
 28 |       tmp.age = anno.tmp$sample_id[anno.tmp$age_cat == "aged"]
 29 |       tmp.adult = anno.tmp$sample_id[anno.tmp$age_cat == "adult"]
 30 |       tmp.max.age = min(length(tmp.age), maxcells)
 31 |       tmp.max.adult = min(length(tmp.adult), maxcells)
 32 |       select.cells = c(sample(tmp.age, tmp.max.age), sample(tmp.adult, tmp.max.adult))
 33 |       anno.tmp = anno.tmp[select.cells,]
 34 |       anno.tmp$sex = as.character(anno.tmp$sex) ## correcting weird error where some entries were boolean rather than characters
 35 | 
 36 |       ## Escape for error
 37 |       tmp = c(table(anno.tmp[,c("sex", "age_cat")]))
 38 |       if(sum(tmp == 0) > 0){
 39 |         stop("Error: Missing data for one of the sexes and/or ages.")
 40 |         # break
 41 |       }
 42 |     }
 43 | 
 44 | 
 45 |     ## Subset data
 46 |     dat.tmp = dat.select[,select.cells]
 47 |     anno.tmp = anno.tmp[select.cells,]
 48 |     
 49 | 
 50 |     ## Make MAST object
 51 |     row.names(anno.tmp) = anno.tmp$sample_id
 52 |     sca = FromMatrix(as.matrix(dat.tmp), cData = anno.tmp)
 53 | 
 54 |   # ## Thresholding ##SKIPPING FOR NOW
 55 |   # thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 30, min_per_bin = 30)
 56 |   # par(mfrow=c(5,4))
 57 |   # plot(thres)
 58 | 
 59 |     ## Freq cutoff
 60 |     if(is.null(select.genes)){
 61 |       select.genes = freq(sca) > freq_expressed
 62 | 
 63 |     }
 64 |     print(paste0("Ncells: ", length(select.cells)))
 65 |     print(paste0("Ngenes: ", sum(select.genes)))
 66 |     sca = sca[select.genes,]
 67 |     
 68 |     ## Model
 69 |     print("Run model")
 70 |     colData(sca)$age_cat<-factor(colData(sca)$age_cat)
 71 |     colData(sca)$sex<-factor(colData(sca)$sex)
 72 |     colData(sca)$roi<-factor(colData(sca)$roi)
 73 |     colData(sca)$broad_roi<-factor(colData(sca)$broad_roi)
 74 |     colData(sca)$facs_population_plan<-factor(colData(sca)$facs_population_plan)
 75 |     colData(sca)$full_genotype<-factor(colData(sca)$full_genotype)
 76 |     colData(sca)$log.gene.counts.0 = scale(log(colData(sca)$gene.counts.0))
 77 |     colData(sca)$z.qc.score = scale(colData(sca)$qc.score)
 78 |     
 79 |     zlm <- zlm(~age_cat + sex + log.gene.counts.0 + z.qc.score, sca)
 80 | 
 81 | 
 82 |     ## ----> LTR
 83 |     summaryAge<- summary(zlm, doLRT='age_cataged')
 84 |     summaryDt.age <- summaryAge$datatable
 85 |     fcHurdle.age <- merge(summaryDt.age[contrast=='age_cataged' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
 86 |                           summaryDt.age[contrast=='age_cataged' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
 87 |     fcHurdle.age[,padjust:=p.adjust(`Pr(>Chisq)`, 'bonferroni')]
 88 |     fcHurdle.age[split.by] = group.tmp
 89 |     
 90 |     ## ----> Save results
 91 |     result = list(select.cells = select.cells, result = fcHurdle.age)
 92 |     if(save.as.tmp){
 93 |       save(result, file = paste0("tmp.degene.result.", make.names(id), ".rda"))
 94 |     }
 95 |     return(result)
 96 | 
 97 | }
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | get_age_degenes_mast_big = function(anno.df, big.dat, id, split.by, maxcells = 500, freq_expressed = 0.1, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){
104 | 	
105 | 	set.seed(2010)
106 | 
107 | 	require(MAST)
108 | 	require(dplyr)
109 | 	require(scrattch.hicat)
110 | 	require(data.table)
111 | 
112 | 	## Rename objects
113 | 	anno.df.select = anno.df
114 | 	row.names(anno.df.select) = anno.df.select$sample_id
115 | 	group.tmp = id
116 | 	print(paste0("Starting ", group.tmp, "..."))
117 | 
118 | 
119 |     ## Exclude or include
120 |     if(exclude){
121 |     	anno.tmp = anno.df.select[anno.df.select[,split.by] != group.tmp,]
122 |     } else {
123 |     	anno.tmp = anno.df.select[anno.df.select[,split.by] == group.tmp,]
124 | 
125 |     }
126 | 
127 |     ## Subsample to the same size per age group, up to maxcells (if no preselected cells)
128 |     if(is.null(select.cells)){
129 |       tmp.age = anno.tmp$sample_id[anno.tmp$age_cat == "aged"]
130 |       tmp.adult = anno.tmp$sample_id[anno.tmp$age_cat == "adult"]
131 |       tmp.max.age = min(length(tmp.age), maxcells)
132 |       tmp.max.adult = min(length(tmp.adult), maxcells)
133 |       select.cells = c(sample(tmp.age, tmp.max.age), sample(tmp.adult, tmp.max.adult))
134 |       anno.tmp = anno.tmp[select.cells,]
135 |       anno.tmp$sex = as.character(anno.tmp$sex) ## correcting weird error where some entries were boolean rather than characters
136 | 
137 |       ## Escape for error 
138 |       tmp = c(table(anno.tmp[,c("sex", "age_cat")]))
139 |       if(sum(tmp == 0) > 0){
140 |         stop("Error: Missing data for one of the sexes and/or ages.")
141 |         # break
142 |       }
143 |     }
144 | 
145 | 
146 |     ## Subsample data
147 |     dat.tmp = get_logNormal(big.dat, select.cells)
148 | 
149 |     ## Make MAST object
150 |     row.names(anno.tmp) = anno.tmp$sample_id
151 |     sca = FromMatrix(as.matrix(dat.tmp), cData = anno.tmp)
152 | 
153 |   # ## Thresholding ##SKIPPING FOR NOW
154 |   # thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 30, min_per_bin = 30)
155 |   # par(mfrow=c(5,4))
156 |   # plot(thres)
157 | 
158 |     ## Freq cutoff
159 |     if(is.null(select.genes)){
160 |       select.genes = freq(sca) > freq_expressed
161 |     }
162 |     print(paste0("Ncells: ", length(select.cells)))
163 |     print(paste0("Ngenes: ", sum(select.genes)))
164 |     sca = sca[select.genes,]
165 |     
166 |     ## Model
167 |     print("Run model")
168 |     colData(sca)$age_cat<-factor(colData(sca)$age_cat)
169 |     colData(sca)$sex<-factor(colData(sca)$sex)
170 |     colData(sca)$roi<-factor(colData(sca)$roi)
171 |     colData(sca)$broad_roi<-factor(colData(sca)$broad_roi)
172 |     colData(sca)$facs_population_plan<-factor(colData(sca)$facs_population_plan)
173 |     colData(sca)$full_genotype<-factor(colData(sca)$full_genotype)
174 |     colData(sca)$log.gene.counts.0 = scale(log(colData(sca)$gene.counts.0))
175 |     colData(sca)$z.qc.score = scale(colData(sca)$qc.score)
176 |     
177 |     zlm <- zlm(~age_cat + sex + log.gene.counts.0 + z.qc.score, sca)
178 |     
179 | 
180 | 
181 |     ## ----> LTR
182 |     summaryAge<- summary(zlm, doLRT='age_cataged')
183 |     summaryDt.age <- summaryAge$datatable
184 |     fcHurdle.age <- merge(summaryDt.age[contrast=='age_cataged' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
185 |                           summaryDt.age[contrast=='age_cataged' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
186 |     fcHurdle.age[,padjust:=p.adjust(`Pr(>Chisq)`, 'bonferroni')]
187 |     fcHurdle.age[split.by] = group.tmp
188 |     
189 |     ## ----> Save results
190 |     result = list(select.cells = select.cells, result = fcHurdle.age)
191 |     if(save.as.tmp){
192 |     	save(result, file = paste0("tmp.degene.result.", make.names(id), ".rda"))
193 |     }
194 |     return(result)
195 | 
196 | }
197 | 
198 | 
199 | # 
200 | # 
201 | # 
202 | # 
203 | # 
204 | # get_sex_intergenes_mast_big = function(anno.df, big.dat, id, split.by, maxcells = 500, freq_expressed = 0.1, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){
205 | #   
206 | #   set.seed(2010)
207 | #   
208 | #   require(MAST)
209 | #   require(dplyr)
210 | #   require(scrattch.hicat)
211 | #   require(data.table)
212 | #   
213 | #   ## Rename objects
214 | #   anno.df.select = anno.df
215 | #   row.names(anno.df.select) = anno.df.select$sample_id
216 | #   group.tmp = id
217 | #   print(paste0("Starting ", group.tmp, "..."))
218 | #   
219 | #   
220 | #   ## Exclude or include
221 | #   if(exclude){
222 | #     anno.tmp = anno.df.select[anno.df.select[,split.by] != group.tmp,]
223 | #   } else {
224 | #     anno.tmp = anno.df.select[anno.df.select[,split.by] == group.tmp,]
225 | #     
226 | #   }
227 | #   
228 | #   ## Subsample to the same size per age group, up to maxcells (if no preselected cells)
229 | #   if(is.null(select.cells)){
230 | #     tmp.age = anno.tmp$sample_id[anno.tmp$age_cat == "aged"]
231 | #     tmp.adult = anno.tmp$sample_id[anno.tmp$age_cat == "adult"]
232 | #     tmp.max.age = min(length(tmp.age), maxcells)
233 | #     tmp.max.adult = min(length(tmp.adult), maxcells)
234 | #     select.cells = c(sample(tmp.age, tmp.max.age), sample(tmp.adult, tmp.max.adult))
235 | #     anno.tmp = anno.tmp[select.cells,]
236 | #     anno.tmp$sex = as.character(anno.tmp$sex) ## correcting weird error where some entries were boolean rather than characters
237 | #     
238 | #     ## Escape for error 
239 | #     tmp = c(table(anno.tmp[,c("sex", "age_cat")]))
240 | #     if(sum(tmp == 0) > 0){
241 | #       stop("Error: Missing data for one of the sexes and/or ages.")
242 | #       # break
243 | #     }
244 | #   }
245 | #   
246 | #   
247 | #   ## Subsample data
248 | #   dat.tmp = get_logNormal(big.dat, select.cells)
249 | #   
250 | #   ## Make MAST object
251 | #   row.names(anno.tmp) = anno.tmp$sample_id
252 | #   sca = FromMatrix(as.matrix(dat.tmp), cData = anno.tmp)
253 | #   
254 | #   # ## Thresholding ##SKIPPING FOR NOW
255 | #   # thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 30, min_per_bin = 30)
256 | #   # par(mfrow=c(5,4))
257 | #   # plot(thres)
258 | #   
259 | #   ## Freq cutoff
260 | #   if(is.null(select.genes)){
261 | #     select.genes = freq(sca) > freq_expressed
262 | #   }
263 | #   print(paste0("Ncells: ", length(select.cells)))
264 | #   print(paste0("Ngenes: ", sum(select.genes)))
265 | #   sca = sca[select.genes,]
266 | #   
267 | #   ## Model
268 | #   print("Run model")
269 | #   colData(sca)$age_cat<-factor(colData(sca)$age_cat)
270 | #   colData(sca)$sex<-factor(colData(sca)$sex)
271 | #   colData(sca)$roi<-factor(colData(sca)$roi)
272 | #   colData(sca)$facs_population_plan<-factor(colData(sca)$facs_population_plan)
273 | #   colData(sca)$full_genotype<-factor(colData(sca)$full_genotype)
274 | #   
275 | #   
276 | #   
277 | #   
278 | #   
279 | #   if(length(unique(anno.tmp$full_genotype)) == 1){
280 | #     if(length(unique(anno.tmp$facs_population_plan)) == 1){
281 | #       if(length(unique(anno.tmp$roi)) == 1){
282 | #         zlm <- zlm(~age_cat + sex + age_cat:sex, sca)
283 | #       } else{
284 | #         zlm <- zlm(~age_cat + sex + roi + age_cat:sex, sca)
285 | #       }
286 | #     } else{
287 | #       if(length(unique(anno.tmp$roi)) == 1){
288 | #         zlm <- zlm(~age_cat + sex + facs_population_plan + age_cat:sex, sca)
289 | #       } else{
290 | #         zlm <- zlm(~age_cat + sex + roi + facs_population_plan + age_cat:sex, sca)
291 | #       }
292 | #     }
293 | #   } else{
294 | #     if(length(unique(anno.tmp$facs_population_plan)) == 1){
295 | #       if(length(unique(anno.tmp$roi)) == 1){
296 | #         zlm <- zlm(~age_cat + sex + full_genotype + age_cat:sex, sca)
297 | #       } else{
298 | #         zlm <- zlm(~age_cat + sex + roi + full_genotype + age_cat:sex, sca)
299 | #       }
300 | #     } else{
301 | #       if(length(unique(anno.tmp$roi)) == 1){
302 | #         zlm <- zlm(~age_cat + sex + facs_population_plan + full_genotype + age_cat:sex, sca)
303 | #       } else{
304 | #         zlm <- zlm(~age_cat + sex + roi + facs_population_plan + full_genotype + age_cat:sex, sca)
305 | #       }
306 | #     }
307 | #   }
308 | #   
309 | #   
310 | #   
311 | #   
312 | #   ## ----> LTR
313 | #   summaryAge<- summary(zlm, doLRT='age_cataged:sexM')
314 | #   summaryDt.age <- summaryAge$datatable
315 | #   fcHurdle.age <- merge(summaryDt.age[contrast=='age_cataged:sexM' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
316 | #                         summaryDt.age[contrast=='age_cataged:sexM' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
317 | #   fcHurdle.age[,padjust:=p.adjust(`Pr(>Chisq)`, 'bonferroni')]
318 | #   fcHurdle.age[split.by] = group.tmp
319 | #   
320 | #   ## ----> Save results
321 | #   result = list(select.cells = select.cells, result = fcHurdle.age)
322 | #   if(save.as.tmp){
323 | #     save(result, file = paste0("tmp.degene.result.", make.names(id), ".rda"))
324 | #   }
325 | #   return(result)
326 | #   
327 | # }
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | get_age_degenes_mast_p = function(anno.df, norm.dat, groups, split.by, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){
337 | 
338 | 	if (mc.cores == 1) {
339 |         
340 |         de.list = list()
341 | 
342 |         for (n in 1:length(groups)) {
343 | 
344 |         	id = groups[n]
345 |             de.list[id] = list(get_age_degenes_mast(anno.df, norm.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells))
346 | 
347 |         }
348 | 
349 |     }else {
350 | 
351 | 	    library(doMC)
352 | 	    registerDoMC(cores = mc.cores)
353 | 	    de.list = foreach::foreach(i = groups, .combine='c') %dopar% {
354 | 	                 list(get_age_degenes_mast(anno.df, norm.dat, id = i, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells))
355 | 	              }
356 | 	    names(de.list) = groups
357 | 
358 | 
359 |     }
360 | 
361 |     return(de.list)
362 | 
363 | }
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 
370 | 
371 | get_age_degenes_mast_big_p = function(anno.df, big.dat, groups, split.by, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){
372 | 
373 | 	if (mc.cores == 1) {
374 |         
375 |         de.list = list()
376 | 
377 |         for (n in 1:length(groups)) {
378 | 
379 |         	id = groups[n]
380 |             de.list[id] = list(get_age_degenes_mast_big(anno.df, big.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells))
381 | 
382 |         }
383 | 
384 |     }else {
385 | 
386 | 	    library(doMC)
387 | 	    registerDoMC(cores = mc.cores)
388 | 	    de.list = foreach::foreach(i = groups, .combine='c') %dopar% {
389 | 	                 list(get_age_degenes_mast_big(anno.df, big.dat, id = i, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells))
390 | 	              }
391 | 	    names(de.list) = groups
392 | 
393 | 
394 |     }
395 | 
396 |     return(de.list)
397 | 
398 | }
399 | 
400 | 
401 | 
402 | # 
403 | # 
404 | # get_sex_intergenes_mast_big_p = function(anno.df, big.dat, groups, split.by, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){
405 | #   
406 | #   if (mc.cores == 1) {
407 | #     
408 | #     de.list = list()
409 | #     
410 | #     for (n in 1:length(groups)) {
411 | #       
412 | #       id = groups[n]
413 | #       de.list[id] = list(get_sex_intergenes_mast_big(anno.df, big.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells))
414 | #       
415 | #     }
416 | #     
417 | #   }else {
418 | #     
419 | #     library(doMC)
420 | #     registerDoMC(cores = mc.cores)
421 | #     de.list = foreach::foreach(i = groups, .combine='c') %dopar% {
422 | #       list(get_sex_intergenes_mast_big(anno.df, big.dat, id = i, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells))
423 | #     }
424 | #     names(de.list) = groups
425 | #     
426 | #     
427 | #   }
428 | #   
429 | #   return(de.list)
430 | #   
431 | # }
432 | # 
433 | # 
434 | 
435 | 
436 | 
437 | 
438 | 
439 | 
440 | 
441 | get_age_degenes_perm = function(anno.df, norm.dat, id, split.by, nperm = 50, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL){
442 |   
443 |   select.cells.age = anno.df$sample_id[anno.df$age_cat == "aged"]
444 |   select.cells.adult = anno.df$sample_id[anno.df$age_cat == "adult"]
445 |   
446 |   set.seed(2010)
447 |   
448 |   perm.mat.age = replicate(nperm, sample(select.cells.age, maxcells, replace = F))
449 |   perm.mat.adult = replicate(nperm, sample(select.cells.adult, maxcells, replace = F))
450 |   perm.mat = rbind(perm.mat.age, perm.mat.adult)
451 |   
452 |   if (mc.cores == 1) {
453 |     
454 |     de.list = list()
455 |     
456 |     for (n in 1:nperm) {
457 |       
458 |       pname = paste0("p",n)
459 |       de.list[pname] = list(get_age_degenes_mast(anno.df, norm.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = perm.mat[,n]))
460 |       
461 |     }
462 |     
463 |   }else {
464 |     
465 |     library(doMC)
466 |     registerDoMC(cores = mc.cores)
467 |     de.list = foreach::foreach(i = 1:nperm, .combine='c') %dopar% {
468 |       list(get_age_degenes_mast(anno.df, norm.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = perm.mat[,i]))
469 |     }
470 |     names(de.list) = paste0("p",1:nperm)
471 |     
472 |     
473 |   }
474 |   
475 |   return(de.list)
476 |   
477 | }
478 | 


--------------------------------------------------------------------------------
/analysis_scripts/functions/run_umap.py:
--------------------------------------------------------------------------------
 1 | #!/home/user/miniconda3/bin/python3
 2 | 
 3 | import sys, getopt
 4 | 
 5 | from pathlib import Path
 6 | import umap
 7 | import pandas as pd
 8 | import re
 9 | 
10 | def run_umap(fin, fout, nc = 2, nn=25, md=0.4, niter=1):
11 |     df=pd.read_csv(fin, index_col=0)
12 |     for i in range(niter):
13 |         embedding2d = umap.UMAP(n_components=nc, metric='correlation', n_neighbors=nn, min_dist=md, verbose=True).fit_transform(df.values)
14 |         rd = pd.DataFrame(embedding2d)
15 |         rd.index = df.index
16 |         if niter ==1:
17 |             rd.to_csv(fout)
18 |         else:
19 |             rd.to_csv(Path(re.sub("csv", str(i)+".csv", str(fout))))
20 |             
21 | def run_umap_init(fin, finit, fout, nc = 2, nn=25, md=0.3, niter=1):
22 |     df1=pd.read_csv(fin, index_col=0)
23 |     df2=pd.read_csv(finit, index_col=0)
24 |     for i in range(niter):
25 |         embedding2d = umap.UMAP(n_components=nc, metric='correlation', n_neighbors=nn, min_dist=md, verbose=True, init = df2.values).fit_transform(df1.values)
26 |         rd = pd.DataFrame(embedding2d)
27 |         rd.index = df1.index
28 |         if niter ==1:
29 |             rd.to_csv(fout)
30 |         else:
31 |             rd.to_csv(fout+i)
32 | 
33 | 
34 | def main(argv):
35 |     fin = ''
36 |     fout = ''
37 |     finit = ''
38 |     nc=2
39 |     nn = 25
40 |     md=0.4
41 |     niter=1
42 |     try:
43 |         opts, args = getopt.getopt(argv, "hc:d:n:k:i:o:t:",["infile=", "outfile=", "init_file="])
44 |     except getopt.GetoptError:
45 |         print('run_umap.py -i <fin> -o <fout> -t <finit> -c <nc> -d <md> -k <nn> -n <niter>')
46 |         sys.exit(2)
47 |     for opt, arg in opts:
48 |         if opt == 'h':
49 |             print('run_umap.py -i <fin> -o <fout> -t <finit> -c <nc> -d <md> -k <nn> -n <niter>')            
50 |         if opt in ("-i", "--infile"):
51 |             fin = arg
52 |         if opt in ("-o", "--outfile"):
53 |             fout = arg
54 |         if opt in ("-t", "--init_outfile"):
55 |             finit = arg
56 |         if opt == "-c":
57 |             nc = int(arg)
58 |         if opt == "-k":
59 |             nn = int(arg)
60 |         if opt == "-d":
61 |             md = float(arg)
62 |         if opt in ("-n"):
63 |             niter = int(arg)
64 |     print("fin:",fin, "finit:", finit,"fout:", fout,"niter:", niter,"\n")
65 |     if finit =='':
66 |             run_umap(fin,fout, nc = nc, nn=nn, md=md, niter=niter)
67 |     else:
68 |         run_umap_init(fin,finit,fout, nc = nc, nn=nn, md=md, niter=niter)
69 |         
70 | 
71 | if __name__ == "__main__":
72 |    main(sys.argv[1:])
73 | 
74 | 


--------------------------------------------------------------------------------
/analysis_scripts/functions/scrattch.mapping/HANN_prepareTaxonomy.R:
--------------------------------------------------------------------------------
  1 | #' run_prepareTaxonomy : call prepareTaxonomy() from h5ad file
  2 | #'
  3 | #' @param h5adFN     filename of anndata with uns[['HANN']]
  4 | #' @export
  5 | run_prepareTaxonomy <- function( h5adFN ) {
  6 |    library(anndata)
  7 |    library(reticulate)
  8 |    library(Matrix)  
  9 |    adata = read_h5ad(h5adFN)
 10 |    norm.dat = t(adata$X)
 11 |    rownames(norm.dat) = adata$var_names
 12 |    colnames(norm.dat) = adata$obs_names
 13 |    if (sum(names(cl) != adata$obs_names) > 0) { 
 14 |       print("cl and adata$obs_names do not match")
 15 |    }
 16 |    cl           = adata$uns[['HANN']]$cl
 17 |    names(cl)    = adata$obs_names
 18 |    cl.df        = adata$uns[['HANN']]$cl.df
 19 |    cl.df        = py_to_r(cl.df)
 20 | 
 21 |    cl.hierarchy = adata$uns[['HANN']]$cl.hierarchy
 22 |    AIT.str      = adata$uns[['HANN']]$taxonomyName
 23 |    taxonomy.dir = adata$uns[['HANN']]$taxonomyDir
 24 | 
 25 |    AIT.dir = prepareTaxonomy ( count        = norm.dat,
 26 |                                cl           = cl,
 27 |                                cl.df        = cl.df,
 28 |                                cl.hierarchy = cl.hierarchy,
 29 |                                AIT.str      = AIT.str,
 30 |                                taxonomy.dir = taxonomy.dir )
 31 |    print(paste0("Taxonomy is ready in", AIT.dir))
 32 | }
 33 | #' Prepare taxonomy for optimized tree mapping
 34 | #'
 35 | #' This function writes an anndata object in the correct format for all downstream mapping algoriths.
 36 | #'
 37 | #' @param counts count[gene x cell]
 38 | #' @param cl assigned cluster
 39 | #' @param cl.df.users cluster annotation 
 40 | #' @param cl.hierarchy : hierarhcy in clusters : cluster(cl)/subclass_label/neighborhood/root
 41 | #' @param AIT.str taxonomy id
 42 | #' @param taxonomy.dir Output directly to write h5ad file
 43 | #'
 44 | #' @import patchseqtools
 45 | #' @import scrattch.hicat
 46 | #'
 47 | #' @return A copy of the anndata object that is also written to disk
 48 | #'
 49 | #' @export
 50 | prepareTaxonomy <- function (
 51 |    count, 
 52 |    cl, cl.df.users, cl.hierarchy, AIT.str,
 53 |    taxonomy.dir = "/allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/Taxonomies/",
 54 |    h5ad_out = FALSE ) {
 55 |    library(scrattch.bigcat)
 56 |    library(dplyr)
 57 | 
 58 |    ## Define reference-taxonomy folder and filenames
 59 |    AIT.dir = file.path(taxonomy.dir, AIT.str)
 60 |    if (!file.exists(AIT.dir)) dir.create(AIT.dir)
 61 |    de.dir  = file.path(AIT.dir, "de_parquet")
 62 |    sum.dir = file.path(AIT.dir, "de_summary")
 63 |    dat.dir = file.path(AIT.dir, "data_parquet")
 64 |    h5ad.FN = file.path(AIT.dir, paste0(AIT.str, ".h5ad"))
 65 |    pairs.FN = file.path(AIT.dir, "pairs.parquet")
 66 |    cl.bin.FN = file.path(AIT.dir, "cl.bin.rda")
 67 | 
 68 | 
 69 |    ## convert norm.data matrix to big.dat in parquet, if big.dat is not available (default)
 70 |    count = count[, sample_cells(cl,200)]
 71 |    cl = cl[colnames(count)]
 72 |    count.sparse = as(count, 'sparseMatrix')
 73 |    library(data.table)
 74 |    big.dat=convert_big.dat_parquet(count.sparse,  dir=dat.dir)
 75 | 
 76 |    print("## cluster stat")
 77 |    cl.stats = get_cl_stats_big(big.dat, cl=cl, stats=c("means","present","sqr_means"), mc.cores=15)
 78 |    save(cl.stats, file=file.path(AIT.dir, "cl.stats.big.rda"))
 79 |    cl.means = cl.stats$means
 80 |    save(cl.means, file=file.path(AIT.dir, "cl.means.rda"))
 81 | 
 82 |    cl.df = rearrange_cl.df (cl.df.users, cl.hierarchy) 
 83 |    save(cl.df, file=file.path(AIT.dir, "cl.df.rda"))
 84 | 
 85 |    print("## calculate all-pairwise DE genes ")
 86 |    if (file.exists(de.dir)) system(paste0("rm -r ", de.dir))
 87 |    if (file.exists(sum.dir)) system(paste0("rm -r ", sum.dir))
 88 | 
 89 |    de.result = prep_parquet_de_all_pairs(norm.dat=NULL, cl=cl, cl.bin=NULL, mc.cores=30, 
 90 |                   pairs.fn=pairs.FN,  cl.bin.fn=cl.bin.FN, cl.means=cl.stats$means, 
 91 |                   cl.present=cl.stats$present, cl.sqr.means=cl.stats$sqr_means, 
 92 |                   out.dir=de.dir, summary.dir=sum.dir, de.param=de_param())
 93 |    return(AIT.dir) 
 94 | }
 95 | # de_param default values
 96 | #     de_param(
 97 | #       low.th = 1,
 98 | #       padj.th = 0.01,
 99 | #       lfc.th = 1,
100 | #       q1.th = 0.5,
101 | #       q2.th = NULL,
102 | #       q.diff.th = 0.7,
103 | #       de.score.th = 150,
104 | #       min.cells = 4,
105 | #       min.genes = 5
106 | #     )
107 | 
108 | #' Rearrange cl.df to internal format labels
109 | #'
110 | #' This function writes an anndata object in the correct format for all downstream mapping algoriths.
111 | #'
112 | #' @param counts count[gene x cell]
113 | #' @param cl assigned cluster
114 | #' @return A copy of the anndata object that is also written to disk
115 | #'
116 | #' @export
117 | rearrange_cl.df <- function (cl.df, cl.hierarchy) {
118 | 
119 |    nlevel=length(cl.hierarchy)
120 |    if (nlevel==2) {
121 |       hier_label = c('root', 'cl')
122 |       cl         = cl.df[, cl.hierarchy[2]]
123 |       cluster_id = cl
124 |       df = data.frame(cl, cluster_id)
125 |    }
126 |    if (nlevel==3) {
127 |       hier_label     = c('root', 'subclass_label', 'cl')
128 |       subclass_label = cl.df[, cl.hierarchy[2]]
129 |       cl             = cl.df[, cl.hierarchy[3]]
130 |       cluster_id     = cl
131 |       df = data.frame(cl, cluster_id, subclass_label)
132 |    }
133 |    if (nlevel==4) {
134 |       hier_label = c('root', 'class_label', 'subclass_label', 'cl')
135 |       class_label    = cl.df[, cl.hierarchy[2]]
136 |       subclass_label = cl.df[, cl.hierarchy[3]]
137 |       cl             = cl.df[, cl.hierarchy[4]]
138 |       cluster_id = cl
139 |       df = data.frame(cl, cluster_id, subclass_label, class_label)
140 |    }
141 |    library(dplyr)
142 |    rownames(df) = df$cl
143 |    return(df)
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/analysis_scripts/functions/scrattch.mapping/HANN_utils.R:
--------------------------------------------------------------------------------
   1 | #' INFO -- PLEASE ADD --
   2 | #'
   3 | #' @param in.df to_be_added
   4 | #' @param cl.df to_be_added
   5 | #'
   6 | #' @return ???
   7 | #'
   8 | #' @keywords internal
   9 | l2norm <- function(X, by="column")
  10 | {
  11 |   if (by=="column") {
  12 |     l2norm <- sqrt(Matrix::colSums(X^2))
  13 |     if (!any(l2norm==0)) {
  14 |       X=sweep(X, 2, l2norm, "/", check.margin=FALSE)
  15 |     }
  16 |     else{
  17 |       warning("L2 norms of zero detected for distance='Cosine, no transformation")
  18 |     }
  19 |     X = X 
  20 |   } else {
  21 |     l2norm <- sqrt(Matrix::rowSums(X^2))
  22 |     if (!any(l2norm==0)) {      
  23 |       X= X/l2norm
  24 |     }
  25 |     else{
  26 |       warning("L2 norms of zero detected for distance='Cosine'")
  27 |       X = X/ pmax(l2norm,1)
  28 |     }
  29 |   }
  30 | }
  31 | 
  32 | #' Title
  33 | #'
  34 | #' @param cn
  35 | #' @param direction
  36 | #' @param include.self
  37 | #'
  38 | #' @return
  39 | #' @export
  40 | #'
  41 | #' @examples
  42 | create_pairs <- function(cn1, cn2=cn1,direction="nondirectional", include.self = FALSE)
  43 |   {
  44 |     cn1=as.character(cn1)
  45 |     cn2=as.character(cn2)
  46 |     cl.n1 = length(cn1)
  47 |     cl.n2 = length(cn2)
  48 |     pairs = cbind(rep(cn1, rep(cl.n2,cl.n1)), rep(cn2, cl.n1))
  49 |     if(!identical(cn1,cn2) & direction!="unidirectional"){
  50 |       down.pairs = cbind(rep(cn2, rep(cl.n1,cl.n2)), rep(cn1, cl.n2))
  51 |       pairs = rbind(pairs, down.pairs)
  52 |     }
  53 |     if(direction=="nondirectional"){
  54 |       pairs = pairs[pairs[,1]<=pairs[,2],,drop=F]
  55 |     }
  56 |     if(!include.self){
  57 |       pairs = pairs[pairs[,1]!=pairs[,2],,drop=F]
  58 |     }
  59 |     colnames(pairs)=c("P1","P2")
  60 |     row.names(pairs) = paste0(pairs[,1],"_",pairs[,2])
  61 |     return(pairs)
  62 |   }
  63 | 
  64 | 
  65 | #' Compute differential expression summary statistics based on a differential results data.frame and de_param().
  66 | #' 
  67 | #' @param df A data.frame of pairwise differential expression results (i.e. from \code{score_selected_pairs()}).
  68 | #' @param de.param A list of differential gene expression parameters from \code{de_param()}
  69 | #' @param cl.size1 Optional: The number of samples in the first/high cluster
  70 | #' @param cl.size2 Optional: The number of samples in the second/low cluster
  71 | #' 
  72 | #' @results A list of filtered differential expression results containing:
  73 | #' \itemize{
  74 | #' \item{score} The deScore value, equal to the sum of the -log10(p-values) of differentially expressed genes, with a cap of 20 per gene.
  75 | #' \item{up.score} The deScore value for up-regulated genes.
  76 | #' \item{down.score} The deScore value for down-regulated genes.
  77 | #' \item{num} The number of differentially expressed genes
  78 | #' \item{up.num} The number of up-regulated genes
  79 | #' \item{down.num} The number of down-regulated genes
  80 | #' \item{genes} Gene symbols for differentially expressed genes.
  81 | #' \item{up.genes} Gene symbols for up-regulated genes.
  82 | #' \item{down.genes} Gene symbols for down-regulated genes.
  83 | #' \item{de.df} The df used as input, filtered for differentially expressed genes.
  84 | #' }
  85 | #' 
  86 | de_stats_pair <- function(df,
  87 |                           de.param = de_param(), 
  88 |                           cl.size1 = NULL, 
  89 |                           cl.size2 = NULL,
  90 |                           select.genes = NULL,
  91 |                           return.df = FALSE) {  
  92 |   df <- df[order(df$pval, -abs(df$lfc)), ]
  93 |   
  94 |   select <- with(df, which(padj < de.param$padj.th & abs(lfc) > de.param$lfc.th))
  95 |   select <- row.names(df)[select]
  96 |   
  97 |   if(!is.null(select.genes)){
  98 |     select <- select[select %in% select.genes]
  99 |   }
 100 |   
 101 |   if(is.null(select) | length(select) == 0){
 102 |     return(null_de())
 103 |   }
 104 |   
 105 |   up <- select[df[select, "lfc"] > 0]
 106 |   down <- select[df[select, "lfc"] < 0]
 107 |   df <- df[select, ]
 108 |   
 109 |   if(!is.null(de.param$q.diff.th) & is.null(df$q.diff)) {
 110 |     
 111 |     df$q.diff <- with(df, abs(q1 - q2) / pmax(q1, q2))
 112 |     df$q.diff[is.na(df$q.diff)] <- 0
 113 |     
 114 |   }
 115 |   
 116 |   if(!is.null(de.param$q1.th)) {
 117 |     up <- with(df[up, , drop = FALSE], up[q1 > de.param$q1.th])
 118 |     if(!is.null(cl.size1)){
 119 |       up <- with(df[up, , drop = FALSE], up[q1 * cl.size1 >= de.param$min.cells])
 120 |     }
 121 |     
 122 |     down <- with(df[down, , drop = FALSE], down[q2 > de.param$q1.th])
 123 |     if(!is.null(cl.size2)) {
 124 |       down <- with(df[down, , drop = FALSE], down[q2 * cl.size2 >= de.param$min.cells])
 125 |     }
 126 |   }
 127 |   
 128 |   if(!is.null(de.param$q2.th)) {
 129 |     up <- with(df[up, , drop = FALSE], up[q2 < de.param$q2.th])
 130 |     down <- with(df[down, , drop = FALSE], down[q1 < de.param$q2.th])
 131 |   }
 132 |   
 133 |   if(!is.null(de.param$q.diff.th)){
 134 |     up <- with(df[up, , drop = FALSE], up[abs(q.diff) > de.param$q.diff.th])
 135 |     down <- with(df[down, , drop = FALSE], down[abs(q.diff) > de.param$q.diff.th])
 136 |   }
 137 |   
 138 |   select <- c(up, down)
 139 |   
 140 |   if(length(select) == 0){
 141 |     return(null_de())
 142 |   } else {
 143 | 
 144 |     up.genes = setNames(-log10(df[up,"padj"]), up)
 145 |     down.genes = setNames(-log10(df[down,"padj"]), down)
 146 |     
 147 |     tmp = up.genes
 148 |     tmp[tmp > 20] = 20
 149 |     up.score <- sum(tmp)
 150 |     tmp = down.genes
 151 |     tmp[tmp > 20] = 20
 152 |     down.score <- sum(tmp)    
 153 |    
 154 |     result=list(
 155 |       up.genes=up.genes,
 156 |       down.genes=down.genes,
 157 |       up.score = up.score,
 158 |       down.score = down.score,
 159 |       score = up.score + down.score,
 160 |       up.num = length(up.genes),
 161 |       down.num = length(down.genes),
 162 |       num = length(up.genes) + length(down.genes)
 163 |       )
 164 |     
 165 | 
 166 |     if(return.df){
 167 |       result$de.df = df[select,]
 168 |     }
 169 |     return(result)
 170 |   } 
 171 | }
 172 | 
 173 | 
 174 | #' Get KNN
 175 | #'
 176 | #' @param dat  to_be_added
 177 | #' @param ref.dat  to_be_added
 178 | #' @param k  to_be_added
 179 | #' @param method  to_be_added
 180 | #' @param dim  to_be_added
 181 | #'
 182 | #' @return to_be_added
 183 | #'
 184 | #' @keywords internal
 185 | get_knn <- function(dat, ref.dat, k, method ="cor", dim=NULL,index=NULL, build.index=FALSE, transposed=TRUE, return.distance=FALSE)
 186 |   {
 187 |     if(transposed){
 188 |       cell.id = colnames(dat)
 189 |     }
 190 |     else{
 191 |       cell.id= row.names(dat)
 192 |     }    
 193 |     if(transposed){
 194 |       if(is.null(index)){
 195 |         ref.dat = Matrix::t(ref.dat)
 196 |       }
 197 |       dat = Matrix::t(dat)
 198 |     }
 199 |     if(method=="RANN"){
 200 |       library(RANN)
 201 |       knn.result = RANN::nn2(ref.dat, dat, k=k)
 202 |     }
 203 |     else if(method %in% c("Annoy.Euclidean", "Annoy.Cosine","cor")){
 204 |       library(BiocNeighbors)      
 205 |       if(is.null(index)){
 206 |         if(method=="cor"){
 207 |           ref.dat = ref.dat - rowMeans(ref.dat)
 208 |           ref.dat = l2norm(ref.dat,by = "row")
 209 |         }
 210 |         if (method=="Annoy.Cosine"){
 211 |           ref.dat = l2norm(ref.dat,by = "row")
 212 |         }
 213 |         if(build.index){
 214 |           index= buildAnnoy(ref.dat)
 215 |         }
 216 |       }
 217 |       if (method=="Annoy.Cosine"){
 218 |         dat = l2norm(dat,by="row")
 219 |       }
 220 |       if (method=="cor"){
 221 |         dat = dat - rowMeans(dat)
 222 |         dat = l2norm(dat,by = "row")
 223 |       }
 224 |       knn.result = queryAnnoy(X= ref.dat, query=dat, k=k, precomputed = index)
 225 |     }
 226 |     else{
 227 |       stop(paste(method, "method unknown"))
 228 |     }    
 229 |     knn.index= knn.result[[1]]
 230 |     knn.distance = knn.result[[2]]
 231 |     row.names(knn.index) = row.names(knn.distance)=cell.id
 232 |     if(!return.distance){
 233 |       return(knn.index)
 234 |     }
 235 |     else{
 236 |       list(index=knn.index, distance=knn.distance)
 237 |     }
 238 |   }
 239 | 
 240 | 
 241 | #' get knn batch
 242 | #'
 243 | #' @param dat  to_be_added
 244 | #' @param ref.dat  to_be_added
 245 | #' @param k  to_be_added
 246 | #' @param method  to_be_added
 247 | #' @param dim  to_be_added
 248 | #' @param batch.size  to_be_added
 249 | #' @param mc.cores  to_be_added
 250 | #'
 251 | #' @return to_be_added
 252 | #'
 253 | #' @keywords internal
 254 | get_knn_batch <- function(dat, ref.dat, k=1, method="cor", dim=NULL, batch.size, mc.cores=1, return.distance=FALSE,...)
 255 |   {
 256 |     if(return.distance){
 257 |       fun = "knn_combine"
 258 |     }
 259 |     else{
 260 |       fun = "rbind"
 261 |     }
 262 |     results <- batch_process(x=1:ncol(dat), batch.size=batch.size, mc.cores=mc.cores, .combine=fun, FUN=function(bin){
 263 |       get_knn(dat=dat[row.names(ref.dat),bin,drop=F], ref.dat=ref.dat, k=k, method=method, dim=dim,return.distance=return.distance, ...)
 264 |     })
 265 |     return(results)
 266 |   }
 267 | 
 268 | knn_combine <- function(result.1, result.2)
 269 | {
 270 |   knn.index = rbind(result.1[[1]], result.2[[1]])
 271 |   knn.distance = rbind(result.1[[2]], result.2[[2]])
 272 |   return(list(knn.index, knn.distance))
 273 | }
 274 | 
 275 | #' Batch process
 276 | #'
 277 | #' @param x  to_be_added
 278 | #' @param batch.size  to_be_added
 279 | #' @param FUN  to_be_added
 280 | #' @param mc.cores  to_be_added
 281 | #' @param .combine  to_be_added
 282 | #' @param ... 
 283 | #'
 284 | #' @return to_be_added
 285 | #'
 286 | #' @keywords internal
 287 | batch_process <- function(x, batch.size, FUN, mc.cores=1, .combine="c",...)
 288 |   {
 289 |     require(foreach)
 290 |     require(doMC)
 291 |     if (mc.cores == 1) {
 292 |       registerDoSEQ()
 293 |     }
 294 |     else {
 295 |       registerDoMC(cores=mc.cores)
 296 |       #on.exit(parallel::stopCluster(), add = TRUE)
 297 |     }
 298 |     bins = split(x, floor((1:length(x))/batch.size))
 299 |     results= foreach(i=1:length(bins), .combine=.combine) %dopar% FUN(bins[[i]],...)
 300 |     return(results)
 301 |   }
 302 | 
 303 | #' INFO -- PLEASE ADD --
 304 | #'
 305 | #' @param in.df to_be_added
 306 | #' @param cl.df to_be_added
 307 | #'
 308 | #' @return ???
 309 | #'
 310 | #' @keywords internal
 311 | build_train_index <- function(cl.dat, method= c("Annoy.Cosine","cor","Annoy.Euclidean"),fn=tempfile(fileext=".idx"))
 312 |   {
 313 |     library(BiocNeighbors)
 314 |     method = method[1]
 315 |     ref.dat = Matrix::t(cl.dat)
 316 |     if(method=="cor"){
 317 |       ref.dat = ref.dat - rowMeans(ref.dat)
 318 |       ref.dat = l2norm(ref.dat,by = "row")
 319 |     }
 320 |     if (method=="Annoy.Cosine"){
 321 |       ref.dat = l2norm(ref.dat,by = "row")
 322 |     }
 323 |     index= buildAnnoy(ref.dat, fname=fn)
 324 |     return(index)    
 325 |   }
 326 | 
 327 | #' INFO -- PLEASE ADD --
 328 | #'
 329 | #' @param in.df to_be_added
 330 | #' @param cl.df to_be_added
 331 | #'
 332 | #' @return ???
 333 | #'
 334 | #' @keywords internal
 335 | build_train_index_bs <- function(cl.dat, method= c("Annoy.Cosine","cor","Annoy.Euclidean"),sample.markers.prop=0.8, iter=100, mc.cores=10,fn=tempfile(fileext=".idx"))
 336 |   {
 337 |     library(BiocNeighbors)
 338 |     require(doMC)
 339 |     require(foreach)
 340 |     registerDoMC(cores=mc.cores)
 341 | 
 342 |     if (is.na(sample.markers.prop)) {
 343 |        index = build_train_index(cl.dat = cl.dat, method=method, fn=paste0(fn, ".",1))
 344 |        return(list(list(cl.dat=cl.dat, index=index)))
 345 |     } else {
 346 |        ###for each cluster, find markers that discriminate it from other types
 347 |        train.dat <- foreach(i=1:iter, .combine="c") %dopar% {
 348 |          cat(i, '\r')
 349 |          train.markers = sample(row.names(cl.dat), round(nrow(cl.dat) * sample.markers.prop))
 350 |          train.cl.dat = cl.dat[train.markers,]
 351 |          index = build_train_index(cl.dat = train.cl.dat, method=method, fn = paste0(fn, ".",i))
 352 |          return(list(list(cl.dat=train.cl.dat, index=index)))
 353 |        }   
 354 |     }
 355 |   }
 356 | 
 357 | #' INFO -- PLEASE ADD --
 358 | #'
 359 | #' @param in.df to_be_added
 360 | #' @param cl.df to_be_added
 361 | #'
 362 | #' @return ???
 363 | #'
 364 | #' @keywords internal
 365 | map_cells_knn <- function(topk=1,test.dat, cl.dat, train.index=NULL, method = c("Annoy.Cosine","cor"), batch.size=5000, mc.cores=1)
 366 |   {
 367 | 
 368 |     cl.knn = get_knn_batch(test.dat, cl.dat, k=topk, index=train.index, method=method, transposed=TRUE, batch.size=batch.size, mc.cores=mc.cores,return.distance=TRUE)
 369 |     knn.index = cl.knn[[1]]
 370 |     knn.dist = cl.knn[[2]]
 371 |     map.df = data.frame(sample_id=colnames(test.dat), cl = colnames(cl.dat)[knn.index], dist = knn.dist)
 372 |     return(map.df)
 373 |   }
 374 | 
 375 | #' INFO -- PLEASE ADD --
 376 | #'
 377 | #' @param in.df to_be_added
 378 | #' @param cl.df to_be_added
 379 | #'
 380 | #' @return ???
 381 | #'
 382 | #' @keywords internal
 383 | map_cells_knn_big <- function(big.dat, cl.dat, select.cells, train.index=NULL, method = c("Annoy.Cosine","cor"), batch.size=10000, mc.cores=10)
 384 |   {    
 385 |     cl.knn =  get_knn_batch_big(big.dat, cl.dat, select.cells=select.cells, k=1, index=train.index, method=method, transposed=TRUE, batch.size=batch.size, mc.cores=mc.cores,return.distance=TRUE)
 386 |     knn.index = cl.knn[[1]]
 387 |     knn.dist = cl.knn[[2]]
 388 |     map.df = data.frame(sample_id=select.cells, cl = colnames(cl.dat)[knn.index], dist = knn.dist)
 389 |     return(map.df)
 390 |   }
 391 | 
 392 | #' INFO -- PLEASE ADD --
 393 | #'
 394 | #' @param in.df to_be_added
 395 | #' @param cl.df to_be_added
 396 | #'
 397 | #' @return ???
 398 | #'
 399 | #' @keywords internal
 400 | map_cells_knn_bs <- function(topk=1, test.dat, iter=100,cl.dat=NULL,train.index.bs=NULL, method = c("Annoy.Cosine","cor"), mc.cores=20, ...)
 401 |   {
 402 |     require(doMC)
 403 |     require(foreach)
 404 |     mc.cores = min(mc.cores, length(train.index.bs))
 405 |     registerDoMC(cores=mc.cores)
 406 |     ###for each cluster, find markers that discriminate it from other types
 407 |     if(!is.null(train.index.bs)){
 408 |       iter = length(train.index.bs)
 409 |     } else{
 410 |       idx = match(rownames(cl.dat), rownames(test.dat))
 411 |       if (any(is.na(idx))) {
 412 |          print("some genes in the train data set are missing from query data")
 413 |          common.gene = intersect(rownames(cl.dat), rownames(test.dat))
 414 |          cl.dat = cl.dat[common.gene,]
 415 |       }
 416 |       train.index.bs = build_train_index_bs(cl.dat, method=method,iter=iter, ...)
 417 |     }
 418 |     library(data.table)
 419 |     map.list <- foreach(i=1:iter, .combine="c") %dopar% {
 420 |       train.index = train.index.bs[[i]]$index
 421 |       cl.dat = train.index.bs[[i]]$cl.dat
 422 |       map.df=map_cells_knn(topk, test.dat, cl.dat, train.index, method = c("Annoy.Cosine","cor"))
 423 |       map.df = list(map.df)
 424 |     }
 425 |     map.df = rbindlist(map.list)
 426 |     map.df = map.df %>% group_by(sample_id, cl) %>% summarize(freq=n(),dist = mean(dist))
 427 |     map.df$freq = map.df$freq/iter
 428 |     best.map.df = map.df %>% group_by(sample_id) %>% summarize(best.cl= cl[which.max(freq)],prob=max(freq), avg.dist = dist[which.max(freq)])
 429 |     if(method=="cor"){
 430 |       best.map.df = best.map.df%>% mutate(avg.cor = 1 - avg.dist^2/2)
 431 |     }
 432 |     return(list(map.freq=map.df, best.map.df = best.map.df))    
 433 |   }
 434 | 
 435 | ## cl.list is cluster membership at different levels, finest at the beginning.
 436 | ## val is a vector associated with sample_id
 437 | ## compute z_score aggregate at different levels of clustering, start with finest level of clustering, and resort to higher level if not enough sample size
 438 | #' INFO -- PLEASE ADD --
 439 | #'
 440 | #' @param in.df to_be_added
 441 | #' @param cl.df to_be_added
 442 | #'
 443 | #' @return ???
 444 | #'
 445 | #' @keywords internal
 446 | z_score <- function(cl.list, val, min.samples =100)
 447 |   {
 448 | 
 449 |     sample_id = names(cl.list[[1]])
 450 |     z.score = c()
 451 |     for(i in 1:length(cl.list)){
 452 |       cl=cl.list[[i]][sample_id]
 453 |       cl.size = table(cl)
 454 |       if(i !=length(cl.list)){
 455 |         select.cl = names(cl.size)[cl.size > min.samples]
 456 |       }
 457 |       else{
 458 |         select.cl = names(cl.size)
 459 |       }
 460 |       df = data.frame(sample_id = names(cl),cl=cl, val=val[names(cl)])      
 461 |       df = df %>% filter(cl %in% select.cl) %>% group_by(cl) %>% mutate(z = (val - mean(val))/sd(val))
 462 |       z.score[df$sample_id] = df$z
 463 |       sample_id = setdiff(sample_id, df$sample_id)      
 464 |     }
 465 |     return(z.score)
 466 |   }
 467 | 
 468 |   get_gene_score_ds <- function(ds, to.add, genes, cl.bin, de=NULL, max.num=1000,mc.cores=20)
 469 |   {
 470 |     require(doMC)
 471 |     registerDoMC(cores=mc.cores)
 472 |     if(!is.null(de)){
 473 |       tmp.de = suppressMessages(de %>% right_join(to.add[,c("P1","P2")]))
 474 |       gene.score = tmp.de %>% group_by(gene) %>% summarize(score = sum(as.numeric(max.num- rank))) %>% filter(gene %in% genes) %>% arrange(-score)   
 475 |     }
 476 |     else{
 477 |       to.add = suppressMessages(to.add %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl")))
 478 |       cl.bin.x = to.add %>% pull(bin.x) %>% unique
 479 |       cl.bin.y = to.add %>% pull(bin.y) %>% unique  
 480 |       tmp=foreach::foreach(bin1=cl.bin.x,.combine="c")%:%
 481 |         foreach::foreach(bin2=cl.bin.y,.combine="c")%dopar% {
 482 |           cat(bin1, bin2, '\r')
 483 |           tmp.pairs = to.add %>%  filter(bin.x == bin1 & bin.y ==bin2)
 484 |           tmp.de = ds %>% filter(bin.x == bin1 & bin.y ==bin2 & gene %in% genes & rank < max.num & P1 %in% tmp.pairs$P1 & P2 %in% tmp.pairs$P2) %>% collect
 485 |           tmp.de = suppressMessages(tmp.de %>% right_join(tmp.pairs,by=c("P1","P2")))
 486 |           gene.score = tmp.de %>% group_by(gene) %>% summarize(rank.sum = sum(max.num- rank))
 487 |           list(gene.score)
 488 |         }
 489 |       gene.score=rbindlist(tmp)
 490 |       gene.score = gene.score %>% group_by(gene) %>% summarize(score=sum(as.numeric(rank.sum))) %>% arrange(-score)
 491 |     }
 492 |     return(gene.score)
 493 |   }
 494 | 
 495 | #' INFO -- PLEASE ADD --
 496 | #'
 497 | #' @param in.df to_be_added
 498 | #' @param cl.df to_be_added
 499 | #'
 500 | #' @return ???
 501 | #'
 502 | #' @keywords internal
 503 | update_gene_score_ds <- function(gene.score, ds, to.remove, cl.bin, de=NULL, max.num=1000,mc.cores=20)
 504 |   {
 505 |     require(doMC)
 506 |     registerDoMC(cores=mc.cores)
 507 |     to.remove = suppressMessages(to.remove %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl")))
 508 |     cl.x = to.remove %>% pull(P1) %>%unique
 509 |     cl.y = to.remove %>% pull(P2) %>%unique
 510 |     
 511 |     cl.bin.x = to.remove %>% pull(bin.x) %>% unique
 512 |     cl.bin.y = to.remove %>% pull(bin.y) %>% unique
 513 |     if(!is.null(de)){
 514 |       tmp.de = suppressMessages(de %>% right_join(to.remove))
 515 |       rm.gene.score = tmp.de %>% group_by(gene) %>% summarize(rm.score = sum(max.num- rank))
 516 |     }
 517 |     else{
 518 |       if(length(cl.bin.x)*length(cl.bin.y) * nrow(gene.score) < 10^6){
 519 |         de=  ds %>% filter(bin.x %in% cl.bin.x & bin.y %in% cl.bin.y & gene %in% gene.score$gene & rank < max.num & P1 %in% cl.x & P2 %in% cl.y) %>% collect
 520 |         tmp.de = suppressMessages(de %>% right_join(to.remove))
 521 |         rm.gene.score = tmp.de %>% group_by(gene) %>% summarize(rm.score = sum(max.num- rank))        
 522 |       }
 523 |       else{
 524 |         tmp=foreach::foreach(bin1=cl.bin.x,.combine="c")%:%
 525 |           foreach::foreach(bin2=cl.bin.y,.combine="c")%dopar% {
 526 |             tmp.pairs = to.remove %>%  filter(bin.x == bin1 & bin.y ==bin2)
 527 |             tmp.de = ds %>% filter(bin.x == bin1 & bin.y ==bin2 & gene %in% gene.score$gene & rank < max.num & P1 %in% tmp.pairs$P1 & P2 %in% tmp.pairs$P2) %>% collect
 528 |             tmp.de = suppressMessages(tmp.de %>% right_join(tmp.pairs))
 529 |             gene.score = tmp.de %>% group_by(gene) %>% summarize(rank.sum = sum(max.num- rank))
 530 |             list(gene.score)
 531 |           }
 532 |         rm.gene.score=rbindlist(tmp)      
 533 |         if(is.null(rm.gene.score)|nrow(rm.gene.score)==0){
 534 |           return(gene.score)
 535 |         }
 536 |         rm.gene.score = rm.gene.score %>% group_by(gene) %>% summarize(rm.score=sum(as.numeric(rank.sum)))
 537 |       }
 538 |     }
 539 |     tmp = suppressMessages(gene.score %>% left_join(rm.gene.score) %>% mutate(rm.score = ifelse(is.na(rm.score), 0, rm.score)) %>% mutate(new.score = score - rm.score))
 540 |     tmp = tmp%>% select(gene, new.score) %>% mutate(score=new.score) %>% filter(score > 0) %>% arrange(-score)
 541 |     return(tmp)
 542 |   }
 543 | 
 544 | #' INFO -- PLEASE ADD --
 545 | #'
 546 | #' @param in.df to_be_added
 547 | #' @param cl.df to_be_added
 548 | #'
 549 | #' @return ???
 550 | #'
 551 | #' @keywords internal
 552 | check_pairs_ds <- function(de.dir, to.add, genes,cl.bin, de=NULL, mc.cores=10,max.num=1000)
 553 |   {
 554 |     require(doMC)
 555 |     registerDoMC(cores=mc.cores)
 556 |     to.add = to.add[,c("P1","P2")]
 557 |     to.add = suppressMessages(to.add %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl")))
 558 |     select.P1 = to.add %>% pull(P1) %>% unique
 559 |     select.P2 = to.add %>% pull(P2) %>% unique    
 560 |     if(!is.null(de)){
 561 |       de.checked = suppressMessages(to.add %>% left_join(de %>% filter(rank < max.num & gene %in% genes)) %>% group_by(P1,P2) %>% summarize(checked=sum(!is.na(gene))))
 562 |     }
 563 |     else{
 564 |       cl.bin.x = to.add %>% pull(bin.x) %>% unique
 565 |       cl.bin.y = to.add %>% pull(bin.y) %>% unique
 566 |       if(length(cl.bin.x)*length(cl.bin.y)*length(genes) < 10^5){
 567 |         ds = open_dataset(de.dir)
 568 |         de = ds %>% filter(bin.x  %in% cl.bin.x & bin.y %in% cl.bin.y & gene %in% genes & P1 %in% select.P1 & P2 %in% select.P2 & rank < max.num ) %>% collect
 569 |         de.checked = suppressMessages(to.add %>% left_join(de %>% filter(rank < max.num & gene %in% genes)) %>% group_by(P1,P2) %>% summarize(checked=sum(!is.na(gene))))
 570 |       }
 571 |       else{
 572 |         de.checked=foreach::foreach(bin1=cl.bin.x,.combine="c")%:%
 573 |           foreach::foreach(bin2=cl.bin.y,.combine="c")%dopar% {
 574 |             cat("bin", bin1, bin2, "\r")
 575 |             d = file.path(de.dir, paste0("bin.x=",bin1), paste0("bin.y=",bin2))
 576 |             if (file.exists(file.path(d))) {
 577 |                tmp.pairs = to.add %>%  filter(bin.x == bin1 & bin.y ==bin2)
 578 |                ###
 579 |                #tmp.de =ds %>% filter(bin.x == bin1 & bin.y ==bin.y) %>% collect %>% filter(gene %in% genes & rank < max.num)
 580 |                #Directly read from parquet file is faster
 581 |                fn = dir(d, pattern="parquet")
 582 |                tmp.de = arrow::read_parquet(file.path(d, fn))
 583 |                tmp.de = tmp.de %>% filter(gene %in% genes & rank < max.num)
 584 |                tmp.de = suppressMessages(tmp.de %>% right_join(tmp.pairs))
 585 |                de.checked = suppressMessages(tmp.de %>% group_by(P1,P2) %>% summarize(checked=n()))
 586 |                list(de.checked)
 587 |             }
 588 |           }      
 589 |         de.checked = rbindlist(de.checked)
 590 |       }
 591 |     }
 592 |     return(de.checked)
 593 |   }
 594 | 
 595 | #' INFO -- PLEASE ADD --
 596 | #'
 597 | #' @param in.df to_be_added
 598 | #' @param cl.df to_be_added
 599 | #'
 600 | #' @return ???
 601 | #'
 602 | #' @keywords internal
 603 | select_markers_ds <- function(de.dir, cl.bin, genes, select.cl=NULL, top.n=20,mc.cores=10)
 604 |   {
 605 |     ds = open_dataset(de.dir)
 606 | 
 607 |     if(!is.null(select.cl)){
 608 |       cl.bin = cl.bin %>% filter(cl %in% select.cl)
 609 |     }
 610 |     select.bin = cl.bin %>% pull(bin) %>% unique
 611 |     mc.cores=min(mc.cores, length(select.bin))
 612 |     library(parallel)    
 613 |     require(doMC)
 614 |     require(foreach)
 615 |     registerDoMC(cores=mc.cores)    
 616 |     tmp=foreach::foreach(bin1=select.bin,.combine="c")%:%
 617 |       foreach::foreach(bin2=select.bin,.combine="c")%dopar% {
 618 |         de = ds %>% filter(bin.x %in% bin1 & bin.y %in% bin2)
 619 |         if(is.null(select.cl)){
 620 |           de = de %>% filter(P1 %in% select.cl & P2 %in% select.cl)            
 621 |         }
 622 |         de %>%  filter(gene %in% genes) %>% filter(rank <= top.n) %>% pull(gene) %>% unique
 623 |       }
 624 |     select.markers=setdiff(intersect(genes,unique(tmp)), NA)
 625 |     return(select.markers)
 626 |   }
 627 | 
 628 | #' INFO -- PLEASE ADD --
 629 | #'
 630 | #' @param in.df to_be_added
 631 | #' @param cl.df to_be_added
 632 | #'
 633 | #' @return ???
 634 | #'
 635 | #' @keywords internal
 636 | select_markers_pair_direction_ds <- function(de.dir, add.num, genes, cl.bin, de=NULL, mc.cores=mc.cores,max.genes=1000,byg=10, ...)
 637 |   {
 638 |     ds = open_dataset(de.dir)
 639 |     select.genes=c()
 640 |     if(!is.null(de)){
 641 |       de = de %>% filter(gene %in% genes)
 642 |     }
 643 |     add.num = suppressMessages(add.num %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl")))
 644 |     gene.score= get_gene_score_ds(ds, to.add=add.num, genes=genes,cl.bin=cl.bin, de=de,mc.cores=mc.cores,...)
 645 |     while(nrow(add.num)>0 & length(genes)>0 & length(select.genes)< max.genes){      
 646 |       if(is.null(gene.score) | nrow(gene.score)==0){
 647 |         break
 648 |       }
 649 |       if (length(gene.score$gene)> byg) g = as.character(gene.score$gene[1:byg])
 650 |       else g = as.character(gene.score$gene)
 651 | 
 652 |       new.checked = check_pairs_ds(de.dir, to.add=add.num %>% select(P1,P2), genes=g,cl.bin=cl.bin,de=de, ...)
 653 |       add.num$checked=NULL
 654 |       add.num = suppressMessages(add.num %>% left_join(new.checked, by=c("P1","P2")))      
 655 |       add.num = add.num %>% mutate(checked=ifelse(is.na(checked),0,checked)) %>% mutate(num = num-checked)
 656 |       to.remove = add.num %>% filter(num <=0) %>% select(P1,P2)
 657 |       add.num = add.num %>% filter(num > 0)
 658 |       genes = setdiff(genes,g)
 659 |       select.genes=c(select.genes,g)
 660 |       cat('gene_score', dim(gene.score), length(select.genes), '\r')
 661 |       gene.score = update_gene_score_ds(gene.score, ds, to.remove, cl.bin, de=de,...)
 662 |       gene.score = gene.score %>% filter(gene %in% genes)
 663 |       if(!is.null(de)){
 664 |         de = de %>% filter(!(gene %in% g))
 665 |         if(is.null(de)| nrow(de)==0){
 666 |           break
 667 |         }
 668 |       }
 669 |     }
 670 |     return(list(select.genes=select.genes, de=de))
 671 |   }
 672 | 
 673 | #' INFO -- PLEASE ADD --
 674 | #'
 675 | #' @param in.df to_be_added
 676 | #' @param cl.df to_be_added
 677 | #'
 678 | #' @return ???
 679 | #'
 680 | #' @keywords internal
 681 | select_markers_pair_group_top_ds <- function(g1,g2,ds, genes, cl.bin, select.sign="up", n.markers=20,mc.cores=1, ...)
 682 | {
 683 |   require(matrixStats)
 684 |   require(data.table)
 685 |   require(arrow)
 686 |   require(dplyr)
 687 | 
 688 |   require(doMC)
 689 |   registerDoMC(cores=mc.cores)
 690 |   up.to.add = down.to.add=NULL
 691 |   up.genes=down.genes=NULL
 692 |   if("up" %in% select.sign){
 693 |     up.to.add = as.data.frame(create_pairs(g1, g2, direction="unidirectional"))
 694 |     up.gene.score  = get_gene_score_ds(ds, to.add=up.to.add, genes=genes, cl.bin=cl.bin,...)
 695 |     up.genes = head(up.gene.score$gene, n.markers)
 696 |   }
 697 |   if("down" %in% select.sign){
 698 |     down.to.add = as.data.frame(create_pairs(g2, g1,direction="unidirectional"))
 699 |     down.gene.score  = get_gene_score_ds(ds, to.add=down.to.add, genes=genes, cl.bin=cl.bin, ...)
 700 |     down.genes = head(down.gene.score$gene, n.markers)
 701 |   }
 702 |   to.add=rbind(up.to.add, down.to.add)
 703 |   return(list(up.genes=up.genes, down.genes=down.genes,to.add=to.add))
 704 | }
 705 | 
 706 | #' INFO -- PLEASE ADD --
 707 | #'
 708 | #' @param in.df to_be_added
 709 | #' @param cl.df to_be_added
 710 | #'
 711 | #' @return ???
 712 | #'
 713 | #' @keywords internal
 714 | select_markers_pair_group_ds <- function(g1,g2,de.dir, genes, cl.bin, n.markers=20,select.sign=c("up","down"),max.genes=50,...)
 715 |   {
 716 |     ds = open_dataset(de.dir)
 717 |     result = select_markers_pair_group_top_ds( g1,g2,ds=ds, genes=genes, cl.bin=cl.bin, n.markers=n.markers,select.sign=select.sign,...)
 718 |     markers=c(result$up.genes, result$down.genes)
 719 |     add.num = result$to.add
 720 |     add.num$num = n.markers
 721 |     new.checked = check_pairs_ds(de.dir, to.add=add.num %>% select(P1,P2),genes=markers,cl.bin=cl.bin,...)
 722 |     add.num = suppressMessages(add.num %>% left_join(new.checked, by=c("P1","P2")))      
 723 |     add.num = add.num %>% mutate(checked=ifelse(is.na(checked),0,checked)) %>% mutate(num = num-checked)            
 724 |     add.num = add.num %>% filter(num > 0)
 725 |     max.genes= max.genes - length(markers)
 726 |     if(nrow(add.num)>0 & max.genes > 0){
 727 |       add.num$checked=NULL
 728 |       genes = setdiff(genes,markers)
 729 |       more.markers <- select_markers_pair_direction_ds(de.dir, add.num=add.num, genes=genes,cl.bin=cl.bin,max.genes=max.genes,...)
 730 |       more.markers <- more.markers$select.genes
 731 |       markers= c(markers, more.markers)
 732 |     }
 733 |     return(markers)
 734 |   }
 735 | 
 736 | #' INFO -- PLEASE ADD --
 737 | #'
 738 | #' @param in.df to_be_added
 739 | #' @param cl.df to_be_added
 740 | #'
 741 | #' @return ???
 742 | #'
 743 | #' @keywords internal
 744 | select_N_markers_ds<- function(de.dir, select.cl=NULL,pair.num=1, add.num=NULL, genes, cl.bin, default.markers=NULL,...)
 745 |   {
 746 |     if(is.null(add.num)){
 747 |       add.num = as.data.frame(create_pairs(select.cl, direction="directional"))
 748 |       add.num$num = pair.num
 749 |     }
 750 |     if(!is.null(default.markers)){      
 751 |       de.checked.num = check_pairs_ds(de.dir, add.num, genes=default.markers, cl.bin=cl.bin,...)
 752 |       add.num = suppressMessages(add.num %>% left_join(de.checked.num,by=c("P1","P2")))
 753 |       add.num = add.num %>% mutate(checked=ifelse(is.na(checked),0,checked)) %>% mutate(num = num-checked)
 754 |       add.add = add.num %>% filter(num > 0)
 755 |       genes = setdiff(genes, default.markers)
 756 |       add.num$checked=NULL
 757 |     }
 758 |     markers <- select_markers_pair_direction_ds(de.dir, add.num=add.num, genes=genes,cl.bin=cl.bin,...)$select.genes    
 759 |   }
 760 | 
 761 | #' INFO -- PLEASE ADD --
 762 | #'
 763 | #' @param in.df to_be_added
 764 | #' @param cl.df to_be_added
 765 | #'
 766 | #' @return ???
 767 | #'
 768 | #' @keywords internal
 769 | select_pos_markers_ds <- function(de.dir, cl, select.cl, genes, cl.bin, n.markers=1,  mc.cores=1,out.dir="cl.markers",...)
 770 |   {
 771 |     library(parallel)    
 772 |     require(doMC)
 773 |     require(foreach)
 774 |     registerDoMC(cores=mc.cores)
 775 |     ds = open_dataset(de.dir)
 776 |     if (!dir.exists(out.dir)) {
 777 |         dir.create(out.dir)
 778 |     }
 779 |     
 780 |     ###for each cluster, find markers that discriminate it from other types
 781 |     cl.markers <- foreach(x=select.cl, .combine="c") %dopar% {
 782 |       #print(x)
 783 |       g1=x
 784 |       g2 = setdiff(cl, x)
 785 |       cl.bin.x = cl.bin %>% filter(cl==g1) %>% pull(bin)      
 786 |       select.de = ds %>% filter(bin.x==cl.bin.x & P1==g1 & P2 %in% g2) %>% collect()
 787 |       markers <- select_markers_pair_group_ds(g1,g2, de.dir=de.dir, de=select.de, genes=genes, cl.bin=cl.bin, n.markers=n.markers,select.sign="up",...)
 788 |       save(markers, file=file.path(out.dir, paste0(x, ".markers.rda")))
 789 |       tmp=list(markers)
 790 |       names(tmp)=x
 791 |       tmp
 792 |     }
 793 |     return(cl.markers)
 794 |   }
 795 | 
 796 | #' INFO -- PLEASE ADD --
 797 | #'
 798 | #' @param in.df to_be_added
 799 | #' @param cl.df to_be_added
 800 | #'
 801 | #' @return ???
 802 | #'
 803 | #' @keywords internal
 804 | select_top_pos_markers_ds<- function(ds, cl, select.cl, genes, cl.bin, n.markers=3, mc.cores=10,...)
 805 |   {
 806 |     library(parallel)    
 807 |     require(doMC)
 808 |     require(foreach)
 809 |     registerDoMC(cores=mc.cores)
 810 |     
 811 |     ###for each cluster, find markers that discriminate it from other types
 812 | 
 813 |     cl.markers <- foreach(x=select.cl, .combine="c") %dopar% {
 814 |       #print(x)
 815 |       g1=x
 816 |       g2 = setdiff(cl, x)
 817 |       cl.bin.x = cl.bin %>% filter(cl==g1) %>% pull(bin)      
 818 |       select.de = ds %>% filter(bin.x==cl.bin.x & P1==g1 & P2 %in% g2) %>% collect()
 819 |       markers= select_markers_pair_group_top_ds(g1,g2,ds, genes=genes,cl.bin=cl.bin, de=select.de,n.markers=n.markers,select.sign="up",...)$up.genes
 820 |       tmp=list(markers)
 821 |       names(tmp)=x
 822 |       tmp
 823 |     }
 824 |     return(cl.markers)
 825 |   }
 826 | 
 827 | #' INFO -- PLEASE ADD --
 828 | #'
 829 | #' @param in.df to_be_added
 830 | #' @param cl.df to_be_added
 831 | #'
 832 | #' @return ???
 833 | #'
 834 | #' @keywords internal
 835 | select_markers_groups_top_ds <- function(ds, cl.group, select.groups=names(cl.group), n.markers=3,mc.cores=1,...)
 836 |   {
 837 | 
 838 |     library(parallel)
 839 |     library(parallel)    
 840 |     require(doMC)
 841 |     require(foreach)
 842 |     registerDoMC(cores=mc.cores)
 843 |     
 844 |     all.cl = unlist(cl.group)
 845 |     group.markers <- foreach(x=select.groups, .combine="c") %dopar% {
 846 |       #print(x)
 847 |       g1 = cl.group[[x]]
 848 |       g2 = setdiff(all.cl, g1)              
 849 |       markers=select_markers_pair_group_top_ds(g1,g2,ds=ds, select.sign="up",n.markers=n.markers, ...)$up.genes
 850 |       list(markers)
 851 |     }
 852 |     names(group.markers) = select.groups
 853 |     return(group.markers)
 854 |   }
 855 | 
 856 | #' INFO -- PLEASE ADD --
 857 | #'
 858 | #' @param in.df to_be_added
 859 | #' @param cl.df to_be_added
 860 | #'
 861 | #' @return ???
 862 | #'
 863 | #' @keywords internal
 864 | select_markers_groups <- function(de.dir, cl.group, genes, cl.bin, select.groups= unique(cl.group$group), n.markers=20,mc.cores=1,byg=10, ...)
 865 |   {
 866 |     ds = open_dataset(de.dir)
 867 |     cl.group$cl = as.character(cl.group$cl)
 868 |     group_pair=create_pairs(unique(cl.group$group))
 869 |     library(parallel)
 870 |     require(doMC)
 871 |     require(foreach)
 872 |     registerDoMC(cores=mc.cores)
 873 | 
 874 |     group.markers <- foreach(i=1:nrow(group_pair), .combine="c") %dopar% {
 875 |       x= group_pair[i,1]
 876 |       y= group_pair[i,2]
 877 |       cat(i, nrow(group_pair), x,' vs ', y,"\r")
 878 |       g1 = cl.group %>% filter(group==x) %>% pull(cl)
 879 |       g2 = cl.group %>% filter(group==y) %>% pull(cl)
 880 |       result=select_markers_pair_group_top_ds(g1,g2,ds=ds, n.markers=n.markers, genes=genes,cl.bin=cl.bin,select.sign=c("up","down"),...)
 881 |       list(c(result$up.genes, result$down.genes))
 882 |     }
 883 |     group.markers=unique(unlist(group.markers))   
 884 |     pairs = as.data.frame(create_pairs(cl.group$cl,direction="directional"))
 885 |     pairs$pair = row.names(pairs)
 886 |     pairs = suppressMessages(pairs %>% left_join(cl.group, by=c("P1"="cl")))
 887 |     pairs = suppressMessages(pairs %>% left_join(cl.group, by=c("P2"="cl")))
 888 |     pairs = pairs %>% filter(group.x!=group.y) 
 889 |    
 890 |     registerDoMC(cores=10)
 891 |     de.checked.num = check_pairs_ds(de.dir, pairs[,c("P1","P2")], genes=group.markers,cl.bin=cl.bin, mc.cores=10, ...)    
 892 |     add.num = suppressMessages(pairs %>% left_join(de.checked.num) %>% mutate(num=n.markers - checked))
 893 |     
 894 |     select.markers=group.markers
 895 |     genes = setdiff(genes, select.markers)
 896 |     #save(group.markers, pairs, add.num, de.dir, genes, cl.bin, file="Debug.rda")
 897 |     more.markers <- select_markers_pair_direction_ds(de.dir, add.num=add.num,  genes=genes, cl.bin=cl.bin, mc.cores=10, byg=byg, ...)    
 898 |     select.markers = c(select.markers, more.markers$select.genes)
 899 |     if (any(is.na(select.markers))) {print("markers include NA")}
 900 |     select.markers = setdiff(select.markers, NA) 
 901 |     if (any(is.na(select.markers))) {print("markers include NA") ; browser() }
 902 |     return(select.markers)
 903 |   }
 904 | 
 905 | #' INFO -- PLEASE ADD --
 906 | #'
 907 | #' @param in.df to_be_added
 908 | #' @param cl.df to_be_added
 909 | #'
 910 | #' @return ???
 911 | #'
 912 | #' @keywords internal
 913 | get_de_genes <- function(ds, cl.bin, cl1, cl2)
 914 |   {
 915 |     bin1 = cl.bin %>% filter(cl==cl1) %>% pull(bin)
 916 |     bin2 = cl.bin %>% filter(cl==cl2) %>% pull(bin)
 917 |     select.genes = ds %>% filter(bin.x==bin1 & P1==cl1 & bin.y==bin2 & P2==cl2 | bin.x==bin2 & P2==cl1 & bin.y==bin1 & P1==cl2) %>% collect()
 918 |     return(select.genes)
 919 |   }
 920 | 
 921 | 
 922 | 
 923 | #' INFO -- PLEASE ADD --
 924 | #'
 925 | #' @param in.df to_be_added
 926 | #' @param cl.df to_be_added
 927 | #'
 928 | #' @return ???
 929 | #'
 930 | #' @keywords internal
 931 | 
 932 | prep_parquet_de_all_pairs <- function (norm.dat, cl, cl.bin = NULL, cl.bin.size = 100, de.param = de_param(),
 933 |     method = "fast_limma", mc.cores = 1, pairs.fn = "pairs.parquet", cl.bin.fn="cl.bin.rda", ...)
 934 | {
 935 |     cn <- as.character(sort(unique(cl)))
 936 |     pairs = create_pairs(cn)
 937 |     pairs = as.data.frame(pairs)
 938 | 
 939 |     pairs$pair = row.names(pairs)
 940 |     pairs$pair_id = 1:nrow(pairs)
 941 |     if (is.null(cl.bin)) {
 942 |        cl.bin.size = min(100, length(cn)/mc.cores)
 943 |        cl.bin = data.frame(cl = cn, bin = ceiling((1:length(cn)/cl.bin.size)))
 944 |     }
 945 |     library(arrow)
 946 |     write_parquet(pairs, sink = pairs.fn)
 947 |     save(cl.bin, file=cl.bin.fn) 
 948 |     de.result = prep_parquet_de_selected_pairs(norm.dat, cl = cl, pairs = pairs,
 949 |         cl.bin = cl.bin, de.param = de.param, method = method,
 950 |         mc.cores = mc.cores, ...)
 951 |     #de.result = de_selected_pairs(norm.dat, cl = cl, pairs = pairs,
 952 |     #    de.param = de.param, method = method, mc.cores = mc.cores, ...)
 953 |     return(de.result$de.genes)
 954 | }
 955 | 
 956 | #' INFO -- PLEASE ADD --
 957 | #'
 958 | #' @param in.df to_be_added
 959 | #' @param cl.df to_be_added
 960 | #'
 961 | #' @return ???
 962 | #'
 963 | #' @keywords internal
 964 | prep_parquet_de_selected_pairs <- function (norm.dat, cl, pairs, cl.bin = NULL, cl.size = NULL,
 965 |     de.param = de_parm(), method = "fast_limma", cl.means = NULL,
 966 |     cl.present = NULL, cl.sqr.means = NULL, use.voom = FALSE,
 967 |     counts = NULL, mc.cores = 1, out.dir = NULL, summary.dir = NULL,
 968 |     top.n = 500, overwrite = FALSE, return.df = FALSE, return.summary = !is.null(summary.dir))
 969 | {
 970 |     library(arrow)
 971 |     method <- match.arg(method, choices = c("fast_limma", "limma",
 972 |         "chisq", "t.test"))
 973 |     require(parallel)
 974 |     if (use.voom & is.null(counts)) {
 975 |         stop("The use.voom = TRUE parameter requires a raw count matrix via the counts parameter.")
 976 |     }
 977 |     if (is.null(cl.size)) {
 978 |         cl.size <- table(cl)
 979 |         cl.size = setNames(as.integer(cl.size), names(cl.size))
 980 |     }
 981 |     pairs.fn = NULL
 982 |     if (length(pairs) == 1) {
 983 |         pairs.fn = pairs
 984 |         pairs = open_dataset(pairs.fn)
 985 |     }
 986 |     else {
 987 |         pairs = as.data.frame(pairs)
 988 |         if (is.null(pairs$pair)) {
 989 |             pairs$pair = row.names(pairs)
 990 |         }
 991 |         if (is.null(pairs$pair_id)) {
 992 |             pairs$pair_id = 1:nrow(pairs)
 993 |         }
 994 |     }
 995 | 
 996 |     select.cl <- unique(c(pairs %>% pull(P1), pairs %>% pull(P2)))
 997 |     select.cl <- intersect(select.cl, names(cl.size)[cl.size >=
 998 |         de.param$min.cells])
 999 |     cl <- cl[cl %in% select.cl]
1000 |     if (is.factor(cl)) {
1001 |         cl = droplevels(cl)
1002 |     }
1003 |     if (is.null(pairs$bin.x)) {
1004 |         if (is.null(cl.bin)) {
1005 |             cl.bin.size = min(100, length(select.cl)/mc.cores)
1006 |             cl.bin = data.frame(cl = select.cl, bin = ceiling((1:length(select.cl)/cl.bin.size)))
1007 |         }
1008 |         pairs = pairs %>% left_join(cl.bin, by = c(P1 = "cl")) %>%
1009 |             left_join(cl.bin, by = c(P2 = "cl"))
1010 |     }
1011 |     pairs = pairs %>% filter(P1 %in% select.cl & P2 %in% select.cl) %>%
1012 |         collect()
1013 |     cl.size = cl.size[select.cl]
1014 |     if (is.null(cl.means)) {
1015 |         cl.means <- as.data.frame(get_cl_means(norm.dat, cl))
1016 |     }
1017 |     else {
1018 |         cl.means <- as.data.frame(cl.means)
1019 |     }
1020 |     if (is.null(cl.present)) {
1021 |         cl.present <- as.data.frame(get_cl_present(norm.dat,
1022 |             cl, de.param$low.th))
1023 |     }
1024 |     else {
1025 |         cl.present <- as.data.frame(cl.present)
1026 |     }
1027 |     if (is.null(cl.sqr.means)) {
1028 |         cl.sqr.means <- as.data.frame(get_cl_sqr_means(norm.dat,
1029 |             cl))
1030 |     }
1031 |     else {
1032 |         cl.sqr.means <- as.data.frame(cl.sqr.means)
1033 |     }
1034 |     if (method == "limma") {
1035 |         require("limma")
1036 |         norm.dat <- as.matrix(norm.dat[, names(cl)])
1037 |         cl <- setNames(as.factor(paste0("cl", cl)), names(cl))
1038 |         design <- model.matrix(~0 + cl)
1039 |         colnames(design) <- levels(as.factor(cl))
1040 |         if (use.voom & !is.null(counts)) {
1041 |             v <- limma::voom(counts = as.matrix(counts[row.names(norm.dat),
1042 |                 names(cl)]), design = design)
1043 |             fit <- limma::lmFit(object = v, design = design)
1044 |         }
1045 |         else {
1046 |             fit <- limma::lmFit(object = norm.dat[, names(cl)],
1047 |                 design = design)
1048 |         }
1049 |     }
1050 |     else if (method == "fast_limma") {
1051 |         fit = simple_lmFit(norm.dat, cl = cl, cl.means = cl.means,
1052 |             cl.sqr.means = cl.sqr.means)
1053 |     }
1054 |     else if (method == "t.test") {
1055 |         cl.vars <- as.data.frame(get_cl_vars(norm.dat, cl, cl.means = cl.means))
1056 |     }
1057 |     require(doMC)
1058 |     require(foreach)
1059 |     mc.cores = min(mc.cores, ceiling(nrow(pairs)/5000))
1060 |     registerDoMC(cores = mc.cores)
1061 |     de_combine <- function(result.1, result.2) {
1062 |         library(data.table)
1063 |         de.genes = c(result.1$de.genes, result.2$de.genes)
1064 |         if (!is.null(result.1$de.summary)) {
1065 |             de.summary = rbindlist(result.1$de.summary, result.2$de.summary)
1066 |             return(list(de.genes = de.genes, de.summary = de.summary))
1067 |         }
1068 |         else {
1069 |             return(list(de.genes = de.genes))
1070 |         }
1071 |     }
1072 |     if (!is.null(out.dir)) {
1073 |         if (!dir.exists(out.dir)) {
1074 |             dir.create(out.dir)
1075 |         }
1076 |     }
1077 |     if (!is.null(summary.dir)) {
1078 |         if (!dir.exists(summary.dir)) {
1079 |             dir.create(summary.dir)
1080 |         }
1081 |     }
1082 |     all.bins = sort(unique(c(pairs$bin.x, pairs$bin.y)))
1083 |     #print("check. all.bins") ; browser()
1084 |     de_list = foreach(bin1 = 1:length(all.bins), .combine = "c") %:%
1085 |         foreach(bin2 = bin1:length(all.bins), .combine = "c") %dopar%
1086 |        {
1087 |             x = all.bins[bin1]
1088 |             y = all.bins[bin2]
1089 |             if (!overwrite & !is.null(out.dir)) {
1090 |                 if (file.exists(file.path(out.dir, paste0("bin.x=",
1091 |                   x), paste0("bin.y=", y)))) {
1092 |                   return(list(de.genes = NULL, de.summary = NULL))
1093 |                 }
1094 |             }
1095 |             library(dplyr)
1096 |             library(arrow)
1097 |             library(data.table)
1098 |             tmp.pairs = pairs %>% filter(bin.x == x & bin.y ==
1099 |                 y | bin.x == y & bin.y == x)
1100 |             if (is.null(tmp.pairs) | nrow(tmp.pairs) == 0) {
1101 |                 return(list(de.genes = NULL, de.summary = NULL))
1102 |             }
1103 |             de.genes = sapply(1:nrow(tmp.pairs), function(i) {
1104 |                 pair = unlist(tmp.pairs[i, c("P1", "P2")])
1105 |                 if (method == "limma") {
1106 |                   require("limma")
1107 |                   df = de_pair_limma(pair = pair, cl.present = cl.present,
1108 |                     cl.means = cl.means, design = design, fit = fit)
1109 |                 }
1110 |                 else if (method == "fast_limma") {
1111 |                   df = de_pair_fast_limma(pair = pair, cl.present = cl.present,
1112 |                     cl.means = cl.means, fit = fit)
1113 |                 }
1114 |                 else if (method == "t.test") {
1115 |                   df = de_pair_t.test(pair = pair, cl.present = cl.present,
1116 |                     cl.means = cl.means, cl.vars = cl.vars, cl.size = cl.size)
1117 |                 }
1118 |                 else if (method == "chisq") {
1119 |                   df = de_pair_chisq(pair = pair, cl.present = cl.present,
1120 |                     cl.means = cl.means, cl.size = cl.size)
1121 |                 }
1122 |                 if (!is.null(de.param$min.cells)) {
1123 |                   cl.size1 <- cl.size[as.character(pair[1])]
1124 |                   cl.size2 <- cl.size[as.character(pair[2])]
1125 |                 }
1126 |                 else {
1127 |                  cl.size1 <- NULL
1128 |                  cl.size2 <- NULL
1129 |                 }
1130 |                 stats = de_stats_pair(df, de.param = de.param,
1131 |                   cl.size1, cl.size2, return.df = return.df)
1132 |             }, simplify = F)
1133 |             pair = tmp.pairs %>% pull(pair)
1134 |             names(de.genes) = pair
1135 |             if (return.summary) {
1136 |                 de.summary = scrattch.bigcat::de_pair_summary(de.genes, cl.bin=cl.bin, out.dir = summary.dir,
1137 |                   pairs = tmp.pairs, return.df = is.null(summary.dir))
1138 |             }
1139 |             if (!is.null(out.dir)) {
1140 |                 cat("Export", bin1, bin2, "\n")
1141 |                 result = scrattch.bigcat::export_de_genes(de.genes, cl.means,
1142 |                   out.dir = out.dir, pairs = tmp.pairs,
1143 |                   mc.cores = 1, top.n = top.n)
1144 |                 cat("Finish Export", x, y, "\n")
1145 |             }
1146 |             out = list(de.genes = de.genes, de.summary = de.summary)
1147 |           
1148 |             de.summary = NULL
1149 |             de.genes = NULL
1150 |             out
1151 |         }
1152 | 
1153 |     de.genes = do.call("c", de_list[names(de_list) == "de.genes"])
1154 |     names(de.genes) = gsub("de.genes.", "", names(de.genes))
1155 |     de.summary = do.call("c", de_list[names(de_list) == "de.summary"])
1156 |     if (is.null(de.summary)) {
1157 |         return(de.genes)
1158 |     }
1159 |     return(list(de.genes, de.summary))
1160 | }
1161 | 
1162 | 
1163 | 


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT13.0_mouse/cl.clean.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT13.0_mouse/cl.clean.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT21.0_mouse/cl.clean.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT21.0_mouse/cl.clean.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/broad.roi.key.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/broad.roi.key.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/col.age.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/col.age.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/col.broi.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/col.broi.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/genotype.genes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/genotype.genes.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/sex.genes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/sex.genes.rda


--------------------------------------------------------------------------------
/analysis_scripts/r_objects/supertype.exclude.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/supertype.exclude.rda


--------------------------------------------------------------------------------