├── README.md └── analysis_scripts ├── 01_prelim.filters.R ├── 02_prelim.cluster.R ├── 03_filter_from_prelim_cluster.R ├── 04_cluster.R ├── 05_label_transfer_of_aged_cells.R ├── 06_annotate_and_filter_from_second_cluster.R ├── 07_UMAP.R ├── 08a_subcluster_immune.R ├── 08b_subcluster_tanycytes_ependymal.R ├── 09a_degenes_age_mast_subclass.R ├── 09b_degenes_age_mast_supertype_nn.R ├── 10a_gsea_gprofiler_subclass.R ├── 10b_gsea_gprofiler_nnsupertype.R ├── 11_GO_matrix_gprofiler.R ├── 12_augur.R ├── 13_pb_edgeR.R ├── functions ├── degenes_age_mast_simplified_qcscore.R ├── run_umap.py └── scrattch.mapping │ ├── HANN.R │ ├── HANN_build.R │ ├── HANN_prepareTaxonomy.R │ └── HANN_utils.R └── r_objects ├── ABC_MWB_taxonomy ├── AIT13.0_mouse │ └── cl.clean.rda └── AIT21.0_mouse │ └── cl.clean.rda ├── broad.roi.key.rda ├── col.age.rda ├── col.broi.rda ├── genotype.genes.rda ├── sex.genes.rda └── supertype.exclude.rda /README.md: -------------------------------------------------------------------------------- 1 | # mouse_aging_scRNAseq 2 | Analysis scripts related to Jin et al manuscript on scRNA seq in aging mouse brain 3 | -------------------------------------------------------------------------------- /analysis_scripts/01_prelim.filters.R: -------------------------------------------------------------------------------- 1 | library(scrattch.hicat) 2 | library(dplyr) 3 | 4 | ## Description/Goal: 5 | ## Filter data at the single-cell level using loose filter criteria 6 | 7 | ################################################# 8 | ## Load data files 9 | ################################################# 10 | load("samp.dat_bothages_qcscore_20220705.rda") ## loading in cell metadata file 11 | 12 | ## Filter by very loose criteria 13 | qc.cut = 50 ## qc score cutoff 14 | gc.cut = 1000 ## gene detection cutoff 15 | doub.cut = 0.3 ## doublet score cutoff 16 | 17 | samp.dat.filtered = samp.dat[samp.dat$gene.counts.0 > gc.cut & 18 | samp.dat$qc.score > qc.cut & 19 | samp.dat$doublet_score < 0.3,] 20 | 21 | save(samp.dat.filtered, file = "samp.dat.filtered_prelim.20220705.rda") ## save new filtered meta-data object for later -------------------------------------------------------------------------------- /analysis_scripts/02_prelim.cluster.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Perform first round of clustering 3 | 4 | 5 | library(dplyr) 6 | library(scrattch.hicat) 7 | library(scrattch.bigcat) 8 | library(bigstatsr) 9 | library(Matrix) 10 | library(RcppParallel) 11 | library(matrixStats) 12 | 13 | 14 | ################################################# 15 | ## Load data files 16 | ################################################# 17 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## cell by gene file-backed matrix (see github.com/alleninstitute/scrattch.bigcat for examples for assembly) 18 | load("samp.dat.filtered_prelim.20220705.rda") ## filtered metadata file from previous step 19 | 20 | select.cells = samp.dat.filtered$sample_id 21 | 22 | 23 | ################################################# 24 | ## First clustering round 25 | ################################################# 26 | ## Run clustering 27 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50) ## Looser settings 28 | 29 | set.seed(123) 30 | resulti = iter_clust_big(big.dat = big.dat, select.cells = select.cells, de.param =de.param, prefix="./prelim_cluster/cluster_0220705", 31 | max.cl.size=200, split.size = 50, verbose=1, sampleSize=10000) 32 | 33 | save(resulti, file="./prelim_cluster/resulti_cluster_0220705.rda") 34 | 35 | cl = resulti$cl ## de novo clusters 36 | markers = resulti$markers ## marker genes for all clusters 37 | 38 | ## Subsample results and merge 39 | sampled.cells = sample_cells(cl, 100) 40 | sampled.cells = sample(sampled.cells, 200000, replace = F) 41 | norm.dat = get_logNormal(big.dat, sampled.cells) 42 | select.markers = intersect(markers, big.dat$row_id) 43 | 44 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50) 45 | merge.result = merge_cl(norm.dat=norm.dat, cl=cl, rd.dat.t=norm.dat[select.markers,], de.param=de.param, verbose=TRUE) 46 | 47 | 48 | 49 | ################################################# 50 | ## Save objects for future use 51 | ################################################# 52 | save(merge.result, file="merge.result_th300_cluster_0220705.rda") 53 | save(norm.dat, file = "norm.dat_cl100_ss200Kcells.rda") 54 | -------------------------------------------------------------------------------- /analysis_scripts/03_filter_from_prelim_cluster.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Use preliminary clustering results to perform additional round of filtering of clusters 3 | ## before second and final round of clustering 4 | 5 | ## Note: 6 | ## This script uses an older version of ABC-MWB annotations (AIT13.0) - we recommend using 7 | ## the latest version that has been published in Yao, 2023 Nature, but are including the older 8 | ## version here for transparancy 9 | 10 | 11 | library(dplyr) 12 | library(scrattch.hicat) 13 | options(stringsAsFactors = F) 14 | 15 | 16 | ################################################# 17 | ## Load data files 18 | ################################################# 19 | ## Load objects from prior scripts 20 | load("./prelim_cluster_objects/merge.result_th300_cluster_0220705.rda") 21 | load("samp.dat.filtered_prelim.20220705.rda") 22 | 23 | ## Load annotations that already exist for adult cells (these were an older version of ABC-MWB annotations) 24 | load("r_objects/ABC_MWB_taxonomy/AIT13.0_mouse/cl.clean.rda") ## loads objects cl.clean and cl.df.clean 25 | 26 | 27 | ################################################# 28 | ## Combine adult annotations with preliminary cluster results 29 | ################################################# 30 | ## Subset ABC-MWB results and label missing cells (assumed to be junk) 31 | roi.keep = unique(samp.dat.filtered$roi) 32 | 33 | cells.anno = intersect(samp.dat.filtered$sample_id, names(cl.clean)) 34 | cells.adult = samp.dat.filtered$sample_id[samp.dat.filtered$age_cat == "adult"] 35 | cells.junk = setdiff(cells.adult, cells.anno) 36 | 37 | anno.df.u19 = data.frame(sample_id = c(cells.anno, cells.junk)) 38 | anno.df.u19$cl.u19 = cl.clean[anno.df.u19$sample_id] 39 | anno.df.u19$cl.u19[is.na(anno.df.u19$cl.u19)] = "missing.or.junk" 40 | 41 | head(anno.df.u19) 42 | tail(anno.df.u19) 43 | 44 | anno.df.u19 = left_join(anno.df.u19, cl.df.clean[,c("cl", "cluster_label", "Level1_label", "Level2_label", "subclass_label", "class_label")], by = c("cl.u19" = "cl")) 45 | anno.df.u19$cluster_label = as.character(anno.df.u19$cluster_label) 46 | anno.df.u19[is.na(anno.df.u19)] = "missing.or.junk" 47 | 48 | head(anno.df.u19) 49 | tail(anno.df.u19) 50 | 51 | 52 | ## Add cl to my data 53 | row.names(samp.dat.filtered) = samp.dat.filtered$sample_id 54 | cl = merge.result$cl 55 | samp.dat.filtered$cl = cl[samp.dat.filtered$sample_id] 56 | 57 | ## Add U19 labels 58 | samp.dat.filtered = left_join(samp.dat.filtered, anno.df.u19) 59 | 60 | 61 | cl.df = samp.dat.filtered %>% group_by(cl) %>% summarize(ncell = n(), ## this object summarizes many QC stats for each de novo cluster and will be used to visualize and determine filter criteria 62 | med.gc = median(gene.counts.0), 63 | med.qc = median(qc.score), 64 | med.doub = median(doublet_score), 65 | age_prop = sum(age_cat == "aged") / n(), 66 | adult_prop = sum(age_cat == "adult") / n(), 67 | U19_id_prop = 1 - (sum(is.na(cluster_label)) / n()), 68 | max.cluster.U19 = names(which.max(table(cluster_label))), 69 | max.cluster.U19.prop = (max(table(cluster_label)) / n()), 70 | max.subclass.U19 = names(which.max(table(subclass_label))), 71 | max.subclass.U19.prop = (max(table(subclass_label)) / n()), 72 | max.class.U19 = names(which.max(table(class_label))), 73 | max.class.U19.prop = (max(table(class_label)) / n()), 74 | max.level1.U19 = names(which.max(table(Level1_label))), 75 | max.level1.U19.prop = (max(table(Level1_label)) / n()), 76 | max.level2.U19 = names(which.max(table(Level2_label))), 77 | max.level2.U19.prop = (max(table(Level2_label)) / n()), 78 | max.donor.prop = (max(table(library_prep)) / n())) 79 | 80 | 81 | ################################################# 82 | ## Vis QC metrics 83 | ################################################# 84 | library(gridExtra) 85 | library(ggplot2) 86 | 87 | ## Gene counts 88 | p2 = cl.df %>% 89 | ggplot(aes(x = med.gc))+ 90 | geom_histogram(bins = 50)+ 91 | scale_x_log10()+ 92 | geom_vline(xintercept = 2000)+ 93 | geom_vline(xintercept = 3000, color = "red")+ 94 | geom_vline(xintercept = 5500, color = "blue")+ 95 | ggtitle("Median Gene Counts")+ 96 | facet_wrap(~max.class.U19, ncol = 2, scales = "free_y")+ 97 | theme_bw() 98 | 99 | p1 = cl.df %>% 100 | ggplot(aes(x = med.gc))+ 101 | geom_histogram(bins = 50)+ 102 | scale_x_log10()+ 103 | geom_vline(xintercept = 2000)+ 104 | geom_vline(xintercept = 3000, color = "red")+ 105 | geom_vline(xintercept = 5500, color = "blue")+ 106 | ggtitle("Median Gene Counts")+ 107 | facet_wrap(~max.class.U19, ncol = 2)+ 108 | theme_bw() 109 | 110 | grid.arrange(p1, p2, nrow = 1) 111 | 112 | ## QC Score 113 | p2 = cl.df %>% 114 | ggplot(aes(x = med.qc))+ 115 | geom_histogram(bins = 50)+ 116 | # scale_x_log10()+ 117 | geom_vline(xintercept = 150)+ 118 | geom_vline(xintercept = 300, color = "red")+ 119 | ggtitle("Median QC Score")+ 120 | facet_wrap(~max.class.U19, ncol = 2, scales = "free_y")+ 121 | theme_bw() 122 | 123 | p1 = cl.df %>% 124 | ggplot(aes(x = med.qc))+ 125 | geom_histogram(bins = 50)+ 126 | # scale_x_log10()+ 127 | geom_vline(xintercept = 150)+ 128 | geom_vline(xintercept = 300, color = "red")+ 129 | ggtitle("Median QC Score")+ 130 | facet_wrap(~max.class.U19, ncol = 2)+ 131 | theme_bw() 132 | 133 | grid.arrange(p1, p2, nrow = 1) 134 | 135 | 136 | ################################################# 137 | ## Label LQ clusters 138 | ################################################# 139 | ## Cluster-level QC and GC cutoffs for neurons versus non-neurons 140 | tmp1 = cl.df$cl[cl.df$max.class.U19 %in% c("NN", "IMN") & (cl.df$med.gc < 2000 | cl.df$med.qc < 100)] 141 | tmp.neurons = c("Gaba", "Glut", "Hybrid", "Sero", "Dopa") 142 | tmp2 = cl.df$cl[cl.df$max.class.U19 %in% tmp.neurons & (cl.df$med.gc < 3000 | cl.df$med.qc < 250)] 143 | cl.lq.qc = unique(c(tmp1, tmp2)) 144 | 145 | ## Clusters that are mostly mapping to "junk" 146 | cl.lq.junk = cl.df$cl[cl.df$max.cluster.U19 == "missing.or.junk" & cl.df$U19_id_prop > 0.05] 147 | 148 | ## Clusters that are biased towards a single donor 149 | cl.lq.donor = cl.df$cl[cl.df$max.donor.prop > 0.8] 150 | 151 | ## Other manually annotated LQ clusters: 152 | cl.lq.manual = c( 153 | 7051, ## Oligo cluster with endo markers 154 | 7191, 7193, ## Astro + Microglia doublets 155 | 7214, 7215, ## Astro + Endo doublets 156 | 595 ## Astro + Gluta doublet (Slc17a7) 157 | ) 158 | 159 | 160 | cl.lq = unique(c(cl.lq.qc, cl.lq.junk, cl.lq.donor, cl.lq.manual)) 161 | 162 | cl.df$keep = T 163 | cl.df$keep[cl.df$cl %in% cl.lq] = F 164 | 165 | sum(cl.df$keep) ## number of remaining clusters 166 | sum(cl.df$ncell) ## number of starting cells 167 | sum(cl.df$ncell[cl.df$keep == T]) ## number of cells remaining after new filters 168 | 169 | save(cl.df, file = "./prelim_cluster_objects/cl.df_U19.max.2022-09-15.rda") ## save cl.df object for later 170 | 171 | 172 | ################################################# 173 | ## Filter LQ clusters 174 | ################################################# 175 | ## Filter 176 | cl = merge.result$cl 177 | samp.dat.filtered$cl = cl[samp.dat.filtered$sample_id] 178 | samp.dat.filtered = left_join(samp.dat.filtered, cl.df) 179 | 180 | anno.df.clean = samp.dat.filtered[samp.dat.filtered$keep == T,] 181 | 182 | save(anno.df.clean, file = "anno.df.clean_20220915.rda") 183 | 184 | 185 | 186 | 187 | ## Create anno.df (remove weird geno/donors) 188 | weird.donor = "483935" 189 | weird.geno = c("Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt", "CX3CR1-GFP/CX3CR1-GFP") 190 | anno.df.clean = samp.dat.filtered[samp.dat.filtered$external_donor_name != weird.donor,] 191 | anno.df.clean = anno.df.clean[!anno.df.clean$full_genotype %in% weird.geno,] 192 | row.names(anno.df.clean) = anno.df.clean$sample_id 193 | save(anno.df.clean, file = "anno.df.clean_20220915.rda") 194 | -------------------------------------------------------------------------------- /analysis_scripts/04_cluster.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Perform final round of clustering 3 | 4 | 5 | library(dplyr) 6 | library(scrattch.hicat) 7 | library(scrattch.bigcat) 8 | library(bigstatsr) 9 | library(Matrix) 10 | library(RcppParallel) 11 | library(matrixStats) 12 | 13 | 14 | ################################################# 15 | ## Load data 16 | ################################################# 17 | load("big.dat_Aging.16ROI.V3.20220701.rda") 18 | load("anno.df.clean_20220915.rda") ## most recent metadata object with filtered cells 19 | 20 | select.cells = anno.df.clean$sample_id 21 | 22 | 23 | ################################################# 24 | ## Second clustering round 25 | ################################################# 26 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50) 27 | 28 | set.seed(123) 29 | resulti = iter_clust_big(big.dat = big.dat, select.cells = select.cells, de.param =de.param, prefix="./cluster_0916/cluster_intermed_files", 30 | max.cl.size=200, split.size = 50, verbose=1, sampleSize=10000) 31 | 32 | save(resulti, file=paste0("./cluster_0916/resulti_cluster.rda")) 33 | 34 | 35 | ## Merge clusters 36 | load("./cluster_0916/resulti_cluster.rda") 37 | cl = resulti$cl 38 | markers = resulti$markers 39 | 40 | sampled.cells = sample_cells(cl, 100) 41 | # sampled.cells = sample(sampled.cells, 200000, replace = F) 42 | norm.dat = get_logNormal(big.dat, sampled.cells) 43 | select.markers = intersect(markers, big.dat$row_id) 44 | 45 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=100) 46 | merge.result = merge_cl(norm.dat=norm.dat, cl=cl, rd.dat.t=norm.dat[select.markers,], de.param=de.param, verbose=TRUE) 47 | 48 | 49 | 50 | ################################################# 51 | ## Save objects for future use 52 | ################################################# 53 | save(merge.result, file= "./cluster_0916/merge.result_th300_mincells100.rda") ## clustering results 54 | save(norm.dat, file = "./cluster_0916/norm.dat_cluster0916_cl100.rda") ## subsampled cell-by-gene matrix 55 | -------------------------------------------------------------------------------- /analysis_scripts/05_label_transfer_of_aged_cells.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Performing label tansfer for all aged cells from ABC-WMB taxonomy (AIT21.0) 3 | 4 | ## Note: 5 | ## This script was run on an HPC job scheduler, and prior to the publishing of scrattch.mapping package 6 | ## We recomment installing github.com/alleninstitute/scrattch.mapping package to perform 7 | ## label transfer, but are including original scripts here for the sake of transparency 8 | 9 | #################################################################### 10 | ## These arguments were used for indexing on an HPC job scheduler ## 11 | args = commandArgs(TRUE) 12 | array.index = args[1] 13 | array.index = as.integer(array.index) 14 | #################################################################### 15 | 16 | 17 | ## These functions have since been implemented in github.com/alleninstitute/scrattch-mapping 18 | ## but are also included in github repo for this project 19 | source("functions/scrattch.mapping/HANN.R") 20 | source("functions/scrattch.mapping/HANN_build.R") 21 | source("functions/scrattch.mapping/HANN_utils.R") 22 | source("functions/scrattch.mapping/HANN_prepareTaxonomy.R") 23 | 24 | 25 | ################################################# 26 | ## Load data files 27 | ################################################# 28 | library(scrattch.bigcat) 29 | library(openblasctl) 30 | openblas_set_num_threads(1) 31 | 32 | load("big.dat_Aging.16ROI.V3.20231208.rda") 33 | load("samp.dat_bothages_qcscore_20231208.rda") 34 | 35 | ## Select cells to keep 36 | keep.cells = samp.dat$sample_id[samp.dat$age_cat == "aged"] ## Only select aged cells 37 | 38 | 39 | ################################################# 40 | ## Run mapping 41 | ################################################# 42 | library(dplyr) 43 | library(scrattch.bigcat) 44 | library(Matrix) 45 | library(data.table) 46 | library(arrow) 47 | 48 | ## Breaking the data into parts for parallel processing on HPC 49 | chunksize = 50000 50 | start = seq(1,length(keep.cells), chunksize) 51 | end = c(start[-1] - 1, length(keep.cells)) 52 | print(c(array.index, start[array.index], end[array.index])) 53 | 54 | 55 | ## Perform label transfer 56 | select.cells = keep.cells[c(start[array.index]:end[array.index])] 57 | 58 | print("Get norm.dat...") 59 | qdat = get_cols(big.dat, select.cells) 60 | 61 | print("Start mapping...") 62 | mapped = run_mapping_on_taxonomy( 63 | qdat, # qdat : log normalized count matrix (gene x cell) 64 | Taxonomy='AIT21.0_mouse', # taxonomy 65 | TaxHome='/allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/Taxonomies/', 66 | prefix='10X_cells_v3', # prefix for your data platform (SS, 10X_cells_v2, 10X_cells_v3, 10X_nuclei_v3,MFISH) 67 | prebuild=FALSE, # if the taxonomy is available for your platform, use it. 68 | newbuild=FALSE, # 69 | mapping.method='flat', # 'flat','hierarchy' (AIT12.0_mouse: only flat) 70 | nlevel=2, 71 | mc.cores=10, # lower this to 5 if the run fails due to the memory 72 | iter=100, 73 | blocksize=5000) 74 | 75 | fn.tmp = paste0("mapping_aged_fmap_AIT21.0_allcells/map.results_part", array.index, "of", length(start), ".rds") 76 | print(paste0("Saving as ", fn.tmp, "...")) 77 | saveRDS(mapped, file = fn.tmp) 78 | 79 | 80 | ################################################# 81 | ## Assemble mapping results 82 | ################################################# 83 | fmap.files = list.files("./mapping_aged_fmap_AIT21.0/", pattern = "*.rds", full.names = T) 84 | fmap.result = c() 85 | 86 | for(i in fmap.files){ 87 | 88 | tmp.fmap = readRDS(i) 89 | fmap.result = rbind(fmap.result, tmp.fmap$best.map.df) 90 | 91 | } 92 | 93 | head(fmap.result) 94 | names(fmap.result)[-1] = paste0(names(fmap.result)[-1], "_AIT21.0") 95 | ref.cl.df = tmp.fmap$cl.df 96 | 97 | saveRDS(fmap.result, file = "mapping_aged_fmap_AIT21.0/map.result_bestmap.allcells.20230915.rds") ## Mapping results 98 | save(ref.cl.df, file = "mapping_aged_fmap_AIT21.0/ref.cl.df.rda") ## Saves the annotation table for the reference that was used (ABC-WMB taxonomy) 99 | 100 | -------------------------------------------------------------------------------- /analysis_scripts/06_annotate_and_filter_from_second_cluster.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Annotate and filter based on latest clustering 3 | 4 | library(plyr) 5 | library(dplyr) 6 | library(scrattch.hicat) 7 | options(stringsAsFactors = F) 8 | 9 | 10 | ################################################# 11 | ## Load data 12 | ################################################# 13 | ## Load annotations that already exist from ABC-MWB 14 | load("r_objects/ABC_MWB_taxonomy/AIT21.0_mouse/cl.clean.rda") 15 | remove(cl.df.clean) 16 | load("mapping_aged_fmap_AIT21.0_allcells/ref.cl.df.rda") 17 | 18 | ## Load clustering results 19 | load("./cluster_0916/merge.result_th300_mincells100.rda") 20 | load("samp.dat_bothages_qcscore_20231208.rda") 21 | 22 | cl = merge.result$cl 23 | samp.dat.filtered = samp.dat[samp.dat$sample_id %in% names(cl),] 24 | 25 | 26 | 27 | ################################################# 28 | ## Combine adult annotations with cluster results 29 | ################################################# 30 | ## Subset ABC-MWB results and label missing cells (assumed to be junk) 31 | roi.keep = unique(samp.dat.filtered$roi) 32 | 33 | cells.anno = intersect(samp.dat.filtered$sample_id, names(cl.clean)) 34 | cells.adult = samp.dat.filtered$sample_id[samp.dat.filtered$age_cat == "adult"] 35 | cells.junk = setdiff(cells.adult, cells.anno) 36 | 37 | anno.df.u19 = data.frame(sample_id = c(cells.anno, cells.junk)) 38 | anno.df.u19$cl.u19 = cl.clean[anno.df.u19$sample_id] 39 | anno.df.u19$cl.u19[is.na(anno.df.u19$cl.u19)] = "missing.or.junk" 40 | 41 | head(anno.df.u19) 42 | tail(anno.df.u19) 43 | 44 | ref.cl.df$cl = as.character(ref.cl.df$cl) 45 | anno.df.u19 = left_join(anno.df.u19, ref.cl.df[,c("cl", "cluster_id_label", "supertype_label", "subclass_label", "class_label")], by = c("cl.u19" = "cl")) 46 | anno.df.u19[is.na(anno.df.u19)] = "missing.or.junk" 47 | 48 | head(anno.df.u19) 49 | tail(anno.df.u19) 50 | dim(anno.df.u19) 51 | 52 | 53 | ## Add mapped aged labels 54 | load("mapping_aged_fmap_AIT21.0_allcells//ref.cl.df.rda") 55 | map.result = readRDS("mapping_aged_fmap_AIT21.0_allcells/map.result_bestmap.allcells.20231219.rds") 56 | map.result = left_join(map.result, ref.cl.df[,c("cl", "cluster_id_label", "supertype_label", "subclass_label", "class_label")], by = c("best.cl_AIT21.0" = "cl")) 57 | names(map.result)[names(map.result) == "best.cl._AIT21.0"] = "cl.u19" 58 | common = intersect(names(map.result), names(anno.df.u19)) 59 | map.result = map.result[map.result$sample_id %in% samp.dat.filtered$sample_id[samp.dat.filtered$age_cat == "aged"],] 60 | 61 | anno.df.u19 = rbind(anno.df.u19[,common], map.result[,common]) 62 | 63 | 64 | ## Add cl to my data 65 | row.names(samp.dat.filtered) = samp.dat.filtered$sample_id 66 | samp.dat.filtered$cl = cl[samp.dat.filtered$sample_id] 67 | samp.dat.filtered = samp.dat.filtered[!is.na(samp.dat.filtered$cl),] 68 | 69 | ## Add U19 labels 70 | samp.dat.filtered = left_join(samp.dat.filtered, anno.df.u19) 71 | 72 | cl.df = samp.dat.filtered %>% dplyr::group_by(cl) %>% dplyr::summarize(ncell = n(), 73 | med.gc = median(gene.counts.0), 74 | med.qc = median(qc.score), 75 | med.doub = median(doublet_score), 76 | age_prop = sum(age_cat == "aged") / n(), 77 | adult_prop = sum(age_cat == "adult") / n(), 78 | 79 | max.roi = names(which.max(table(roi))), 80 | max.roi.prop = (max(table(roi)) / n()), 81 | 82 | max.class = names(which.max(table(class_label))), 83 | max.class_2 = names(sort(table(class_label), decreasing = T))[2], 84 | max.class.prop = (max(table(class_label)) / n()), 85 | max.class.prop_2 = sort((table(class_label)) / n(), decreasing = T)[2], 86 | 87 | max.subclass = names(which.max(table(subclass_label))), 88 | max.subclass_2 = names(sort(table(subclass_label), decreasing = T))[2], 89 | max.subclass.prop = (max(table(subclass_label)) / n()), 90 | max.subclass.prop_2 = sort((table(subclass_label)) / n(), decreasing = T)[2], 91 | 92 | max.supertype = names(which.max(table(supertype_label))), 93 | max.supertype_2 = names(sort(table(supertype_label), decreasing = T))[2], 94 | max.supertype.prop = (max(table(supertype_label)) / n()), 95 | max.supertype.prop_2 = sort((table(supertype_label)) / n(), decreasing = T)[2], 96 | 97 | max.cluster = names(which.max(table(cluster_id_label))), 98 | max.cluster_2 = names(sort(table(cluster_id_label), decreasing = T))[2], 99 | max.cluster.prop = (max(table(cluster_id_label)) / n()), 100 | max.cluster.prop_2 = sort((table(cluster_id_label)) / n(), decreasing = T)[2], 101 | 102 | max.lib.prop = (max(table(library_prep)) / n())) 103 | 104 | 105 | 106 | hist(cl.df$max.class.prop) 107 | hist(cl.df$max.subclass.prop) 108 | 109 | 110 | ################################################# 111 | ## Vis QC metrics 112 | ################################################# 113 | library(ggplot2) 114 | library(gridExtra) 115 | 116 | 117 | ## Gene counts 118 | p2 = cl.df %>% 119 | ggplot(aes(x = med.gc))+ 120 | geom_histogram(bins = 50)+ 121 | scale_x_log10()+ 122 | geom_vline(xintercept = 2000)+ 123 | geom_vline(xintercept = 3000, color = "red")+ 124 | geom_vline(xintercept = 5000, color = "blue")+ 125 | geom_vline(xintercept = 5500, color = "dark green")+ 126 | ggtitle("Median Gene Counts")+ 127 | facet_wrap(~max.class, ncol = 2, scales = "free_y")+ 128 | theme_bw() 129 | 130 | p1 = cl.df %>% 131 | ggplot(aes(x = med.gc))+ 132 | geom_histogram(bins = 50)+ 133 | scale_x_log10()+ 134 | geom_vline(xintercept = 2000)+ 135 | geom_vline(xintercept = 3000, color = "red")+ 136 | geom_vline(xintercept = 5000, color = "blue")+ 137 | geom_vline(xintercept = 5500, color = "dark green")+ 138 | ggtitle("Median Gene Counts")+ 139 | facet_wrap(~max.class, ncol = 2)+ 140 | theme_bw() 141 | 142 | grid.arrange(p1, p2, nrow = 1) 143 | 144 | 145 | 146 | ## QC Score 147 | p2 = cl.df %>% 148 | ggplot(aes(x = med.qc))+ 149 | geom_histogram(bins = 50)+ 150 | # scale_x_log10()+ 151 | geom_vline(xintercept = 150)+ 152 | geom_vline(xintercept = 300, color = "red")+ 153 | ggtitle("Median QC Score")+ 154 | facet_wrap(~max.class, ncol = 2, scales = "free_y")+ 155 | theme_bw() 156 | 157 | p1 = cl.df %>% 158 | ggplot(aes(x = med.qc))+ 159 | geom_histogram(bins = 50)+ 160 | # scale_x_log10()+ 161 | geom_vline(xintercept = 150)+ 162 | geom_vline(xintercept = 300, color = "red")+ 163 | ggtitle("Median QC Score")+ 164 | facet_wrap(~max.class, ncol = 2)+ 165 | theme_bw() 166 | 167 | grid.arrange(p1, p2, nrow = 1) 168 | 169 | 170 | 171 | 172 | ################################################# 173 | ## Label LQ clusters 174 | ################################################# 175 | ## Cluster-level QC and GC cutoffs for different classes 176 | tmp = c("Immune", "Vascular", "Astro-Epen") 177 | tmp.nn = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 2000 | cl.df$med.qc < 150)] 178 | tmp = c("OPC-Oligo") 179 | tmp.oligo = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 3000 | cl.df$med.qc < 150)] 180 | tmp = c("OB-CR Glut", "DG-IMN Glut", "OB-IMN GABA") 181 | tmp.imn = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 3000 | cl.df$med.qc < 150)] 182 | tmp = setdiff(unique(ref.cl.df$class_label), c("Immune", "Vascular", "OPC-Oligo", "Astro-Epen", "OB-CR Glut", "DG-IMN Glut", "OB-IMN GABA")) 183 | tmp.neuron = cl.df$cl[cl.df$max.class %in% tmp & (cl.df$med.gc < 5500 | cl.df$med.qc < 300)] 184 | 185 | cl.lq.qc = unique(c(tmp.nn, tmp.imn, tmp.neuron, tmp.oligo)) 186 | 187 | ## Clusters that are mostly mapping to "junk" 188 | cl.lq.junk = cl.df$cl[cl.df$max.subclass == "missing.or.junk" | cl.df$max.supertype == "missing.or.junk"] 189 | 190 | ## Clusters that are biased towards a single donor 191 | cl.lq.donor = cl.df$cl[cl.df$max.donor.prop > 0.8] 192 | 193 | ## Other manually annotated LQ clusters: 194 | cl.lq.manual = c("2108" #mapped to half neurons and half microglia 195 | ) 196 | 197 | ## Exclude off target supertypes 198 | load("r_objects/supertype.exclude.rda") 199 | cl.offtarget = cl.df$cl[cl.df$max.supertype %in% supertype.exclude] 200 | tmp = cl.df[cl.df$cl %in% cl.offtarget,] 201 | 202 | ## Ambiguous mapping 203 | hist(cl.df$max.class.prop) 204 | tmp.cl.df = cl.df[cl.df$max.class.prop < 0.7,] 205 | cl.ambmappingjunk = tmp.cl.df$cl[tmp.cl.df$max.class_2 == "missing.or.junk" & tmp.cl.df$max.class.prop_2 > 0.3] 206 | tmp = cl.df[cl.df$cl %in% unique(c(cl.ambmappingjunk)),] 207 | 208 | cl.lq = unique(c(cl.lq.qc, cl.lq.junk, cl.lq.donor, cl.lq.manual, cl.offtarget, cl.ambmappingjunk)) 209 | 210 | 211 | ################################################# 212 | ## Final filter 213 | ################################################# 214 | cl.df$keep = T 215 | cl.df$keep[cl.df$cl %in% cl.lq] = F 216 | 217 | sum(cl.df$keep) ## number of remaining clusters 218 | sum(cl.df$ncell) ## number of starting cells 219 | sum(cl.df$ncell[cl.df$keep == T]) ## number of cells remaining after new filters 220 | 221 | cl.df.clean = cl.df[cl.df$keep == T,] 222 | 223 | save(cl.df, file = "./cluster_0916/cl.df_v231221.rda") 224 | 225 | ## Assign class 226 | cl.df.clean$class_label = cl.df.clean$max.class 227 | table(cl.df.clean$class_label) 228 | 229 | ## Assign subclass 230 | cl.df.clean$subclass_label = cl.df.clean$max.subclass 231 | table(cl.df.clean$subclass_label) 232 | 233 | ## Assign supertype 234 | cl.df.clean$supertype_label = cl.df.clean$max.supertype 235 | table(cl.df.clean$supertype_label) 236 | 237 | 238 | ################################################# 239 | ## Finding hierarchy discrepencies - correct each manually 240 | ################################################# 241 | load("mapping_aged_fmap_AIT21.0_allcells//ref.cl.df.rda") 242 | tmp = unique(cl.df.clean[,c("class_label", "subclass_label", "supertype_label")]) 243 | tmp.ref = unique(ref.cl.df[,c("class_label", "subclass_label", "supertype_label")]) 244 | setdiff(tmp, tmp.ref) 245 | 246 | ##--> cl292 247 | cl.df.clean$class_label[cl.df.clean$cl == 292] = "MB Glut" #changing to MB Glut because its similar in prop to former (P Glut) and max ROI is also MB 248 | ##--> cl502 249 | cl.df.clean$supertype_label[cl.df.clean$cl == 502] = "RHP-COA Ndnf Gaba_6" #changing to top supertype of the matching subclass 250 | ##--> cl629 251 | cl.df.clean$supertype_label[cl.df.clean$cl == 629] = "STN-PSTN Pitx2 Glut_2" #changing to top supertype of the matching subclass 252 | ##--> cl3678 253 | cl.df.clean$supertype_label[cl.df.clean$cl == 3678] = "L4/5 IT CTX Glut_1" #changing to top supertype of the matching subclass 254 | ##--> cl3415 255 | cl.df.clean$supertype_label[cl.df.clean$cl == 3415] = "OB-STR-CTX Inh IMN_1" #changing to top supertype of the matching subclass; all aged cells in this cluster are also mapping to OB rather than DIR 256 | 257 | tmp = unique(cl.df.clean[,c("class_label", "subclass_label", "supertype_label")]) 258 | tmp.ref = unique(ref.cl.df[,c("class_label", "subclass_label", "supertype_label")]) 259 | setdiff(tmp, tmp.ref) #should be empty now 260 | 261 | ## Correcting order 262 | class_label = intersect(unique(ref.cl.df$class_label), unique(cl.df.clean$class_label)) 263 | class_id = 1:length(class_label) 264 | class.df = data.frame(class_label = class_label, class_id = class_id) 265 | 266 | subclass_label = intersect(unique(ref.cl.df$subclass_label), unique(cl.df.clean$subclass_label)) 267 | subclass_id = 1:length(subclass_label) 268 | subclass.df = data.frame(subclass_label = subclass_label, subclass_id = subclass_id) 269 | 270 | supertype_label = intersect(unique(ref.cl.df$supertype_label), unique(cl.df.clean$supertype_label)) 271 | supertype_id = 1:length(supertype_label) 272 | supertype.df = data.frame(supertype_label = supertype_label, supertype_id = supertype_id) 273 | 274 | cl.df.clean = left_join(cl.df.clean, class.df) 275 | cl.df.clean = left_join(cl.df.clean, subclass.df) 276 | cl.df.clean = left_join(cl.df.clean, supertype.df) 277 | 278 | 279 | ## Add colors 280 | class.colors = read.csv("mapping_aged_fmap_AIT21.0/ait21_class_colors.csv") 281 | subclass.colors = read.csv("mapping_aged_fmap_AIT21.0/ait21_subclass_colors.csv") 282 | supertype.colors = read.csv("mapping_aged_fmap_AIT21.0/ait21_supertype_colors.csv") 283 | 284 | cl.df.clean = left_join(cl.df.clean, class.colors) 285 | cl.df.clean = left_join(cl.df.clean, subclass.colors) 286 | cl.df.clean = left_join(cl.df.clean, supertype.colors) 287 | 288 | 289 | ## Update cl numbers 290 | cl.df.clean = cl.df.clean[order(cl.df.clean$supertype_id, cl.df.clean$subclass_id, cl.df.clean$class_id),] 291 | cl.df.clean$cl_0916 = cl.df.clean$cl 292 | cl.df.clean$cl = 1:nrow(cl.df.clean) 293 | cl.df.clean$cluster_label = paste0(cl.df.clean$cl, "_", cl.df.clean$supertype_label) 294 | 295 | save(cl.df.clean, file = "cluster_0916/cl.df.clean_v231221.rda") 296 | 297 | 298 | ################################################# 299 | ## Make new annotated metadata object (anno.df.clean) 300 | ################################################# 301 | ## Load clustering results and new cl.df.clean 302 | load("./cluster_0916/merge.result_th300_mincells100.rda") 303 | load("./cluster_0916/cl.df.clean_v231221.rda") 304 | 305 | ## Load original cell metadata file and key for broad rois 306 | load("samp.dat_bothages_qcscore_20231208.rda") 307 | load("r_objects/broad.roi.key.rda") 308 | 309 | ## Add newest cluster label to metadata file 310 | cl = merge.result$cl 311 | select.cells = names(cl) 312 | row.names(samp.dat) = samp.dat$sample_id 313 | samp.dat.filtered = samp.dat[select.cells,] 314 | samp.dat.filtered$cl_0916 = cl[row.names(samp.dat.filtered)] 315 | 316 | anno.df.clean = samp.dat.filtered[samp.dat.filtered$cl_0916 %in% cl.df.clean$cl_0916,] 317 | anno.df.clean = left_join(anno.df.clean, cl.df.clean) 318 | anno.df.clean = left_join(anno.df.clean, broad.key) 319 | 320 | save(anno.df.clean, file = "anno.df.clean_cluster0916_v231221_20231221.rda") 321 | 322 | -------------------------------------------------------------------------------- /analysis_scripts/07_UMAP.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Compute UMAP 3 | 4 | library(scrattch.bigcat) 5 | library(scrattch.hicat) 6 | library(MatrixGenerics) 7 | 8 | ################################################# 9 | ## Load data 10 | ################################################# 11 | load("cluster_0916/merge.result_th300_mincells100.rda") ## clustering results 12 | load("cluster_0916/norm.dat_cluster0916_cl100.rda") ## subsampled cell by gene matrix 13 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## fbm big.dat 14 | load("anno.df.clean_cluster0916_v1020_20221020.rda") ## metadata 15 | 16 | ## Markers 17 | cl = merge.result$cl 18 | markers = merge.result$markers 19 | select.markers = intersect(markers, row.names(norm.dat)) 20 | load("genotype.genes.rda") 21 | load("/allen/programs/celltypes/workgroups/rnaseqanalysis/yzizhen/10X_analysis/sex.genes.rda") 22 | select.markers = setdiff(select.markers, sex.genes) 23 | select.markers = setdiff(select.markers, genotype.genes) 24 | 25 | 26 | ## Rm.eigen 27 | rm.eigen = as.matrix(setNames(log2(anno.df.clean$gene.counts.0), anno.df.clean$sample_id), ncol=1) 28 | rownames(rm.eigen) = anno.df.clean$sample_id 29 | colnames(rm.eigen) = "log2Genes" 30 | save(rm.eigen, file = "rm.eigen.rda") ## save for later use 31 | 32 | ################################################# 33 | ## PCA 34 | ################################################# 35 | set.seed(123) 36 | 37 | ## PCA all 38 | select.cells = anno.df.clean$sample_id 39 | rd.result = rd_PCA_big(big.dat, norm.dat[select.markers, ], select.cells = select.cells, max.dim=150, method="elbow", mc.cores=20) 40 | rd.dat = filter_RD(rd.result$rd.dat, rm.eigen, 0.7) 41 | save(rd.dat, file = "./cluster_0916//rd.result_refdat_2022-10-21.rda") ## save PCA results as R object 42 | write.csv(rd.dat, file = "./cluster_0916/rd.result_refdat_2022-10-21.csv") ## save PCA results as csv which will be used for UMAP computation in Python 43 | 44 | 45 | ##################################### 46 | ## UMAP 47 | ##################################### 48 | ## These lines of script call run_umap.py script for UMAP computation 49 | tmp.in = "./cluster_0916/rd.result_refdat_2022-10-21.csv" 50 | tmp.out = "./cluster_0916/umap.2d_global_k10_d0.4_2022-10-21.csv" 51 | tmp.system = paste0("/functions/run_umap.py -i ", tmp.in, " -o ", tmp.out, " -k 10 -c 2 -d 0.4") 52 | system(tmp.system) -------------------------------------------------------------------------------- /analysis_scripts/08a_subcluster_immune.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Extract only immune cells and cluster them separately 3 | 4 | 5 | library(dplyr) 6 | library(scrattch.hicat) 7 | library(scrattch.bigcat) 8 | 9 | ################################################# 10 | ## Load data 11 | ################################################# 12 | load("anno.df.clean_cluster0916_v1020_20221020.rda") 13 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## fbm big.dat 14 | 15 | ## Create smaller cell-by-gene matrix with only immune cells 16 | select.cells = anno.df.clean$sample_id[anno.df.clean$class_label == "Immune"] 17 | norm.dat = get_logNormal(big.dat, select.cells) 18 | 19 | 20 | 21 | ################################################# 22 | ## Clustering 23 | ################################################# 24 | set.seed(123) 25 | 26 | ## Cluster 27 | resulti = iter_clust(norm.dat = norm.dat[,select.cells], select.cells = select.cells, "./data_immune/", 28 | max.cl.size = 200, split.size = 50, verbose = 1) 29 | 30 | ## Merge 31 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50, padj.th = 0.01, min.genes = 3) ## Looser settings 32 | merge.result = merge_cl(norm.dat=norm.dat, cl=resulti$cl, rd.dat.t=norm.dat[resulti$markers,select.cells], de.param=de.param, verbose=TRUE) ## 10 33 | 34 | ## Save for later 35 | save(resulti, file = "./data_immune/resulti.rda") 36 | save(merge.result, file = "./data_immune/merge.result.rda") 37 | 38 | 39 | -------------------------------------------------------------------------------- /analysis_scripts/08b_subcluster_tanycytes_ependymal.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Extract only tanycytes and ependymal cells and cluster them separately 3 | 4 | 5 | library(dplyr) 6 | library(scrattch.hicat) 7 | library(scrattch.bigcat) 8 | 9 | ################################################# 10 | ## Load data 11 | ################################################# 12 | load("anno.df.clean_cluster0916_v1020_20221020.rda") 13 | load("big.dat_Aging.16ROI.V3.20220701.rda") ## fbm big.dat 14 | 15 | ## Create smaller cell-by-gene matrix with only tanycytes and ependymal cells 16 | select.cells = anno.df.clean$sample_id[anno.df.clean$subclass_label %in% c("Tanycyte", "Epenedymal")] 17 | norm.dat = get_logNormal(big.dat, select.cells) 18 | 19 | 20 | ################################################# 21 | ## Clustering 22 | ################################################# 23 | set.seed(123) 24 | 25 | ## Cluster 26 | resulti = iter_clust(norm.dat = norm.dat[,select.cells], select.cells = select.cells, "./data_tanepen/", 27 | max.cl.size = 200, split.size = 50, verbose = 1) 28 | 29 | ## Merge 30 | de.param = de_param(q1.th=0.4, q.diff.th = 0.7, de.score.th=300, min.cells=50, padj.th = 0.01, min.genes = 3) ## Looser settings 31 | merge.result = merge_cl(norm.dat=norm.dat, cl=resulti$cl, rd.dat.t=norm.dat[resulti$markers,select.cells], de.param=de.param, verbose=TRUE) ## 10 32 | 33 | ## Save for later 34 | save(resulti, file = "./data_tanepen/resulti.rda") 35 | save(merge.result, file = "./data_tanepen/merge.result.rda") -------------------------------------------------------------------------------- /analysis_scripts/09a_degenes_age_mast_subclass.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Running MAST at the subclass level to find age-DE genes 3 | 4 | library(MAST) 5 | library(scrattch.hicat) 6 | library(dplyr) 7 | library(ggplot2) 8 | library(data.table) 9 | library(bigstatsr) 10 | library(scrattch.bigcat) 11 | 12 | 13 | ################################################################################ 14 | ## Load data 15 | ################################################################################ 16 | ## Cleaned metadata data file 17 | load("anno.df.clean_cluster0916_v231221_20240206.rda") 18 | 19 | ## Scrattch.bigcat compatible file-backed-matrix 20 | load("big.dat_Aging.16ROI.V3.20231208.rda") 21 | 22 | 23 | 24 | ################################################################################ 25 | ## Deciding which subclasses to include for analysis 26 | # ################################################################################ 27 | # -- Criteria: 28 | # -- ncell > 100 for each age_cat 29 | 30 | tmp.age.count = c(table(anno.df.clean$subclass_label[anno.df.clean$age_cat == "aged"])) 31 | tmp.adult.count = c(table(anno.df.clean$subclass_label[anno.df.clean$age_cat == "adult"])) 32 | tmp.age.count2 = tmp.age.count[tmp.age.count > 100] 33 | tmp.adult.count2 = tmp.adult.count[tmp.adult.count > 100] 34 | 35 | tmp.keep = intersect(names(tmp.age.count2), names(tmp.adult.count2)) 36 | 37 | check.remove = setdiff(unique(anno.df.clean$subclass_label), tmp.keep) 38 | 39 | subclass.keep = tmp.keep 40 | 41 | save(subclass.keep, file = "subclass.keep_cluster0916_v20231221.rda") 42 | 43 | 44 | 45 | ################################################################################ 46 | ## MAST per subclass (using parallel function) 47 | ################################################################################ 48 | source("functions/degenes_age_mast_simplified_qcscore.R") ## custom function for running MAST in parallel 49 | setwd("MAST_simplified/") 50 | 51 | 52 | de.gene.results = get_age_degenes_mast_big_p(anno.df.clean, big.dat, groups = subclass.keep, split.by = "subclass_label", save.as.tmp = F, 53 | maxcells = 2000, freq_expressed = 0.1, mc.cores = 15) 54 | 55 | saveRDS(de.gene.results, file = "de.gene.result_cluster0916_v231221_subclass.rds") 56 | 57 | 58 | 59 | ################################################################################ 60 | ## Assemble MAST results 61 | ################################################################################ 62 | ## Load results 63 | load("anno.df.clean_cluster0916_v231221_20240206.rda") 64 | results = readRDS("MAST_simplified/de.gene.result_cluster0916_v231221_subclass.rds") 65 | 66 | ## Exclude genes 67 | load("r_objects/sex.genes.rda") #sex-biased genes 68 | load("r_objects/genotype.genes.rda") #genotype-biased genes 69 | load("r_objects/iegenes.rda") #immediate early genes 70 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes) 71 | 72 | ## Cutoffs for age-DE genes 73 | fcthreshold = 1 74 | pthreshold = 0.01 75 | 76 | 77 | ## Create summarized objects with results 78 | de.gene.results = lapply(results, function(x) x[["result"]]) 79 | 80 | for(i in names(de.gene.results)){ 81 | de.gene.results[[i]]$subclass = i 82 | } 83 | de.results.long0 = do.call(rbind, de.gene.results) 84 | saveRDS(de.results.long0, file = paste0(opath, "de.results.long0_allresults.rds")) 85 | 86 | 87 | de.gene.results = lapply(de.gene.results, function(x) x[x$padjust < pthreshold & abs(x$coef) > fcthreshold,]) 88 | de.gene.results = lapply(de.gene.results, function(x) x[order(x$coef, decreasing = T),]) 89 | de.gene.results = lapply(de.gene.results, function(x) x[!x$primerid %in% genes.exclude,]) ## excluding sex, genotype, iegenes 90 | sort(unlist(lapply(de.gene.results, nrow))) 91 | subclass.results = as.data.frame(sort(unlist(lapply(de.gene.results, nrow)))) 92 | names(subclass.results) = "de.gene.count" 93 | subclass.results$subclass = row.names(subclass.results) 94 | 95 | tmp = unlist(lapply(results, function(x) length(x$select.cells))) 96 | names(tmp) = names(results) 97 | subclass.results$ncell.tested = tmp[subclass.results$subclass] 98 | 99 | saveRDS(subclass.results, file = paste0(opath, "subclasswide_degene_summary_pcut", pthreshold, "_fc", fcthreshold, "_excludedgenes.rds")) 100 | 101 | de.results.long = de.results.long0[de.results.long0$padjust < pthreshold & abs(de.results.long0$coef) > fcthreshold,] 102 | saveRDS(de.results.long, file = paste0(opath, "de.results.long_pcut", pthreshold, "_fc", fcthreshold, ".rds")) 103 | 104 | 105 | ################################################################################ 106 | ## Visualize results 107 | ################################################################################ 108 | opath = "MAST_simplified" 109 | 110 | tmp.level1 = unique(anno.df.clean[,c("class_label", "class_id")]) 111 | tmp.level1 = tmp.level1[order(tmp.level1$class_id),] 112 | level1.order = tmp.level1$class_label 113 | tmp = unique(anno.df.clean[,c("subclass_label", "class_color", "class_label")]) 114 | level1.color = tmp$class_color 115 | names(level1.color) = tmp$subclass_label 116 | 117 | subclass.results1 = left_join(subclass.results, tmp, by = c("subclass" = "subclass_label")) 118 | row.names(subclass.results1) = subclass.results1$subclass 119 | # subclass.results1$subclass_color = level1.color[subclass.results1$subclass] 120 | subclass.results1 = subclass.results1[order(subclass.results1$subclass),] 121 | subclass.results1$class_label = factor(subclass.results1$class_label, levels = level1.order) 122 | subclass.results1 = subclass.results1[order(subclass.results1$class_label),] 123 | subclass.results1$subclass = factor(subclass.results1$subclass, levels = subclass.results1$subclass) 124 | 125 | ## Barplot of nDEgene count 126 | p1 = subclass.results1 %>% 127 | ggplot(aes(y = de.gene.count, x = subclass, fill = subclass))+ 128 | geom_bar(stat = "identity", width = 0.8)+ 129 | scale_fill_manual(values = level1.color)+ 130 | theme_classic()+ 131 | theme(legend.position = "none")+ 132 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5)) 133 | 134 | 135 | de.results.long$subclass = factor(de.results.long$subclass, levels = subclass.results1$subclass) 136 | de.results.long$direction = "positive" 137 | de.results.long$direction[de.results.long$coef < 0] = "negative" 138 | de.results.long$abs.coef = abs(de.results.long$coef) 139 | 140 | 141 | ## Dot plot of coeficients 142 | p2 = de.results.long %>% 143 | ggplot(aes(x = subclass, y = abs.coef, color = subclass, shape = direction))+ 144 | geom_jitter(size = 0.8, width = 0.05, alpha = 0.8)+ 145 | scale_color_manual(values = level1.color)+ 146 | scale_shape_manual(values = c(4, 1))+ 147 | theme_classic()+ 148 | theme(legend.position = "none")+ 149 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5)) 150 | 151 | p4 = de.results.long %>% 152 | ggplot(aes(x = subclass, y = -log10(padjust), color = subclass, size = 5))+ 153 | geom_jitter(size = 0.5, width = 0.1)+ 154 | scale_color_manual(values = level1.color)+ 155 | theme_bw()+ 156 | theme(legend.position = "none")+ 157 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5)) 158 | 159 | 160 | p3 = subclass.results1 %>% 161 | ggplot(aes(y = de.gene.count, x = subclass, fill = subclass))+ 162 | geom_bar(stat = "identity", width = 0.8)+ 163 | scale_fill_manual(values = level1.color)+ 164 | geom_jitter(data = de.results.long, aes(x = subclass, y = -30*abs.coef, color = subclass, shape = direction), size = 0.8, width = 0.05, alpha = 0.8)+ 165 | scale_shape_manual(values = c(4, 1))+ 166 | scale_color_manual(values = level1.color)+ 167 | theme_classic()+ 168 | theme(legend.position = "none")+ 169 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5)) 170 | 171 | 172 | ggsave(p1, filename = paste0(opath, "barplot_ndegene_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5) 173 | ggsave(p2, filename = paste0(opath, "dotplot_signifcoef_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5) 174 | ggsave(p4, filename = paste0(opath, "dotplot_signifpval_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5) 175 | ggsave(p3, filename = paste0(opath, "combdotbar_level1color_pcut", pthreshold, "_fc", fcthreshold, ".pdf"), useDingbats = F, width = 14, height = 5) 176 | -------------------------------------------------------------------------------- /analysis_scripts/09b_degenes_age_mast_supertype_nn.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Running MAST at the supertype level in non-neuronals to find age-DE genes 3 | 4 | library(MAST) 5 | library(scrattch.hicat) 6 | library(dplyr) 7 | library(ggplot2) 8 | library(data.table) 9 | library(bigstatsr) 10 | library(scrattch.bigcat) 11 | 12 | 13 | ################################################################################ 14 | ## Load data 15 | ################################################################################ 16 | ## Cleaned metadata data file 17 | load("anno.df.clean_cluster0916_v231221_20240206.rda") 18 | 19 | ## Loading cell by gene matrix in dgC sparse matrix format (not included in github) 20 | load("norm.dat_objects/nonneuronal-imn/norm.dat.rda") 21 | 22 | 23 | ################################################################################ 24 | ## Deciding which subclasses to include for analysis 25 | # ################################################################################ 26 | anno.df.select = anno.df.clean[grepl("NN", anno.df.clean$supertype_label),] 27 | tmp = as.data.frame(table(anno.df.select[,c("supertype_label", "age_cat")])) 28 | tmp = tmp[tmp$Freq > 100,] 29 | supertype.keep = names(table(tmp$supertype_label)[table(tmp$supertype_label) == 2]) 30 | common = intersect(colnames(norm.dat), anno.df.clean$sample_id) 31 | anno.df.select = anno.df.clean[anno.df.clean$sample_id %in% common,] 32 | 33 | 34 | ################################################################################ 35 | ## MAST per supertype (using parallel function) 36 | ################################################################################ 37 | source("functions/degenes_age_mast_simplified_qcscore.R") ## custom function for running MAST in parallel 38 | setwd("MAST_nnsupertype/") 39 | 40 | de.gene.results = get_age_degenes_mast_p(anno.df.select, norm.dat, groups = supertype.keep, split.by = "supertype_label", save.as.tmp = F, 41 | maxcells = 2000, freq_expressed = 0.1, mc.cores = 10) 42 | 43 | saveRDS(de.gene.results, file = "de.gene.result_cluster0916_v231221_nnsupertype.rds") 44 | 45 | 46 | ################################################################################ 47 | ## Assemble MAST results 48 | ################################################################################ 49 | opath = "MAST_nnsupertype" 50 | 51 | ## Load results 52 | load("anno.df.clean_cluster0916_v231221_20240206.rda") 53 | results = readRDS("MAST_nnsupertype/de.gene.result_cluster0916_v231221_nnsupertype.rds") 54 | 55 | ## Exclude genes 56 | load("r_objects/sex.genes.rda") #sex-biased genes 57 | load("r_objects/genotype.genes.rda") #genotype-biased genes 58 | load("r_objects/iegenes.rda") #immediate early genes 59 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes) 60 | 61 | ## Cutoffs for age-DE genes 62 | fcthreshold = 1 63 | pthreshold = 0.01 64 | 65 | ## Assemble results 66 | de.gene.results = lapply(results, function(x) x[["result"]]) 67 | 68 | for(i in names(de.gene.results)){ 69 | de.gene.results[[i]]$subclass = i 70 | } 71 | de.results.long0 = do.call(rbind, de.gene.results) 72 | saveRDS(de.results.long0, file = paste0(opath, "de.results.long0_allresults.rds")) 73 | 74 | de.gene.results = lapply(de.gene.results, function(x) x[x$padjust < pthreshold & abs(x$coef) > fcthreshold,]) 75 | de.gene.results = lapply(de.gene.results, function(x) x[order(x$coef, decreasing = T),]) 76 | de.gene.results = lapply(de.gene.results, function(x) x[!x$primerid %in% genes.exclude,]) ## excluding sex, genotype, iegenes 77 | sort(unlist(lapply(de.gene.results, nrow))) 78 | subclass.results = as.data.frame(sort(unlist(lapply(de.gene.results, nrow)))) 79 | names(subclass.results) = "de.gene.count" 80 | subclass.results$subclass = row.names(subclass.results) 81 | 82 | tmp = unlist(lapply(results, function(x) length(x$select.cells))) 83 | names(tmp) = names(results) 84 | subclass.results$ncell.tested = tmp[subclass.results$subclass] 85 | 86 | saveRDS(subclass.results, file = paste0(opath, "supertype_degene_summary_pcut", pthreshold, "_fc", fcthreshold, "_excludegenes.rds")) 87 | 88 | de.results.long = de.results.long0[de.results.long0$padjust < pthreshold & abs(de.results.long0$coef) > fcthreshold,] 89 | saveRDS(de.results.long, file = paste0(opath, "de.results.long_pcut", pthreshold, "_fc", fcthreshold, ".rds")) 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /analysis_scripts/10a_gsea_gprofiler_subclass.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Gene set enrichment analysis with gprofiler2 at the subclass level 3 | 4 | library(gprofiler2) 5 | library(dplyr) 6 | 7 | ################################################# 8 | ## Run gprofiler 9 | ################################################# 10 | ## Load in age-DE genes from MAST 11 | de.results = readRDS("MAST_simplified/de.gene.result_cluster0916_v231221_subclass.rds") 12 | de.results = lapply(de.results, function(x) x[["result"]]) 13 | 14 | ## Remove certain genes 15 | load("sex.genes.rda") 16 | load("genotype.genes.rda") 17 | load("iegenes.rda") 18 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes) 19 | 20 | de.results = lapply(de.results, function(x) x[!x$primerid %in% genes.exclude,]) 21 | 22 | ## Signif cutoffs 23 | pval.cut = 0.01 24 | logfc.cut = 1 25 | 26 | ## Positive only 27 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef > 0,]) 28 | tmp = unlist(lapply(de.genes, nrow)) 29 | de.genes = de.genes[names(tmp[tmp >= 3])] 30 | gp.result.pos = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01)) 31 | 32 | 33 | ## Negative only 34 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef < 0,]) 35 | tmp = unlist(lapply(de.genes, nrow)) 36 | de.genes = de.genes[names(tmp[tmp >= 3])] 37 | gp.result.neg = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01)) 38 | 39 | 40 | save(gp.result.pos, gp.result.neg, file = "gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda") 41 | 42 | 43 | 44 | ################################################# 45 | ## Consolidate results 46 | ################################################# 47 | load("gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda") 48 | 49 | for(i in names(gp.result.pos)){ 50 | print(i) 51 | tmp = gp.result.pos[[i]]$result 52 | if(is.null(tmp)){ 53 | gp.result.pos[[i]] = NULL 54 | next 55 | } 56 | tmp$subclass = i 57 | tmp$direction = "pos" 58 | gp.result.pos[[i]] = tmp 59 | } 60 | 61 | for(i in names(gp.result.neg)){ 62 | print(i) 63 | tmp = gp.result.neg[[i]]$result 64 | if(is.null(tmp)){ 65 | gp.result.neg[[i]] = NULL 66 | next 67 | } 68 | tmp$subclass = i 69 | tmp$direction = "neg" 70 | gp.result.neg[[i]] = tmp 71 | } 72 | 73 | tmp.pos = do.call(rbind, gp.result.pos) 74 | tmp.neg = do.call(rbind, gp.result.neg) 75 | 76 | go.results.all = rbind(data.frame(tmp.pos), data.frame(tmp.neg)) 77 | go.results.all = apply(go.results.all,2,as.character) 78 | 79 | write.csv(go.results.all, file = "gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv") -------------------------------------------------------------------------------- /analysis_scripts/10b_gsea_gprofiler_nnsupertype.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Gene set enrichment analysis with gprofiler2 at the subclass level 3 | 4 | library(gprofiler2) 5 | library(dplyr) 6 | 7 | 8 | ################################################# 9 | ## Run gprofiler 10 | ################################################# 11 | ## Load in age-DE genes from MAST 12 | de.results = readRDS("MAST_nnsupertype/de.gene.result_cluster0916_v231221_nnsupertype.rds") 13 | de.results = lapply(de.results, function(x) x[["result"]]) 14 | 15 | ## Remove certain genes 16 | load("sex.genes.rda") 17 | load("genotype.genes.rda") 18 | load("iegenes.rda") 19 | genes.exclude = c(iegenes.rapidprg, iegenes.delayedprg, sex.genes, genotype.genes) 20 | test = lapply(de.results, function(x) x[!x$primerid %in% genes.exclude,]) 21 | unlist(lapply(de.results, nrow)) 22 | unlist(lapply(test, nrow)) 23 | de.results = test 24 | 25 | ## Signif cutoffs 26 | pval.cut = 0.01 27 | logfc.cut = 1 28 | 29 | ## Positive only 30 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef > 0,]) 31 | tmp = unlist(lapply(de.genes, nrow)) 32 | de.genes = de.genes[names(tmp[tmp >= 3])] 33 | gp.result.pos = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01)) 34 | 35 | 36 | ## Negative only 37 | de.genes = lapply(de.results, function(x) x[abs(x$coef) > logfc.cut & x$padjust < pval.cut & x$coef < 0,]) 38 | tmp = unlist(lapply(de.genes, nrow)) 39 | de.genes = de.genes[names(tmp[tmp >= 3])] 40 | gp.result.neg = lapply(de.genes, function(x) gost(query = x$primerid, organism = "mmusculus", ordered_query = T, evcodes = T, user_threshold = 0.01)) 41 | 42 | 43 | save(gp.result.pos, gp.result.neg, file = "gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda") 44 | 45 | 46 | 47 | ################################################# 48 | ## Consolidate results 49 | ################################################# 50 | ## Check and add columns 51 | load("gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.rda") 52 | 53 | for(i in names(gp.result.pos)){ 54 | print(i) 55 | tmp = gp.result.pos[[i]]$result 56 | if(is.null(tmp)){ 57 | gp.result.pos[[i]] = NULL 58 | next 59 | } 60 | tmp$subclass = i 61 | tmp$direction = "pos" 62 | gp.result.pos[[i]] = tmp 63 | } 64 | 65 | 66 | 67 | for(i in names(gp.result.neg)){ 68 | print(i) 69 | tmp = gp.result.neg[[i]]$result 70 | if(is.null(tmp)){ 71 | gp.result.neg[[i]] = NULL 72 | next 73 | } 74 | tmp$subclass = i 75 | tmp$direction = "neg" 76 | gp.result.neg[[i]] = tmp 77 | } 78 | 79 | tmp.pos = do.call(rbind, gp.result.pos) 80 | tmp.neg = do.call(rbind, gp.result.neg) 81 | 82 | go.results.all = rbind(data.frame(tmp.pos), data.frame(tmp.neg)) 83 | go.results.all = apply(go.results.all,2,as.character) 84 | 85 | write.csv(go.results.all, file = "gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv") 86 | -------------------------------------------------------------------------------- /analysis_scripts/11_GO_matrix_gprofiler.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Compute and plot gene enrichment matrix 3 | 4 | library(pheatmap) 5 | library(RColorBrewer) 6 | 7 | ################################################# 8 | ## Load data and combine 9 | ################################################# 10 | go.results.all = read.csv("gprofiler_results_subclass_all_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv") 11 | go.results.nn = read.csv("gprofiler_results_nnsupertype_AIT21_logFC1_pval0.01cut_posneg_separate_excludedgenes_20240109.csv") 12 | go.results.all = go.results.all[!grepl(" NN", go.results.all$subclass),] 13 | go.results.all0 = rbind(go.results.all, go.results.nn) 14 | 15 | 16 | ################################################# 17 | ## Filter 18 | ################################################# 19 | ## Which data sources to use 20 | source.keep = c("GO:MF", "GO:BP", "GO:CC") 21 | go.results.all = go.results.all0[go.results.all0$source %in% source.keep,] 22 | 23 | ## Which terms/subclasses/supertypes to keep 24 | termfreq = 2 #filters terms with frequency less than this 25 | subclassfreq = 6 #filters celltypes with frequency less than this 26 | goterm.keep = names(table(go.results.all$term_name)[table(go.results.all$term_name) >= termfreq]) 27 | go.results.all = go.results.all[go.results.all$term_name %in% goterm.keep,] 28 | subclass.keep = names(table(go.results.all$subclass)[table(go.results.all$subclass) >= subclassfreq]) 29 | go.results.all = go.results.all[go.results.all$subclass %in% subclass.keep,] 30 | setdiff(unique(go.results.all0$subclass), subclass.keep) # check subclasses that got filtered out 31 | 32 | ## Filter out broad terms 33 | nterm_cutoff = 2000 #filters terms with greater than this number of gene members (to filter out terms that are too broad) 34 | terms.tmp = unique(go.results.all[,c("term_name", "term_size")]) 35 | unique(go.results.all$term_name[go.results.all$term_size > nterm_cutoff]) #which terms would be thrown out 36 | go.results.all = go.results.all[go.results.all$term_size <= nterm_cutoff,] 37 | 38 | ## Final counts of terms/subclasses 39 | length(unique(go.results.all$term_name)) 40 | length(unique(go.results.all$subclass)) 41 | 42 | go.mat = matrix(0, nrow = length(unique(go.results.all$subclass)), ncol = length(unique(go.results.all$term_name))) 43 | row.names(go.mat) = unique(go.results.all$subclass) 44 | colnames(go.mat) = unique(go.results.all$term_name) 45 | 46 | for(n in 1:nrow(go.results.all)){ 47 | x = go.results.all$subclass[n] 48 | y = go.results.all$term_name[n] 49 | tmp = -log10(go.results.all$p_value[n]) 50 | if(go.results.all$direction[n] == "pos"){ 51 | go.mat[x,y] = tmp 52 | } 53 | if(go.results.all$direction[n] == "neg"){ 54 | go.mat[x,y] = -tmp 55 | } 56 | } 57 | 58 | 59 | ################################################# 60 | ## Plot 61 | ################################################# 62 | ## Make objects for colors 63 | load("col.broi.rda") 64 | tmp = "#808080" 65 | names(tmp) = "NN" 66 | col.broi = c(col.broi, tmp) 67 | subclass.roi.summary = anno.df.clean %>% group_by(subclass_label) %>% summarize(max.broad.roi = names(which.max(table(broad_roi)))) 68 | subclass.roi.summary = subclass.roi.summary[!grepl("NN", subclass.roi.summary$subclass_label),] 69 | tmp = data.frame(subclass_label = unique(go.results.nn$subclass), max.broad.roi = "NN") 70 | subclass.roi.summary = rbind(subclass.roi.summary, tmp) 71 | subclass.roi.summary = subclass.roi.summary[subclass.roi.summary$subclass_label %in% subclass.keep,] 72 | 73 | anno.df = data.frame(max.broad.roi = subclass.roi.summary$max.broad.roi) 74 | row.names(anno.df) = subclass.roi.summary$subclass_label 75 | 76 | anno.list = list(max.broad.roi = col.broi) 77 | 78 | 79 | 80 | ## Setting limit on max score to make colors easier to interpret 81 | max = 10 82 | go.mat[go.mat > max] = max 83 | go.mat[go.mat < (-1*max)] = (-1*max) 84 | 85 | 86 | ## Plot heatmap 87 | pheatmap(go.mat, fontsize_row = 6, fontsize_col = 6, #scale = "row", 88 | annotation_row = anno.df, annotation_colors = anno.list, 89 | filename = "figures_go/gprofiler_gomat.pdf", width = 15, height = 10, show_colnames = T, 90 | color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), 91 | clustering_method = "ward.D", ##complete, ward.D, and ward.D2 are usually the best ones 92 | border_color = NA) -------------------------------------------------------------------------------- /analysis_scripts/12_augur.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Run augur on subclasses 3 | 4 | library(Augur) 5 | 6 | ################################################# 7 | ## Modify select_variance function in Augur 8 | ################################################# 9 | library(godmode) ## this package used for running modified version of select_variance() 10 | source("cust_functions/select_variance_MOD.R") 11 | godmode:::assignAnywhere("select_variance", select_variance2) 12 | 13 | ################################################# 14 | ## Load data and combine 15 | ################################################# 16 | load("anno.df.clean_cluster0916_v231221_20231221.rda") 17 | load("subclass.keep_cluster0916_v20231221.rda") ## subclasses included in MAST analysis 18 | load("big.dat_Aging.16ROI.V3.20231208.rda") 19 | load("cluster_0916/norm.dat_cluster0916_v20231221_subclass_ss1200.rda") 20 | 21 | ################################################# 22 | ## Run Augur 23 | ################################################# 24 | anno.df.select = anno.df.clean[anno.df.clean$sample_id %in% colnames(norm.dat),] 25 | anno.df.select = anno.df.select[anno.df.select$subclass_label %in% subclass.keep,] 26 | anno.df.select$subclass_label = as.character(anno.df.select$subclass_label) 27 | min(table(anno.df.select$subclass_label, anno.df.select$age_cat)) 28 | 29 | meta.df = anno.df.select[,c("subclass_label", "age_cat")] 30 | row.names(meta.df) = anno.df.select$sample_id 31 | 32 | norm.dat.select = norm.dat[,row.names(meta.df)] 33 | 34 | augur = calculate_auc(norm.dat.select, meta.df, cell_type_col = "subclass_label", label_col = "age_cat", 35 | feature_perc = 0.8, var_quantile = 0.9, subsample_size = 200, 36 | n_threads = 25) 37 | 38 | saveRDS(augur, file = "augur/augur.result_ait21_dispersioncorrected_var0.9_fperc0.8_subsampsize200_20240102.rds") 39 | 40 | ################################################# 41 | ## Check results and save smaller objects for later 42 | ################################################# 43 | augur = readRDS("augur/augur.result_ait21_dispersioncorrected_var0.9_fperc0.8_subsampsize200_20240102.rds") 44 | rank.all = augur$AUC 45 | aucresults.all = augur$results 46 | save(rank.all, aucresults.all, file = "augur/augur.objects_allcells.rda") 47 | 48 | load("augur/augur.objects_allcells.rda") 49 | 50 | rank.all = rank.all[order(rank.all$cell_type),] 51 | names(rank.all) = c("cell_type", "auc.all") 52 | rank.summary = rank.summary[order(rank.summary$auc.all, decreasing = T),] 53 | saveRDS(rank.summary, file = "augur/rank.summary_all.m.f_ait21_dispersioncorrected_var0.9_fperc0.8_subsampsize200_20240102.rds") 54 | -------------------------------------------------------------------------------- /analysis_scripts/13_pb_edgeR.R: -------------------------------------------------------------------------------- 1 | ## Description/Goal: 2 | ## Run pseudobulk analysis on all subclasses 3 | 4 | 5 | ################### 6 | ## Load data 7 | ################### 8 | library(scrattch.hicat) 9 | library(scrattch.bigcat) 10 | library(dplyr) 11 | 12 | load("anno.df.clean_cluster0916_v230915_20230915.rda") 13 | load("cluster_0916/norm.dat_cluster0916_v230915_subclass_ss1500.rda") ## subsampled cell by gene matrix 14 | load("big.dat_Aging.16ROI.V3.20220701.rda") 15 | load("subclass.keep_cluster0916_v230915.rda") 16 | 17 | ################### 18 | ## Compute pseudobulk means (means per library per subclass) 19 | ################### 20 | anno.df.select = anno.df.clean[anno.df.clean$subclass_label %in% subclass.keep,] 21 | cl = paste(anno.df.select$subclass_label, anno.df.select$age_cat, anno.df.select$library_prep, sep = "__") 22 | length(unique(cl)) 23 | names(cl) = anno.df.select$sample_id 24 | 25 | tmp.summary = anno.df.select %>% dplyr::group_by(subclass_label, age_cat, library_prep) %>% dplyr::summarize(n = n()) 26 | tmp.summary.keep = tmp.summary[tmp.summary$n >= 10,] 27 | tmp.summary.keep$cl = paste(tmp.summary.keep$subclass_label, tmp.summary.keep$age_cat, tmp.summary.keep$library_prep, sep = "__") 28 | 29 | cl.clean = cl[cl %in% tmp.summary.keep$cl] 30 | 31 | cl.means = get_cl_stats_big(big.dat, cl.clean, max.cl.size = 200, stats = "means", mc.cores = 5) 32 | 33 | save(cl.means, file = "pb_edgeR/cl.means") 34 | 35 | 36 | 37 | ################### 38 | ## Pseudobulk analysis with EdgeR 39 | ################### 40 | library(edgeR) 41 | library(statmod) 42 | 43 | load("pb_edgeR/cl.means") 44 | load("subclass.keep_cluster0916_v230915.rda") 45 | 46 | cl.means = cl.means$means 47 | tmp.split = strsplit(colnames(cl.means), "__") 48 | subclass_label = unlist(lapply(tmp.split, function(x) x[[1]])) 49 | age_cat = unlist(lapply(tmp.split, function(x) x[[2]])) 50 | library_prep = unlist(lapply(tmp.split, function(x) x[[3]])) 51 | meta.df = data.frame(sample = colnames(cl.means), subclass_label = subclass_label, age_cat = age_cat, library_prep = library_prep) 52 | 53 | qlf.result = c() 54 | lrt.result = c() 55 | 56 | for(i in subclass.keep){ 57 | 58 | print(i) 59 | 60 | sample.keep = grep(i, colnames(cl.means)) 61 | meta.select = meta.df[sample.keep,] 62 | 63 | age = meta.select$age_cat 64 | # y.select = DGEList(counts = cl.means[,sample.keep], group = age) 65 | 66 | tmp.counts = 2^cl.means[,sample.keep] - 1 67 | y.select.counts = DGEList(counts = tmp.counts, group = age) 68 | 69 | design = model.matrix(~age) 70 | row.names(design) = colnames(y.select.counts) 71 | 72 | y.select.counts = estimateDisp(y.select.counts, design, robust = T) 73 | # plotBCV(y.select.counts) 74 | 75 | print("Running quasi-likelihood...") 76 | fit = glmQLFit(y.select.counts, design) 77 | qlf = glmQLFTest(fit, coef = 2) 78 | # topTags(qlf, n = 20) 79 | 80 | print("Running LRT...") 81 | fit = glmFit(y.select.counts, design) 82 | lrt = glmLRT(fit, coef = 2) 83 | # topTags(lrt, n = 20) 84 | 85 | print("Saving results...") 86 | tmp.qlf = list(result = qlf, dgelist = y.select.counts) 87 | tmp.lrt = list(result = lrt, dgelist = y.select.counts) 88 | qlf.result[i] = list(tmp.qlf) 89 | lrt.result[i] = list(tmp.lrt) 90 | 91 | } 92 | 93 | 94 | saveRDS(qlf.result, file = "pb_edgeR/qlf.result_subclass_cluster0916_v230915.rds") 95 | saveRDS(lrt.result, file = "pb_edgeR/lrt.result_subclass_cluster0916_v230915.rds") 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /analysis_scripts/functions/degenes_age_mast_simplified_qcscore.R: -------------------------------------------------------------------------------- 1 | get_age_degenes_mast = function(anno.df, norm.dat, id, split.by, maxcells = 500, freq_expressed = 0.1, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){ 2 | 3 | set.seed(2010) 4 | 5 | require(MAST) 6 | require(dplyr) 7 | require(scrattch.hicat) 8 | require(data.table) 9 | 10 | ## Rename objects 11 | anno.df.select = anno.df 12 | row.names(anno.df.select) = anno.df.select$sample_id 13 | group.tmp = id 14 | dat.select = norm.dat 15 | print(paste0("Starting ", group.tmp, "...")) 16 | 17 | 18 | ## Exclude or include 19 | if(exclude){ 20 | anno.tmp = anno.df.select[anno.df.select[,split.by] != group.tmp,] 21 | } else { 22 | anno.tmp = anno.df.select[anno.df.select[,split.by] == group.tmp,] 23 | 24 | } 25 | 26 | ## Subsample to the same size per age group, up to maxcells 27 | if(is.null(select.cells)){ 28 | tmp.age = anno.tmp$sample_id[anno.tmp$age_cat == "aged"] 29 | tmp.adult = anno.tmp$sample_id[anno.tmp$age_cat == "adult"] 30 | tmp.max.age = min(length(tmp.age), maxcells) 31 | tmp.max.adult = min(length(tmp.adult), maxcells) 32 | select.cells = c(sample(tmp.age, tmp.max.age), sample(tmp.adult, tmp.max.adult)) 33 | anno.tmp = anno.tmp[select.cells,] 34 | anno.tmp$sex = as.character(anno.tmp$sex) ## correcting weird error where some entries were boolean rather than characters 35 | 36 | ## Escape for error 37 | tmp = c(table(anno.tmp[,c("sex", "age_cat")])) 38 | if(sum(tmp == 0) > 0){ 39 | stop("Error: Missing data for one of the sexes and/or ages.") 40 | # break 41 | } 42 | } 43 | 44 | 45 | ## Subset data 46 | dat.tmp = dat.select[,select.cells] 47 | anno.tmp = anno.tmp[select.cells,] 48 | 49 | 50 | ## Make MAST object 51 | row.names(anno.tmp) = anno.tmp$sample_id 52 | sca = FromMatrix(as.matrix(dat.tmp), cData = anno.tmp) 53 | 54 | # ## Thresholding ##SKIPPING FOR NOW 55 | # thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 30, min_per_bin = 30) 56 | # par(mfrow=c(5,4)) 57 | # plot(thres) 58 | 59 | ## Freq cutoff 60 | if(is.null(select.genes)){ 61 | select.genes = freq(sca) > freq_expressed 62 | 63 | } 64 | print(paste0("Ncells: ", length(select.cells))) 65 | print(paste0("Ngenes: ", sum(select.genes))) 66 | sca = sca[select.genes,] 67 | 68 | ## Model 69 | print("Run model") 70 | colData(sca)$age_cat<-factor(colData(sca)$age_cat) 71 | colData(sca)$sex<-factor(colData(sca)$sex) 72 | colData(sca)$roi<-factor(colData(sca)$roi) 73 | colData(sca)$broad_roi<-factor(colData(sca)$broad_roi) 74 | colData(sca)$facs_population_plan<-factor(colData(sca)$facs_population_plan) 75 | colData(sca)$full_genotype<-factor(colData(sca)$full_genotype) 76 | colData(sca)$log.gene.counts.0 = scale(log(colData(sca)$gene.counts.0)) 77 | colData(sca)$z.qc.score = scale(colData(sca)$qc.score) 78 | 79 | zlm <- zlm(~age_cat + sex + log.gene.counts.0 + z.qc.score, sca) 80 | 81 | 82 | ## ----> LTR 83 | summaryAge<- summary(zlm, doLRT='age_cataged') 84 | summaryDt.age <- summaryAge$datatable 85 | fcHurdle.age <- merge(summaryDt.age[contrast=='age_cataged' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values 86 | summaryDt.age[contrast=='age_cataged' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients 87 | fcHurdle.age[,padjust:=p.adjust(`Pr(>Chisq)`, 'bonferroni')] 88 | fcHurdle.age[split.by] = group.tmp 89 | 90 | ## ----> Save results 91 | result = list(select.cells = select.cells, result = fcHurdle.age) 92 | if(save.as.tmp){ 93 | save(result, file = paste0("tmp.degene.result.", make.names(id), ".rda")) 94 | } 95 | return(result) 96 | 97 | } 98 | 99 | 100 | 101 | 102 | 103 | get_age_degenes_mast_big = function(anno.df, big.dat, id, split.by, maxcells = 500, freq_expressed = 0.1, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){ 104 | 105 | set.seed(2010) 106 | 107 | require(MAST) 108 | require(dplyr) 109 | require(scrattch.hicat) 110 | require(data.table) 111 | 112 | ## Rename objects 113 | anno.df.select = anno.df 114 | row.names(anno.df.select) = anno.df.select$sample_id 115 | group.tmp = id 116 | print(paste0("Starting ", group.tmp, "...")) 117 | 118 | 119 | ## Exclude or include 120 | if(exclude){ 121 | anno.tmp = anno.df.select[anno.df.select[,split.by] != group.tmp,] 122 | } else { 123 | anno.tmp = anno.df.select[anno.df.select[,split.by] == group.tmp,] 124 | 125 | } 126 | 127 | ## Subsample to the same size per age group, up to maxcells (if no preselected cells) 128 | if(is.null(select.cells)){ 129 | tmp.age = anno.tmp$sample_id[anno.tmp$age_cat == "aged"] 130 | tmp.adult = anno.tmp$sample_id[anno.tmp$age_cat == "adult"] 131 | tmp.max.age = min(length(tmp.age), maxcells) 132 | tmp.max.adult = min(length(tmp.adult), maxcells) 133 | select.cells = c(sample(tmp.age, tmp.max.age), sample(tmp.adult, tmp.max.adult)) 134 | anno.tmp = anno.tmp[select.cells,] 135 | anno.tmp$sex = as.character(anno.tmp$sex) ## correcting weird error where some entries were boolean rather than characters 136 | 137 | ## Escape for error 138 | tmp = c(table(anno.tmp[,c("sex", "age_cat")])) 139 | if(sum(tmp == 0) > 0){ 140 | stop("Error: Missing data for one of the sexes and/or ages.") 141 | # break 142 | } 143 | } 144 | 145 | 146 | ## Subsample data 147 | dat.tmp = get_logNormal(big.dat, select.cells) 148 | 149 | ## Make MAST object 150 | row.names(anno.tmp) = anno.tmp$sample_id 151 | sca = FromMatrix(as.matrix(dat.tmp), cData = anno.tmp) 152 | 153 | # ## Thresholding ##SKIPPING FOR NOW 154 | # thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 30, min_per_bin = 30) 155 | # par(mfrow=c(5,4)) 156 | # plot(thres) 157 | 158 | ## Freq cutoff 159 | if(is.null(select.genes)){ 160 | select.genes = freq(sca) > freq_expressed 161 | } 162 | print(paste0("Ncells: ", length(select.cells))) 163 | print(paste0("Ngenes: ", sum(select.genes))) 164 | sca = sca[select.genes,] 165 | 166 | ## Model 167 | print("Run model") 168 | colData(sca)$age_cat<-factor(colData(sca)$age_cat) 169 | colData(sca)$sex<-factor(colData(sca)$sex) 170 | colData(sca)$roi<-factor(colData(sca)$roi) 171 | colData(sca)$broad_roi<-factor(colData(sca)$broad_roi) 172 | colData(sca)$facs_population_plan<-factor(colData(sca)$facs_population_plan) 173 | colData(sca)$full_genotype<-factor(colData(sca)$full_genotype) 174 | colData(sca)$log.gene.counts.0 = scale(log(colData(sca)$gene.counts.0)) 175 | colData(sca)$z.qc.score = scale(colData(sca)$qc.score) 176 | 177 | zlm <- zlm(~age_cat + sex + log.gene.counts.0 + z.qc.score, sca) 178 | 179 | 180 | 181 | ## ----> LTR 182 | summaryAge<- summary(zlm, doLRT='age_cataged') 183 | summaryDt.age <- summaryAge$datatable 184 | fcHurdle.age <- merge(summaryDt.age[contrast=='age_cataged' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values 185 | summaryDt.age[contrast=='age_cataged' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients 186 | fcHurdle.age[,padjust:=p.adjust(`Pr(>Chisq)`, 'bonferroni')] 187 | fcHurdle.age[split.by] = group.tmp 188 | 189 | ## ----> Save results 190 | result = list(select.cells = select.cells, result = fcHurdle.age) 191 | if(save.as.tmp){ 192 | save(result, file = paste0("tmp.degene.result.", make.names(id), ".rda")) 193 | } 194 | return(result) 195 | 196 | } 197 | 198 | 199 | # 200 | # 201 | # 202 | # 203 | # 204 | # get_sex_intergenes_mast_big = function(anno.df, big.dat, id, split.by, maxcells = 500, freq_expressed = 0.1, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){ 205 | # 206 | # set.seed(2010) 207 | # 208 | # require(MAST) 209 | # require(dplyr) 210 | # require(scrattch.hicat) 211 | # require(data.table) 212 | # 213 | # ## Rename objects 214 | # anno.df.select = anno.df 215 | # row.names(anno.df.select) = anno.df.select$sample_id 216 | # group.tmp = id 217 | # print(paste0("Starting ", group.tmp, "...")) 218 | # 219 | # 220 | # ## Exclude or include 221 | # if(exclude){ 222 | # anno.tmp = anno.df.select[anno.df.select[,split.by] != group.tmp,] 223 | # } else { 224 | # anno.tmp = anno.df.select[anno.df.select[,split.by] == group.tmp,] 225 | # 226 | # } 227 | # 228 | # ## Subsample to the same size per age group, up to maxcells (if no preselected cells) 229 | # if(is.null(select.cells)){ 230 | # tmp.age = anno.tmp$sample_id[anno.tmp$age_cat == "aged"] 231 | # tmp.adult = anno.tmp$sample_id[anno.tmp$age_cat == "adult"] 232 | # tmp.max.age = min(length(tmp.age), maxcells) 233 | # tmp.max.adult = min(length(tmp.adult), maxcells) 234 | # select.cells = c(sample(tmp.age, tmp.max.age), sample(tmp.adult, tmp.max.adult)) 235 | # anno.tmp = anno.tmp[select.cells,] 236 | # anno.tmp$sex = as.character(anno.tmp$sex) ## correcting weird error where some entries were boolean rather than characters 237 | # 238 | # ## Escape for error 239 | # tmp = c(table(anno.tmp[,c("sex", "age_cat")])) 240 | # if(sum(tmp == 0) > 0){ 241 | # stop("Error: Missing data for one of the sexes and/or ages.") 242 | # # break 243 | # } 244 | # } 245 | # 246 | # 247 | # ## Subsample data 248 | # dat.tmp = get_logNormal(big.dat, select.cells) 249 | # 250 | # ## Make MAST object 251 | # row.names(anno.tmp) = anno.tmp$sample_id 252 | # sca = FromMatrix(as.matrix(dat.tmp), cData = anno.tmp) 253 | # 254 | # # ## Thresholding ##SKIPPING FOR NOW 255 | # # thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 30, min_per_bin = 30) 256 | # # par(mfrow=c(5,4)) 257 | # # plot(thres) 258 | # 259 | # ## Freq cutoff 260 | # if(is.null(select.genes)){ 261 | # select.genes = freq(sca) > freq_expressed 262 | # } 263 | # print(paste0("Ncells: ", length(select.cells))) 264 | # print(paste0("Ngenes: ", sum(select.genes))) 265 | # sca = sca[select.genes,] 266 | # 267 | # ## Model 268 | # print("Run model") 269 | # colData(sca)$age_cat<-factor(colData(sca)$age_cat) 270 | # colData(sca)$sex<-factor(colData(sca)$sex) 271 | # colData(sca)$roi<-factor(colData(sca)$roi) 272 | # colData(sca)$facs_population_plan<-factor(colData(sca)$facs_population_plan) 273 | # colData(sca)$full_genotype<-factor(colData(sca)$full_genotype) 274 | # 275 | # 276 | # 277 | # 278 | # 279 | # if(length(unique(anno.tmp$full_genotype)) == 1){ 280 | # if(length(unique(anno.tmp$facs_population_plan)) == 1){ 281 | # if(length(unique(anno.tmp$roi)) == 1){ 282 | # zlm <- zlm(~age_cat + sex + age_cat:sex, sca) 283 | # } else{ 284 | # zlm <- zlm(~age_cat + sex + roi + age_cat:sex, sca) 285 | # } 286 | # } else{ 287 | # if(length(unique(anno.tmp$roi)) == 1){ 288 | # zlm <- zlm(~age_cat + sex + facs_population_plan + age_cat:sex, sca) 289 | # } else{ 290 | # zlm <- zlm(~age_cat + sex + roi + facs_population_plan + age_cat:sex, sca) 291 | # } 292 | # } 293 | # } else{ 294 | # if(length(unique(anno.tmp$facs_population_plan)) == 1){ 295 | # if(length(unique(anno.tmp$roi)) == 1){ 296 | # zlm <- zlm(~age_cat + sex + full_genotype + age_cat:sex, sca) 297 | # } else{ 298 | # zlm <- zlm(~age_cat + sex + roi + full_genotype + age_cat:sex, sca) 299 | # } 300 | # } else{ 301 | # if(length(unique(anno.tmp$roi)) == 1){ 302 | # zlm <- zlm(~age_cat + sex + facs_population_plan + full_genotype + age_cat:sex, sca) 303 | # } else{ 304 | # zlm <- zlm(~age_cat + sex + roi + facs_population_plan + full_genotype + age_cat:sex, sca) 305 | # } 306 | # } 307 | # } 308 | # 309 | # 310 | # 311 | # 312 | # ## ----> LTR 313 | # summaryAge<- summary(zlm, doLRT='age_cataged:sexM') 314 | # summaryDt.age <- summaryAge$datatable 315 | # fcHurdle.age <- merge(summaryDt.age[contrast=='age_cataged:sexM' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values 316 | # summaryDt.age[contrast=='age_cataged:sexM' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients 317 | # fcHurdle.age[,padjust:=p.adjust(`Pr(>Chisq)`, 'bonferroni')] 318 | # fcHurdle.age[split.by] = group.tmp 319 | # 320 | # ## ----> Save results 321 | # result = list(select.cells = select.cells, result = fcHurdle.age) 322 | # if(save.as.tmp){ 323 | # save(result, file = paste0("tmp.degene.result.", make.names(id), ".rda")) 324 | # } 325 | # return(result) 326 | # 327 | # } 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | get_age_degenes_mast_p = function(anno.df, norm.dat, groups, split.by, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){ 337 | 338 | if (mc.cores == 1) { 339 | 340 | de.list = list() 341 | 342 | for (n in 1:length(groups)) { 343 | 344 | id = groups[n] 345 | de.list[id] = list(get_age_degenes_mast(anno.df, norm.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells)) 346 | 347 | } 348 | 349 | }else { 350 | 351 | library(doMC) 352 | registerDoMC(cores = mc.cores) 353 | de.list = foreach::foreach(i = groups, .combine='c') %dopar% { 354 | list(get_age_degenes_mast(anno.df, norm.dat, id = i, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells)) 355 | } 356 | names(de.list) = groups 357 | 358 | 359 | } 360 | 361 | return(de.list) 362 | 363 | } 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | get_age_degenes_mast_big_p = function(anno.df, big.dat, groups, split.by, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){ 372 | 373 | if (mc.cores == 1) { 374 | 375 | de.list = list() 376 | 377 | for (n in 1:length(groups)) { 378 | 379 | id = groups[n] 380 | de.list[id] = list(get_age_degenes_mast_big(anno.df, big.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells)) 381 | 382 | } 383 | 384 | }else { 385 | 386 | library(doMC) 387 | registerDoMC(cores = mc.cores) 388 | de.list = foreach::foreach(i = groups, .combine='c') %dopar% { 389 | list(get_age_degenes_mast_big(anno.df, big.dat, id = i, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells)) 390 | } 391 | names(de.list) = groups 392 | 393 | 394 | } 395 | 396 | return(de.list) 397 | 398 | } 399 | 400 | 401 | 402 | # 403 | # 404 | # get_sex_intergenes_mast_big_p = function(anno.df, big.dat, groups, split.by, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL, select.cells = NULL){ 405 | # 406 | # if (mc.cores == 1) { 407 | # 408 | # de.list = list() 409 | # 410 | # for (n in 1:length(groups)) { 411 | # 412 | # id = groups[n] 413 | # de.list[id] = list(get_sex_intergenes_mast_big(anno.df, big.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells)) 414 | # 415 | # } 416 | # 417 | # }else { 418 | # 419 | # library(doMC) 420 | # registerDoMC(cores = mc.cores) 421 | # de.list = foreach::foreach(i = groups, .combine='c') %dopar% { 422 | # list(get_sex_intergenes_mast_big(anno.df, big.dat, id = i, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = select.cells)) 423 | # } 424 | # names(de.list) = groups 425 | # 426 | # 427 | # } 428 | # 429 | # return(de.list) 430 | # 431 | # } 432 | # 433 | # 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | get_age_degenes_perm = function(anno.df, norm.dat, id, split.by, nperm = 50, maxcells = 500, freq_expressed = 0.1, mc.cores = 10, exclude = F, save.as.tmp = F, select.genes = NULL){ 442 | 443 | select.cells.age = anno.df$sample_id[anno.df$age_cat == "aged"] 444 | select.cells.adult = anno.df$sample_id[anno.df$age_cat == "adult"] 445 | 446 | set.seed(2010) 447 | 448 | perm.mat.age = replicate(nperm, sample(select.cells.age, maxcells, replace = F)) 449 | perm.mat.adult = replicate(nperm, sample(select.cells.adult, maxcells, replace = F)) 450 | perm.mat = rbind(perm.mat.age, perm.mat.adult) 451 | 452 | if (mc.cores == 1) { 453 | 454 | de.list = list() 455 | 456 | for (n in 1:nperm) { 457 | 458 | pname = paste0("p",n) 459 | de.list[pname] = list(get_age_degenes_mast(anno.df, norm.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = perm.mat[,n])) 460 | 461 | } 462 | 463 | }else { 464 | 465 | library(doMC) 466 | registerDoMC(cores = mc.cores) 467 | de.list = foreach::foreach(i = 1:nperm, .combine='c') %dopar% { 468 | list(get_age_degenes_mast(anno.df, norm.dat, id = id, split.by = split.by, maxcells = maxcells, freq_expressed = freq_expressed, exclude = exclude, save.as.tmp = save.as.tmp, select.genes = select.genes, select.cells = perm.mat[,i])) 469 | } 470 | names(de.list) = paste0("p",1:nperm) 471 | 472 | 473 | } 474 | 475 | return(de.list) 476 | 477 | } 478 | -------------------------------------------------------------------------------- /analysis_scripts/functions/run_umap.py: -------------------------------------------------------------------------------- 1 | #!/home/user/miniconda3/bin/python3 2 | 3 | import sys, getopt 4 | 5 | from pathlib import Path 6 | import umap 7 | import pandas as pd 8 | import re 9 | 10 | def run_umap(fin, fout, nc = 2, nn=25, md=0.4, niter=1): 11 | df=pd.read_csv(fin, index_col=0) 12 | for i in range(niter): 13 | embedding2d = umap.UMAP(n_components=nc, metric='correlation', n_neighbors=nn, min_dist=md, verbose=True).fit_transform(df.values) 14 | rd = pd.DataFrame(embedding2d) 15 | rd.index = df.index 16 | if niter ==1: 17 | rd.to_csv(fout) 18 | else: 19 | rd.to_csv(Path(re.sub("csv", str(i)+".csv", str(fout)))) 20 | 21 | def run_umap_init(fin, finit, fout, nc = 2, nn=25, md=0.3, niter=1): 22 | df1=pd.read_csv(fin, index_col=0) 23 | df2=pd.read_csv(finit, index_col=0) 24 | for i in range(niter): 25 | embedding2d = umap.UMAP(n_components=nc, metric='correlation', n_neighbors=nn, min_dist=md, verbose=True, init = df2.values).fit_transform(df1.values) 26 | rd = pd.DataFrame(embedding2d) 27 | rd.index = df1.index 28 | if niter ==1: 29 | rd.to_csv(fout) 30 | else: 31 | rd.to_csv(fout+i) 32 | 33 | 34 | def main(argv): 35 | fin = '' 36 | fout = '' 37 | finit = '' 38 | nc=2 39 | nn = 25 40 | md=0.4 41 | niter=1 42 | try: 43 | opts, args = getopt.getopt(argv, "hc:d:n:k:i:o:t:",["infile=", "outfile=", "init_file="]) 44 | except getopt.GetoptError: 45 | print('run_umap.py -i -o -t -c -d -k -n ') 46 | sys.exit(2) 47 | for opt, arg in opts: 48 | if opt == 'h': 49 | print('run_umap.py -i -o -t -c -d -k -n ') 50 | if opt in ("-i", "--infile"): 51 | fin = arg 52 | if opt in ("-o", "--outfile"): 53 | fout = arg 54 | if opt in ("-t", "--init_outfile"): 55 | finit = arg 56 | if opt == "-c": 57 | nc = int(arg) 58 | if opt == "-k": 59 | nn = int(arg) 60 | if opt == "-d": 61 | md = float(arg) 62 | if opt in ("-n"): 63 | niter = int(arg) 64 | print("fin:",fin, "finit:", finit,"fout:", fout,"niter:", niter,"\n") 65 | if finit =='': 66 | run_umap(fin,fout, nc = nc, nn=nn, md=md, niter=niter) 67 | else: 68 | run_umap_init(fin,finit,fout, nc = nc, nn=nn, md=md, niter=niter) 69 | 70 | 71 | if __name__ == "__main__": 72 | main(sys.argv[1:]) 73 | 74 | -------------------------------------------------------------------------------- /analysis_scripts/functions/scrattch.mapping/HANN_prepareTaxonomy.R: -------------------------------------------------------------------------------- 1 | #' run_prepareTaxonomy : call prepareTaxonomy() from h5ad file 2 | #' 3 | #' @param h5adFN filename of anndata with uns[['HANN']] 4 | #' @export 5 | run_prepareTaxonomy <- function( h5adFN ) { 6 | library(anndata) 7 | library(reticulate) 8 | library(Matrix) 9 | adata = read_h5ad(h5adFN) 10 | norm.dat = t(adata$X) 11 | rownames(norm.dat) = adata$var_names 12 | colnames(norm.dat) = adata$obs_names 13 | if (sum(names(cl) != adata$obs_names) > 0) { 14 | print("cl and adata$obs_names do not match") 15 | } 16 | cl = adata$uns[['HANN']]$cl 17 | names(cl) = adata$obs_names 18 | cl.df = adata$uns[['HANN']]$cl.df 19 | cl.df = py_to_r(cl.df) 20 | 21 | cl.hierarchy = adata$uns[['HANN']]$cl.hierarchy 22 | AIT.str = adata$uns[['HANN']]$taxonomyName 23 | taxonomy.dir = adata$uns[['HANN']]$taxonomyDir 24 | 25 | AIT.dir = prepareTaxonomy ( count = norm.dat, 26 | cl = cl, 27 | cl.df = cl.df, 28 | cl.hierarchy = cl.hierarchy, 29 | AIT.str = AIT.str, 30 | taxonomy.dir = taxonomy.dir ) 31 | print(paste0("Taxonomy is ready in", AIT.dir)) 32 | } 33 | #' Prepare taxonomy for optimized tree mapping 34 | #' 35 | #' This function writes an anndata object in the correct format for all downstream mapping algoriths. 36 | #' 37 | #' @param counts count[gene x cell] 38 | #' @param cl assigned cluster 39 | #' @param cl.df.users cluster annotation 40 | #' @param cl.hierarchy : hierarhcy in clusters : cluster(cl)/subclass_label/neighborhood/root 41 | #' @param AIT.str taxonomy id 42 | #' @param taxonomy.dir Output directly to write h5ad file 43 | #' 44 | #' @import patchseqtools 45 | #' @import scrattch.hicat 46 | #' 47 | #' @return A copy of the anndata object that is also written to disk 48 | #' 49 | #' @export 50 | prepareTaxonomy <- function ( 51 | count, 52 | cl, cl.df.users, cl.hierarchy, AIT.str, 53 | taxonomy.dir = "/allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/Taxonomies/", 54 | h5ad_out = FALSE ) { 55 | library(scrattch.bigcat) 56 | library(dplyr) 57 | 58 | ## Define reference-taxonomy folder and filenames 59 | AIT.dir = file.path(taxonomy.dir, AIT.str) 60 | if (!file.exists(AIT.dir)) dir.create(AIT.dir) 61 | de.dir = file.path(AIT.dir, "de_parquet") 62 | sum.dir = file.path(AIT.dir, "de_summary") 63 | dat.dir = file.path(AIT.dir, "data_parquet") 64 | h5ad.FN = file.path(AIT.dir, paste0(AIT.str, ".h5ad")) 65 | pairs.FN = file.path(AIT.dir, "pairs.parquet") 66 | cl.bin.FN = file.path(AIT.dir, "cl.bin.rda") 67 | 68 | 69 | ## convert norm.data matrix to big.dat in parquet, if big.dat is not available (default) 70 | count = count[, sample_cells(cl,200)] 71 | cl = cl[colnames(count)] 72 | count.sparse = as(count, 'sparseMatrix') 73 | library(data.table) 74 | big.dat=convert_big.dat_parquet(count.sparse, dir=dat.dir) 75 | 76 | print("## cluster stat") 77 | cl.stats = get_cl_stats_big(big.dat, cl=cl, stats=c("means","present","sqr_means"), mc.cores=15) 78 | save(cl.stats, file=file.path(AIT.dir, "cl.stats.big.rda")) 79 | cl.means = cl.stats$means 80 | save(cl.means, file=file.path(AIT.dir, "cl.means.rda")) 81 | 82 | cl.df = rearrange_cl.df (cl.df.users, cl.hierarchy) 83 | save(cl.df, file=file.path(AIT.dir, "cl.df.rda")) 84 | 85 | print("## calculate all-pairwise DE genes ") 86 | if (file.exists(de.dir)) system(paste0("rm -r ", de.dir)) 87 | if (file.exists(sum.dir)) system(paste0("rm -r ", sum.dir)) 88 | 89 | de.result = prep_parquet_de_all_pairs(norm.dat=NULL, cl=cl, cl.bin=NULL, mc.cores=30, 90 | pairs.fn=pairs.FN, cl.bin.fn=cl.bin.FN, cl.means=cl.stats$means, 91 | cl.present=cl.stats$present, cl.sqr.means=cl.stats$sqr_means, 92 | out.dir=de.dir, summary.dir=sum.dir, de.param=de_param()) 93 | return(AIT.dir) 94 | } 95 | # de_param default values 96 | # de_param( 97 | # low.th = 1, 98 | # padj.th = 0.01, 99 | # lfc.th = 1, 100 | # q1.th = 0.5, 101 | # q2.th = NULL, 102 | # q.diff.th = 0.7, 103 | # de.score.th = 150, 104 | # min.cells = 4, 105 | # min.genes = 5 106 | # ) 107 | 108 | #' Rearrange cl.df to internal format labels 109 | #' 110 | #' This function writes an anndata object in the correct format for all downstream mapping algoriths. 111 | #' 112 | #' @param counts count[gene x cell] 113 | #' @param cl assigned cluster 114 | #' @return A copy of the anndata object that is also written to disk 115 | #' 116 | #' @export 117 | rearrange_cl.df <- function (cl.df, cl.hierarchy) { 118 | 119 | nlevel=length(cl.hierarchy) 120 | if (nlevel==2) { 121 | hier_label = c('root', 'cl') 122 | cl = cl.df[, cl.hierarchy[2]] 123 | cluster_id = cl 124 | df = data.frame(cl, cluster_id) 125 | } 126 | if (nlevel==3) { 127 | hier_label = c('root', 'subclass_label', 'cl') 128 | subclass_label = cl.df[, cl.hierarchy[2]] 129 | cl = cl.df[, cl.hierarchy[3]] 130 | cluster_id = cl 131 | df = data.frame(cl, cluster_id, subclass_label) 132 | } 133 | if (nlevel==4) { 134 | hier_label = c('root', 'class_label', 'subclass_label', 'cl') 135 | class_label = cl.df[, cl.hierarchy[2]] 136 | subclass_label = cl.df[, cl.hierarchy[3]] 137 | cl = cl.df[, cl.hierarchy[4]] 138 | cluster_id = cl 139 | df = data.frame(cl, cluster_id, subclass_label, class_label) 140 | } 141 | library(dplyr) 142 | rownames(df) = df$cl 143 | return(df) 144 | } 145 | 146 | -------------------------------------------------------------------------------- /analysis_scripts/functions/scrattch.mapping/HANN_utils.R: -------------------------------------------------------------------------------- 1 | #' INFO -- PLEASE ADD -- 2 | #' 3 | #' @param in.df to_be_added 4 | #' @param cl.df to_be_added 5 | #' 6 | #' @return ??? 7 | #' 8 | #' @keywords internal 9 | l2norm <- function(X, by="column") 10 | { 11 | if (by=="column") { 12 | l2norm <- sqrt(Matrix::colSums(X^2)) 13 | if (!any(l2norm==0)) { 14 | X=sweep(X, 2, l2norm, "/", check.margin=FALSE) 15 | } 16 | else{ 17 | warning("L2 norms of zero detected for distance='Cosine, no transformation") 18 | } 19 | X = X 20 | } else { 21 | l2norm <- sqrt(Matrix::rowSums(X^2)) 22 | if (!any(l2norm==0)) { 23 | X= X/l2norm 24 | } 25 | else{ 26 | warning("L2 norms of zero detected for distance='Cosine'") 27 | X = X/ pmax(l2norm,1) 28 | } 29 | } 30 | } 31 | 32 | #' Title 33 | #' 34 | #' @param cn 35 | #' @param direction 36 | #' @param include.self 37 | #' 38 | #' @return 39 | #' @export 40 | #' 41 | #' @examples 42 | create_pairs <- function(cn1, cn2=cn1,direction="nondirectional", include.self = FALSE) 43 | { 44 | cn1=as.character(cn1) 45 | cn2=as.character(cn2) 46 | cl.n1 = length(cn1) 47 | cl.n2 = length(cn2) 48 | pairs = cbind(rep(cn1, rep(cl.n2,cl.n1)), rep(cn2, cl.n1)) 49 | if(!identical(cn1,cn2) & direction!="unidirectional"){ 50 | down.pairs = cbind(rep(cn2, rep(cl.n1,cl.n2)), rep(cn1, cl.n2)) 51 | pairs = rbind(pairs, down.pairs) 52 | } 53 | if(direction=="nondirectional"){ 54 | pairs = pairs[pairs[,1]<=pairs[,2],,drop=F] 55 | } 56 | if(!include.self){ 57 | pairs = pairs[pairs[,1]!=pairs[,2],,drop=F] 58 | } 59 | colnames(pairs)=c("P1","P2") 60 | row.names(pairs) = paste0(pairs[,1],"_",pairs[,2]) 61 | return(pairs) 62 | } 63 | 64 | 65 | #' Compute differential expression summary statistics based on a differential results data.frame and de_param(). 66 | #' 67 | #' @param df A data.frame of pairwise differential expression results (i.e. from \code{score_selected_pairs()}). 68 | #' @param de.param A list of differential gene expression parameters from \code{de_param()} 69 | #' @param cl.size1 Optional: The number of samples in the first/high cluster 70 | #' @param cl.size2 Optional: The number of samples in the second/low cluster 71 | #' 72 | #' @results A list of filtered differential expression results containing: 73 | #' \itemize{ 74 | #' \item{score} The deScore value, equal to the sum of the -log10(p-values) of differentially expressed genes, with a cap of 20 per gene. 75 | #' \item{up.score} The deScore value for up-regulated genes. 76 | #' \item{down.score} The deScore value for down-regulated genes. 77 | #' \item{num} The number of differentially expressed genes 78 | #' \item{up.num} The number of up-regulated genes 79 | #' \item{down.num} The number of down-regulated genes 80 | #' \item{genes} Gene symbols for differentially expressed genes. 81 | #' \item{up.genes} Gene symbols for up-regulated genes. 82 | #' \item{down.genes} Gene symbols for down-regulated genes. 83 | #' \item{de.df} The df used as input, filtered for differentially expressed genes. 84 | #' } 85 | #' 86 | de_stats_pair <- function(df, 87 | de.param = de_param(), 88 | cl.size1 = NULL, 89 | cl.size2 = NULL, 90 | select.genes = NULL, 91 | return.df = FALSE) { 92 | df <- df[order(df$pval, -abs(df$lfc)), ] 93 | 94 | select <- with(df, which(padj < de.param$padj.th & abs(lfc) > de.param$lfc.th)) 95 | select <- row.names(df)[select] 96 | 97 | if(!is.null(select.genes)){ 98 | select <- select[select %in% select.genes] 99 | } 100 | 101 | if(is.null(select) | length(select) == 0){ 102 | return(null_de()) 103 | } 104 | 105 | up <- select[df[select, "lfc"] > 0] 106 | down <- select[df[select, "lfc"] < 0] 107 | df <- df[select, ] 108 | 109 | if(!is.null(de.param$q.diff.th) & is.null(df$q.diff)) { 110 | 111 | df$q.diff <- with(df, abs(q1 - q2) / pmax(q1, q2)) 112 | df$q.diff[is.na(df$q.diff)] <- 0 113 | 114 | } 115 | 116 | if(!is.null(de.param$q1.th)) { 117 | up <- with(df[up, , drop = FALSE], up[q1 > de.param$q1.th]) 118 | if(!is.null(cl.size1)){ 119 | up <- with(df[up, , drop = FALSE], up[q1 * cl.size1 >= de.param$min.cells]) 120 | } 121 | 122 | down <- with(df[down, , drop = FALSE], down[q2 > de.param$q1.th]) 123 | if(!is.null(cl.size2)) { 124 | down <- with(df[down, , drop = FALSE], down[q2 * cl.size2 >= de.param$min.cells]) 125 | } 126 | } 127 | 128 | if(!is.null(de.param$q2.th)) { 129 | up <- with(df[up, , drop = FALSE], up[q2 < de.param$q2.th]) 130 | down <- with(df[down, , drop = FALSE], down[q1 < de.param$q2.th]) 131 | } 132 | 133 | if(!is.null(de.param$q.diff.th)){ 134 | up <- with(df[up, , drop = FALSE], up[abs(q.diff) > de.param$q.diff.th]) 135 | down <- with(df[down, , drop = FALSE], down[abs(q.diff) > de.param$q.diff.th]) 136 | } 137 | 138 | select <- c(up, down) 139 | 140 | if(length(select) == 0){ 141 | return(null_de()) 142 | } else { 143 | 144 | up.genes = setNames(-log10(df[up,"padj"]), up) 145 | down.genes = setNames(-log10(df[down,"padj"]), down) 146 | 147 | tmp = up.genes 148 | tmp[tmp > 20] = 20 149 | up.score <- sum(tmp) 150 | tmp = down.genes 151 | tmp[tmp > 20] = 20 152 | down.score <- sum(tmp) 153 | 154 | result=list( 155 | up.genes=up.genes, 156 | down.genes=down.genes, 157 | up.score = up.score, 158 | down.score = down.score, 159 | score = up.score + down.score, 160 | up.num = length(up.genes), 161 | down.num = length(down.genes), 162 | num = length(up.genes) + length(down.genes) 163 | ) 164 | 165 | 166 | if(return.df){ 167 | result$de.df = df[select,] 168 | } 169 | return(result) 170 | } 171 | } 172 | 173 | 174 | #' Get KNN 175 | #' 176 | #' @param dat to_be_added 177 | #' @param ref.dat to_be_added 178 | #' @param k to_be_added 179 | #' @param method to_be_added 180 | #' @param dim to_be_added 181 | #' 182 | #' @return to_be_added 183 | #' 184 | #' @keywords internal 185 | get_knn <- function(dat, ref.dat, k, method ="cor", dim=NULL,index=NULL, build.index=FALSE, transposed=TRUE, return.distance=FALSE) 186 | { 187 | if(transposed){ 188 | cell.id = colnames(dat) 189 | } 190 | else{ 191 | cell.id= row.names(dat) 192 | } 193 | if(transposed){ 194 | if(is.null(index)){ 195 | ref.dat = Matrix::t(ref.dat) 196 | } 197 | dat = Matrix::t(dat) 198 | } 199 | if(method=="RANN"){ 200 | library(RANN) 201 | knn.result = RANN::nn2(ref.dat, dat, k=k) 202 | } 203 | else if(method %in% c("Annoy.Euclidean", "Annoy.Cosine","cor")){ 204 | library(BiocNeighbors) 205 | if(is.null(index)){ 206 | if(method=="cor"){ 207 | ref.dat = ref.dat - rowMeans(ref.dat) 208 | ref.dat = l2norm(ref.dat,by = "row") 209 | } 210 | if (method=="Annoy.Cosine"){ 211 | ref.dat = l2norm(ref.dat,by = "row") 212 | } 213 | if(build.index){ 214 | index= buildAnnoy(ref.dat) 215 | } 216 | } 217 | if (method=="Annoy.Cosine"){ 218 | dat = l2norm(dat,by="row") 219 | } 220 | if (method=="cor"){ 221 | dat = dat - rowMeans(dat) 222 | dat = l2norm(dat,by = "row") 223 | } 224 | knn.result = queryAnnoy(X= ref.dat, query=dat, k=k, precomputed = index) 225 | } 226 | else{ 227 | stop(paste(method, "method unknown")) 228 | } 229 | knn.index= knn.result[[1]] 230 | knn.distance = knn.result[[2]] 231 | row.names(knn.index) = row.names(knn.distance)=cell.id 232 | if(!return.distance){ 233 | return(knn.index) 234 | } 235 | else{ 236 | list(index=knn.index, distance=knn.distance) 237 | } 238 | } 239 | 240 | 241 | #' get knn batch 242 | #' 243 | #' @param dat to_be_added 244 | #' @param ref.dat to_be_added 245 | #' @param k to_be_added 246 | #' @param method to_be_added 247 | #' @param dim to_be_added 248 | #' @param batch.size to_be_added 249 | #' @param mc.cores to_be_added 250 | #' 251 | #' @return to_be_added 252 | #' 253 | #' @keywords internal 254 | get_knn_batch <- function(dat, ref.dat, k=1, method="cor", dim=NULL, batch.size, mc.cores=1, return.distance=FALSE,...) 255 | { 256 | if(return.distance){ 257 | fun = "knn_combine" 258 | } 259 | else{ 260 | fun = "rbind" 261 | } 262 | results <- batch_process(x=1:ncol(dat), batch.size=batch.size, mc.cores=mc.cores, .combine=fun, FUN=function(bin){ 263 | get_knn(dat=dat[row.names(ref.dat),bin,drop=F], ref.dat=ref.dat, k=k, method=method, dim=dim,return.distance=return.distance, ...) 264 | }) 265 | return(results) 266 | } 267 | 268 | knn_combine <- function(result.1, result.2) 269 | { 270 | knn.index = rbind(result.1[[1]], result.2[[1]]) 271 | knn.distance = rbind(result.1[[2]], result.2[[2]]) 272 | return(list(knn.index, knn.distance)) 273 | } 274 | 275 | #' Batch process 276 | #' 277 | #' @param x to_be_added 278 | #' @param batch.size to_be_added 279 | #' @param FUN to_be_added 280 | #' @param mc.cores to_be_added 281 | #' @param .combine to_be_added 282 | #' @param ... 283 | #' 284 | #' @return to_be_added 285 | #' 286 | #' @keywords internal 287 | batch_process <- function(x, batch.size, FUN, mc.cores=1, .combine="c",...) 288 | { 289 | require(foreach) 290 | require(doMC) 291 | if (mc.cores == 1) { 292 | registerDoSEQ() 293 | } 294 | else { 295 | registerDoMC(cores=mc.cores) 296 | #on.exit(parallel::stopCluster(), add = TRUE) 297 | } 298 | bins = split(x, floor((1:length(x))/batch.size)) 299 | results= foreach(i=1:length(bins), .combine=.combine) %dopar% FUN(bins[[i]],...) 300 | return(results) 301 | } 302 | 303 | #' INFO -- PLEASE ADD -- 304 | #' 305 | #' @param in.df to_be_added 306 | #' @param cl.df to_be_added 307 | #' 308 | #' @return ??? 309 | #' 310 | #' @keywords internal 311 | build_train_index <- function(cl.dat, method= c("Annoy.Cosine","cor","Annoy.Euclidean"),fn=tempfile(fileext=".idx")) 312 | { 313 | library(BiocNeighbors) 314 | method = method[1] 315 | ref.dat = Matrix::t(cl.dat) 316 | if(method=="cor"){ 317 | ref.dat = ref.dat - rowMeans(ref.dat) 318 | ref.dat = l2norm(ref.dat,by = "row") 319 | } 320 | if (method=="Annoy.Cosine"){ 321 | ref.dat = l2norm(ref.dat,by = "row") 322 | } 323 | index= buildAnnoy(ref.dat, fname=fn) 324 | return(index) 325 | } 326 | 327 | #' INFO -- PLEASE ADD -- 328 | #' 329 | #' @param in.df to_be_added 330 | #' @param cl.df to_be_added 331 | #' 332 | #' @return ??? 333 | #' 334 | #' @keywords internal 335 | build_train_index_bs <- function(cl.dat, method= c("Annoy.Cosine","cor","Annoy.Euclidean"),sample.markers.prop=0.8, iter=100, mc.cores=10,fn=tempfile(fileext=".idx")) 336 | { 337 | library(BiocNeighbors) 338 | require(doMC) 339 | require(foreach) 340 | registerDoMC(cores=mc.cores) 341 | 342 | if (is.na(sample.markers.prop)) { 343 | index = build_train_index(cl.dat = cl.dat, method=method, fn=paste0(fn, ".",1)) 344 | return(list(list(cl.dat=cl.dat, index=index))) 345 | } else { 346 | ###for each cluster, find markers that discriminate it from other types 347 | train.dat <- foreach(i=1:iter, .combine="c") %dopar% { 348 | cat(i, '\r') 349 | train.markers = sample(row.names(cl.dat), round(nrow(cl.dat) * sample.markers.prop)) 350 | train.cl.dat = cl.dat[train.markers,] 351 | index = build_train_index(cl.dat = train.cl.dat, method=method, fn = paste0(fn, ".",i)) 352 | return(list(list(cl.dat=train.cl.dat, index=index))) 353 | } 354 | } 355 | } 356 | 357 | #' INFO -- PLEASE ADD -- 358 | #' 359 | #' @param in.df to_be_added 360 | #' @param cl.df to_be_added 361 | #' 362 | #' @return ??? 363 | #' 364 | #' @keywords internal 365 | map_cells_knn <- function(topk=1,test.dat, cl.dat, train.index=NULL, method = c("Annoy.Cosine","cor"), batch.size=5000, mc.cores=1) 366 | { 367 | 368 | cl.knn = get_knn_batch(test.dat, cl.dat, k=topk, index=train.index, method=method, transposed=TRUE, batch.size=batch.size, mc.cores=mc.cores,return.distance=TRUE) 369 | knn.index = cl.knn[[1]] 370 | knn.dist = cl.knn[[2]] 371 | map.df = data.frame(sample_id=colnames(test.dat), cl = colnames(cl.dat)[knn.index], dist = knn.dist) 372 | return(map.df) 373 | } 374 | 375 | #' INFO -- PLEASE ADD -- 376 | #' 377 | #' @param in.df to_be_added 378 | #' @param cl.df to_be_added 379 | #' 380 | #' @return ??? 381 | #' 382 | #' @keywords internal 383 | map_cells_knn_big <- function(big.dat, cl.dat, select.cells, train.index=NULL, method = c("Annoy.Cosine","cor"), batch.size=10000, mc.cores=10) 384 | { 385 | cl.knn = get_knn_batch_big(big.dat, cl.dat, select.cells=select.cells, k=1, index=train.index, method=method, transposed=TRUE, batch.size=batch.size, mc.cores=mc.cores,return.distance=TRUE) 386 | knn.index = cl.knn[[1]] 387 | knn.dist = cl.knn[[2]] 388 | map.df = data.frame(sample_id=select.cells, cl = colnames(cl.dat)[knn.index], dist = knn.dist) 389 | return(map.df) 390 | } 391 | 392 | #' INFO -- PLEASE ADD -- 393 | #' 394 | #' @param in.df to_be_added 395 | #' @param cl.df to_be_added 396 | #' 397 | #' @return ??? 398 | #' 399 | #' @keywords internal 400 | map_cells_knn_bs <- function(topk=1, test.dat, iter=100,cl.dat=NULL,train.index.bs=NULL, method = c("Annoy.Cosine","cor"), mc.cores=20, ...) 401 | { 402 | require(doMC) 403 | require(foreach) 404 | mc.cores = min(mc.cores, length(train.index.bs)) 405 | registerDoMC(cores=mc.cores) 406 | ###for each cluster, find markers that discriminate it from other types 407 | if(!is.null(train.index.bs)){ 408 | iter = length(train.index.bs) 409 | } else{ 410 | idx = match(rownames(cl.dat), rownames(test.dat)) 411 | if (any(is.na(idx))) { 412 | print("some genes in the train data set are missing from query data") 413 | common.gene = intersect(rownames(cl.dat), rownames(test.dat)) 414 | cl.dat = cl.dat[common.gene,] 415 | } 416 | train.index.bs = build_train_index_bs(cl.dat, method=method,iter=iter, ...) 417 | } 418 | library(data.table) 419 | map.list <- foreach(i=1:iter, .combine="c") %dopar% { 420 | train.index = train.index.bs[[i]]$index 421 | cl.dat = train.index.bs[[i]]$cl.dat 422 | map.df=map_cells_knn(topk, test.dat, cl.dat, train.index, method = c("Annoy.Cosine","cor")) 423 | map.df = list(map.df) 424 | } 425 | map.df = rbindlist(map.list) 426 | map.df = map.df %>% group_by(sample_id, cl) %>% summarize(freq=n(),dist = mean(dist)) 427 | map.df$freq = map.df$freq/iter 428 | best.map.df = map.df %>% group_by(sample_id) %>% summarize(best.cl= cl[which.max(freq)],prob=max(freq), avg.dist = dist[which.max(freq)]) 429 | if(method=="cor"){ 430 | best.map.df = best.map.df%>% mutate(avg.cor = 1 - avg.dist^2/2) 431 | } 432 | return(list(map.freq=map.df, best.map.df = best.map.df)) 433 | } 434 | 435 | ## cl.list is cluster membership at different levels, finest at the beginning. 436 | ## val is a vector associated with sample_id 437 | ## compute z_score aggregate at different levels of clustering, start with finest level of clustering, and resort to higher level if not enough sample size 438 | #' INFO -- PLEASE ADD -- 439 | #' 440 | #' @param in.df to_be_added 441 | #' @param cl.df to_be_added 442 | #' 443 | #' @return ??? 444 | #' 445 | #' @keywords internal 446 | z_score <- function(cl.list, val, min.samples =100) 447 | { 448 | 449 | sample_id = names(cl.list[[1]]) 450 | z.score = c() 451 | for(i in 1:length(cl.list)){ 452 | cl=cl.list[[i]][sample_id] 453 | cl.size = table(cl) 454 | if(i !=length(cl.list)){ 455 | select.cl = names(cl.size)[cl.size > min.samples] 456 | } 457 | else{ 458 | select.cl = names(cl.size) 459 | } 460 | df = data.frame(sample_id = names(cl),cl=cl, val=val[names(cl)]) 461 | df = df %>% filter(cl %in% select.cl) %>% group_by(cl) %>% mutate(z = (val - mean(val))/sd(val)) 462 | z.score[df$sample_id] = df$z 463 | sample_id = setdiff(sample_id, df$sample_id) 464 | } 465 | return(z.score) 466 | } 467 | 468 | get_gene_score_ds <- function(ds, to.add, genes, cl.bin, de=NULL, max.num=1000,mc.cores=20) 469 | { 470 | require(doMC) 471 | registerDoMC(cores=mc.cores) 472 | if(!is.null(de)){ 473 | tmp.de = suppressMessages(de %>% right_join(to.add[,c("P1","P2")])) 474 | gene.score = tmp.de %>% group_by(gene) %>% summarize(score = sum(as.numeric(max.num- rank))) %>% filter(gene %in% genes) %>% arrange(-score) 475 | } 476 | else{ 477 | to.add = suppressMessages(to.add %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl"))) 478 | cl.bin.x = to.add %>% pull(bin.x) %>% unique 479 | cl.bin.y = to.add %>% pull(bin.y) %>% unique 480 | tmp=foreach::foreach(bin1=cl.bin.x,.combine="c")%:% 481 | foreach::foreach(bin2=cl.bin.y,.combine="c")%dopar% { 482 | cat(bin1, bin2, '\r') 483 | tmp.pairs = to.add %>% filter(bin.x == bin1 & bin.y ==bin2) 484 | tmp.de = ds %>% filter(bin.x == bin1 & bin.y ==bin2 & gene %in% genes & rank < max.num & P1 %in% tmp.pairs$P1 & P2 %in% tmp.pairs$P2) %>% collect 485 | tmp.de = suppressMessages(tmp.de %>% right_join(tmp.pairs,by=c("P1","P2"))) 486 | gene.score = tmp.de %>% group_by(gene) %>% summarize(rank.sum = sum(max.num- rank)) 487 | list(gene.score) 488 | } 489 | gene.score=rbindlist(tmp) 490 | gene.score = gene.score %>% group_by(gene) %>% summarize(score=sum(as.numeric(rank.sum))) %>% arrange(-score) 491 | } 492 | return(gene.score) 493 | } 494 | 495 | #' INFO -- PLEASE ADD -- 496 | #' 497 | #' @param in.df to_be_added 498 | #' @param cl.df to_be_added 499 | #' 500 | #' @return ??? 501 | #' 502 | #' @keywords internal 503 | update_gene_score_ds <- function(gene.score, ds, to.remove, cl.bin, de=NULL, max.num=1000,mc.cores=20) 504 | { 505 | require(doMC) 506 | registerDoMC(cores=mc.cores) 507 | to.remove = suppressMessages(to.remove %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl"))) 508 | cl.x = to.remove %>% pull(P1) %>%unique 509 | cl.y = to.remove %>% pull(P2) %>%unique 510 | 511 | cl.bin.x = to.remove %>% pull(bin.x) %>% unique 512 | cl.bin.y = to.remove %>% pull(bin.y) %>% unique 513 | if(!is.null(de)){ 514 | tmp.de = suppressMessages(de %>% right_join(to.remove)) 515 | rm.gene.score = tmp.de %>% group_by(gene) %>% summarize(rm.score = sum(max.num- rank)) 516 | } 517 | else{ 518 | if(length(cl.bin.x)*length(cl.bin.y) * nrow(gene.score) < 10^6){ 519 | de= ds %>% filter(bin.x %in% cl.bin.x & bin.y %in% cl.bin.y & gene %in% gene.score$gene & rank < max.num & P1 %in% cl.x & P2 %in% cl.y) %>% collect 520 | tmp.de = suppressMessages(de %>% right_join(to.remove)) 521 | rm.gene.score = tmp.de %>% group_by(gene) %>% summarize(rm.score = sum(max.num- rank)) 522 | } 523 | else{ 524 | tmp=foreach::foreach(bin1=cl.bin.x,.combine="c")%:% 525 | foreach::foreach(bin2=cl.bin.y,.combine="c")%dopar% { 526 | tmp.pairs = to.remove %>% filter(bin.x == bin1 & bin.y ==bin2) 527 | tmp.de = ds %>% filter(bin.x == bin1 & bin.y ==bin2 & gene %in% gene.score$gene & rank < max.num & P1 %in% tmp.pairs$P1 & P2 %in% tmp.pairs$P2) %>% collect 528 | tmp.de = suppressMessages(tmp.de %>% right_join(tmp.pairs)) 529 | gene.score = tmp.de %>% group_by(gene) %>% summarize(rank.sum = sum(max.num- rank)) 530 | list(gene.score) 531 | } 532 | rm.gene.score=rbindlist(tmp) 533 | if(is.null(rm.gene.score)|nrow(rm.gene.score)==0){ 534 | return(gene.score) 535 | } 536 | rm.gene.score = rm.gene.score %>% group_by(gene) %>% summarize(rm.score=sum(as.numeric(rank.sum))) 537 | } 538 | } 539 | tmp = suppressMessages(gene.score %>% left_join(rm.gene.score) %>% mutate(rm.score = ifelse(is.na(rm.score), 0, rm.score)) %>% mutate(new.score = score - rm.score)) 540 | tmp = tmp%>% select(gene, new.score) %>% mutate(score=new.score) %>% filter(score > 0) %>% arrange(-score) 541 | return(tmp) 542 | } 543 | 544 | #' INFO -- PLEASE ADD -- 545 | #' 546 | #' @param in.df to_be_added 547 | #' @param cl.df to_be_added 548 | #' 549 | #' @return ??? 550 | #' 551 | #' @keywords internal 552 | check_pairs_ds <- function(de.dir, to.add, genes,cl.bin, de=NULL, mc.cores=10,max.num=1000) 553 | { 554 | require(doMC) 555 | registerDoMC(cores=mc.cores) 556 | to.add = to.add[,c("P1","P2")] 557 | to.add = suppressMessages(to.add %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl"))) 558 | select.P1 = to.add %>% pull(P1) %>% unique 559 | select.P2 = to.add %>% pull(P2) %>% unique 560 | if(!is.null(de)){ 561 | de.checked = suppressMessages(to.add %>% left_join(de %>% filter(rank < max.num & gene %in% genes)) %>% group_by(P1,P2) %>% summarize(checked=sum(!is.na(gene)))) 562 | } 563 | else{ 564 | cl.bin.x = to.add %>% pull(bin.x) %>% unique 565 | cl.bin.y = to.add %>% pull(bin.y) %>% unique 566 | if(length(cl.bin.x)*length(cl.bin.y)*length(genes) < 10^5){ 567 | ds = open_dataset(de.dir) 568 | de = ds %>% filter(bin.x %in% cl.bin.x & bin.y %in% cl.bin.y & gene %in% genes & P1 %in% select.P1 & P2 %in% select.P2 & rank < max.num ) %>% collect 569 | de.checked = suppressMessages(to.add %>% left_join(de %>% filter(rank < max.num & gene %in% genes)) %>% group_by(P1,P2) %>% summarize(checked=sum(!is.na(gene)))) 570 | } 571 | else{ 572 | de.checked=foreach::foreach(bin1=cl.bin.x,.combine="c")%:% 573 | foreach::foreach(bin2=cl.bin.y,.combine="c")%dopar% { 574 | cat("bin", bin1, bin2, "\r") 575 | d = file.path(de.dir, paste0("bin.x=",bin1), paste0("bin.y=",bin2)) 576 | if (file.exists(file.path(d))) { 577 | tmp.pairs = to.add %>% filter(bin.x == bin1 & bin.y ==bin2) 578 | ### 579 | #tmp.de =ds %>% filter(bin.x == bin1 & bin.y ==bin.y) %>% collect %>% filter(gene %in% genes & rank < max.num) 580 | #Directly read from parquet file is faster 581 | fn = dir(d, pattern="parquet") 582 | tmp.de = arrow::read_parquet(file.path(d, fn)) 583 | tmp.de = tmp.de %>% filter(gene %in% genes & rank < max.num) 584 | tmp.de = suppressMessages(tmp.de %>% right_join(tmp.pairs)) 585 | de.checked = suppressMessages(tmp.de %>% group_by(P1,P2) %>% summarize(checked=n())) 586 | list(de.checked) 587 | } 588 | } 589 | de.checked = rbindlist(de.checked) 590 | } 591 | } 592 | return(de.checked) 593 | } 594 | 595 | #' INFO -- PLEASE ADD -- 596 | #' 597 | #' @param in.df to_be_added 598 | #' @param cl.df to_be_added 599 | #' 600 | #' @return ??? 601 | #' 602 | #' @keywords internal 603 | select_markers_ds <- function(de.dir, cl.bin, genes, select.cl=NULL, top.n=20,mc.cores=10) 604 | { 605 | ds = open_dataset(de.dir) 606 | 607 | if(!is.null(select.cl)){ 608 | cl.bin = cl.bin %>% filter(cl %in% select.cl) 609 | } 610 | select.bin = cl.bin %>% pull(bin) %>% unique 611 | mc.cores=min(mc.cores, length(select.bin)) 612 | library(parallel) 613 | require(doMC) 614 | require(foreach) 615 | registerDoMC(cores=mc.cores) 616 | tmp=foreach::foreach(bin1=select.bin,.combine="c")%:% 617 | foreach::foreach(bin2=select.bin,.combine="c")%dopar% { 618 | de = ds %>% filter(bin.x %in% bin1 & bin.y %in% bin2) 619 | if(is.null(select.cl)){ 620 | de = de %>% filter(P1 %in% select.cl & P2 %in% select.cl) 621 | } 622 | de %>% filter(gene %in% genes) %>% filter(rank <= top.n) %>% pull(gene) %>% unique 623 | } 624 | select.markers=setdiff(intersect(genes,unique(tmp)), NA) 625 | return(select.markers) 626 | } 627 | 628 | #' INFO -- PLEASE ADD -- 629 | #' 630 | #' @param in.df to_be_added 631 | #' @param cl.df to_be_added 632 | #' 633 | #' @return ??? 634 | #' 635 | #' @keywords internal 636 | select_markers_pair_direction_ds <- function(de.dir, add.num, genes, cl.bin, de=NULL, mc.cores=mc.cores,max.genes=1000,byg=10, ...) 637 | { 638 | ds = open_dataset(de.dir) 639 | select.genes=c() 640 | if(!is.null(de)){ 641 | de = de %>% filter(gene %in% genes) 642 | } 643 | add.num = suppressMessages(add.num %>% left_join(cl.bin,by=c("P1"="cl")) %>% left_join(cl.bin,by=c("P2"="cl"))) 644 | gene.score= get_gene_score_ds(ds, to.add=add.num, genes=genes,cl.bin=cl.bin, de=de,mc.cores=mc.cores,...) 645 | while(nrow(add.num)>0 & length(genes)>0 & length(select.genes)< max.genes){ 646 | if(is.null(gene.score) | nrow(gene.score)==0){ 647 | break 648 | } 649 | if (length(gene.score$gene)> byg) g = as.character(gene.score$gene[1:byg]) 650 | else g = as.character(gene.score$gene) 651 | 652 | new.checked = check_pairs_ds(de.dir, to.add=add.num %>% select(P1,P2), genes=g,cl.bin=cl.bin,de=de, ...) 653 | add.num$checked=NULL 654 | add.num = suppressMessages(add.num %>% left_join(new.checked, by=c("P1","P2"))) 655 | add.num = add.num %>% mutate(checked=ifelse(is.na(checked),0,checked)) %>% mutate(num = num-checked) 656 | to.remove = add.num %>% filter(num <=0) %>% select(P1,P2) 657 | add.num = add.num %>% filter(num > 0) 658 | genes = setdiff(genes,g) 659 | select.genes=c(select.genes,g) 660 | cat('gene_score', dim(gene.score), length(select.genes), '\r') 661 | gene.score = update_gene_score_ds(gene.score, ds, to.remove, cl.bin, de=de,...) 662 | gene.score = gene.score %>% filter(gene %in% genes) 663 | if(!is.null(de)){ 664 | de = de %>% filter(!(gene %in% g)) 665 | if(is.null(de)| nrow(de)==0){ 666 | break 667 | } 668 | } 669 | } 670 | return(list(select.genes=select.genes, de=de)) 671 | } 672 | 673 | #' INFO -- PLEASE ADD -- 674 | #' 675 | #' @param in.df to_be_added 676 | #' @param cl.df to_be_added 677 | #' 678 | #' @return ??? 679 | #' 680 | #' @keywords internal 681 | select_markers_pair_group_top_ds <- function(g1,g2,ds, genes, cl.bin, select.sign="up", n.markers=20,mc.cores=1, ...) 682 | { 683 | require(matrixStats) 684 | require(data.table) 685 | require(arrow) 686 | require(dplyr) 687 | 688 | require(doMC) 689 | registerDoMC(cores=mc.cores) 690 | up.to.add = down.to.add=NULL 691 | up.genes=down.genes=NULL 692 | if("up" %in% select.sign){ 693 | up.to.add = as.data.frame(create_pairs(g1, g2, direction="unidirectional")) 694 | up.gene.score = get_gene_score_ds(ds, to.add=up.to.add, genes=genes, cl.bin=cl.bin,...) 695 | up.genes = head(up.gene.score$gene, n.markers) 696 | } 697 | if("down" %in% select.sign){ 698 | down.to.add = as.data.frame(create_pairs(g2, g1,direction="unidirectional")) 699 | down.gene.score = get_gene_score_ds(ds, to.add=down.to.add, genes=genes, cl.bin=cl.bin, ...) 700 | down.genes = head(down.gene.score$gene, n.markers) 701 | } 702 | to.add=rbind(up.to.add, down.to.add) 703 | return(list(up.genes=up.genes, down.genes=down.genes,to.add=to.add)) 704 | } 705 | 706 | #' INFO -- PLEASE ADD -- 707 | #' 708 | #' @param in.df to_be_added 709 | #' @param cl.df to_be_added 710 | #' 711 | #' @return ??? 712 | #' 713 | #' @keywords internal 714 | select_markers_pair_group_ds <- function(g1,g2,de.dir, genes, cl.bin, n.markers=20,select.sign=c("up","down"),max.genes=50,...) 715 | { 716 | ds = open_dataset(de.dir) 717 | result = select_markers_pair_group_top_ds( g1,g2,ds=ds, genes=genes, cl.bin=cl.bin, n.markers=n.markers,select.sign=select.sign,...) 718 | markers=c(result$up.genes, result$down.genes) 719 | add.num = result$to.add 720 | add.num$num = n.markers 721 | new.checked = check_pairs_ds(de.dir, to.add=add.num %>% select(P1,P2),genes=markers,cl.bin=cl.bin,...) 722 | add.num = suppressMessages(add.num %>% left_join(new.checked, by=c("P1","P2"))) 723 | add.num = add.num %>% mutate(checked=ifelse(is.na(checked),0,checked)) %>% mutate(num = num-checked) 724 | add.num = add.num %>% filter(num > 0) 725 | max.genes= max.genes - length(markers) 726 | if(nrow(add.num)>0 & max.genes > 0){ 727 | add.num$checked=NULL 728 | genes = setdiff(genes,markers) 729 | more.markers <- select_markers_pair_direction_ds(de.dir, add.num=add.num, genes=genes,cl.bin=cl.bin,max.genes=max.genes,...) 730 | more.markers <- more.markers$select.genes 731 | markers= c(markers, more.markers) 732 | } 733 | return(markers) 734 | } 735 | 736 | #' INFO -- PLEASE ADD -- 737 | #' 738 | #' @param in.df to_be_added 739 | #' @param cl.df to_be_added 740 | #' 741 | #' @return ??? 742 | #' 743 | #' @keywords internal 744 | select_N_markers_ds<- function(de.dir, select.cl=NULL,pair.num=1, add.num=NULL, genes, cl.bin, default.markers=NULL,...) 745 | { 746 | if(is.null(add.num)){ 747 | add.num = as.data.frame(create_pairs(select.cl, direction="directional")) 748 | add.num$num = pair.num 749 | } 750 | if(!is.null(default.markers)){ 751 | de.checked.num = check_pairs_ds(de.dir, add.num, genes=default.markers, cl.bin=cl.bin,...) 752 | add.num = suppressMessages(add.num %>% left_join(de.checked.num,by=c("P1","P2"))) 753 | add.num = add.num %>% mutate(checked=ifelse(is.na(checked),0,checked)) %>% mutate(num = num-checked) 754 | add.add = add.num %>% filter(num > 0) 755 | genes = setdiff(genes, default.markers) 756 | add.num$checked=NULL 757 | } 758 | markers <- select_markers_pair_direction_ds(de.dir, add.num=add.num, genes=genes,cl.bin=cl.bin,...)$select.genes 759 | } 760 | 761 | #' INFO -- PLEASE ADD -- 762 | #' 763 | #' @param in.df to_be_added 764 | #' @param cl.df to_be_added 765 | #' 766 | #' @return ??? 767 | #' 768 | #' @keywords internal 769 | select_pos_markers_ds <- function(de.dir, cl, select.cl, genes, cl.bin, n.markers=1, mc.cores=1,out.dir="cl.markers",...) 770 | { 771 | library(parallel) 772 | require(doMC) 773 | require(foreach) 774 | registerDoMC(cores=mc.cores) 775 | ds = open_dataset(de.dir) 776 | if (!dir.exists(out.dir)) { 777 | dir.create(out.dir) 778 | } 779 | 780 | ###for each cluster, find markers that discriminate it from other types 781 | cl.markers <- foreach(x=select.cl, .combine="c") %dopar% { 782 | #print(x) 783 | g1=x 784 | g2 = setdiff(cl, x) 785 | cl.bin.x = cl.bin %>% filter(cl==g1) %>% pull(bin) 786 | select.de = ds %>% filter(bin.x==cl.bin.x & P1==g1 & P2 %in% g2) %>% collect() 787 | markers <- select_markers_pair_group_ds(g1,g2, de.dir=de.dir, de=select.de, genes=genes, cl.bin=cl.bin, n.markers=n.markers,select.sign="up",...) 788 | save(markers, file=file.path(out.dir, paste0(x, ".markers.rda"))) 789 | tmp=list(markers) 790 | names(tmp)=x 791 | tmp 792 | } 793 | return(cl.markers) 794 | } 795 | 796 | #' INFO -- PLEASE ADD -- 797 | #' 798 | #' @param in.df to_be_added 799 | #' @param cl.df to_be_added 800 | #' 801 | #' @return ??? 802 | #' 803 | #' @keywords internal 804 | select_top_pos_markers_ds<- function(ds, cl, select.cl, genes, cl.bin, n.markers=3, mc.cores=10,...) 805 | { 806 | library(parallel) 807 | require(doMC) 808 | require(foreach) 809 | registerDoMC(cores=mc.cores) 810 | 811 | ###for each cluster, find markers that discriminate it from other types 812 | 813 | cl.markers <- foreach(x=select.cl, .combine="c") %dopar% { 814 | #print(x) 815 | g1=x 816 | g2 = setdiff(cl, x) 817 | cl.bin.x = cl.bin %>% filter(cl==g1) %>% pull(bin) 818 | select.de = ds %>% filter(bin.x==cl.bin.x & P1==g1 & P2 %in% g2) %>% collect() 819 | markers= select_markers_pair_group_top_ds(g1,g2,ds, genes=genes,cl.bin=cl.bin, de=select.de,n.markers=n.markers,select.sign="up",...)$up.genes 820 | tmp=list(markers) 821 | names(tmp)=x 822 | tmp 823 | } 824 | return(cl.markers) 825 | } 826 | 827 | #' INFO -- PLEASE ADD -- 828 | #' 829 | #' @param in.df to_be_added 830 | #' @param cl.df to_be_added 831 | #' 832 | #' @return ??? 833 | #' 834 | #' @keywords internal 835 | select_markers_groups_top_ds <- function(ds, cl.group, select.groups=names(cl.group), n.markers=3,mc.cores=1,...) 836 | { 837 | 838 | library(parallel) 839 | library(parallel) 840 | require(doMC) 841 | require(foreach) 842 | registerDoMC(cores=mc.cores) 843 | 844 | all.cl = unlist(cl.group) 845 | group.markers <- foreach(x=select.groups, .combine="c") %dopar% { 846 | #print(x) 847 | g1 = cl.group[[x]] 848 | g2 = setdiff(all.cl, g1) 849 | markers=select_markers_pair_group_top_ds(g1,g2,ds=ds, select.sign="up",n.markers=n.markers, ...)$up.genes 850 | list(markers) 851 | } 852 | names(group.markers) = select.groups 853 | return(group.markers) 854 | } 855 | 856 | #' INFO -- PLEASE ADD -- 857 | #' 858 | #' @param in.df to_be_added 859 | #' @param cl.df to_be_added 860 | #' 861 | #' @return ??? 862 | #' 863 | #' @keywords internal 864 | select_markers_groups <- function(de.dir, cl.group, genes, cl.bin, select.groups= unique(cl.group$group), n.markers=20,mc.cores=1,byg=10, ...) 865 | { 866 | ds = open_dataset(de.dir) 867 | cl.group$cl = as.character(cl.group$cl) 868 | group_pair=create_pairs(unique(cl.group$group)) 869 | library(parallel) 870 | require(doMC) 871 | require(foreach) 872 | registerDoMC(cores=mc.cores) 873 | 874 | group.markers <- foreach(i=1:nrow(group_pair), .combine="c") %dopar% { 875 | x= group_pair[i,1] 876 | y= group_pair[i,2] 877 | cat(i, nrow(group_pair), x,' vs ', y,"\r") 878 | g1 = cl.group %>% filter(group==x) %>% pull(cl) 879 | g2 = cl.group %>% filter(group==y) %>% pull(cl) 880 | result=select_markers_pair_group_top_ds(g1,g2,ds=ds, n.markers=n.markers, genes=genes,cl.bin=cl.bin,select.sign=c("up","down"),...) 881 | list(c(result$up.genes, result$down.genes)) 882 | } 883 | group.markers=unique(unlist(group.markers)) 884 | pairs = as.data.frame(create_pairs(cl.group$cl,direction="directional")) 885 | pairs$pair = row.names(pairs) 886 | pairs = suppressMessages(pairs %>% left_join(cl.group, by=c("P1"="cl"))) 887 | pairs = suppressMessages(pairs %>% left_join(cl.group, by=c("P2"="cl"))) 888 | pairs = pairs %>% filter(group.x!=group.y) 889 | 890 | registerDoMC(cores=10) 891 | de.checked.num = check_pairs_ds(de.dir, pairs[,c("P1","P2")], genes=group.markers,cl.bin=cl.bin, mc.cores=10, ...) 892 | add.num = suppressMessages(pairs %>% left_join(de.checked.num) %>% mutate(num=n.markers - checked)) 893 | 894 | select.markers=group.markers 895 | genes = setdiff(genes, select.markers) 896 | #save(group.markers, pairs, add.num, de.dir, genes, cl.bin, file="Debug.rda") 897 | more.markers <- select_markers_pair_direction_ds(de.dir, add.num=add.num, genes=genes, cl.bin=cl.bin, mc.cores=10, byg=byg, ...) 898 | select.markers = c(select.markers, more.markers$select.genes) 899 | if (any(is.na(select.markers))) {print("markers include NA")} 900 | select.markers = setdiff(select.markers, NA) 901 | if (any(is.na(select.markers))) {print("markers include NA") ; browser() } 902 | return(select.markers) 903 | } 904 | 905 | #' INFO -- PLEASE ADD -- 906 | #' 907 | #' @param in.df to_be_added 908 | #' @param cl.df to_be_added 909 | #' 910 | #' @return ??? 911 | #' 912 | #' @keywords internal 913 | get_de_genes <- function(ds, cl.bin, cl1, cl2) 914 | { 915 | bin1 = cl.bin %>% filter(cl==cl1) %>% pull(bin) 916 | bin2 = cl.bin %>% filter(cl==cl2) %>% pull(bin) 917 | select.genes = ds %>% filter(bin.x==bin1 & P1==cl1 & bin.y==bin2 & P2==cl2 | bin.x==bin2 & P2==cl1 & bin.y==bin1 & P1==cl2) %>% collect() 918 | return(select.genes) 919 | } 920 | 921 | 922 | 923 | #' INFO -- PLEASE ADD -- 924 | #' 925 | #' @param in.df to_be_added 926 | #' @param cl.df to_be_added 927 | #' 928 | #' @return ??? 929 | #' 930 | #' @keywords internal 931 | 932 | prep_parquet_de_all_pairs <- function (norm.dat, cl, cl.bin = NULL, cl.bin.size = 100, de.param = de_param(), 933 | method = "fast_limma", mc.cores = 1, pairs.fn = "pairs.parquet", cl.bin.fn="cl.bin.rda", ...) 934 | { 935 | cn <- as.character(sort(unique(cl))) 936 | pairs = create_pairs(cn) 937 | pairs = as.data.frame(pairs) 938 | 939 | pairs$pair = row.names(pairs) 940 | pairs$pair_id = 1:nrow(pairs) 941 | if (is.null(cl.bin)) { 942 | cl.bin.size = min(100, length(cn)/mc.cores) 943 | cl.bin = data.frame(cl = cn, bin = ceiling((1:length(cn)/cl.bin.size))) 944 | } 945 | library(arrow) 946 | write_parquet(pairs, sink = pairs.fn) 947 | save(cl.bin, file=cl.bin.fn) 948 | de.result = prep_parquet_de_selected_pairs(norm.dat, cl = cl, pairs = pairs, 949 | cl.bin = cl.bin, de.param = de.param, method = method, 950 | mc.cores = mc.cores, ...) 951 | #de.result = de_selected_pairs(norm.dat, cl = cl, pairs = pairs, 952 | # de.param = de.param, method = method, mc.cores = mc.cores, ...) 953 | return(de.result$de.genes) 954 | } 955 | 956 | #' INFO -- PLEASE ADD -- 957 | #' 958 | #' @param in.df to_be_added 959 | #' @param cl.df to_be_added 960 | #' 961 | #' @return ??? 962 | #' 963 | #' @keywords internal 964 | prep_parquet_de_selected_pairs <- function (norm.dat, cl, pairs, cl.bin = NULL, cl.size = NULL, 965 | de.param = de_parm(), method = "fast_limma", cl.means = NULL, 966 | cl.present = NULL, cl.sqr.means = NULL, use.voom = FALSE, 967 | counts = NULL, mc.cores = 1, out.dir = NULL, summary.dir = NULL, 968 | top.n = 500, overwrite = FALSE, return.df = FALSE, return.summary = !is.null(summary.dir)) 969 | { 970 | library(arrow) 971 | method <- match.arg(method, choices = c("fast_limma", "limma", 972 | "chisq", "t.test")) 973 | require(parallel) 974 | if (use.voom & is.null(counts)) { 975 | stop("The use.voom = TRUE parameter requires a raw count matrix via the counts parameter.") 976 | } 977 | if (is.null(cl.size)) { 978 | cl.size <- table(cl) 979 | cl.size = setNames(as.integer(cl.size), names(cl.size)) 980 | } 981 | pairs.fn = NULL 982 | if (length(pairs) == 1) { 983 | pairs.fn = pairs 984 | pairs = open_dataset(pairs.fn) 985 | } 986 | else { 987 | pairs = as.data.frame(pairs) 988 | if (is.null(pairs$pair)) { 989 | pairs$pair = row.names(pairs) 990 | } 991 | if (is.null(pairs$pair_id)) { 992 | pairs$pair_id = 1:nrow(pairs) 993 | } 994 | } 995 | 996 | select.cl <- unique(c(pairs %>% pull(P1), pairs %>% pull(P2))) 997 | select.cl <- intersect(select.cl, names(cl.size)[cl.size >= 998 | de.param$min.cells]) 999 | cl <- cl[cl %in% select.cl] 1000 | if (is.factor(cl)) { 1001 | cl = droplevels(cl) 1002 | } 1003 | if (is.null(pairs$bin.x)) { 1004 | if (is.null(cl.bin)) { 1005 | cl.bin.size = min(100, length(select.cl)/mc.cores) 1006 | cl.bin = data.frame(cl = select.cl, bin = ceiling((1:length(select.cl)/cl.bin.size))) 1007 | } 1008 | pairs = pairs %>% left_join(cl.bin, by = c(P1 = "cl")) %>% 1009 | left_join(cl.bin, by = c(P2 = "cl")) 1010 | } 1011 | pairs = pairs %>% filter(P1 %in% select.cl & P2 %in% select.cl) %>% 1012 | collect() 1013 | cl.size = cl.size[select.cl] 1014 | if (is.null(cl.means)) { 1015 | cl.means <- as.data.frame(get_cl_means(norm.dat, cl)) 1016 | } 1017 | else { 1018 | cl.means <- as.data.frame(cl.means) 1019 | } 1020 | if (is.null(cl.present)) { 1021 | cl.present <- as.data.frame(get_cl_present(norm.dat, 1022 | cl, de.param$low.th)) 1023 | } 1024 | else { 1025 | cl.present <- as.data.frame(cl.present) 1026 | } 1027 | if (is.null(cl.sqr.means)) { 1028 | cl.sqr.means <- as.data.frame(get_cl_sqr_means(norm.dat, 1029 | cl)) 1030 | } 1031 | else { 1032 | cl.sqr.means <- as.data.frame(cl.sqr.means) 1033 | } 1034 | if (method == "limma") { 1035 | require("limma") 1036 | norm.dat <- as.matrix(norm.dat[, names(cl)]) 1037 | cl <- setNames(as.factor(paste0("cl", cl)), names(cl)) 1038 | design <- model.matrix(~0 + cl) 1039 | colnames(design) <- levels(as.factor(cl)) 1040 | if (use.voom & !is.null(counts)) { 1041 | v <- limma::voom(counts = as.matrix(counts[row.names(norm.dat), 1042 | names(cl)]), design = design) 1043 | fit <- limma::lmFit(object = v, design = design) 1044 | } 1045 | else { 1046 | fit <- limma::lmFit(object = norm.dat[, names(cl)], 1047 | design = design) 1048 | } 1049 | } 1050 | else if (method == "fast_limma") { 1051 | fit = simple_lmFit(norm.dat, cl = cl, cl.means = cl.means, 1052 | cl.sqr.means = cl.sqr.means) 1053 | } 1054 | else if (method == "t.test") { 1055 | cl.vars <- as.data.frame(get_cl_vars(norm.dat, cl, cl.means = cl.means)) 1056 | } 1057 | require(doMC) 1058 | require(foreach) 1059 | mc.cores = min(mc.cores, ceiling(nrow(pairs)/5000)) 1060 | registerDoMC(cores = mc.cores) 1061 | de_combine <- function(result.1, result.2) { 1062 | library(data.table) 1063 | de.genes = c(result.1$de.genes, result.2$de.genes) 1064 | if (!is.null(result.1$de.summary)) { 1065 | de.summary = rbindlist(result.1$de.summary, result.2$de.summary) 1066 | return(list(de.genes = de.genes, de.summary = de.summary)) 1067 | } 1068 | else { 1069 | return(list(de.genes = de.genes)) 1070 | } 1071 | } 1072 | if (!is.null(out.dir)) { 1073 | if (!dir.exists(out.dir)) { 1074 | dir.create(out.dir) 1075 | } 1076 | } 1077 | if (!is.null(summary.dir)) { 1078 | if (!dir.exists(summary.dir)) { 1079 | dir.create(summary.dir) 1080 | } 1081 | } 1082 | all.bins = sort(unique(c(pairs$bin.x, pairs$bin.y))) 1083 | #print("check. all.bins") ; browser() 1084 | de_list = foreach(bin1 = 1:length(all.bins), .combine = "c") %:% 1085 | foreach(bin2 = bin1:length(all.bins), .combine = "c") %dopar% 1086 | { 1087 | x = all.bins[bin1] 1088 | y = all.bins[bin2] 1089 | if (!overwrite & !is.null(out.dir)) { 1090 | if (file.exists(file.path(out.dir, paste0("bin.x=", 1091 | x), paste0("bin.y=", y)))) { 1092 | return(list(de.genes = NULL, de.summary = NULL)) 1093 | } 1094 | } 1095 | library(dplyr) 1096 | library(arrow) 1097 | library(data.table) 1098 | tmp.pairs = pairs %>% filter(bin.x == x & bin.y == 1099 | y | bin.x == y & bin.y == x) 1100 | if (is.null(tmp.pairs) | nrow(tmp.pairs) == 0) { 1101 | return(list(de.genes = NULL, de.summary = NULL)) 1102 | } 1103 | de.genes = sapply(1:nrow(tmp.pairs), function(i) { 1104 | pair = unlist(tmp.pairs[i, c("P1", "P2")]) 1105 | if (method == "limma") { 1106 | require("limma") 1107 | df = de_pair_limma(pair = pair, cl.present = cl.present, 1108 | cl.means = cl.means, design = design, fit = fit) 1109 | } 1110 | else if (method == "fast_limma") { 1111 | df = de_pair_fast_limma(pair = pair, cl.present = cl.present, 1112 | cl.means = cl.means, fit = fit) 1113 | } 1114 | else if (method == "t.test") { 1115 | df = de_pair_t.test(pair = pair, cl.present = cl.present, 1116 | cl.means = cl.means, cl.vars = cl.vars, cl.size = cl.size) 1117 | } 1118 | else if (method == "chisq") { 1119 | df = de_pair_chisq(pair = pair, cl.present = cl.present, 1120 | cl.means = cl.means, cl.size = cl.size) 1121 | } 1122 | if (!is.null(de.param$min.cells)) { 1123 | cl.size1 <- cl.size[as.character(pair[1])] 1124 | cl.size2 <- cl.size[as.character(pair[2])] 1125 | } 1126 | else { 1127 | cl.size1 <- NULL 1128 | cl.size2 <- NULL 1129 | } 1130 | stats = de_stats_pair(df, de.param = de.param, 1131 | cl.size1, cl.size2, return.df = return.df) 1132 | }, simplify = F) 1133 | pair = tmp.pairs %>% pull(pair) 1134 | names(de.genes) = pair 1135 | if (return.summary) { 1136 | de.summary = scrattch.bigcat::de_pair_summary(de.genes, cl.bin=cl.bin, out.dir = summary.dir, 1137 | pairs = tmp.pairs, return.df = is.null(summary.dir)) 1138 | } 1139 | if (!is.null(out.dir)) { 1140 | cat("Export", bin1, bin2, "\n") 1141 | result = scrattch.bigcat::export_de_genes(de.genes, cl.means, 1142 | out.dir = out.dir, pairs = tmp.pairs, 1143 | mc.cores = 1, top.n = top.n) 1144 | cat("Finish Export", x, y, "\n") 1145 | } 1146 | out = list(de.genes = de.genes, de.summary = de.summary) 1147 | 1148 | de.summary = NULL 1149 | de.genes = NULL 1150 | out 1151 | } 1152 | 1153 | de.genes = do.call("c", de_list[names(de_list) == "de.genes"]) 1154 | names(de.genes) = gsub("de.genes.", "", names(de.genes)) 1155 | de.summary = do.call("c", de_list[names(de_list) == "de.summary"]) 1156 | if (is.null(de.summary)) { 1157 | return(de.genes) 1158 | } 1159 | return(list(de.genes, de.summary)) 1160 | } 1161 | 1162 | 1163 | -------------------------------------------------------------------------------- /analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT13.0_mouse/cl.clean.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT13.0_mouse/cl.clean.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT21.0_mouse/cl.clean.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/ABC_MWB_taxonomy/AIT21.0_mouse/cl.clean.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/broad.roi.key.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/broad.roi.key.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/col.age.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/col.age.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/col.broi.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/col.broi.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/genotype.genes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/genotype.genes.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/sex.genes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/sex.genes.rda -------------------------------------------------------------------------------- /analysis_scripts/r_objects/supertype.exclude.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenInstitute/mouse_aging_scRNAseq/eabe640fe109ffacc8582716ba8e337273b4b85e/analysis_scripts/r_objects/supertype.exclude.rda --------------------------------------------------------------------------------