├── .DS_Store ├── data_analysis ├── .DS_Store ├── drug_discovery │ ├── .DS_Store │ ├── DGE │ │ ├── .DS_Store │ │ ├── deconvoluted │ │ │ ├── .DS_Store │ │ │ ├── DGE-analysis-dec.sh │ │ │ └── DGE-analysis-dec.R │ │ ├── annotated │ │ │ ├── DGE_annotated.sh │ │ │ └── DGE-annotated.R │ │ ├── svg_deg_filter_quest.sh │ │ └── svg_deg_filter_quest.py │ ├── PPI_Drug_Enrichment_Perturbation │ │ ├── ppi_quest.sh │ │ ├── drug_screen_perturb_quest.sh │ │ ├── ppi_quest.py │ │ └── drug_screen_perturb_quest.py │ └── README.md ├── spatial_clustering │ ├── quest_stagate_to_seurat_updated_jobarray.R │ ├── Archive │ │ ├── quest_stagate_to_seurat_jobarray.R │ │ ├── quest_step02_stagate_jobarray.R │ │ └── quest_step01_stagate_jobarray.py │ ├── README.md │ └── quest_stagate_updated_jobarray.py ├── cell_cell_interaction │ ├── distance-based │ │ ├── cci-analysis-COMMOT-DGE-step3.R │ │ ├── cci-analysis-COMMOT-DGE-step2.py │ │ ├── cci-analysis-COMMOT-DGE-step1.R │ │ ├── cci-analysis-COMMOT-DGE-step4.py │ │ ├── cci-analysis-COMMOT-DGE.sh │ │ ├── cci-analysis-COMMOT.py │ │ ├── cci-analysis-COMMOT-pull-scores.py │ │ └── cci-analysis-COMMOT-DGE-step0.py │ └── neighborhood-based │ │ └── adj-analysis.R ├── cell_typing │ ├── deconvolution │ │ ├── create_input_files.R │ │ ├── quest_deconvolution_jobarray.R │ │ └── process_reference_example.R │ ├── reference │ │ ├── geo-download-scRNA-seq.py │ │ └── ref_data_processing_example.R │ └── annotation │ │ ├── annotation_example.R │ │ └── runBrainCellTypeAnnotation-CluHeu.R └── spatial_variability │ ├── quest_SpatialDE_ct_specific.py │ └── quest_SpatialDE_jobarray.py ├── .gitignore ├── data_curation └── geo-query.py ├── README.md └── data_processing ├── process_visium_standard.R └── process_non_visium_standard.R /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/.DS_Store -------------------------------------------------------------------------------- /data_analysis/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/.DS_Store -------------------------------------------------------------------------------- /data_analysis/drug_discovery/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/drug_discovery/.DS_Store -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/drug_discovery/DGE/.DS_Store -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/deconvoluted/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/drug_discovery/DGE/deconvoluted/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | data_analysis/drug_discovery/.DS_Store 3 | data_analysis/drug_discovery/DGE/.DS_Store 4 | data_analysis/.DS_Store 5 | data_analysis/.DS_Store 6 | data_analysis/drug_discovery/.DS_Store 7 | data_analysis/drug_discovery/DGE/.DS_Store 8 | data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/.DS_Store 9 | data_analysis/drug_discovery/.DS_Store 10 | data_analysis/drug_discovery/.DS_Store 11 | data_analysis/.DS_Store 12 | data_analysis/.DS_Store 13 | data_analysis/drug_discovery/.DS_Store 14 | data_analysis/.DS_Store 15 | data_analysis/cell_cell_interaction/.DS_Store 16 | data_analysis/cell_typing/.DS_Store 17 | data_analysis/drug_discovery/.DS_Store 18 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/deconvoluted/DGE-analysis-dec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A p31931 3 | #SBATCH -p normal 4 | #SBATCH -t 01:00:00 5 | #SBATCH -N 1 6 | #SBATCH -n 1 7 | #SBATCH --array=2-6 8 | #SBATCH --mem=30G 9 | #SBATCH --mail-user=yiming.li@northwestern.edu 10 | #SBATCH --mail-type=BEGIN,END,FAIL 11 | #SBATCH --job-name="DGE2%a" 12 | #SBATCH --output=/projects/b1131/ylz8811/pbs-cmds/DGE2_b4_%a.out 13 | 14 | module purge all 15 | module load R/4.1.1 16 | module load geos/3.8.1 17 | 18 | cd /projects/b1131/SpatialT/ 19 | 20 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/SpatialT/deconvoluted_samples.txt 21 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]} 22 | sample_dir=${split_dirs[0]} 23 | 24 | echo "Sample directory: ${sample_dir}" 25 | 26 | Rscript --vanilla /projects/b1131/SpatialT/ref_scripts/DGE-analysis-dec.R $sample_dir 27 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/annotated/DGE_annotated.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A b1042 3 | #SBATCH -p genomics 4 | #SBATCH -t 1:00:00 5 | #SBATCH -N 1 6 | #SBATCH -n 1 7 | #SBATCH --array=0-8 8 | #SBATCH --mem=30G 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu 10 | #SBATCH --mail-type=END,FAIL 11 | #SBATCH --job-name="DGE%a" 12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/deg/out/DGE_%a.out 13 | 14 | module purge all 15 | module load R/4.4.0 16 | 17 | cd /projects/b1131/SpatialT/ 18 | 19 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/cosmx_colon_case_revision_samples.txt 20 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]} 21 | sample_dir=${split_dirs[0]} 22 | 23 | echo "Sample directory: ${sample_dir}" 24 | 25 | Rscript --vanilla /projects/b1131/SpatialT/ref_scripts/DGE-annotated.R $sample_dir -------------------------------------------------------------------------------- /data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A b1042 3 | #SBATCH -p genomics 4 | #SBATCH -t 2:00:00 5 | #SBATCH -N 1 6 | #SBATCH -n 1 7 | #SBATCH --array=0-4 8 | #SBATCH --mem=10G 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu 10 | #SBATCH --mail-type=END,FAIL 11 | #SBATCH --job-name="ppi%a" 12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/drug/ppi%a.out 13 | 14 | module purge all 15 | module load python-miniconda3/4.12.0 16 | source activate myenv 17 | 18 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/SpatialT/cosmx_patho_samples.tsv 19 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]} 20 | dsid=${split_dirs[0]} 21 | sampleid=${split_dirs[1]} 22 | 23 | echo "DSID: ${dsid}" 24 | echo "SampleiD: ${sampleid}" 25 | python /projects/b1131/SpatialT/ST-dataset/analysis/database_utilities/drug/ppi_quest.py $dsid $sampleid -------------------------------------------------------------------------------- /data_analysis/spatial_clustering/quest_stagate_to_seurat_updated_jobarray.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(data.table) 3 | library(stringr) 4 | 5 | sample_dir <- commandArgs(trailingOnly=TRUE) 6 | 7 | coord = read.csv(paste0(sample_dir, "/processed/coordinates.csv")) 8 | 9 | so <- readRDS(paste0(sample_dir, "processed/Seurat.RDS")) 10 | meta <- so@meta.data 11 | meta$row <- 1:nrow(meta) 12 | 13 | clustering_results <- fread(paste0(sample_dir, "analysis/clustering/STAGATE_clusters.csv")) 14 | colnames(clustering_results) <- c("new_spot_id", "STAGATE_cluster") 15 | 16 | meta <- merge(meta, clustering_results, by = "new_spot_id") 17 | meta <- meta[order(meta$row),] 18 | 19 | # keep only seurat obs where they are present in the coordinates column, this is the obs in which clustering are done on 20 | so <- so[, so@meta.data$new_spot_id %in% coord$barcode] 21 | so[["STAGATE_cluster"]] <- meta$STAGATE_cluster 22 | saveRDS(so, file = paste0(sample_dir, "processed/Seurat.RDS")) 23 | 24 | 25 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A b1042 3 | #SBATCH -p genomics 4 | #SBATCH -t 8:00:00 5 | #SBATCH -N 1 6 | #SBATCH -n 20 7 | #SBATCH --array=0-4 8 | #SBATCH --mem=188G 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu 10 | #SBATCH --mail-type=END,FAIL 11 | #SBATCH --job-name="drug%a" 12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/drug/out/drug_rerun%a.out 13 | 14 | module purge all 15 | module load python-miniconda3/4.12.0 16 | source activate myenv 17 | 18 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/patho_samples_drug_cosmx.tsv 19 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]} 20 | dsid=${split_dirs[0]} 21 | sampleid=${split_dirs[1]} 22 | 23 | echo "DSID: ${dsid}" 24 | echo "SampleiD: ${sampleid}" 25 | python /projects/b1131/SpatialT/ST-dataset/analysis/database_utilities/drug/drug_screen_perturb_quest.py $dsid $sampleid -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/svg_deg_filter_quest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A b1042 3 | #SBATCH -p genomics 4 | #SBATCH -t 1:00:00 5 | #SBATCH -N 1 6 | #SBATCH -n 1 7 | #SBATCH --array=0-4 8 | #SBATCH --mem=24G 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu 10 | #SBATCH --mail-type=END,FAIL 11 | #SBATCH --job-name="deg_svg_%a" 12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/svg/out/deg_svg_%a.out 13 | 14 | module purge all 15 | module load python-miniconda3/4.12.0 16 | source activate myenv 17 | 18 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/SpatialT/cosmx_patho_samples.tsv 19 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]} 20 | dsid=${split_dirs[0]} 21 | sampleid=${split_dirs[1]} 22 | tech=${split_dirs[2]} 23 | 24 | echo "DSID: ${dsid}" 25 | echo "SampleID: ${sampleid}" 26 | echo "Tech: ${tech}" 27 | 28 | python /projects/b1131/SpatialT/ST-dataset/analysis/database_utilities/SVG/svg_deg_filter_cosmx_case_quest.py $dsid $sampleid $tech -------------------------------------------------------------------------------- /data_analysis/spatial_clustering/Archive/quest_stagate_to_seurat_jobarray.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(data.table) 3 | library(stringr) 4 | 5 | sample_dir <- commandArgs(trailingOnly=TRUE) 6 | 7 | coord = read.csv(paste0(sample_dir, "/processed/coordinates.csv")) 8 | optimal_n <- dir(paste0(sample_dir, "analysis/clustering"), pattern = "obsm_STAGATE_cluster_") 9 | optimal_n <- str_split(str_split(optimal_n, "_cluster_")[[1]][2], '.csv')[[1]][1] 10 | 11 | so <- readRDS(paste0(sample_dir, "processed/Seurat.RDS")) 12 | meta <- so@meta.data 13 | meta$row <- 1:nrow(meta) 14 | 15 | clustering_results <- fread(paste0(sample_dir, "analysis/clustering/obsm_STAGATE_cluster_", optimal_n, ".csv")) 16 | colnames(clustering_results) <- c("new_spot_id", "STAGATE_cluster") 17 | 18 | meta <- merge(meta, clustering_results, by = "new_spot_id") 19 | meta <- meta[order(meta$row),] 20 | 21 | # keep only seurat obs where they are present in the coordinates column, this is the obs in which clustering are done on 22 | so <- so[, so@meta.data$new_spot_id %in% coord$barcode] 23 | so[["STAGATE_cluster"]] <- meta$STAGATE_cluster 24 | saveRDS(so, file = paste0(sample_dir, "processed/Seurat.RDS")) 25 | 26 | 27 | -------------------------------------------------------------------------------- /data_analysis/spatial_clustering/Archive/quest_step02_stagate_jobarray.R: -------------------------------------------------------------------------------- 1 | library(mclust) 2 | 3 | #### Calculating BIC for each number of clusters #### 4 | args <- commandArgs(trailingOnly=TRUE) 5 | 6 | data <- read.csv(paste0(args, "analysis/clustering/obsm_STAGATE.csv")) 7 | spot_ids <- data[,1] 8 | row_size = nrow(data) 9 | 10 | print(paste0(">>> ", args, " started calculating BIC <<<")) 11 | bic <- c() 12 | max_cluster = 30 13 | if (nrow(data) < 60) { 14 | max_cluster = nrow(data)/2 15 | print(paste0('MAX CLUSTER: ', max_cluster)) 16 | } 17 | 18 | for (i in 2:max_cluster) { 19 | res <- Mclust(data[,-1], i, "EEE") 20 | if(length(res) == 0) { 21 | res <- Mclust(data[,-1], i) 22 | } 23 | bic <- append(bic, res$BIC[1]) 24 | } 25 | 26 | #### Visualize using the optimal number of clusters #### 27 | n_clust <- which.min(bic)+1 28 | print(paste0(">>> Optimal number of cluster for", args, ": ", n_clust, " <<<")) 29 | res <- Mclust(data[,-1], n_clust, "EEE") 30 | if(length(res) == 0) { 31 | res <- Mclust(data[,-1], n_clust) 32 | } 33 | write.table(data.frame(res$classification), file=paste0(args, "analysis/clustering/obsm_STAGATE_cluster_", n_clust, ".csv"), quote=FALSE, sep=",", row.names=spot_ids, col.names=c("cluster")) 34 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/svg_deg_filter_quest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | 5 | dsid = sys.argv[1] 6 | sampleid = sys.argv[2] 7 | tech = sys.argv[3] 8 | pid = 'PID'+dsid.split('DS')[1] 9 | 10 | dge_dir = '/projects/b1131/SpatialT/drug-target/'+dsid+'/'+sampleid+'/DGE_dec/' 11 | svg_dir = '/projects/b1131/SpatialT/'+tech+'/'+pid+'/'+dsid+'/'+sampleid+'/analysis/SVG/' 12 | sample_dge_dir = dge_dir.split('DGE_dec/')[0] 13 | 14 | svg = pd.read_csv(svg_dir+'SpatialDE_results.tsv', sep='\t') 15 | svg_filter = svg[svg['qval']<=0.1] 16 | print('svg read in') 17 | 18 | if not os.path.exists(sample_dge_dir+'DGE_dec_SVG'): 19 | os.makedirs(sample_dge_dir+'DGE_dec_SVG') 20 | 21 | for file in os.listdir(dge_dir): 22 | cell_type = file.split('.txt')[0] 23 | print(cell_type) 24 | deg = pd.read_csv(dge_dir+file, sep='\t') 25 | deg=deg[deg['gene'].isin(svg_filter['g'].tolist())] 26 | print('SVG filtered') 27 | 28 | df_up = deg[(deg['stat']>0.5) & (deg['qval']<0.05) ] 29 | df_down = deg[(deg['stat']<-0.5) & (deg['qval']<0.05) ] 30 | print('DEG up and down shape:', df_up.shape [0],df_down.shape[0]) 31 | 32 | deg.to_csv(sample_dge_dir+'DGE_dec_SVG'+'/'+cell_type+'.csv',index=False) 33 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step3.R: -------------------------------------------------------------------------------- 1 | library(stringr) 2 | library(data.table) 3 | library(tradeSeq) 4 | library(clusterExperiment) 5 | 6 | args <- commandArgs(trailingOnly=TRUE) 7 | 8 | sample_dir <- args[1] 9 | thr_type <- args[2] 10 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1" 11 | # thr_type <- "short" 12 | 13 | out_dir <- paste0(sample_dir, "/analysis/Distance/COMMOT_dec/", thr_type) 14 | pathways <- fread(paste0(out_dir, "/pathways_step1_success.txt"), header = FALSE)$V1 15 | pathways_success <- character(0) 16 | 17 | for (pathway in pathways) { 18 | # pathway = "CXCL" 19 | pathway_dir <- paste0(out_dir, "/", pathway) 20 | 21 | possibleError <- tryCatch( { 22 | sce <- readRDS(paste0(pathway_dir, "/step1_sce.RDS")) 23 | assoRes <- fread(paste0(pathway_dir, "/assoRes.txt"), sep = "\t") 24 | genes <- assoRes$V1 25 | assoRes$V1 <- NULL 26 | assoRes <- data.frame(assoRes) 27 | rownames(assoRes) <- genes 28 | 29 | source(paste0(pathway_dir, "/step3.R")) 30 | fwrite(yhatScaled, paste0(pathway_dir, "/yhatScaled.txt"), sep = "\t", row.names = TRUE) 31 | }, 32 | error=function(e) e 33 | ) 34 | if(!inherits(possibleError, "error")) { 35 | pathways_success <- c(pathways_success, pathway) 36 | } 37 | } 38 | 39 | pathways_failed <- pathways[!pathways %in% pathways_success] 40 | fwrite(data.frame(pathways_success), paste0(out_dir, "/pathways_step3_success.txt"), col.names = FALSE) 41 | fwrite(data.frame(pathways_failed), paste0(out_dir, "/pathways_step3_failed.txt"), col.names = FALSE) 42 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step2.py: -------------------------------------------------------------------------------- 1 | import os, sys, pickle, datetime, anndata 2 | import commot as ct 3 | import scanpy as sc 4 | import pandas as pd 5 | import numpy as np 6 | import scipy 7 | from collections import Counter 8 | 9 | thr_type = sys.argv[2] 10 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec/" + thr_type 11 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT/short' 12 | 13 | with open(out_dir + "/pathways_step1_success.txt") as f: 14 | pathways = [line.rstrip('\n') for line in f] 15 | 16 | for pathway in pathways: 17 | # pathway = "MIF" 18 | pathway_dir = out_dir + "/" + pathway 19 | 20 | df_assoRes = pd.read_csv(pathway_dir + "/assoRes.txt", sep = "\t", index_col = 0) 21 | n_deg_genes = df_assoRes.shape[0] 22 | 23 | n_points = 50 24 | deg_pvalue_cutoff = 0.05 25 | 26 | string_step3 = 'assoRes <- assoRes[which(assoRes$pvalue_1 <= %f),]' % deg_pvalue_cutoff 27 | string_step3 = string_step3 + '\noAsso <- order(assoRes[,"waldStat_1"], decreasing=TRUE)' 28 | string_cluster = 'clusPat <- clusterExpressionPatterns(sce, nPoints = %d,' % n_points\ 29 | + 'verbose=TRUE, genes = rownames(assoRes)[oAsso][1:min(%d,length(oAsso))],' % n_deg_genes \ 30 | + ' k0s=4:5, alphas=c(0.1))' 31 | 32 | string_step3 = string_step3 + '\n' + string_cluster 33 | string_step3 = string_step3 + '\nyhatScaled <- data.frame(clusPat$yhatScaled)\n' 34 | 35 | with open(pathway_dir + "/step3.R", "w") as text_file: 36 | _ = text_file.write(string_step3) 37 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step1.R: -------------------------------------------------------------------------------- 1 | library(stringr) 2 | library(data.table) 3 | library(tradeSeq) 4 | library(clusterExperiment) 5 | 6 | args <- commandArgs(trailingOnly=TRUE) 7 | 8 | sample_dir <- args[1] 9 | thr_type <- args[2] 10 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1" 11 | # thr_type <- "short" 12 | 13 | out_dir <- paste0(sample_dir, "/analysis/Distance/COMMOT_dec/", thr_type) 14 | pathways <- fread(paste0(out_dir, "/pathways.txt"), header = FALSE)$V1 15 | pathways_success <- character(0) 16 | 17 | for (pathway in pathways) { 18 | # pathway = "MIF" 19 | pathway_dir <- paste0(out_dir, "/", pathway) 20 | 21 | possibleError <- tryCatch( { 22 | X <- fread(paste0(pathway_dir, "/step1_X.csv"), header = TRUE) 23 | pseudoTime <- fread(paste0(pathway_dir, "/step1_pseudoTime.csv"), header = FALSE)$V1 24 | cellWeight <- fread(paste0(pathway_dir, "/step1_cellWeight.csv"), header = FALSE)$V1 25 | spot_ids <- X$V1 26 | X$V1 <- NULL 27 | X <- t(as.matrix(X)) 28 | colnames(X) <- spot_ids 29 | 30 | source(paste0(pathway_dir, "/step1.R")) 31 | fwrite(assoRes, paste0(pathway_dir, "/assoRes.txt"), sep = "\t", row.names = TRUE) 32 | saveRDS(sce, paste0(pathway_dir, "/step1_sce.RDS")) 33 | }, 34 | error=function(e) e 35 | ) 36 | if(!inherits(possibleError, "error")) { 37 | pathways_success <- c(pathways_success, pathway) 38 | } 39 | } 40 | 41 | pathways_failed <- pathways[!pathways %in% pathways_success] 42 | fwrite(data.frame(pathways_success), paste0(out_dir, "/pathways_step1_success.txt"), col.names = FALSE) 43 | fwrite(data.frame(pathways_failed), paste0(out_dir, "/pathways_step1_failed.txt"), col.names = FALSE) 44 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step4.py: -------------------------------------------------------------------------------- 1 | import os, sys, pickle, datetime, anndata 2 | import commot as ct 3 | import scanpy as sc 4 | import pandas as pd 5 | import numpy as np 6 | import scipy 7 | from collections import Counter 8 | 9 | thr_type = sys.argv[2] 10 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec/" + thr_type 11 | # thr_type = "short" 12 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT/short' 13 | 14 | with open(out_dir + "/pathways_step3_success.txt") as f: 15 | pathways = [line.rstrip('\n') for line in f] 16 | 17 | for pathway in pathways: 18 | # pathway = "MIF" 19 | pathway_dir = out_dir + "/" + pathway 20 | 21 | yhat_scaled = pd.read_csv(pathway_dir + "/yhatScaled.txt", sep = "\t", index_col = 0) 22 | df_assoRes = pd.read_csv(pathway_dir + "/assoRes.txt", sep = "\t", index_col = 0) 23 | 24 | df_deg = df_assoRes.rename(columns={'waldStat_1':'waldStat', 'df_1':'df', 'pvalue_1':'pvalue'}) 25 | df_deg = df_deg[['waldStat', 'df', 'pvalue']] 26 | idx = np.argsort(-df_deg['waldStat'].values) 27 | df_deg = df_deg.iloc[idx] 28 | df_yhat = yhat_scaled 29 | 30 | deg_result = {"df_deg": df_deg, "df_yhat": df_yhat} 31 | with open(pathway_dir + '/DEG_pt.pkl', 'wb') as handle: 32 | pickle.dump(deg_result, handle, protocol = pickle.HIGHEST_PROTOCOL) 33 | 34 | df_deg_clus, df_yhat_clus = ct.tl.communication_deg_clustering(df_deg, df_yhat, deg_clustering_res=0.4) 35 | top_de_genes = ct.pl.plot_communication_dependent_genes(df_deg_clus, df_yhat_clus, top_ngene_per_cluster=5, filename = pathway_dir + '/DEG.pdf', font_scale=1.2, return_genes = True) 36 | 37 | deg_result = {"df_deg": df_deg, "df_yhat": df_yhat, "df_deg_clus": df_deg_clus, "df_yhat_clus": df_yhat_clus, "top_de_genes": top_de_genes} 38 | with open(pathway_dir + '/DEG_full.pkl', 'wb') as handle: 39 | pickle.dump(deg_result, handle, protocol = pickle.HIGHEST_PROTOCOL) 40 | -------------------------------------------------------------------------------- /data_analysis/spatial_clustering/README.md: -------------------------------------------------------------------------------- 1 | # Spatial Clustering 2 | Implementation of spatial clustering using gene expression and spatial location via the [STAGATE package](https://stagate.readthedocs.io/en/latest/index.html) 3 | - Requires `counts.csv` and `coordinates.csv` in the processed folder and a list of sample directories for running the shell scripts. 4 | - Steps of running: 5 | - Run [quest_stagate_updated_jobarray.py](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/spatial_clustering/quest_stagate_updated_jobarray.py) followed by [quest_stagate_to_seurat_updated_jobarray.R](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/spatial_clustering/quest_stagate_to_seurat_updated_jobarray.R) 6 | - quest_stagate_updated_jobarray.py: 7 | - **Please note that your provided argument (read into `sample_dir`) should have a "/" at the end of the path.** 8 | - Reads counts and coordinates into AnnData object and keeps only spots that have coordinates. 9 | - If the AnnData contains only integer counts and the max count is greater than 20, it is likely not log transformed during processing, so log1p is done. 10 | - Conducts normalization then finds the top 3000 highly variable genes by the CellRanger approach (expects normalized and transformed counts). 11 | - Initial radius cutoff (`rad_cur`) was set to **2** for STAGATE to find neighbors for each spot. If the initial number of neighbors is less than **5**, step-wise addition of radius cutoff (`rad_add`) is done to reach at least 5 neighbors per spot. 12 | - For non-visium technologies, `rad_add` might need to be increased(if too slow in reaching the optimal number)/decreased(if too many neighbors) to reach optimal neighbors between **5 - 15**. 13 | - STAGATE spatial net is then trained. Neighbors are found on the STAGATE spatial+expression-reduced dimensions. 14 | - Louvain clustering is done using resolution determined based on the number of cells in a sample. 15 | - Reducued dimenision data from stagate is saved in `STAGATE_30dim.csv` and cluster assignment is saved under 'STAGATE_clusters.csv' in the clustering subfolder. 16 | - quest_stagate_to_seurat_updated_jobarray.R: 17 | - Adds cluster assignment of each spot to the metadata of `Seurat.RDS` stored in the processed folder. 18 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IFS=$'\n' read -d '' -r -a input_args < sample_list.txt 4 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]} 5 | sample_dir=${split_dirs[0]} 6 | ref_dir=${split_dirs[1]} 7 | distance_type="short" # Supported arguments: short, medium, long, or xlong 8 | 9 | echo "### Sample directory: ${sample_dir}" 10 | echo "### Reference directory: ${ref_dir}" 11 | echo "### Distance type: ${distance_type}" 12 | 13 | module purge all 14 | module load python-miniconda3/4.12.0 15 | source activate SpatialT 16 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT.py $sample_dir $ref_dir $distance_type 17 | then 18 | echo "COMMOT main analysis completed" 19 | else 20 | echo "COMMOT main analysis failed" 21 | exit 1 22 | fi 23 | 24 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-DGE-step0.py $sample_dir $ref_dir $distance_type 25 | then 26 | echo "DGE - Step 0 (Python) completed" 27 | else 28 | echo "DGE - Step 0 (Python) failed" 29 | exit 1 30 | fi 31 | 32 | source deactivate 33 | module purge all 34 | module load R/4.1.1 35 | module load geos/3.8.1 36 | if Rscript --vanilla cci-analysis-COMMOT-DGE-step1.R $sample_dir $distance_type 37 | then 38 | echo "DGE - Step 1 (R) completed" 39 | else 40 | echo "DGE - Step 1 (R) failed" 41 | exit 1 42 | fi 43 | 44 | module purge all 45 | module load python-miniconda3/4.12.0 46 | source activate SpatialT 47 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-DGE-step2.py $sample_dir $distance_type 48 | then 49 | echo "DGE - Step 2 (Python) completed" 50 | else 51 | echo "DGE - Step 2 (Python) failed" 52 | exit 1 53 | fi 54 | 55 | source deactivate 56 | module purge all 57 | module load R/4.1.1 58 | module load geos/3.8.1 59 | if Rscript --vanilla cci-analysis-COMMOT-DGE-step3.R $sample_dir $distance_type 60 | then 61 | echo "DGE - Step 3 (R) completed" 62 | else 63 | echo "DGE - Step 3 (R) failed" 64 | exit 1 65 | fi 66 | 67 | module purge all 68 | module load python-miniconda3/4.12.0 69 | source activate SpatialT 70 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-DGE-step4.py $sample_dir $distance_type 71 | then 72 | echo "DGE - Step 4 (Python) completed" 73 | else 74 | echo "DGE - Step 4 (Python) failed" 75 | exit 1 76 | fi 77 | 78 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-pull-scores.py $sample_dir $ref_dir $distance_type 79 | then 80 | echo "Pulling scores completed" 81 | else 82 | echo "Pulling scores failed" 83 | exit 1 84 | fi 85 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/deconvolution/create_input_files.R: -------------------------------------------------------------------------------- 1 | ### Author: Yiming Li 2 | ### Example usage: 3 | ### create_input_files.R $sample_dir 4 | 5 | library(Seurat) 6 | library(BayesPrism) 7 | library(data.table) 8 | library(dplyr) 9 | library(stringr) 10 | 11 | ### Read the list of DSIDs for use in our database 12 | args <- commandArgs(trailingOnly=TRUE) 13 | 14 | st_dir <- "/projects/b1131/SpatialT" 15 | 16 | sample_dir <- args[1] 17 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID5/DS5A/DS5A.12_151676" 18 | 19 | ### Read Seurat objects and deconvolution results 20 | seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS")) 21 | location <- fread(paste0(sample_dir, "/processed/coordinates.csv")) 22 | output_dir <- paste0(sample_dir, "/analysis/deconvolution") 23 | theta <- readRDS(paste0(output_dir, "/BayesPrism_theta.RDS")) 24 | bp.res <- readRDS(paste0(output_dir, "/BayesPrism_results.RDS")) 25 | meta <- seurat_object@meta.data 26 | spot_id_mapping <- meta$new_spot_id 27 | names(spot_id_mapping) <- rownames(meta) 28 | 29 | gc() 30 | 31 | ### Hard-assigned labels 32 | hard_labels <- apply(theta, 1, function(x) names(x)[which(x == max(x))]) 33 | hard_labels <- data.frame(cell_type_dec_max = as.character(hard_labels), spot_id = names(hard_labels)) 34 | meta$spot_id <- rownames(meta) 35 | meta$row_id <- 1:nrow(meta) 36 | meta <- merge(meta, hard_labels, by = "spot_id") 37 | meta <- meta[order(meta$row_id),] 38 | seurat_object[["cell_type_dec_max"]] <- meta$cell_type_dec_max 39 | # sum(colnames(seurat_object) == meta$spot_id) 40 | saveRDS(seurat_object, paste0(sample_dir, "/processed/Seurat.RDS")) 41 | 42 | rm(seurat_object) 43 | gc() 44 | 45 | ### Save deconvoluted cell-type-specific expressions 46 | all_cell_types_s <- character(0) 47 | all_cell_types <- character(0) 48 | ct_i <- 1 49 | for (cell_type in colnames(theta)) { 50 | # cell_type <- "CAFs" 51 | cell_type_s <- gsub("/", ".", cell_type) 52 | cell_type_s <- gsub(" ", ".", cell_type_s) 53 | cell_type_s <- gsub("-", ".", cell_type_s) 54 | cell_type_s <- gsub("\\*", ".", cell_type_s) 55 | cell_type_s <- gsub("\\+", ".", cell_type_s) 56 | 57 | ct_exp <- get.exp(bp = bp.res, state.or.type = "type", cell.name = cell_type) 58 | counts_df <- data.table(ct_exp) 59 | counts_df$spot <- spot_id_mapping[rownames(ct_exp)] 60 | counts_df <- transpose(counts_df, keep.names = "gene", make.names = "spot") 61 | keep_spots <- intersect(colnames(counts_df), location$barcode) 62 | keep_spots <- c("gene", keep_spots) 63 | counts_df <- counts_df[,..keep_spots] 64 | 65 | fwrite(counts_df, paste0(output_dir, "/counts_", cell_type_s, "_deconv_only.csv"), sep = ",") 66 | all_cell_types[ct_i] <- cell_type 67 | all_cell_types_s[ct_i] <- cell_type_s 68 | ct_i <- ct_i + 1 69 | } 70 | fwrite(data.table(cell_type = all_cell_types, cell_type_s = all_cell_types_s), paste0(output_dir, "/all_cell_types.txt"), sep = "\t") 71 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/reference/geo-download-scRNA-seq.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script queries the GEO using esearch and esummary for scRNA-seq datasets related to a certain organ/species 3 | * Note that this script does NOT check the validity of the arguments 4 | * Two intermediate files will be created during runtime: esearch.xml and esummary.xml (cleaned up at the end) 5 | * Please change any space in the keyword to a plus sign, e.g. spinal+cord instead of spinal cord 6 | 7 | Usage: python geo-download-scRNA-seq.py 8 | e.g. python geo-download-scRNA-seq.py lymph+node mouse 9 | 10 | Author: 11 | Yiming Li 12 | """ 13 | 14 | import pandas as pd 15 | import requests 16 | import xml.etree.ElementTree as ET 17 | import time 18 | import sys 19 | import os 20 | 21 | def loadRSS_esearch(organ, species, keyword): 22 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=' + keyword + '+' + organ + '+AND+' + species + '[organism]&retmax=100000&usehistory=y' 23 | resp = requests.get(url) 24 | with open('esearch.xml', 'wb') as f: 25 | f.write(resp.content) 26 | 27 | def parseXML_esearch(xmlfile): 28 | tree = ET.parse(xmlfile) 29 | root = tree.getroot() 30 | items = [] 31 | for item in root.findall('./IdList/Id'): 32 | items.append(item.text) 33 | 34 | return(items) 35 | 36 | def loadRSS_esummary(gds_id): 37 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&id=' + str(gds_id) 38 | resp = requests.get(url) 39 | with open('esummary.xml', 'wb') as f: 40 | f.write(resp.content) 41 | 42 | def parseXML_esummary(xmlfile): 43 | tree = ET.parse(xmlfile) 44 | root = tree.getroot() 45 | items = [] 46 | for item in root.findall('./DocSum/Item'): 47 | if (item.attrib['Name'] in ["Accession", "title", "summary", "GPL", "taxon", "gdsType", "FTPLink"]): 48 | items.append(item.text) 49 | 50 | return(items) 51 | 52 | organ = sys.argv[1] # organ = 'spinal+cord' 53 | species = sys.argv[2] # species = "human" 54 | 55 | 56 | 57 | ############ 58 | 59 | 60 | 61 | keywords = ['scRNA-seq', 'single+cell+RNA-seq', 'single+cell+RNA+sequencing', 'single+cell+transcriptomics', 'single+cell+transcriptome'] # Change this to make the search broader/narrower 62 | 63 | ### Step 1. Get GDS IDs 64 | gds_ids = [] 65 | for keyword in keywords: 66 | loadRSS_esearch(organ, species, keyword) 67 | time.sleep(1) 68 | gds_ids = gds_ids + parseXML_esearch('esearch.xml') 69 | 70 | gds_ids = list(set(gds_ids)) # Remove duplicates 71 | 72 | print(">>>>> [Organ: " + organ + "; Species: " + species + "] " + str(len(gds_ids)) + " GDS IDs found <<<<<\n") 73 | 74 | ### Step 2. Get meta-information 75 | results = [] 76 | for gds_id in gds_ids: 77 | loadRSS_esummary(gds_id) 78 | time.sleep(1) 79 | results.append(parseXML_esummary('esummary.xml')) 80 | print("[" + gds_id + "] completed") 81 | 82 | results = pd.DataFrame(results, columns=["Accession", "title", "summary", "GPL", "taxon", "gdsType", "FTPLink"]) 83 | results["GDS_ID"] = gds_ids 84 | results["Species"] = species 85 | results["Organ"] = organ 86 | results.to_csv(species + "-" + organ + ".csv", sep='\t', index = False) 87 | 88 | os.remove("esearch.xml") 89 | os.remove("esummary.xml") 90 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT.py: -------------------------------------------------------------------------------- 1 | import os, sys, pickle, datetime 2 | import commot as ct 3 | import scanpy as sc 4 | import pandas as pd 5 | import numpy as np 6 | from collections import Counter 7 | 8 | ### Read in data 9 | data_dir = sys.argv[1] + "/analysis/deconvolution/" 10 | # data_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/deconvolution/' 11 | # data_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/' 12 | counts_dir = data_dir + 'binded_counts.csv' 13 | coord_dir = data_dir + 'binded_coordinates.csv' 14 | anno_dir = data_dir + 'binded_cell_types.tsv' 15 | counts = pd.read_csv(counts_dir, index_col = 0) 16 | coord = pd.read_csv(coord_dir, index_col = 2) 17 | annotation = pd.read_csv(anno_dir, index_col = 0, sep = "\t") 18 | cell_types = annotation['cell_type'].tolist() 19 | 20 | ### Get species 21 | ref_dir = sys.argv[2] 22 | # ref_dir = "/projects/b1131/SpatialT/ref/final/Cancer/Breast/Human" 23 | ref_dir = ref_dir.split("/") 24 | species = [i for i in ref_dir if i][-1].lower() # Last non-empty string in ref path 25 | 26 | ### Get spatial distance type 27 | thr_type = sys.argv[3] 28 | # thr_type = "short" 29 | thr_type_multiplier = { 30 | "short": 500, 31 | "medium": 1000, 32 | "long": 1500, 33 | "xlong": 2500, 34 | } 35 | 36 | ### Spatial distance constraint 37 | center_to_center_dist_techs = { 38 | "10x": 100, 39 | "ST": 200, 40 | "DBiT-seq": 20, 41 | "Slide-seq": 20, 42 | "MERFISH": 0.334, 43 | "osmFISH": 0.13, 44 | "seqFISH": 0.26, 45 | "sci-Space": 222, 46 | } 47 | tech = data_dir.split("/")[4] 48 | dis_thr = thr_type_multiplier[thr_type] / center_to_center_dist_techs[tech] 49 | 50 | ### Set up anndata 51 | adata = sc.AnnData(counts.T) 52 | adata.var_names_make_unique() 53 | adata = adata[coord.index,] 54 | coor_df = coord.loc[adata.obs_names, ["x", "y"]] 55 | adata.obsm["spatial"] = coor_df.to_numpy() 56 | adata.raw = adata 57 | 58 | ### Data processing 59 | sc.pp.normalize_total(adata, inplace=True) 60 | sc.pp.log1p(adata) 61 | adata_disthr = adata.copy() 62 | sc.pp.highly_variable_genes(adata, min_mean = 0.0125, max_mean = 3, min_disp = 0.5) 63 | adata = adata[:, adata.var.highly_variable] 64 | sc.tl.pca(adata, svd_solver = 'arpack') 65 | sc.pp.neighbors(adata, n_neighbors = 10, n_pcs = 40) 66 | sc.tl.umap(adata) 67 | sc.tl.leiden(adata, resolution = 0.4) 68 | 69 | ### Get CellChat ligand-receptors 70 | df_cellchat = ct.pp.ligand_receptor_database(database = 'CellChat', species = species) 71 | df_cellchat_filtered = ct.pp.filter_lr_database(df_cellchat, adata_disthr, min_cell_pct = 0.05) 72 | 73 | if (df_cellchat_filtered.shape == (0, 0)): 74 | raise ValueError("ct.pp.filter_lr_database() returns an empty data frame, too few overlapping genes between reference and data") 75 | 76 | now = datetime.datetime.now() 77 | print("Analysis started: ") 78 | print(now) 79 | 80 | ct.tl.spatial_communication(adata_disthr, database_name = 'cellchat', df_ligrec = df_cellchat_filtered, dis_thr = dis_thr, heteromeric = True, pathway_sum = True) 81 | 82 | adata_disthr.write(data_dir + "adata_disthr_" + thr_type + ".h5ad") 83 | 84 | now = datetime.datetime.now() 85 | print("Analysis finished: ") 86 | print(now) 87 | -------------------------------------------------------------------------------- /data_analysis/spatial_clustering/Archive/quest_step01_stagate_jobarray.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import pandas as pd 5 | import scanpy as sc 6 | import matplotlib.pyplot as plt 7 | 8 | import tensorflow as tf 9 | from sklearn.mixture import GaussianMixture 10 | from sklearn.metrics.cluster import adjusted_rand_score 11 | 12 | import STAGATE 13 | 14 | 15 | sample_dir = sys.argv[1] 16 | tech = sample_dir.split("/")[4] 17 | print(">>> " + sample_dir + " started first step STAGE clustering<<<") 18 | 19 | # directory of counts and coordinates 20 | counts_file = os.path.join(sample_dir, 'processed/counts.csv') 21 | coor_file = os.path.join(sample_dir, 'processed/coordinates.csv') 22 | 23 | if os.path.isfile(counts_file) and os.path.isfile(coor_file): 24 | # read and format data to anndata 25 | counts = pd.read_csv(counts_file, index_col=0) 26 | coor_df = pd.read_csv(coor_file) 27 | coor_df.set_index('barcode', drop=True, inplace=True) 28 | adata = sc.AnnData(counts.T) 29 | adata.var_names_make_unique() 30 | 31 | # keep only obs that are in coordinatesfile 32 | adata = adata[coor_df.index,] 33 | coor_df = coor_df.loc[adata.obs_names, ['x', 'y']] 34 | adata.obsm["spatial"] = coor_df.to_numpy() 35 | adata.raw = adata 36 | 37 | 38 | # check if need to log1p by finding non-int values across columns 39 | int_only = True 40 | for col in counts.columns.tolist(): 41 | col_int = counts[col].astype(str).str.isdigit().all() 42 | if col_int == False: 43 | int_only=False 44 | print('NON-INT COUNTS') 45 | break 46 | 47 | if counts.to_numpy().max() >20 and int_only==True: 48 | sc.pp.log1p(adata) 49 | 50 | # normalization 51 | sc.pp.normalize_total(adata, target_sum=1e4) 52 | sc.pp.filter_genes(adata,min_cells=5) 53 | sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000) 54 | 55 | tf.compat.v1.disable_eager_execution() 56 | rad_cur = 2 57 | STAGATE.Cal_Spatial_Net(adata, rad_cutoff=rad_cur) 58 | neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs 59 | print('TECH:', tech, ' INIT NEIGHBORS:', neighbors) 60 | 61 | # add radius_cutoff based on technology until reach at least 5 neighbors 62 | if neighbors < 5 : 63 | while neighbors < 5: 64 | if tech == 'ST': 65 | rad_add = 1 66 | elif tech == 'DBiT-seq': 67 | rad_add = 1 68 | elif tech == '10x': 69 | rad_add = 2 70 | elif tech == 'seqFISH': 71 | rad_add = 5 72 | elif tech == 'MERFISH': 73 | rad_add = 30 74 | elif tech == 'Slide-seq': 75 | rad_add = 30 76 | elif tech == 'osmFISH': 77 | rad_add = 300 78 | else: 79 | rad_add = 10 80 | 81 | rad_cur = rad_cur + rad_add 82 | STAGATE.Cal_Spatial_Net(adata, rad_cutoff= rad_cur) 83 | neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs 84 | 85 | print(' FINAL RADIUS CUTOFF:', rad_cur, 'FINAL NEIGHBORS:', neighbors) 86 | #print(adata.uns['Spatial_Net']) 87 | 88 | #### Running STAGATE #### 89 | adata = STAGATE.train_STAGATE(adata, alpha=0) 90 | 91 | sc.pp.neighbors(adata, use_rep='STAGATE') 92 | sc.tl.umap(adata) 93 | 94 | # create clustering folder if does not exist 95 | if not os.path.exists(f'{sample_dir}analysis/clustering'): 96 | os.makedirs(f'{sample_dir}analysis/clustering') 97 | 98 | pd.DataFrame(adata.obsm['STAGATE'], index=adata.obs.index).to_csv(f'{sample_dir}analysis/clustering/obsm_STAGATE.csv') 99 | print(">>> " + sample_dir + " finished first step STAGATE clustering<<<") 100 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/annotated/DGE-annotated.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### Author: Jenny, Yiming 4 | ### 5 | ### Description: This script performs DGE analysis (single-cell level data) 6 | 7 | library(Seurat) 8 | library(data.table) 9 | library(dplyr) 10 | library(stringr) 11 | 12 | ### Define paths and variables 13 | args <- commandArgs(trailingOnly=TRUE) 14 | st_dir <- "/projects/b1131/SpatialT" 15 | dt_dir <- "/projects/b1131/SpatialT/drug-target/" 16 | sample_dir <- args[1] 17 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/" 18 | # sample_dir <- "/projects/b1131/SpatialT/DBiT-seq/PID150/DS150A/DS150A.GSM4096261/" 19 | 20 | ds_name <- str_split(sample_dir, '/')[[1]][7] 21 | tech <- str_split(sample_dir, '/')[[1]][5] 22 | p_name <- str_split(sample_dir, '/')[[1]][6] 23 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/") 24 | sample_name <- str_split(sample_dir, '/')[[1]][8] 25 | 26 | ### Read Seurat object 27 | seurat_object_path <- paste0(sample_dir, "processed/Seurat.RDS") 28 | seurat_object <- readRDS(seurat_object_path) 29 | 30 | ### DGE results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/DGE 31 | output_dir <- paste0(sample_dir, "analysis/DGE") 32 | if (!dir.exists(output_dir)) { 33 | dir.create(output_dir) 34 | } 35 | dir.create(paste0(dt_dir, ds_name)) 36 | dir.create(paste0(dt_dir, ds_name, "/", sample_name)) 37 | dir.create(paste0(dt_dir, ds_name, "/", sample_name, "/DGE_anno")) 38 | 39 | 40 | ### this is to change '/' in cell type names to '.' 41 | seurat_object@meta.data$cell_type_annotation_class <- gsub( 42 | pattern = "/", 43 | replacement = ".", 44 | x = seurat_object@meta.data$cell_type_annotation_class 45 | ) 46 | annotations <- seurat_object$cell_type_annotation_class 47 | uniq_anno = unique(annotations) 48 | Idents(seurat_object) <- "cell_type_annotation_class" 49 | saveRDS(seurat_object, paste0(sample_dir,'/processed/Seurat_reanno.RDS')) 50 | 51 | ### Perform DGE analysis on different cell types 52 | if (length(uniq_anno) == 1) { 53 | fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_anno.tsv"), sep = "\t") 54 | cat("\n\n### Only one annotated cell type -- skipping DGE analysis (cell types).") 55 | cat("\n# Writing empty DGE_cell_types_dec.tsv to file.") 56 | } else { 57 | # https://satijalab.org/seurat/archive/v3.1/future_vignette.html 58 | options(future.globals.maxSize = 5000 * 1024^2) 59 | DGE_cell_types <- FindAllMarkers(seurat_object, assay = "SCT", logfc.threshold = 0.2, min.pct = 0.1, verbose = FALSE) 60 | if (nrow(DGE_cell_types) == 0) { 61 | fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_anno.tsv"), sep = "\t") 62 | cat("\n\n### DGE analysis (cell types) cannot be performed due to having too few spots in one/many of the cell types.") 63 | cat("\n# Writing empty DGE_cell_types_dec.tsv to file.") 64 | } else { 65 | DGE_cell_types$cluster <- as.character(DGE_cell_types$cluster) 66 | fwrite(DGE_cell_types, paste0(output_dir, "/DGE_cell_types_anno.tsv"), sep = "\t") 67 | 68 | for (cell_type in sort(unique(DGE_cell_types$cluster))) { 69 | cat(paste0(cell_type,' being anlayzed for DEG \n')) 70 | # cell_type <- "Malignant" 71 | DGE_cell_types_less <- DGE_cell_types[DGE_cell_types$cluster == cell_type,] 72 | DGE_cell_types_less <- DGE_cell_types_less[,c("gene", "avg_log2FC", "p_val", "p_val_adj")] 73 | colnames(DGE_cell_types_less) <- c("gene", "stat", "pval", "qval") 74 | fwrite(DGE_cell_types_less, paste0(dt_dir, ds_name, "/", sample_name, "/DGE_anno/", cell_type, ".txt"), sep = "\t") 75 | } 76 | cat("\n\n### DGE analysis (cell types) results written to file.") 77 | } 78 | } 79 | tt4 <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14]) 80 | cat(paste0("\n### Analysis completed; max memory consumed: ", as.character(tt4), "M -- [", Sys.time(), "]\n\n")) 81 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/README.md: -------------------------------------------------------------------------------- 1 | # Drug Discovery 2 | 3 | Drug discovery analysis aims to identify repurposable and established compounds for targeting cell types of interests in pathological sample. Analysis is conducted on spatially variable and differentially expressed (SV-DE) genes for each deconvoluated cell type. 4 | 5 | ## Drug Enrichment and Perturbation [Script](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py) 6 | 7 | ### Input 8 | - `CMAP L1000 perturbation profile` (e.g., cmap2_6hr_pert_z.npy), which can be downloaded from [CMAP](https://clue.io/data/CMap2020#LINCS2020) signature/level 5 data. This file contains the perturbation MODZ score calculated for each compound perturbation on each gene (12,328 total). We included 6hr perturbations (145,491) to reduce redunduncy and avoid the confounding factor of treatment duration. More explanation on MODZ score can be found from this CMAP [article](https://clue.io/connectopedia/replicate_collapse). Additional metadata can also be downloade from CMAP to filter for cell type, dosage, etc. 9 | 10 | - `CMAP L1000 rank profile` (e.g., cmap2_6hr_pert_rank.csv), which is created by ranking the perturbation MODZ score of genes for each of the 145,491 perturbations. 11 | 12 | - `Gene name mappings` (e.g., geneinfo_beta.txt and HGNC091923.txt) for mapping between gene IDs and symbols 13 | 14 | - `Spatially variable (tissue level) cell type-specific differnetially expressed genes` (e.g., dsid/sampleid/DGE_dec_SVG/). SVG can be generated from [script](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability), and DGE can be generated from the [script](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/DGE). Filter is used to select tissue-elevel spatially variable (q-value <0.1) up and down DE gene sets with |log2fc| > 0.5 and q-value < 0.05. For cell types that don't have enough deconvoluted spots for SVG analysis, DGE results are used directly. 15 | 16 | ### CMAP2 Enrichment Calculation 17 | - Gene set enrichment analysis (GSEA) is conducted on the resulting set of up and down DGE sets (10 <= gene set size <= 2000) for each CMAP compound perturbation. GSEA score difference between up and down DEG set is calculated as the enrichment score for the compound. P-value is calculated by comparing the proportion of enrichment score calculated from random gene rankings greater/less than the compound's enrichment score (depending on the sign). More information can be found at the Methods section of [manuscript](https://www.biorxiv.org/content/10.1101/2022.04.17.488596v3). 18 | - Compounds with the 500 highest (inverse enrichment, suppress DEGs) and lowest (positive enrichment, promote DEGs) enrichment score are saved for every cell type as output(e.g., dsid/sampleid/Enrichment/). 19 | 20 | ### CMAP2 Perturbation Network 21 | - For each of the 500 positively and 500 inversely enriched compound perturbation, CMAP perturbation MODZ score of the compound on the SV-DE genes DEGs represents the effect a compound has on the gene target. 22 | - Top and bottom 30 SV-DE genes with the highest absolute value of perturbation MODZ score are saved along with the log2fc of the SV-DE genes for plotting the perturbation network (e.g., dsid/sampleid/Perturbation/). 23 | 24 | 25 | ## Protein-protein Interaction [Script](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.py) 26 | 27 | ### Input 28 | - `Human protein interactome` (e.g., Interactome.tsv'), which contains 350k pairs of PPIs. Maping of NCBI gene IDs and symbols can be done using HGNC reference (e.g., 'HGNC.tsv'). 29 | 30 | - `Spatially variable (tissue level) cell type-specific differnetially expressed genes` similar to that required for drug screen. 31 | 32 | ### PPI Network 33 | - Top 300 SV-DE genes with the highest absolute values is used to find matching PPIs in the interactome (both receiver and sender need to be from the top 300 DEG list). 34 | - Results along with log2fc of the SV-DE genes are saved for plotting PPI network. 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/annotation/annotation_example.R: -------------------------------------------------------------------------------- 1 | """ 2 | Cell Type Annotation Example Script 3 | 4 | Author: Saya Dennis 5 | 6 | Usage: Rscript --no-save annotation_example.R 7 | 8 | Requirements: 9 | 1. You need to first process your reference scRNA-seq dataset. 10 | 2. In the code below, edit the directory/file names of your reference dataset (saved into variable ref_data_sce) 11 | 3. You will need to create an annotation directory under your ST sample directories 12 | - This should look like this: /share/fsmresfiles/SpatialT/{tech}/PID{pid}/{dsid}/{sampleid}/analysis/annotation/ 13 | - To automate generating this directory, refer to script ST-dataset/analysis/cell_type_annotation/01_create_anno_directory.py 14 | 4. Back up your un-annotated Seurat object 15 | - Back up to a file named Seurat.RDS.bk under the same directory. 16 | - Refer to ST-dataset/analysis/cell_type_annotation/saya_cell_type_annotation_examples/02_backup_seurat_before_annotation.py 17 | 5. Below, edit the PID, DSID, and technology directory (e.g. /share/fsmresfiles/SpatialT/DBiT-seq) 18 | 19 | """ 20 | 21 | library(data.table) 22 | library(SingleCellExperiment) 23 | library(scuttle) 24 | library(Seurat) 25 | library(SingleR) 26 | 27 | ################################################ 28 | #### Cell type annotation on a dataset DS2O #### 29 | ################################################ 30 | 31 | dref <- '/share/fsmresfiles/SpatialT/ref/Heart/Adult/heart-cell-atlas/processed/' # reference directory 32 | ref_data_sce <- readRDS(paste0(dref, 'sce_heart.RDS')) 33 | 34 | target_p_name <- "PID70" 35 | target_ds_name <- "DS70B" 36 | target_ds_dir <- paste(c("/share/fsmresfiles/SpatialT/DBiT-seq", target_p_name, target_ds_name), collapse = "/") 37 | target_ds_metatable <- read.table(paste0(target_ds_dir, "/metatable.tsv"), header = TRUE, stringsAsFactors = FALSE) 38 | 39 | ### Loop through samples and annotate 40 | for (target_sample_name in target_ds_metatable$SampleID) { 41 | cat(paste0("Starting annotation for sample ", target_sample_name, " -- ", Sys.time(), "\n")) 42 | # create annotation directory 43 | # if (!dir.exists(paste0(dds, "/", sampleid, "/analysis/annotation"))) { 44 | # dir.create(paste0(dds, "/", sampleid, "/analysis/annotation")) 45 | # } 46 | 47 | # load processed Seurat object 48 | target_sample_dir <- paste(c(target_ds_dir, "/", target_sample_name), collapse = "") 49 | seurat_object_tn_path <- paste0(target_sample_dir, "/processed/Seurat.RDS.bk") 50 | seurat_object_tn <- readRDS(seurat_object_tn_path) 51 | 52 | ### Perform spot-based cell type annotation and save to Seurat 53 | annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, labels = ref_data_sce$label, de.method="wilcox") 54 | seurat_object_tn[["cell_type_annotation"]] <- annotation$labels 55 | 56 | ### Perform cluster-based cell type annotation and save to Seurat 57 | cluster_results <- seurat_object_tn[["seurat_clusters"]]$seurat_clusters 58 | annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, clusters = cluster_results, labels = ref_data_sce$label, de.method="wilcox") 59 | seurat_object_tn[["cell_type_annotation_clusters"]] <- annotation$labels[cluster_results] 60 | 61 | ### Overwrite the previously saved Seurat object with cell type annotated Seurat object 62 | saveRDS(seurat_object_tn, file = paste0(target_sample_dir, "/processed/Seurat.RDS")) 63 | 64 | ### Visualize annotated cell types 65 | pdf(paste0(target_sample_dir, "/analysis/annotation/cell_type_annotation.pdf")) ### Change to your own save directory/name 66 | # pdf("analysis/annotation/cell_type_annotation.pdf") 67 | print(SpatialDimPlot(seurat_object_tn)) 68 | print(DimPlot(seurat_object_tn, reduction = "umap")) 69 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation")) 70 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation")) 71 | dev.off() 72 | cat(paste0("Finished annotation for sample ", target_sample_name, " -- ", Sys.time(), "\n\n")) 73 | } 74 | -------------------------------------------------------------------------------- /data_analysis/spatial_clustering/quest_stagate_updated_jobarray.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import pandas as pd 5 | import scanpy as sc 6 | import matplotlib.pyplot as plt 7 | 8 | import tensorflow as tf 9 | from sklearn.mixture import GaussianMixture 10 | from sklearn.metrics.cluster import adjusted_rand_score 11 | 12 | import STAGATE 13 | 14 | 15 | sample_dir = sys.argv[1] 16 | tech = sample_dir.split("/")[4] 17 | print(">>> " + sample_dir + " started first step STAGE clustering<<<") 18 | 19 | # directory of counts and coordinates 20 | counts_file = os.path.join(sample_dir, 'processed/counts.csv') 21 | coor_file = os.path.join(sample_dir, 'processed/coordinates.csv') 22 | 23 | if os.path.isfile(counts_file) and os.path.isfile(coor_file): 24 | # read and format data to anndata 25 | counts = pd.read_csv(counts_file, index_col=0) 26 | coor_df = pd.read_csv(coor_file) 27 | coor_df.set_index('barcode', drop=True, inplace=True) 28 | adata = sc.AnnData(counts.T) 29 | adata.var_names_make_unique() 30 | 31 | # keep only obs that are in coordinatesfile 32 | adata = adata[coor_df.index,] 33 | coor_df = coor_df.loc[adata.obs_names, ['x', 'y']] 34 | adata.obsm["spatial"] = coor_df.to_numpy() 35 | adata.raw = adata 36 | 37 | 38 | # check if need to log1p by finding non-int values across columns 39 | int_only = True 40 | for col in counts.columns.tolist(): 41 | col_int = counts[col].astype(str).str.isdigit().all() 42 | if col_int == False: 43 | int_only=False 44 | print('NON-INT COUNTS') 45 | break 46 | 47 | if counts.to_numpy().max() >20 and int_only==True: 48 | sc.pp.log1p(adata) 49 | 50 | # normalization 51 | sc.pp.normalize_total(adata, target_sum=1e4) 52 | sc.pp.filter_genes(adata,min_cells=5) 53 | sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000) 54 | 55 | tf.compat.v1.disable_eager_execution() 56 | rad_cur = 2 57 | STAGATE.Cal_Spatial_Net(adata, rad_cutoff=rad_cur) 58 | neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs 59 | print('TECH:', tech, ' INIT NEIGHBORS:', neighbors) 60 | 61 | # add radius_cutoff based on technology until reach at least 5 neighbors 62 | if neighbors < 5 : 63 | while neighbors < 5: 64 | if tech == 'ST': 65 | rad_add = 1 66 | elif tech == 'DBiT-seq': 67 | rad_add = 1 68 | elif tech == '10x': 69 | rad_add = 2 70 | elif tech == 'seqFISH': 71 | rad_add = 5 72 | elif tech == 'MERFISH': 73 | rad_add = 30 74 | elif tech == 'Slide-seq': 75 | rad_add = 30 76 | elif tech == 'osmFISH': 77 | rad_add = 300 78 | else: 79 | rad_add = 10 80 | 81 | rad_cur = rad_cur + rad_add 82 | STAGATE.Cal_Spatial_Net(adata, rad_cutoff= rad_cur) 83 | neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs 84 | 85 | print(' FINAL RADIUS CUTOFF:', rad_cur, 'FINAL NEIGHBORS:', neighbors) 86 | #print(adata.uns['Spatial_Net']) 87 | 88 | #### Running STAGATE #### 89 | adata = STAGATE.train_STAGATE(adata, alpha=0) 90 | 91 | sc.pp.neighbors(adata, use_rep='STAGATE') 92 | sc.tl.umap(adata) 93 | 94 | # determine cluster resolution based on cell size 95 | if adata.shape[0] < 100: 96 | res=1.2 97 | elif adata.shape[0] >=100 and adata.shape[0] <500: 98 | res = 0.7 99 | elif adata.shape[0] >=500 and adata.shape[0] <5000: 100 | res = 0.5 101 | elif adata.shape[0] >=5000 and adata.shape[0] <20000: 102 | res = 0.3 103 | elif adata.shape[0] >=20000: 104 | res = 0.1 105 | 106 | # clustering 107 | sc.tl.louvain(adata, resolution=res) 108 | sc.pl.embedding(adata, basis="spatial", color="louvain",s=6, show=False, title='STAGATE') 109 | 110 | # create clustering folder if does not exist 111 | if not os.path.exists(f'{sample_dir}analysis/clustering'): 112 | os.makedirs(f'{sample_dir}analysis/clustering') 113 | 114 | pd.DataFrame(adata.obsm['STAGATE'], index=adata.obs.index).to_csv(f'{sample_dir}analysis/clustering/STAGATE_30dim.csv') 115 | pd.DataFrame(adata.obs['louvain'], index=adata.obs.index).to_csv(f'{sample_dir}analysis/clustering/STAGATE_clusters.csv') 116 | print(">>> " + sample_dir + " finished first step STAGATE clustering<<<") 117 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-pull-scores.py: -------------------------------------------------------------------------------- 1 | import os, sys, pickle, datetime, anndata 2 | import commot as ct 3 | import scanpy as sc 4 | import pandas as pd 5 | import numpy as np 6 | import scipy 7 | from collections import Counter 8 | 9 | ### Read in data 10 | data_dir = sys.argv[1] + "/analysis/deconvolution/" 11 | # data_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/deconvolution/' 12 | # data_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/' 13 | counts_dir = data_dir + 'binded_counts.csv' 14 | counts = pd.read_csv(counts_dir, index_col = 0) 15 | 16 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec" 17 | # out_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/Distance/COMMOT_dec' 18 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec' 19 | if not os.path.exists(out_dir): 20 | os.makedirs(out_dir) 21 | 22 | ### Get spatial distance type 23 | thr_type = sys.argv[3] 24 | out_dir = out_dir + "/" + thr_type 25 | # thr_type = "short" 26 | # out_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/Distance/COMMOT_dec/thr_type' 27 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec/short' 28 | if not os.path.exists(out_dir): 29 | os.makedirs(out_dir) 30 | 31 | ### Get COMMOT object and pathways 32 | adata_disthr = sc.read_h5ad(data_dir + "adata_disthr_" + thr_type + ".h5ad") 33 | with open(out_dir + "/pathways.txt") as f: 34 | pathways = [line.rstrip('\n') for line in f] 35 | 36 | ### Get cell types 37 | anno_dir = data_dir + 'binded_cell_types.tsv' 38 | annotation = pd.read_csv(anno_dir, index_col = 0, sep = "\t") 39 | cell_types = annotation['cell_type'].tolist() 40 | 41 | ### Cell-type-level scores 42 | adata_disthr.obs['cell_type'] = cell_types 43 | for pathway in pathways: 44 | ct.tl.cluster_communication(adata_disthr, database_name = 'cellchat', pathway_name = pathway, clustering = 'cell_type', n_permutations = 100) 45 | 46 | ### Pull LR pairs 47 | lrpairs = [str(i).replace("commot-cellchat-", "") for i in adata_disthr.obsp] 48 | lrpairs = [i for i in lrpairs if "-" in i] 49 | lrpairs.sort() 50 | lrpairs.remove("total-total") 51 | for lrpair in lrpairs: 52 | ct.tl.cluster_communication(adata_disthr, database_name = 'cellchat', pathway_name = lrpair, clustering = 'cell_type', n_permutations = 100) 53 | 54 | ### Pathway-level 55 | rows_list = [] 56 | for pathway in pathways: 57 | # https://github.com/zcang/COMMOT/issues/10 58 | # rows (first index) represent senders and the columns (second index) represent receivers 59 | tmp_mat = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + pathway]["communication_matrix"] 60 | tmp_p = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + pathway]["communication_pvalue"] 61 | for i in tmp_mat.index: 62 | for j in tmp_mat.columns: 63 | rows_list.append({"pathway": pathway, "cell_type1": i, "cell_type2": j, "score": tmp_mat.loc[i,j], "p_val": tmp_p.loc[i,j]}) 64 | 65 | df = pd.DataFrame(rows_list) 66 | df.to_csv(out_dir + "/communication_scores_pathway.txt", index = False) 67 | 68 | ### LR-pair-level 69 | ccdb = adata_disthr.uns['commot-cellchat-info']["df_ligrec"] 70 | ccdb["lrpair"] = ccdb["ligand"] + "-" + ccdb["receptor"] 71 | ccdb_dict = dict(zip(ccdb.lrpair, ccdb.pathway)) 72 | rows_list = [] 73 | lrpairs2 = lrpairs 74 | for lrpair in lrpairs: 75 | if lrpair in ccdb_dict: 76 | tmp_mat = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + lrpair]["communication_matrix"] 77 | tmp_p = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + lrpair]["communication_pvalue"] 78 | for i in tmp_mat.index: 79 | for j in tmp_mat.columns: 80 | rows_list.append({"pathway": ccdb_dict[lrpair], "lrpair": lrpair, "cell_type1": i, "cell_type2": j, "score": tmp_mat.loc[i,j], "p_val": tmp_p.loc[i,j]}) 81 | else: 82 | lrpairs2.remove(lrpair) 83 | 84 | with open(out_dir + '/lrpairs.txt', 'w') as f: 85 | for lrpair in lrpairs2: 86 | _ = f.write(f"{lrpair}\n") 87 | 88 | df = pd.DataFrame(rows_list) 89 | df.to_csv(out_dir + "/communication_scores_lrpair.txt", index = False) 90 | 91 | ### Overwrite result file 92 | adata_disthr.write(data_dir + "adata_disthr_" + thr_type + "_cs.h5ad") 93 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step0.py: -------------------------------------------------------------------------------- 1 | import os, sys, pickle, datetime, anndata, shutil 2 | import commot as ct 3 | import scanpy as sc 4 | import pandas as pd 5 | import numpy as np 6 | import scipy 7 | from collections import Counter 8 | 9 | ### Read in data 10 | data_dir = sys.argv[1] + "/analysis/deconvolution/" 11 | # data_dir = '/share/fsmresfiles/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/' 12 | # data_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/' 13 | counts_dir = data_dir + 'binded_counts.csv' 14 | counts = pd.read_csv(counts_dir, index_col = 0) 15 | 16 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec" 17 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec' 18 | if not os.path.exists(out_dir): 19 | os.makedirs(out_dir) 20 | 21 | ### Get spatial distance type 22 | thr_type = sys.argv[3] 23 | out_dir = out_dir + "/" + thr_type 24 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec/short' 25 | if not os.path.exists(out_dir): 26 | os.makedirs(out_dir) 27 | # else: 28 | # shutil.rmtree(out_dir) 29 | # os.makedirs(out_dir) 30 | 31 | adata_disthr = sc.read_h5ad(data_dir + "adata_disthr_" + thr_type + ".h5ad") 32 | adata_disthr.layers['counts'] = scipy.sparse.csr_matrix(counts.values.T) 33 | 34 | pathways = [str(i).replace("commot-cellchat-", "") for i in adata_disthr.obsp] 35 | pathways = [i for i in pathways if "-" not in i] 36 | pathways.sort() 37 | with open(out_dir + '/pathways.txt', 'w') as f: 38 | for pathway in pathways: 39 | _ = f.write(f"{pathway}\n") 40 | 41 | for pathway in pathways: 42 | # pathway = "MIF" 43 | 44 | ### rpy2 does not fully work on Quest; want to achieve the following: 45 | ### df_deg, df_yhat = ct.tl.communication_deg_detection(adata_disthr, database_name = 'cellchat', pathway = pathway, summary = 'receiver') 46 | summary = 'receiver' 47 | database_name = "cellchat" 48 | pathway_dir = out_dir + "/" + pathway 49 | # pathway_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT/short/MIF' 50 | if not os.path.exists(pathway_dir): 51 | os.makedirs(pathway_dir) 52 | 53 | # prepare input adata for R 54 | adata_deg = anndata.AnnData(X = adata_disthr.layers['counts'], var = pd.DataFrame(index=list(adata_disthr.var_names)), obs = pd.DataFrame(index=list(adata_disthr.obs_names))) 55 | adata_deg_var = adata_deg.copy() 56 | sc.pp.filter_genes(adata_deg_var, min_cells=3) 57 | sc.pp.filter_genes(adata_deg, min_cells=3) 58 | sc.pp.normalize_total(adata_deg_var, target_sum=1e4) 59 | sc.pp.log1p(adata_deg_var) 60 | sc.pp.highly_variable_genes(adata_deg_var, min_mean=0.0125, max_mean=3, min_disp=0.5) 61 | adata_deg = adata_deg[:, adata_deg_var.var.highly_variable] 62 | del adata_deg_var 63 | 64 | summary_name = 'commot-'+database_name+'-sum-'+summary 65 | if summary == 'sender': 66 | summary_abrv = 's' 67 | else: 68 | summary_abrv = 'r' 69 | 70 | comm_sum = adata_disthr.obsm[summary_name][summary_abrv+'-'+pathway].values.reshape(-1,1) 71 | cell_weight = np.ones_like(comm_sum).reshape(-1,1) 72 | 73 | ### Save data for R 74 | Xmat = pd.DataFrame(adata_deg.X.toarray()) 75 | Xmat.index = adata_deg.obs.index 76 | Xmat.columns = adata_deg.var.index 77 | Xmat.to_csv(pathway_dir + "/step1_X.csv", header = True, index = True) 78 | pseudoTime = pd.DataFrame(comm_sum) 79 | pseudoTime.to_csv(pathway_dir + "/step1_pseudoTime.csv", header = False, index = False) 80 | cellWeight = pd.DataFrame(cell_weight) 81 | cellWeight.to_csv(pathway_dir + "/step1_cellWeight.csv", header = False, index = False) 82 | 83 | nknots = 6 84 | 85 | string_fitGAM = 'sce <- fitGAM(counts=X, pseudotime=pseudoTime, cellWeights=cellWeight, nknots=%d, verbose=TRUE)' % nknots 86 | string_fitGAM = string_fitGAM + '\nassoRes <- data.frame( associationTest(sce, global=FALSE, lineage=TRUE) )' 87 | string_fitGAM = string_fitGAM + '\nassoRes[is.nan(assoRes[,"waldStat_1"]),"waldStat_1"] <- 0.0' 88 | string_fitGAM = string_fitGAM + '\nassoRes[is.nan(assoRes[,"df_1"]),"df_1"] <- 0.0' 89 | string_fitGAM = string_fitGAM + '\nassoRes[is.nan(assoRes[,"pvalue_1"]),"pvalue_1"] <- 1.0\n' 90 | 91 | with open(pathway_dir + "/step1.R", "w") as text_file: 92 | _ = text_file.write(string_fitGAM) 93 | -------------------------------------------------------------------------------- /data_analysis/spatial_variability/quest_SpatialDE_ct_specific.py: -------------------------------------------------------------------------------- 1 | import SpatialDE 2 | import NaiveDE 3 | import numpy as np 4 | import pandas as pd 5 | from pandas.api.types import is_numeric_dtype 6 | import os 7 | import sys 8 | 9 | st_dir = "/projects/b1131/SpatialT/" # On Quest 10 | 11 | sample_dir = sys.argv[1] 12 | cell_type = sys.argv[2] 13 | print("\n\n>>> " + sample_dir + "[" + cell_type + "] started <<<") 14 | 15 | ### Read counts and coordinates 16 | counts = pd.read_csv(sample_dir + 'analysis/deconvolution/counts_' + cell_type + '_deconv_only.csv') 17 | counts_num = counts._get_numeric_data() 18 | min_count = counts_num.min().min() 19 | if (min_count < 0): 20 | counts_num[counts_num < 0] = 0 21 | 22 | coordinates = pd.read_csv(sample_dir + 'processed/coordinates.csv') 23 | counts.loc['Total',:]= counts.sum(axis=0) 24 | 25 | ### Align counts and coordinates index 26 | error_count = 0 27 | for i,j in zip (counts.columns.tolist()[1:], coordinates['barcode'].tolist()): 28 | if i != j: 29 | error_count = error_count + 1 30 | 31 | if error_count > 0: 32 | print("[ERROR] " + sample_dir + " has not matching spot IDs.") 33 | sys.exit() 34 | 35 | ### Get total counts 36 | total_counts = counts.iloc[-1][1:].tolist() 37 | 38 | ### Process data 39 | sample_info = pd.DataFrame() 40 | if 'x' in coordinates.columns: 41 | sample_info['y'] = coordinates['y'] 42 | sample_info['x'] = coordinates['x'] 43 | else: 44 | print("[ERROR] " + sample_dir + " has problematic coordinates column names.") 45 | sys.exit(1) 46 | 47 | sample_info['total_counts'] = total_counts 48 | sample_info.index = coordinates['barcode'] 49 | # sample_info 50 | reshaped_counts = counts.set_index('gene').iloc[:-1].transpose() 51 | reshaped_counts.index = coordinates['barcode'] 52 | reshaped_counts = reshaped_counts.T[reshaped_counts.sum(0) >= 3].T 53 | # reshaped_counts 54 | 55 | ### Run SpatialDE 56 | try: 57 | norm_expr = NaiveDE.stabilize(reshaped_counts.T).T 58 | except: 59 | norm_expr = np.log(reshaped_counts.T).T 60 | 61 | resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(total_counts)').T 62 | # sample_resid_expr = resid_expr.sample(n=15202, axis=1, random_state=1) 63 | X = sample_info[['x', 'y']] 64 | 65 | try: 66 | results = SpatialDE.run(X.to_numpy(), resid_expr) 67 | except: 68 | print("[ERROR] " + sample_dir + " SpatialDE failed, probably because the data is too sparse / contains too few spots for this cell type.") 69 | sys.exit(1) 70 | 71 | if not os.path.exists(f'{sample_dir}analysis/'): 72 | os.makedirs(f'{sample_dir}analysis/') 73 | 74 | if not os.path.exists(f'{sample_dir}analysis/SVG/'): 75 | os.makedirs(f'{sample_dir}analysis/SVG/') 76 | 77 | ### Write results to file 78 | results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_results_' + cell_type + '.tsv', sep = '\t', index = False) 79 | # g - The name of the gene 80 | # pval - The P-value for spatial differential expression 81 | # qval - Significance after correcting for multiple testing 82 | # l - A parameter indicating the distance scale a gene changes expression over 83 | print("# SVG analysis [" + cell_type + "] finished.") 84 | 85 | sign_results = results.query('qval < 0.05') 86 | n_patterns = 5 # Default, hard-wired for now 87 | 88 | if sign_results.shape[0]>0: 89 | ### Get average l 90 | l = pd.DataFrame(sign_results['l'].value_counts()).index.tolist() 91 | count = pd.DataFrame(sign_results['l'].value_counts())['count'].tolist() 92 | total_count = sum(count) 93 | total = 0 94 | for i,j in zip(l, count): 95 | ij = i*j 96 | total += ij 97 | 98 | L = round(total/total_count) 99 | histology_results, patterns = SpatialDE.aeh.spatial_patterns(X.to_numpy(), resid_expr, sign_results, C = n_patterns, l = L, verbosity = 1, delta_elbo_threshold = 1) 100 | print("# Pattern analysis [" + cell_type + "] finished.") 101 | else: 102 | patterns = pd.DataFrame(columns=['0', '1']) 103 | histology_results = pd.DataFrame(columns=['g', 'pattern', 'membership']) 104 | print("# [WARNING] Cannot perform pattern analysis [" + cell_type + "], no sig genes.") 105 | 106 | ### Write results to file 107 | histology_results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_histology_results_' + cell_type + '.tsv', sep = '\t', index = False) 108 | patterns.to_csv(sample_dir + 'analysis/SVG/SpatialDE_patterns_' + cell_type + '.tsv', sep = '\t', index = False) 109 | print(">>> " + sample_dir + " finished <<<") 110 | -------------------------------------------------------------------------------- /data_analysis/spatial_variability/quest_SpatialDE_jobarray.py: -------------------------------------------------------------------------------- 1 | import SpatialDE 2 | import NaiveDE 3 | import pandas as pd 4 | from pandas.api.types import is_numeric_dtype 5 | import os 6 | import sys 7 | 8 | # st_dir = "/share/fsmresfiles/SpatialT/" # On FSM servers 9 | st_dir = "/projects/b1131/SpatialT/" # On Quest 10 | all_samples = pd.read_csv(st_dir + "master_table_new.txt", sep = "\t") # 1735 samples 11 | 12 | sample_dir = sys.argv[1] 13 | print(">>> " + sample_dir + " started <<<") 14 | 15 | ### Read counts and coordinates 16 | counts = pd.read_csv(sample_dir + 'processed/counts.csv') 17 | counts_num = counts._get_numeric_data() 18 | min_count = counts_num.min().min() 19 | if (min_count < 0): 20 | counts_num[counts_num < 0] = 0 21 | 22 | coordinates = pd.read_csv(sample_dir + 'processed/coordinates.csv') 23 | counts.loc['Total',:]= counts.sum(axis=0) 24 | 25 | ### Align counts and coordinates index 26 | error_count = 0 27 | for i,j in zip (counts.columns.tolist()[1:], coordinates['barcode'].tolist()): 28 | if i != j: 29 | error_count = error_count + 1 30 | 31 | if error_count > 0: 32 | print("[ERROR] " + sample_dir + " has not matching spot IDs.") 33 | sys.exit() 34 | 35 | ### Get total counts 36 | total_counts = counts.iloc[-1][1:].tolist() 37 | 38 | ### Process data 39 | sample_info = pd.DataFrame() 40 | if 'x' in coordinates.columns: 41 | sample_info['y'] = coordinates['y'] 42 | sample_info['x'] = coordinates['x'] 43 | else: 44 | print("[ERROR] " + sample_dir + " has problematic coordinates column names.") 45 | sys.exit() 46 | 47 | sample_info['total_counts'] = total_counts 48 | sample_info.index = coordinates['barcode'] 49 | # sample_info 50 | reshaped_counts = counts.set_index('gene').iloc[:-1].transpose() 51 | reshaped_counts.index = coordinates['barcode'] 52 | reshaped_counts = reshaped_counts.T[reshaped_counts.sum(0) >= 3].T 53 | # reshaped_counts 54 | 55 | ### Run SpatialDE 56 | try: 57 | norm_expr = NaiveDE.stabilize(reshaped_counts.T).T 58 | except: 59 | norm_expr = np.log(reshaped_counts.T).T 60 | 61 | resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(total_counts)').T 62 | # sample_resid_expr = resid_expr.sample(n=15202, axis=1, random_state=1) 63 | X = sample_info[['x', 'y']] 64 | 65 | try: 66 | results = SpatialDE.run(X, resid_expr) 67 | except: 68 | print("[ERROR] " + sample_dir + " SpatialDE failed, probably because the data is too sparse / contains too few spots for this cell type.") 69 | sys.exit() 70 | 71 | if not os.path.exists(f'{sample_dir}analysis/'): 72 | os.makedirs(f'{sample_dir}analysis/') 73 | 74 | if not os.path.exists(f'{sample_dir}analysis/SVG/'): 75 | os.makedirs(f'{sample_dir}analysis/SVG/') 76 | 77 | ### Write results to file 78 | results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_results.tsv', sep = '\t', index = False) 79 | # g - The name of the gene 80 | # pval - The P-value for spatial differential expression 81 | # qval - Significance after correcting for multiple testing 82 | # l - A parameter indicating the distance scale a gene changes expression over 83 | 84 | print("# Whole tissue SVG analysis finished.") 85 | 86 | sign_results = results.query('qval < 0.05') 87 | n_patterns = 5 # Default, hard-wired for now 88 | 89 | if sign_results.shape[0]>0: 90 | ### Get average l 91 | l = pd.DataFrame(sign_results['l'].value_counts()).index.tolist() 92 | count = pd.DataFrame(sign_results['l'].value_counts())['l'].tolist() 93 | total_count = sum(count) 94 | total = 0 95 | for i,j in zip(l, count): 96 | ij = i*j 97 | total += ij 98 | 99 | L = round(total/total_count) 100 | histology_results, patterns = SpatialDE.aeh.spatial_patterns(X, resid_expr, sign_results, C = n_patterns, l = L, verbosity = 1, delta_elbo_threshold = 1) 101 | print("# Pattern analysis finished.") 102 | else: 103 | patterns = pd.DataFrame(columns=['0', '1']) 104 | histology_results = pd.DataFrame(columns=['g', 'pattern', 'membership']) 105 | print("# [WARNING] Cannot perform pattern analysis, no sig genes.") 106 | 107 | ### Write results to file 108 | if not os.path.exists(f'{sample_dir}analysis/SVG'): 109 | os.makedirs(f'{sample_dir}analysis/SVG') 110 | histology_results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_histology_results.tsv', sep = '\t', index = False) 111 | patterns.to_csv(sample_dir + 'analysis/SVG/SpatialDE_patterns.tsv', sep = '\t', index = False) 112 | 113 | print(">>> " + sample_dir + " finished <<<") 114 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/deconvolution/quest_deconvolution_jobarray.R: -------------------------------------------------------------------------------- 1 | ### Author: Yiming Li 2 | ### Example usage: 3 | ### quest_deconvolution_jobarray.R $sample_dir $ref_dir 4 | 5 | library(stringr) 6 | library(Seurat) 7 | library(BayesPrism) 8 | library(data.table) 9 | 10 | # cd /share/fsmresfiles/SpatialT/10x/PID27/DS27A/DS27A_1160920F/processed 11 | # conda activate R4 12 | args <- commandArgs(trailingOnly=TRUE) 13 | 14 | ### Change the below parameters if needed 15 | if (length(args) > 2) { 16 | n_cores <- as.integer(args[3]) 17 | } else { 18 | n_cores <- 20 19 | } 20 | chain.length <- 1000 21 | burn.in <- 500 22 | maxit <- 10000 23 | 24 | st_dir <- "/projects/b1131/SpatialT" 25 | sample_dir <- args[1] 26 | ds_name <- str_split(sample_dir, '/')[[1]][7] 27 | tech <- str_split(sample_dir, '/')[[1]][5] 28 | p_name <- str_split(sample_dir, '/')[[1]][6] 29 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/") 30 | sample_name <- str_split(sample_dir, '/')[[1]][8] 31 | 32 | ### Assumes two files are present under this path: 33 | ### * sc.dat.filtered.pc.sig.RDS 34 | ### * cell_types.txt 35 | ref_dir <- args[2] 36 | 37 | ##### Validate if reference files exist 38 | 39 | if (!file.exists(paste0(ref_dir, "/cell_types.txt"))) { 40 | stop(paste0("[", ref_dir, "/cell_types.txt] does not exist\n")) 41 | } else if (!file.exists(paste0(ref_dir, "/sc.dat.filtered.pc.sig.RDS"))) { 42 | stop(paste0("[", ref_dir, "/sc.dat.filtered.pc.sig.RDS] does not exist\n")) 43 | } 44 | final_ref <- readRDS(paste0(ref_dir, "/sc.dat.filtered.pc.sig.RDS")) 45 | cell_types <- fread(paste0(ref_dir, "/cell_types.txt")) 46 | cell.type.labels <- cell_types$label 47 | cell.state.labels <- cell_types$label 48 | 49 | ##### Validate if transposed count matrix exist 50 | 51 | if (!file.exists(paste0(sample_dir, "/processed/bk.dat.RDS"))) { 52 | stop(paste0("[", sample_dir, "/processed/bk.dat.RDS] does not exist\n")) 53 | } else { 54 | bk.dat <- readRDS(paste0(sample_dir, "/processed/bk.dat.RDS")) 55 | cat(paste0("\n[", sample_name, "] - ST count matrix read from file\n")) 56 | } 57 | 58 | 59 | 60 | ##### 61 | 62 | 63 | 64 | ### Create output dir 65 | output_dir <- paste0(sample_dir, "/analysis") 66 | if (!dir.exists(output_dir)) { 67 | dir.create(output_dir) 68 | } 69 | output_dir <- paste0(sample_dir, "/analysis/deconvolution") 70 | if (!dir.exists(output_dir)) { 71 | dir.create(output_dir) 72 | } 73 | 74 | ### Run BayesPrism 75 | myPrism <- new.prism( 76 | reference = final_ref, 77 | mixture = bk.dat, 78 | input.type = "count.matrix", 79 | cell.type.labels = cell.type.labels, 80 | cell.state.labels = cell.state.labels, 81 | # key="tumor", 82 | key = NULL, 83 | outlier.cut = 0.01, 84 | outlier.fraction = 0.1 85 | ) 86 | cat(paste0("\n[", sample_name, "] - deconvolution started [", Sys.time(), "]\n")) 87 | bp.res <- run.prism(prism = myPrism, n.cores = n_cores, gibbs.control = list(burn.in = burn.in, chain.length = chain.length), opt.control = list(maxit = maxit)) 88 | saveRDS(bp.res, paste0(output_dir, "/BayesPrism_results.RDS")) 89 | gc() 90 | cat(paste0("\n[", sample_name, "] - deconvolution done [", Sys.time(), "]\n")) 91 | 92 | 93 | 94 | ##### 95 | 96 | 97 | 98 | ### Save thetas 99 | theta.cv <- bp.res@posterior.theta_f@theta.cv 100 | theta <- get.fraction(bp = bp.res, which.theta = "final", state.or.type = "type") 101 | # BayesPrism advises to mask theta with CV above 0.5 (Visium) 102 | theta[theta.cv > 0.5] <- 0 103 | theta <- t(apply(theta, 1, function(x) x / sum(x))) 104 | theta <- theta[,sort(colnames(theta))] 105 | saveRDS(theta, paste0(output_dir, "/BayesPrism_theta.RDS")) 106 | 107 | ### Save deconvoluted cell-type-specific expressions 108 | seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS")) 109 | location <- fread(paste0(sample_dir, "/processed/coordinates.csv")) 110 | meta <- seurat_object@meta.data 111 | spot_id_mapping <- meta$new_spot_id 112 | names(spot_id_mapping) <- rownames(meta) 113 | 114 | deconv_genes <- colnames(bp.res@prism@mixture) 115 | not_deconv_genes <- colnames(bk.dat) 116 | not_deconv_genes <- not_deconv_genes[!not_deconv_genes %in% deconv_genes] 117 | not_deconv_exp <- bk.dat[,not_deconv_genes] 118 | not_deconv_exp <- not_deconv_exp / length(colnames(theta)) 119 | 120 | all_cell_types <- character(0) 121 | ct_i <- 1 122 | for (cell_type in colnames(theta)) { 123 | # cell_type <- "CAFs" 124 | cell_type_s <- gsub("/", ".", cell_type) 125 | cell_type_s <- gsub(" ", ".", cell_type_s) 126 | cell_type_s <- gsub("-", ".", cell_type_s) 127 | cell_type_s <- gsub("\\*", ".", cell_type_s) 128 | cell_type_s <- gsub("\\+", ".", cell_type_s) 129 | 130 | ct_exp <- get.exp(bp = bp.res, state.or.type = "type", cell.name = cell_type) 131 | ct_exp <- cbind(ct_exp, not_deconv_exp) 132 | 133 | counts_df <- data.table(ct_exp) 134 | counts_df$spot <- spot_id_mapping[rownames(ct_exp)] 135 | counts_df <- transpose(counts_df, keep.names = "gene", make.names = "spot") 136 | keep_spots <- intersect(colnames(counts_df), location$barcode) 137 | keep_spots <- c("gene", keep_spots) 138 | counts_df <- counts_df[,..keep_spots] 139 | 140 | #### Write counts, coordinates, and meta_spots to file 141 | fwrite(counts_df, paste0(output_dir, "/counts_", cell_type_s, ".csv"), sep = ",") 142 | all_cell_types[ct_i] <- cell_type_s 143 | ct_i <- ct_i + 1 144 | } 145 | write.table(all_cell_types, paste0(output_dir, "/all_cell_types.txt"), col.names = FALSE, row.names = FALSE, quote = FALSE) 146 | cat(paste0("\n[", sample_name, "] - cell-type-specific expression matrices saved [", Sys.time(), "]\n")) 147 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import numpy as np 4 | from math import isnan 5 | import sys 6 | 7 | ds = sys.argv[1] 8 | sample = sys.argv[2] 9 | print('PPI analysis for: ', ds, sample) 10 | ppi_out_dir = '/projects/b1131/SpatialT/drug-target-cmap2-svg/'+ds+'/'+sample+'/PPI/' 11 | if not os.path.exists(ppi_out_dir): 12 | os.makedirs(ppi_out_dir) 13 | # for single cellular annotated samples 14 | #deg_dir = '/projects/b1131/SpatialT/drug-target/'+ds+'/'+sample+'/DGE_anno_SVG/' 15 | # for deconvoluted samples 16 | deg_dir = '/projects/b1131/SpatialT/drug-target/'+ds+'/'+sample+'/DGE_dec_SVG/' 17 | 18 | # read in gene reference and remove unformatted genes 19 | gene = pd.read_table('/projects/b1131/SpatialT/cmap_ppi_database/HGNC.tsv') 20 | gene = gene[['NCBI Gene ID(supplied by NCBI)','Approved symbol']] 21 | gene.columns = ['num','name'] 22 | lst = [] 23 | for i in gene['num'].tolist(): 24 | if pd.isna(i) != True: 25 | lst.append(int(i)) 26 | else: 27 | lst.append(i) 28 | gene['num'] = lst 29 | gene_dict = gene.set_index('num').to_dict()['name'] 30 | 31 | clean_hgnc_dict = filter(lambda k: not isnan(k), gene_dict) 32 | clean_hgnc_dict = {k: gene_dict[k] for k in gene_dict if not isnan(k)} 33 | clean_hgnc_dict = {int(k):v for k,v in clean_hgnc_dict.items()} 34 | 35 | # interactome for ppi network 36 | interactome = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/Interactome.tsv', sep='\t') 37 | 38 | # mapping genes to interactome 39 | unmappable_pro = [] 40 | Protein_A_Name = [] 41 | Protein_B_Name = [] 42 | 43 | for i in interactome['#Protein A'].tolist(): 44 | if i in clean_hgnc_dict.keys(): 45 | Protein_A_Name.append(clean_hgnc_dict[i] ) 46 | else: 47 | if i not in unmappable_pro: 48 | unmappable_pro.append(i) 49 | print(i) 50 | Protein_A_Name.append('drop') 51 | 52 | for i in interactome['Protein B'].tolist(): 53 | if i in clean_hgnc_dict.keys(): 54 | Protein_B_Name.append(clean_hgnc_dict[i] ) 55 | else: 56 | if i not in unmappable_pro: 57 | unmappable_pro.append(i) 58 | print(i) 59 | Protein_B_Name.append('drop') 60 | 61 | # drop unmappable interactome interactions 62 | interactome['Protein_A_Name'] = Protein_A_Name 63 | interactome['Protein_B_Name'] = Protein_B_Name 64 | dropa_index = interactome[interactome['Protein_A_Name'] == 'drop'].index.tolist() 65 | dropb_index = interactome[interactome['Protein_B_Name'] == 'drop'].index.tolist() 66 | print(len(dropa_index), len(dropb_index)) 67 | # 538 interactions removed due to unmappable gene number 68 | for i in dropa_index: 69 | if i not in dropb_index: 70 | dropb_index.append(i) 71 | len(dropb_index) 72 | clean_interactome = interactome.drop(dropb_index) 73 | print('interactome cleaned') 74 | 75 | # generate ppi network using SV-DGEs 76 | for deg_file in os.listdir(deg_dir): 77 | # read in deg-svg file and filter (no svg fitler placed on cell types that don't have SVG analysis ) 78 | cell_type = deg_file.split('.csv')[0] 79 | log2fc = pd.read_csv(deg_dir+deg_file) 80 | log2fc['abs_stat'] = [abs(i) for i in log2fc['stat'].tolist()] 81 | log2fc_top300 = log2fc.sort_values('abs_stat', ascending=False)[:300] 82 | log2fc_top300_sig = log2fc_top300[log2fc_top300['qval']<0.05] 83 | print('sv-deg file read in') 84 | 85 | # map degs to inteactome 86 | ppi_index_interactome_top300 = [] 87 | for i in clean_interactome.iterrows(): 88 | row = i[1] 89 | if row['Protein_A_Name'] in log2fc_top300_sig['gene'].tolist() and row['Protein_B_Name'] in log2fc_top300_sig['gene'].tolist(): 90 | ppi_index_interactome_top300.append(i[0]) 91 | ppi_top300 = clean_interactome[clean_interactome.index.isin(ppi_index_interactome_top300)][['#Protein A','Protein B', 'Protein_A_Name','Protein_B_Name']] 92 | print('sv-deg mapped to interactome') 93 | 94 | # drop ppis where both proteins are the same 95 | same_pro_index_top300 = [] 96 | for i in ppi_top300.iterrows(): 97 | row = i[1] 98 | if row['#Protein A'] == row['Protein B']: 99 | same_pro_index_top300.append(i[0]) 100 | ppi_top300_nored = ppi_top300.drop(same_pro_index_top300) 101 | 102 | # format edges list for output 103 | ppi_top300_nored = ppi_top300_nored[['Protein_A_Name','Protein_B_Name']] 104 | ppi_top300_nored.columns = ['Source','Target'] 105 | 106 | # format nodes list for output 107 | ppi_uniq_pro = list(set(ppi_top300_nored['Source'].tolist()) ) 108 | for i in ppi_top300_nored['Target'].tolist(): 109 | if i not in ppi_top300_nored['Source'].tolist() and i not in ppi_uniq_pro : 110 | ppi_uniq_pro.append(i) 111 | 112 | ppi_nodes_top300 = pd.DataFrame() 113 | ppi_nodes_top300['ID'] = ppi_uniq_pro 114 | ppi_nodes_top300['Label'] = ppi_uniq_pro 115 | 116 | ppi_nodes_top300 = pd.merge(ppi_nodes_top300, log2fc[['gene','stat']], how = 'left', right_on = 'gene', left_on = 'Label') 117 | ppi_nodes_top300 = ppi_nodes_top300.drop('gene',axis = 1) 118 | 119 | ppi_nodes_top300['sign'] = [np.sign(i) for i in ppi_nodes_top300['stat'].tolist()] 120 | ppi_nodes_top300['abs_stat'] = [abs(i) for i in ppi_nodes_top300['stat'].tolist()] 121 | 122 | # save 123 | ppi_nodes_top300.to_csv(ppi_out_dir+cell_type+'_ppi_nodes.csv', index=False) 124 | ppi_top300_nored.to_csv(ppi_out_dir+cell_type+'_ppi.csv', index=False) 125 | print(cell_type,'ppi saved') 126 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/DGE/deconvoluted/DGE-analysis-dec.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### Author: Yiming 4 | ### 5 | ### Description: This script performs DGE analysis (using pseudo-cell data) 6 | 7 | library(Seurat) 8 | library(data.table) 9 | library(dplyr) 10 | library(stringr) 11 | 12 | ### Define paths and variables 13 | args <- commandArgs(trailingOnly=TRUE) 14 | st_dir <- "/projects/b1131/SpatialT" 15 | dt_dir <- "/projects/b1131/SpatialT/drug-target/" 16 | sample_dir <- args[1] 17 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/" 18 | # sample_dir <- "/projects/b1131/SpatialT/DBiT-seq/PID150/DS150A/DS150A.GSM4096261/" 19 | 20 | ds_name <- str_split(sample_dir, '/')[[1]][7] 21 | tech <- str_split(sample_dir, '/')[[1]][5] 22 | p_name <- str_split(sample_dir, '/')[[1]][6] 23 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/") 24 | sample_name <- str_split(sample_dir, '/')[[1]][8] 25 | 26 | ### Read all possible cell types 27 | all_cell_types_df <- fread(paste0(sample_dir, "/analysis/deconvolution/all_cell_types.txt")) 28 | all_cell_types <- all_cell_types_df$cell_type_s 29 | ct_mapping <- all_cell_types_df$cell_type 30 | names(ct_mapping) <- all_cell_types 31 | 32 | ### Read Seurat object 33 | seurat_object_tn_path <- paste0(sample_dir, "processed/Seurat.RDS") 34 | seurat_object_tn <- readRDS(seurat_object_tn_path) 35 | spot_id_mapping <- seurat_object_tn@meta.data$new_spot_id 36 | names(spot_id_mapping) <- as.character(rownames(seurat_object_tn@meta.data)) 37 | 38 | ### Read cell type fractions 39 | theta <- readRDS(paste0(sample_dir, "/analysis/deconvolution/BayesPrism_theta.RDS")) 40 | rownames(theta) <- as.character(spot_id_mapping[rownames(theta)]) 41 | 42 | ### DGE results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/DGE 43 | output_dir <- paste0(sample_dir, "analysis/DGE") 44 | if (!dir.exists(output_dir)) { 45 | dir.create(output_dir) 46 | } 47 | dir.create(paste0(dt_dir, ds_name)) 48 | dir.create(paste0(dt_dir, ds_name, "/", sample_name)) 49 | dir.create(paste0(dt_dir, ds_name, "/", sample_name, "/DGE_dec")) 50 | 51 | ### Create pseudo-cell-level object 52 | if (file.exists(paste0(sample_dir, "analysis/deconvolution/binded_exp_Seurat.RDS"))) { 53 | # if (FALSE) { 54 | binded_so <- readRDS(paste0(sample_dir, "analysis/deconvolution/binded_exp_Seurat.RDS")) 55 | } else { 56 | ct_exp <- list() 57 | total_cells_each_ct <- list() 58 | for (cell_type in all_cell_types) { 59 | # cell_type <- "Malignant" 60 | # cell_type <- all_cell_types[6] 61 | exp_mat <- fread(paste0(sample_dir, "/analysis/deconvolution/counts_", cell_type, "_deconv_only.csv")) 62 | genes <- exp_mat$gene 63 | exp_mat$gene <- NULL 64 | exp_mat <- as.matrix(exp_mat) 65 | rownames(exp_mat) <- genes 66 | 67 | # Divide by fraction and leave out the spots with zero fraction of this cell type 68 | fractions <- theta[,as.character(ct_mapping[cell_type])] 69 | non_zero_fraction_spots <- names(which(fractions != 0)) 70 | exp_mat <- t(apply(exp_mat, 1, function(x) x / fractions)) 71 | exp_mat[is.na(exp_mat)] <- 0 72 | exp_mat <- exp_mat[, non_zero_fraction_spots, drop = FALSE] 73 | 74 | ct_exp[[cell_type]] <- exp_mat 75 | total_cells_each_ct[[cell_type]] <- ncol(exp_mat) 76 | } 77 | ct_exp <- do.call(cbind, ct_exp) 78 | colnames(ct_exp) <- paste0("sp", 1:ncol(ct_exp)) 79 | saveRDS(ct_exp, paste0(sample_dir, "analysis/deconvolution/binded_exp.RDS")) 80 | 81 | # Remove "pseudo-cells" with zero deconvoluted expression in all the genes 82 | # Initially this was done because CellChat does not accept objects with zero total expression cells 83 | col_sums <- colSums(ct_exp) 84 | kept_pseudo_cells <- names(which(col_sums > 0)) 85 | all_cell_types <- all_cell_types[total_cells_each_ct != 0] 86 | total_cells_each_ct <- total_cells_each_ct[total_cells_each_ct != 0] 87 | meta <- data.frame(labels = rep(all_cell_types, times = total_cells_each_ct)) 88 | meta$cell_id <- paste0("sp", 1:ncol(ct_exp)) 89 | meta <- meta[meta$cell_id %in% kept_pseudo_cells,] 90 | row.names(meta) <- meta$cell_id 91 | meta$cell_id <- NULL 92 | ct_exp_less <- ct_exp[,kept_pseudo_cells] 93 | 94 | rm(seurat_object_tn) 95 | binded_so <- CreateSeuratObject(ct_exp_less) 96 | binded_so[["cell_type"]] <- meta$labels 97 | binded_so <- SCTransform(binded_so, verbose = FALSE, return.only.var.genes = FALSE) 98 | saveRDS(binded_so, paste0(sample_dir, "analysis/deconvolution/binded_exp_Seurat.RDS")) 99 | } 100 | 101 | ### Perform DGE analysis on different cell types 102 | annotations <- binded_so[["cell_type"]]$cell_type 103 | names(annotations) <- rownames(binded_so[["cell_type"]]) 104 | Idents(binded_so) <- annotations 105 | 106 | if (length(table(binded_so[["cell_type"]])) == 1) { 107 | fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_dec.tsv"), sep = "\t") 108 | cat("\n\n### Only one annotated cell type -- skipping DGE analysis (cell types).") 109 | cat("\n# Writing empty DGE_cell_types_dec.tsv to file.") 110 | } else { 111 | # https://satijalab.org/seurat/archive/v3.1/future_vignette.html 112 | options(future.globals.maxSize = 5000 * 1024^2) 113 | DGE_cell_types <- FindAllMarkers(binded_so, assay = "SCT", logfc.threshold = 0.1, min.pct = 0.1, verbose = FALSE) 114 | if (nrow(DGE_cell_types) == 0) { 115 | fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_dec.tsv"), sep = "\t") 116 | cat("\n\n### DGE analysis (cell types) cannot be performed due to having too few spots in one/many of the cell types.") 117 | cat("\n# Writing empty DGE_cell_types_dec.tsv to file.") 118 | } else { 119 | DGE_cell_types$cluster <- as.character(DGE_cell_types$cluster) 120 | fwrite(DGE_cell_types, paste0(output_dir, "/DGE_cell_types_dec.tsv"), sep = "\t") 121 | 122 | for (cell_type in sort(unique(DGE_cell_types$cluster))) { 123 | # cell_type <- "Malignant" 124 | DGE_cell_types_less <- DGE_cell_types[DGE_cell_types$cluster == cell_type,] 125 | DGE_cell_types_less <- DGE_cell_types_less[,c("gene", "avg_log2FC", "p_val", "p_val_adj")] 126 | colnames(DGE_cell_types_less) <- c("gene", "stat", "pval", "qval") 127 | fwrite(DGE_cell_types_less, paste0(dt_dir, ds_name, "/", sample_name, "/DGE_dec/", cell_type, ".txt"), sep = "\t") 128 | } 129 | cat("\n\n### DGE analysis (cell types) results written to file.") 130 | } 131 | } 132 | tt4 <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14]) 133 | cat(paste0("\n### Analysis completed; max memory consumed: ", as.character(tt4), "M -- [", Sys.time(), "]\n\n")) 134 | -------------------------------------------------------------------------------- /data_curation/geo-query.py: -------------------------------------------------------------------------------- 1 | ### Usage: python3 geo-query.py 2 | ### Author: Yiming Li 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import requests 7 | import xml.etree.ElementTree as ET 8 | import time, os, shutil, sys 9 | 10 | def fetch_species_GDS(species): 11 | # species = "mouse" 12 | urls = ["http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+transcriptomics+AND+" + species + "[organism]&retmax=100000&usehistory=y", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+transcriptome+AND+" + species + "[organism]&retmax=100000&usehistory=y", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+RNA-seq+AND+" + species + "[organism]&retmax=100000&usehistory=y", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+RNA+sequencing+AND+" + species + "[organism]&retmax=100000&usehistory=y"] 13 | items = [] 14 | query_keys = [] 15 | WebEnvs = [] 16 | for url in urls: 17 | resp = requests.get(url) 18 | with open('tmp2.xml', 'wb') as f: 19 | f.write(resp.content) 20 | 21 | with open('tmp2.xml', 'r') as file: 22 | xml_text = file.read() 23 | 24 | if "API rate limit exceeded" in xml_text: 25 | sys.exit('[ERROR] E-utils API rate limit exceeded') 26 | 27 | tree = ET.parse('tmp2.xml') 28 | root = tree.getroot() 29 | query_keys.append(root.findall('./QueryKey')[0].text) 30 | WebEnvs.append(root.findall('./WebEnv')[0].text) 31 | for item in root.findall('./IdList/Id'): 32 | items.append(item.text) 33 | 34 | return(items, query_keys, WebEnvs) 35 | 36 | def fetch_PMIDs(query_keys, WebEnvs): 37 | pmids = [] 38 | items = [] 39 | for query_key, WebEnv in zip(query_keys, WebEnvs): 40 | url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=gds&db=pubmed&query_key=' + str(query_key) + '&WebEnv=' + str(WebEnv) 41 | resp = requests.get(url) 42 | with open('tmp3.xml', 'wb') as f: 43 | f.write(resp.content) 44 | 45 | with open('tmp3.xml', 'r') as file: 46 | xml_text = file.read() 47 | 48 | if "API rate limit exceeded" in xml_text: 49 | sys.exit('[ERROR] E-utils API rate limit exceeded') 50 | 51 | tree = ET.parse('tmp3.xml') 52 | root = tree.getroot() 53 | 54 | for item in root.findall('./LinkSet/IdList/Id'): 55 | items.append(item.text) 56 | 57 | for pmid in root.findall('./LinkSet/LinkSetDb/Link/Id'): 58 | pmids.append(pmid.text) 59 | 60 | return(items, pmids) 61 | 62 | def loadRSS(gds_id): 63 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&id=' + str(gds_id) 64 | resp = requests.get(url) 65 | with open('tmp.xml', 'wb') as f: 66 | f.write(resp.content) 67 | 68 | def parseXML(xmlfile): 69 | with open(xmlfile, 'r') as file: 70 | xml_text = file.read() 71 | 72 | if "API rate limit exceeded" in xml_text: 73 | sys.exit('[ERROR] E-utils API rate limit exceeded') 74 | 75 | tree = ET.parse(xmlfile) 76 | root = tree.getroot() 77 | items = [] 78 | for item in root.findall('./DocSum/Item'): 79 | if (item.attrib['Name'] in ["Accession", "title", "summary", "GPL", "GSE", "taxon", "gdsType", "FTPLink"]): 80 | items.append(item.text) 81 | 82 | return(items) 83 | 84 | def fetch_species_meta(species, save_dir): 85 | print("### Starting initial query for [" + species + "]") 86 | species_ids, query_keys, WebEnvs = fetch_species_GDS(species) 87 | query_keys_df = pd.DataFrame({"query_key": query_keys, "WebEnv": WebEnvs}) 88 | query_keys_df.to_csv(save_dir + "/" + species + "_query_keys.tsv", sep='\t', index = False) 89 | 90 | current_species_ids = [line.strip() for line in open(species + "_GDS_current.txt", 'r')] 91 | species_ids = list(set(species_ids) - set(current_species_ids)) 92 | species_ids_concat = list(set(species_ids) | set(current_species_ids)) 93 | print("### Initial query for [" + species + "] completed\n### Number of species GDS IDs: " + str(len(current_species_ids)) + " (previous), " + str(len(species_ids)) + " (this query), " + str(len(species_ids_concat)) + " (concatenated)") 94 | 95 | ### Get meta-information 96 | print("### Starting query for [" + species + "] meta-information") 97 | results = [] 98 | for gds_id in species_ids: 99 | loadRSS(gds_id) 100 | time.sleep(1) 101 | results.append(parseXML('tmp.xml')) 102 | 103 | results = pd.DataFrame(results, columns=["Accession", "title", "summary", "GPL", "GSE", "taxon", "gdsType", "FTPLink"]) 104 | results["GDS_ID"] = species_ids 105 | results["Organism"] = species 106 | results.to_csv(save_dir + "/" + species + ".tsv", sep='\t', index = False) 107 | print("### Meta-information of [" + species + "] GDS IDs saved to " + save_dir + "/" + species + ".tsv") 108 | 109 | ### Save GDS lists 110 | with open(save_dir + "/" + species + "_GDS_" + save_dir + ".txt", 'w') as f: 111 | for line in species_ids: 112 | f.write(f"{line}\n") 113 | 114 | shutil.copyfile(species + "_GDS_current.txt", species + "_GDS_current.txt.bk") 115 | 116 | with open(species + "_GDS_current.txt", 'w') as f: 117 | for line in species_ids_concat: 118 | f.write(f"{line}\n") 119 | 120 | print("### " + species + "_GDS_current.txt overwritten") 121 | 122 | print("### Starting query for [" + species + "] PMIDs") 123 | gds_ids, pmids = fetch_PMIDs(query_keys, WebEnvs) 124 | pmids_df = pd.DataFrame({"PMID": pmids}) 125 | pmids_df.to_csv(save_dir + "/" + species + "_PMIDs.tsv", sep='\t', index = False) 126 | print("### PMIDs associated with [" + species + "] saved to " + save_dir + "/" + species + "_PMIDs.tsv") 127 | 128 | print("### [" + species + "] completed\n") 129 | return(results) 130 | 131 | ########################################################################################## 132 | 133 | owd = os.getcwd() 134 | os.chdir("/share/fsmresfiles/SpatialT/GEO_query") 135 | save_dir = time.strftime("%Y%m%d") 136 | os.makedirs(save_dir, exist_ok = True) 137 | 138 | ### Get mouse GDS IDs 139 | results_mouse = fetch_species_meta("mouse", save_dir) 140 | 141 | ### Get human GDS IDs 142 | results_human = fetch_species_meta("human", save_dir) 143 | 144 | ### Combine results 145 | results_mouse = pd.read_csv(save_dir + "/mouse.tsv", sep = '\t') 146 | results_human = pd.read_csv(save_dir + "/human.tsv", sep = '\t') 147 | results_all = pd.concat([results_mouse, results_human]) 148 | results_all = results_all.sort_values(by=['GDS_ID', 'Organism']) 149 | 150 | results_all['Accession_is_GSM'] = results_all['Accession'].str.startswith('GSM', na=False) 151 | results_all['GSM'] = np.where(results_all['Accession_is_GSM'], results_all['Accession'], "") 152 | results_all['Accession'] = np.where(results_all['Accession_is_GSM'], "", results_all['Accession']) 153 | results_all['Is ST data'] = "" 154 | results_all['Technology'] = "" 155 | results_all['Platform'] = "" 156 | results_all['Add to SOAR'] = "" 157 | results_all['PMID'] = "" 158 | results_all['GSE'] = "GSE" + results_all['GSE'].apply(str) 159 | 160 | reordered_columns = ['GDS_ID', 'Organism', 'Accession', "GSE", "GSM", 'Is ST data', 'Technology', 'Platform', 'PMID', 'Add to SOAR', 'title', 'summary', 'GPL', 'taxon', 'gdsType', 'FTPLink'] 161 | results_all = results_all[reordered_columns] 162 | results_all = results_all.sort_values(by=['GSE']) 163 | results_all.to_csv(save_dir + "/all.tsv", sep='\t', index = False) 164 | 165 | ### Clean up 166 | os.remove("tmp.xml") 167 | os.remove("tmp2.xml") 168 | os.remove("tmp3.xml") 169 | os.chdir(owd) 170 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/deconvolution/process_reference_example.R: -------------------------------------------------------------------------------- 1 | ### conda activate R4 2 | library(data.table) 3 | library(splitstackshape) 4 | library(Seurat) 5 | library(BayesPrism) 6 | library(SingleCellExperiment) 7 | library(scuttle) 8 | 9 | ### This script uses Thymus, Human as an example 10 | ### This script assumes that: 11 | ### * A Seurat object of the identified scRNA-seq reference data has already been created 12 | ### * The annotated cell types are stored in seurat.object[["label"]] 13 | organ <- "Thymus" 14 | species <- "Human" 15 | seurat.object <- readRDS("/share/fsmresfiles/SpatialT/ref/Tabula_Sapiens/Thymus/Seurat.RDS") 16 | qc_plot_output_dir <- "~/stbase" # Change this to your own directory 17 | 18 | #### Create save path 19 | save_path <- paste0("/share/fsmresfiles/SpatialT/ref/final/", organ) 20 | if (!dir.exists(save_path)) { 21 | dir.create(save_path) 22 | } 23 | save_path <- paste0(save_path, "/", species) 24 | if (!dir.exists(save_path)) { 25 | dir.create(save_path) 26 | } 27 | 28 | #### Get # cells and # genes before QC 29 | n_cells <- ncol(seurat.object) 30 | n_genes <- nrow(seurat.object) 31 | gc() 32 | 33 | #### Get metadata 34 | meta <- seurat.object@meta.data 35 | meta$cell_id <- rownames(meta) 36 | meta <- meta[,c("cell_id", "label")] 37 | meta <- meta[!is.na(meta$label),] 38 | 39 | ### Perform stratified sampling (by cell type labels) on the reference if the total number of cells is larger than 30000 to avoid out-of-memory error 40 | ### You may change 30000 to a slightly larger number or skip this step if QC drops a lot of cells 41 | if (nrow(meta) > 30000) { 42 | if (organ == "Brain") { 43 | meta <- stratified(meta, "subclass_label", size = 30000 / n_cells) 44 | } else { 45 | meta <- stratified(meta, "label", size = 30000 / n_cells) 46 | } 47 | } 48 | seurat.object[["cell_id"]] <- colnames(seurat.object) 49 | seurat.object <- subset(seurat.object, subset = cell_id %in% meta$cell_id) 50 | n_cells_strat <- ncol(seurat.object) 51 | 52 | ### Get cell type proportions 53 | cell_type_string <- table(meta$label) 54 | percs <- paste0(as.character(round(as.numeric(cell_type_string / sum(cell_type_string) * 100), 2)), "%") 55 | cell_type_string2 <- paste0(names(cell_type_string), " (", as.character(round(as.numeric(cell_type_string), 2)), ", ", percs, ")") 56 | cell_type_string2 <- paste(cell_type_string2, collapse = ", ") 57 | 58 | ### Sort metadata by Seurat object's cell order 59 | meta <- meta[match(colnames(seurat.object), meta$cell_id),] 60 | sum(meta$cell_id == colnames(seurat.object)) == nrow(meta) 61 | 62 | ### Cell QC 63 | ### !!! Please perform cell QC case-by-case instead of using uniform thresholds 64 | seurat.object[["percent_mt"]] <- PercentageFeatureSet(seurat.object, "^MT-") 65 | seurat.object.bk <- seurat.object 66 | pdf(paste0(qc_plot_output_dir, "/sc_ref_", organ, "_", species, "_beforeQC.pdf")) 67 | print(VlnPlot(seurat.object, features = "nCount_RNA")) 68 | print(VlnPlot(seurat.object, features = "nFeature_RNA")) 69 | dev.off() 70 | 71 | ### * Change the nCount_RNA and nFeature_RNA thresholds based on the violin plots 72 | seurat.object <- seurat.object.bk 73 | seurat.object <- seurat.object[, seurat.object$nCount_RNA > 500 & seurat.object$nFeature_RNA > 250 & seurat.object$percent_mt < 20] 74 | (n_cells_qc1 <- ncol(seurat.object)) 75 | pdf(paste0(qc_plot_output_dir, "/sc_ref_", organ, "_", species, "_QC1.pdf")) 76 | print(VlnPlot(seurat.object, features = "nCount_RNA")) 77 | print(VlnPlot(seurat.object, features = "nFeature_RNA")) 78 | dev.off() 79 | 80 | ### * Change the two thresholds based on the violin plots 81 | seurat.object2 <- seurat.object[, seurat.object$nCount_RNA < 30000 & seurat.object$nFeature_RNA < 6000] 82 | n_cells_qc2 <- ncol(seurat.object2) 83 | pdf(paste0(qc_plot_output_dir, "/sc_ref_", organ, "_", species, "_QC2.pdf")) 84 | print(VlnPlot(seurat.object2, features = "nCount_RNA")) 85 | print(VlnPlot(seurat.object2, features = "nFeature_RNA")) 86 | dev.off() 87 | 88 | ### Overwrite the Seurat object if the second two violin plots look okay 89 | # seurat.object 90 | # seurat.object2 91 | seurat.object <- seurat.object2 92 | # seurat.object 93 | 94 | ### Write the QC-ed Seurat object and the metadata to file 95 | meta <- seurat.object@meta.data 96 | meta$cell_id <- rownames(meta) 97 | meta <- meta[,c("cell_id", "label")] 98 | fwrite(meta, paste0(save_path, "/cell_types.txt"), sep = "\t") 99 | saveRDS(seurat.object, paste0(save_path, "/Seurat.RDS")) 100 | 101 | ### Generate SingleCellExperiment object for cell type annotation 102 | counts <- GetAssayData(seurat.object, assay = "RNA") 103 | meta$cell_id <- NULL 104 | ref_data_sce <- SingleCellExperiment(list(counts = counts), colData = meta) 105 | ref_data_sce <- logNormCounts(ref_data_sce) 106 | saveRDS(ref_data_sce, file = paste0(save_path, "/SCE.RDS")) 107 | 108 | ### Generate data for BayesPrism (cell type deconvolution) 109 | ### * Counts data 110 | counts <- GetAssayData(seurat.object, assay = "RNA") 111 | counts <- as.matrix(counts) 112 | gene_names <- rownames(counts) 113 | counts <- data.table(counts) 114 | counts$gene <- gene_names 115 | counts <- transpose(counts, keep.names = "cell", make.names = "gene") 116 | cell_names <- counts$cell 117 | counts$cell <- NULL 118 | counts <- as.matrix(counts) 119 | rownames(counts) <- cell_names 120 | min_count <- min(counts) 121 | max_count <- max(counts) 122 | saveRDS(counts, paste0(save_path, "/mat_transposed.RDS")) 123 | gc() 124 | 125 | ### * BayesPrism gene filtering step 1 126 | if (species == "Human") { 127 | sc.dat.filtered <- cleanup.genes(input=counts, input.type="count.matrix", species="hs", gene.group = c("Rb", "Mrp", "other_Rb", "chrM", "MALAT1","chrX","chrY"), exp.cells=5) 128 | sc.dat.filtered.pc <- select.gene.type(sc.dat.filtered, gene.type = "protein_coding") # only works for human 129 | } else { 130 | sc.dat.filtered.pc <- cleanup.genes(input=counts, input.type="count.matrix", species="mm", gene.group = c("Rb", "Mrp", "other_Rb", "chrM","chrX","chrY"), exp.cells=5) 131 | } 132 | 133 | ### * BayesPrism gene filtering step 2 134 | cell.type.labels <- meta$label 135 | cell.state.labels <- meta$label 136 | diff.exp.stat <- get.exp.stat(sc.dat=counts[,colSums(counts>0)>3], cell.type.labels=cell.type.labels, cell.state.labels=cell.state.labels, psuedo.count=0.1, cell.count.cutoff=50, n.cores=1) 137 | 138 | ### !!! Check that all cell types has > 50 marker genes 139 | ### This threshold can be more lenient for sparser cell types 140 | ### Change pval.max and lfc.min to get more genes 141 | sc.dat.filtered.pc.sig <- select.marker(sc.dat=sc.dat.filtered.pc, stat=diff.exp.stat, pval.max=0.01, lfc.min=0.1) 142 | 143 | ### You may check if ngenes_filt2 is around 5000 144 | ### If the number is too small (e.g. < 2000), we may need to use ngenes_filt1 for deconvolution instead 145 | (ngenes_filt1 <- ncol(sc.dat.filtered.pc)) 146 | (ngenes_filt2 <- ncol(sc.dat.filtered.pc.sig)) 147 | 148 | ### Save to BayesPrism-filtered references to file 149 | saveRDS(sc.dat.filtered.pc, paste0(save_path, "/sc.dat.filtered.pc.RDS")) 150 | saveRDS(sc.dat.filtered.pc.sig, paste0(save_path, "/sc.dat.filtered.pc.sig.RDS")) 151 | 152 | results <- data.frame(organ = organ, species = species, save_path = save_path, ncells = n_cells, ngenes = n_genes, ncells_qc1 = n_cells_qc1, ncells_qc2 = n_cells_qc2, n_cells_strat = n_cells_strat, cell_types = cell_type_string2, min_count = min_count, max_count = max_count, ngenes_filt1 = ngenes_filt1, ngenes_filt2 = ngenes_filt2) 153 | fwrite(results, paste0(save_path, "/summary_stats.txt"), sep = "\t") 154 | gc() 155 | results 156 | cat(paste0("\n>>> ", save_path, " finished\n")) 157 | -------------------------------------------------------------------------------- /data_analysis/cell_cell_interaction/neighborhood-based/adj-analysis.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### Author: Yiming Li 4 | ### 5 | ### Description: This script performs neighborhood-based analysis 6 | ### Usage: adj-analysis.R $sample_dir 7 | 8 | library(Seurat) 9 | library(plyr) 10 | library(data.table) 11 | library(stringr) 12 | 13 | ### Read the list of DSIDs for use in our database 14 | args <- commandArgs(trailingOnly=TRUE) 15 | 16 | st_dir <- "/projects/b1131/SpatialT" 17 | # st_dir <- "/share/fsmresfiles" 18 | 19 | sample_dir <- args[1] 20 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID5/DS5A/DS5A.06_151670" 21 | # sample_dir <- "/share/fsmresfiles/SpatialT/10x/PID5/DS5A/DS5A.12_151676" 22 | ds_name <- str_split(sample_dir, '/')[[1]][7] 23 | tech <- str_split(sample_dir, '/')[[1]][5] 24 | p_name <- str_split(sample_dir, '/')[[1]][6] 25 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/") 26 | sample_name <- str_split(sample_dir, '/')[[1]][8] 27 | 28 | ### Functions 29 | get_dist <- function(row1, col1, row2, col2) { 30 | return(sqrt((row1-row2)^2 + (col1-col2)^2)) 31 | } 32 | 33 | get_adjacency <- function(row1, col1, row2, col2) { 34 | return(sum(abs(row1-row2) <= 1 & abs(col1-col2) <= 1)) 35 | } 36 | 37 | get_dist_from_cell_type <- function(row, col, cell_type, one_cell_type_dfs) { 38 | cell_types <- names(one_cell_type_dfs) 39 | distances <- rep(NA, length(cell_types) * 3) 40 | names(distances) <- c(paste0(cell_types, "_min"), paste0(cell_types, "_median"), paste0(cell_types, "_adjacent")) 41 | for (compared_cell_type in cell_types) { 42 | if (cell_type == compared_cell_type) { 43 | next 44 | } 45 | compared_cell_type_df <- one_cell_type_dfs[[compared_cell_type]] 46 | tmp <- get_dist(row, col, compared_cell_type_df$row, compared_cell_type_df$col) 47 | distances[paste0(compared_cell_type, "_min")] <- min(tmp) 48 | distances[paste0(compared_cell_type, "_median")] <- median(tmp) 49 | distances[paste0(compared_cell_type, "_adjacent")] <- get_adjacency(row, col, compared_cell_type_df$row, compared_cell_type_df$col) 50 | } 51 | return(distances) 52 | } 53 | 54 | ### Start adjacency-based analysis 55 | ### Results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/Distance 56 | output_dir <- paste0(sample_dir, "/analysis/Distance") 57 | if (!dir.exists(output_dir)) { 58 | dir.create(output_dir) 59 | } 60 | 61 | ### Check if the analysis has already been done 62 | if (file.exists(paste0(output_dir, "/cci_adj_results.tsv"))) { 63 | tmpdf <- fread(paste0(output_dir, "/cci_adj_results.tsv")) 64 | if ("wilcox_pval_two_sided" %in% colnames(tmpdf)) { 65 | ### Save old results to a separate file 66 | file.copy(paste0(output_dir, "/cci_adj_results.tsv"), paste0(output_dir, "/cci_adj_results_old.tsv"), overwrite = TRUE) 67 | } else if ("avg_log2FC" %in% colnames(tmpdf)) { 68 | stop("\n### Adjacency analysis already done.\n") 69 | } 70 | } 71 | 72 | ### Read in the QC-ed + transformed + processed Seurat object (from process_visium_standard.R) 73 | seurat_object_tn_path <- paste0(sample_dir, "/processed/Seurat.RDS") 74 | seurat_object_tn <- readRDS(seurat_object_tn_path) 75 | cat("\n\n### Processed Seurat object read.\n\n### Starting adjacency analysis...\n") 76 | ### !!! Assuming that the Seurat object has underwent SCT and clustering 77 | 78 | ### Get the table with calculated distance-based statistics 79 | if (file.exists(paste0(output_dir, "/distance_stats_dec_max.tsv"))) { 80 | coords <- fread(paste0(output_dir, "/distance_stats_dec_max.tsv"), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 81 | coords$spot <- as.character(coords$spot) ### In case the spot IDs are numeric 82 | cell_types <- sort(unique(seurat_object_tn@meta.data$cell_type_dec_max)) 83 | cat("\n### Distance-based statistics read from file.\n") 84 | } else { 85 | ### Get coordinates and cell type annotations 86 | annotations <- seurat_object_tn@meta.data$cell_type_dec_max 87 | if ("slice1" %in% names(seurat_object_tn@images)) { 88 | coords <- seurat_object_tn@images$slice1@coordinates ### Visium 89 | coords <- coords[, c("row", "col")] ### Use the spot coordinates 90 | } else { 91 | coords <- seurat_object_tn@images$image@coordinates ### Others 92 | if ("x" %in% colnames(coords)) { 93 | coords <- coords[, c("x", "y")] ### Use the spot coordinates 94 | } else { 95 | ### Some prepared MERFISH datasets did not follow the naming standard 96 | coords <- coords[, c("xcoord", "ycoord")] ### Use the spot coordinates 97 | } 98 | } 99 | colnames(coords) <- c("row", "col") 100 | coords <- tibble::rownames_to_column(coords, "spot") 101 | coords$annotation <- annotations 102 | 103 | ### Get distance metrics, currently supporting: 104 | ### * Minimum distance from another cell type (the same cell type -- marked as NA) 105 | ### * Median distance from another cell type (the same cell type -- marked as NA) 106 | ### * Whether adjacent to another cell type (0 = FALSE, 1 = TRUE; the same cell type -- marked as NA) 107 | cell_types <- sort(unique(annotations)) 108 | one_cell_type_dfs <- list() 109 | for (cell_type in cell_types) { 110 | one_cell_type_dfs[[cell_type]] <- coords[coords$annotation == cell_type,] 111 | } 112 | df_colnames <- c("spot", paste0(cell_types, "_min"), paste0(cell_types, "_median"), paste0(cell_types, "_adjacent")) 113 | distance_stats_df <- data.frame(matrix(NA, ncol = length(df_colnames), nrow = 0)) 114 | colnames(distance_stats_df) <- df_colnames 115 | for (i in 1:nrow(coords)) { 116 | row <- coords$row[i] 117 | col <- coords$col[i] 118 | cell_type <- coords$annotation[i] 119 | distances <- get_dist_from_cell_type(row, col, cell_type, one_cell_type_dfs) 120 | tmpdf <- data.frame(matrix(distances, ncol = length(df_colnames)-1, nrow = 1)) 121 | rownames(tmpdf) <- coords$spot[i] 122 | tmpdf <- tibble::rownames_to_column(tmpdf, "spot") 123 | colnames(tmpdf) <- df_colnames 124 | distance_stats_df <- rbind(distance_stats_df, tmpdf) 125 | } 126 | coords <- join(coords, distance_stats_df, by = "spot") 127 | coords$row <- NULL 128 | coords$col <- NULL 129 | fwrite(coords, paste0(output_dir, "/distance_stats_dec_max.tsv"), sep = "\t") 130 | cat("\n### Distance-based statistics calculated.\n") 131 | 132 | ### Ensure that coords is a data table 133 | coords <- fread(paste0(output_dir, "/distance_stats_dec_max.tsv"), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 134 | cell_types <- sort(unique(seurat_object_tn@meta.data$cell_type_dec_max)) 135 | } 136 | 137 | ### Test if in all cells of cell_type1, the gene expression levels in those adjacent / not adjacent to cell_type2 differ significantly 138 | cci_adj_results_df <- data.frame(gene = character(0), cell_type1 = character(0), cell_type2 = character(0), adjacency = character(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val = numeric(0), p_val_adj = numeric(0)) 139 | cat("\n### Adjacency analysis started.\n") 140 | 141 | if (length(cell_types) == 1) { 142 | cat(paste0("\n# Only one cell type present in the sample. Skipping this sample")) 143 | next 144 | } 145 | cat(paste0("\n# ", as.character(length(cell_types)), " cell types in total.")) 146 | 147 | for (cell_type in cell_types) { 148 | # cell_type <- cell_types[1] 149 | coords_cell_type <- coords[coords$annotation == cell_type,] 150 | other_cell_types <- cell_types[!cell_types == cell_type] 151 | seurat_cell_type <- seurat_object_tn[, coords_cell_type$spot] 152 | meta <- seurat_cell_type@meta.data 153 | meta <- tibble::rownames_to_column(meta, "spot") 154 | 155 | for (other_cell_type in other_cell_types) { 156 | # other_cell_type <- other_cell_types[1] 157 | tmp <- paste0(other_cell_type, "_adjacent") 158 | adjacent_cells <- coords_cell_type$spot[coords_cell_type[,..tmp] > 0] 159 | not_adjacent_cells <- coords_cell_type$spot[coords_cell_type[,..tmp] == 0] 160 | if (length(adjacent_cells) == 0 | length(not_adjacent_cells) == 0) { 161 | ### All cell_type cells are adjacent to or not adjacent to other_cell_type cells 162 | ### Skip this combination 163 | next 164 | } 165 | adj_df <- rbind(data.frame(spot = adjacent_cells, adjacency = "Adjacent"), data.frame(spot = not_adjacent_cells, adjacency = "Not adjacent")) 166 | adj_df <- join(meta, adj_df, by = "spot") 167 | adjacency <- adj_df$adjacency 168 | names(adjacency) <- adj_df$spot 169 | Idents(seurat_cell_type) <- adjacency 170 | 171 | ### Thresholds decided based on: https://www.nature.com/articles/s41467-019-12266-7 172 | ### Genes with absolute log2 fold change threshold > 0.1 and expressed in at least 10% of the cells are considered 173 | DGE_adjacency <- FindAllMarkers(seurat_cell_type, assay = "SCT", logfc.threshold = 0.1, min.pct = 0.1, verbose = FALSE) 174 | if (nrow(DGE_adjacency) == 0) { 175 | ### No DGE genes found 176 | next 177 | } 178 | DGE_adjacency$cell_type1 <- cell_type 179 | DGE_adjacency$cell_type2 <- other_cell_type 180 | DGE_adjacency$adjacency <- as.character(DGE_adjacency$cluster) 181 | 182 | DGE_adjacency <- DGE_adjacency[,c("gene", "cell_type1", "cell_type2", "adjacency", "avg_log2FC", "pct.1", "pct.2", "p_val", "p_val_adj")] 183 | cci_adj_results_df <- rbind(cci_adj_results_df, DGE_adjacency) 184 | gc() 185 | } 186 | tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14]) 187 | cat(paste0("\n# Cell type [", cell_type, "] done; memory consumed: ", as.character(tt), "M -- [", Sys.time(), "]\n\n")) 188 | } 189 | 190 | fwrite(cci_adj_results_df, paste0(output_dir, "/cci_adj_results.tsv"), sep = "\t") 191 | cat("\n\n### Adjacency analysis completed. Results written to cci_adj_results.tsv.\n") 192 | 193 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/reference/ref_data_processing_example.R: -------------------------------------------------------------------------------- 1 | ### This is a pseudo-script demonstrating the possible steps of processing the downloaded scRNA-seq datasets 2 | ### All the filenames are hard-wired, and this script is for your reference only 3 | ### 4 | ### Author: Yiming Li 5 | 6 | ### This example uses a brain scRNA-seq dataset: 7 | ### cd /share/fsmresfiles/SpatialT/ref/Brain/Non_Adult/GSE60361 8 | 9 | 10 | 11 | ###### Read the count matrix and metadata 12 | 13 | library(data.table) ### For fread -- faster than read.table 14 | 15 | exprMatrix <- fread("exprMatrix.tsv", sep = "\t") ### row = gene, column = cell 16 | str(exprMatrix) 17 | # Classes ‘data.table’ and 'data.frame': 19972 obs. of 3006 variables: 18 | # $ sample : chr "Tspan12" "Tshz1" "Fnbp1l" "Adamts15" ... 19 | # $ 1772071015-C02: num 0 2 2 0 1 ... 20 | # $ 1772071017-G12: num 0 1 1 0 1 0 0 0 0 0 ... 21 | # $ 1772071017-A05: num 0 0 2.81 0 1 ... 22 | # ...... 23 | 24 | meta <- fread("meta.tsv", sep = "\t") ### row = cell 25 | str(meta) 26 | # Classes ‘data.table’ and 'data.frame': 3005 obs. of 12 variables: 27 | # $ V1 : chr "1772071015-C02" "1772071017-G12" "1772071017-A05" "1772071014-B06" ... 28 | # $ tissue : chr "sscortex" "sscortex" "sscortex" "sscortex" ... 29 | # $ group : int 1 1 1 1 1 1 1 1 1 1 ... 30 | # $ total mRNA mol : int 21580 21748 31642 32916 21531 24799 31406 20389 23022 24184 ... 31 | # $ well : int 11 95 33 42 48 13 50 66 29 28 ... 32 | # $ sex : int 1 -1 -1 1 1 -1 1 -1 1 1 ... 33 | # $ age : int 21 20 20 21 25 20 25 23 21 21 ... 34 | # $ diameter : num 0 9.56 11.1 11.7 11 11.9 11.3 10.9 12.9 11.2 ... 35 | # $ level1class : chr "interneurons" "interneurons" "interneurons" "interneurons" ... 36 | # $ level2class : chr "Int10" "Int10" "Int6" "Int10" ... 37 | # ...... 38 | 39 | ### Here we see that the columns "level1class" and "level2class" are cell type labels 40 | 41 | ### We can check if the number of cells in the count matrix and the metatable are the same 42 | ### (Assuming that the rows/columns in the metatable / count matrix are unique) 43 | if (ncol(exprMatrix) == nrow(meta) + 1) { 44 | ### "+ 1" because the first column of exprMatrix is not a cell 45 | cat("\nDimensions match\n") 46 | if (sum(sort(colnames(exprMatrix)[2:ncol(exprMatrix)]) == sort(meta$V1)) == nrow(meta)) { 47 | cat("Cell IDs match\n") 48 | } else { 49 | cat("Cell IDs do not match\n") 50 | } 51 | } else { 52 | cat("\nDimensions do not match\n") 53 | } 54 | 55 | ###!!!!!!! Note that you will need to filter exprMatrix and meta if the dimensions and/or cell IDs do not match! 56 | 57 | ### To perform harmonization (later with another dataset), we can check the unique cell labels in the metatable 58 | 59 | table(meta$level1class) 60 | # astrocytes-ependymal endothelial-mural interneurons 61 | # 224 235 290 62 | # microglia oligodendrocytes pyramidal CA1 63 | # 98 820 939 64 | # pyramidal SS 65 | # 399 66 | 67 | table(meta$level2class) 68 | # (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt CA2Pyr2 Choroid 69 | # 189 68 61 380 447 49 41 10 70 | # ClauPyr Epend Int1 Int10 Int11 Int12 Int13 Int14 71 | # 5 20 12 21 10 21 15 22 72 | # Int15 Int16 Int2 Int3 Int4 Int5 Int6 Int7 73 | # 18 20 24 10 15 20 22 23 74 | # Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3 Oligo4 75 | # 26 11 17 16 45 98 87 106 76 | # Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL S1PyrL23 S1PyrL4 77 | # 125 359 21 32 33 81 74 26 78 | # S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b SubPyr Vend1 Vend2 Vsmc 79 | # 16 28 39 21 22 32 105 62 80 | 81 | ### We can also write the above tables to file if you would like to work in a spreadsheet later 82 | write.table(table(meta$level1class), "GSE60361_level1class.txt", quote = FALSE, sep = "\t", row.names = FALSE) 83 | write.table(table(meta$level2class), "GSE60361_level2class.txt", quote = FALSE, sep = "\t", row.names = FALSE) 84 | 85 | ###!!!!!!! Cell type label harmonization (no generic codes for this) 86 | 87 | ### You can define a function for replacing labels during harmonization 88 | replace_labels <- function(vector, old_label, new_label) { 89 | return(replace(vector, vector == old_label, new_label)) 90 | } 91 | 92 | ### For example, if you would like to change the "oligodendrocytes" labels into "OLG" in this dataset 93 | meta$level1class <- replace_labels(meta$level1class, "oligodendrocytes", "OLG") 94 | 95 | table(meta$level1class) ### See that the labels have changed 96 | # astrocytes-ependymal endothelial-mural interneurons 97 | # 224 235 290 98 | # microglia OLG pyramidal CA1 99 | # 98 820 939 100 | # pyramidal SS 101 | # 399 102 | 103 | 104 | 105 | 106 | 107 | 108 | ###### Save the count matrix and metadata 109 | 110 | library(data.table) 111 | library(mltools) ### For the sparsify() function 112 | library(plyr) ### For the join() function 113 | 114 | exprMatrix <- fread("exprMatrix.tsv", sep = "\t") ### Read into a data.table 115 | gene_names <- exprMatrix$sample ### Store the gene names 116 | exprMatrix$sample <- NULL ### Remove the gene column 117 | 118 | exprMatrix <- sparsify(exprMatrix, sparsifyNAs = TRUE) ### Convert to dgCMatrix format 119 | rownames(exprMatrix) <- gene_names ### The column names should be there, you can check by colnames(exprMatrix) 120 | saveRDS(exprMatrix, "mtx.rds") 121 | 122 | ### The below assumes that we will use the meta$level2class labels, and they have been harmonized with other datasets 123 | 124 | ### Change the order of cells in the metatable 125 | meta <- fread("meta.tsv", sep = "\t") 126 | meta <- meta[,c("V1", "level2class")] 127 | colnames(meta) <- c("cell", "label") 128 | meta_sorted <- data.frame(cell = colnames(exprMatrix)) 129 | meta_sorted <- join(meta_sorted, meta, by = "cell") 130 | 131 | fwrite(meta_sorted, file = "ident.csv") 132 | 133 | 134 | 135 | 136 | 137 | 138 | ###### Create a SingleCellExperiment object based on the count matrix and the (harmonized) metatable 139 | 140 | library(data.table) 141 | library(SingleCellExperiment) 142 | library(scuttle) 143 | 144 | ### Read the prepared files 145 | exprMatrix <- readRDS("mtx.rds") ### From the previous step 146 | meta_sorted <- fread("ident.csv", sep = ",") ### From the previous step 147 | 148 | meta_sorted$cell <- NULL 149 | 150 | ### Create SingleCellExperiment object for SingleR and normalize it 151 | ref_data_sce <- SingleCellExperiment(list(counts = exprMatrix), colData = meta_sorted) 152 | 153 | ref_data_sce <- logNormCounts(ref_data_sce) 154 | ### If your downloaded counts data is already normalized, the above command will fail. 155 | ### However, SingleR expects a "logcounts" assay in the input SCE object, so you need to run the following command. 156 | # logcounts(ref_data_sce) <- counts(ref_data_sce) 157 | 158 | ref_data_sce <- ref_data_sce[,ref_data_sce$label != ""] ### Remove empty cell type labels 159 | ### Need to remove the cells with labels like “doublets”, “Not Assigned”, etc., e.g.: 160 | # ref_data_sce <- ref_data_sce[,ref_data_sce$label != "not applicable"] 161 | 162 | saveRDS(ref_data_sce, file = "GSE60361.RDS") 163 | 164 | 165 | 166 | 167 | 168 | 169 | ######### Test cell type annotation on a dataset (3A) 170 | 171 | library(Seurat) 172 | library(SingleR) 173 | 174 | ref_data_sce <- readRDS("GSE60361.RDS") 175 | dim(ref_data_sce) 176 | table(ref_data_sce$label) 177 | length(table(ref_data_sce$label)) 178 | 179 | ### Example: human brain Visium dataset 180 | target_p_name <- "PID3" 181 | target_ds_name <- "DS3A" 182 | target_ds_dir <- paste(c("/share/fsmresfiles/SpatialT/10x", target_p_name, target_ds_name), collapse = "/") 183 | target_ds_metatable <- read.table(paste0(target_ds_dir, "/metatable.tsv"), header = TRUE, stringsAsFactors = FALSE) 184 | target_sample_name <- target_ds_metatable$SampleID[1] # For testing only 185 | target_sample_dir <- paste(c(target_ds_dir, "/", target_sample_name), collapse = "") 186 | seurat_object_tn_path <- paste0(target_sample_dir, "/processed/Seurat.RDS") 187 | seurat_object_tn <- readRDS(seurat_object_tn_path) 188 | 189 | ### Perform spot-based cell type annotation and save to Seurat 190 | annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, labels = ref_data_sce$label, de.method="wilcox") 191 | seurat_object_tn[["cell_type_annotation"]] <- annotation$labels 192 | 193 | ### Perform cluster-based cell type annotation and save to Seurat 194 | cluster_results <- seurat_object_tn[["seurat_clusters"]]$seurat_clusters 195 | annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, clusters = cluster_results, labels = ref_data_sce$label, de.method="wilcox") 196 | seurat_object_tn[["cell_type_annotation_clusters"]] <- annotation$labels[cluster_results] 197 | 198 | ### Visualize annotated cell types 199 | pdf("~/stbase/DS3A_test.pdf") ### Change to your own save directory/name 200 | print(SpatialDimPlot(seurat_object_tn)) 201 | print(DimPlot(seurat_object_tn, reduction = "umap")) 202 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation")) 203 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation")) 204 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation_clusters")) 205 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation_clusters")) 206 | dev.off() 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spatial transcriptOmics Analysis Resource 2 | 3 | This repository contains the data curation, processing, and analysis scripts used by the article "SOAR elucidates disease mechanisms and empowers drug discovery through spatial transcriptomics" [[bioRxiv preprint]](https://www.biorxiv.org/content/10.1101/2022.04.17.488596v2) | [[Website]](https://soar.fsm.northwestern.edu/). 4 | 5 | ## Data curation 6 | 7 | To query the [Gene Expression Omnibus (GEO)](https://www.ncbi.nlm.nih.gov/geo/) for potential human and mouse spatial transcriptomics datasets, please run [the Python script](https://github.com/luoyuanlab/SOAR/tree/main/data_curation/geo-query.py) using different keywords. 8 | 9 | * `python3 geo-query.py` 10 | 11 | The retrieved GDS list with annotated meta-information will be stored in `./<%Y%m%d>/all.csv`. 12 | 13 | ## Data processing 14 | 15 | The data processing scripts are available under [`data_processing/`](https://github.com/luoyuanlab/SOAR/tree/main/data_processing). The scripts automatically perform spot and gene quality control, data transformation, normalization, and dimensionality reduction. 16 | 17 | **10x Visium data** in standard format can be processed using [`process_visium_standard.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_processing/process_visium_standard.R). The script assumes that the directory contains one option from the below: 18 | 19 | 1. (a) `filtered_feature_bc_matrix.h5` or [MEX files](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices) and (b) the image data in a subdirectory called `spatial` 20 | 2. A Visium Seurat object with `data@images` properly added 21 | 22 | Please note that 10x Visium data with only counts and coordinates and no `spatial/` folder data should be processed using the non-Visium scripts. 23 | 24 | **Other types of spatial transcriptomics data** transformed into a standard format (`counts.csv` and `coordinates.csv`, please see below for the guidelines) can be processed using [`process_non_visium_standard.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_processing/process_non_visium_standard.R). The script can also be used on Visium data with no h5 + spatial data provided for public download. 25 | 26 | `counts.csv` 27 | 28 | * Comma-delimited 29 | * Header: `gene,,...,` 30 | * Each row = one gene 31 | * Gene symbols should be used (not Ensembl IDs, etc.) 32 | 33 | `coordinates.csv` 34 | 35 | * Comma-delimited 36 | * Header: `barcode,row,col` 37 | * The example file has more columns but only these three columns are required 38 | * Use the spot coordinates (row, col) instead of pixel coordinates (imagerow, imagecol in the example file) if possible 39 | * Each row = one spot 40 | 41 | The barcode column of `coordinates.csv` should be exactly the same as the `counts.csv` header (after removing "gene"), i.e. the spot IDs should match. 42 | 43 | ## Data analysis 44 | 45 | ### Overview 46 | 47 | The overall flow of data analysis is as below. 48 | 49 | 1. Perform [spatial clustering](#spatial-clustering) 50 | 2. Perform whole-tissue [spatial variability analysis](#spatial-variability-analysis) 51 | 3. Check if the spatial transcriptomics technology is at single-cell level 52 | * If so (e.g. MERFISH), perform [cell type annotation](#cell-type-annotation) 53 | * If not (e.g. 10x Visium), perform [cell type deconvolution](#cell-type-deconvolution) 54 | 4. Perform cell-type-specific [spatial variability analyis](#spatial-variability-analysis) 55 | 5. Perform [cell-cell interaction analysis](#cell-cell-interaction-analysis) 56 | 57 | ### Spatial clustering 58 | 59 | The scripts for performing spatial clustering are stored in [`data_analysis/spatial_clustering/`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_clustering). Please refer to the [README](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_clustering/README.md) for the details. 60 | 61 | ### Cell typing 62 | 63 | In the [`data_analysis/cell_typing/`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing) folder, scripts are available for performing [cellular deconvolution](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/deconvolution) (spatial transcriptomics technologies with multiple cells per capture location, e.g. 10x Visium) and [cell type annotation](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/annotation) (single-cell-level spatial transcriptomics technologies, e.g. MERFISH). 64 | 65 | #### scRNA-seq reference identification and processing 66 | 67 | To identify scRNA-seq references for cell typing, users may utilize the [GEO query helper script](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/reference/geo-download-scRNA-seq.py). [`ref_data_processing_example.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/reference/ref_data_processing_example.R) is an example script for processing the downloaded scRNA-seq data. Please note that cell quality control needs to be performed case-by-case, i.e. the thresholds should be chosen manually based on the QC plots. 68 | 69 | #### Cell type annotation 70 | 71 | [`annotation_example.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/annotation/annotation_example.R) is an example script for performing cell type annotation on single-cell-level spatial transcriptomics datasets (e.g. MERFISH) using scRNA-seq reference datasets and SingleR. 72 | 73 |
Heuristics-guided cell type annotation for brain datasets (click me) 74 | 75 | [`runBrainCellTypeAnnotation-CluHeu.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/annotation/runBrainCellTypeAnnotation-CluHeu.R) 76 | 77 | * Usage: `./runBrainCellTypeAnnotation-CluHeu.R > runBrainCellTypeAnnotation-CluHeu.log` 78 | * Description 79 | * This script automatically annotates the cell types of brain Visium datasets using a cluster-based approach guided by some heuristics. 80 | * Note that: 81 | * This script requires processed mouse and human scRNA-seq references as the input, and the file paths are currently hard-wired: 82 | * `/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_human_ctx_smart-seq` 83 | * `aibs_human_ctx_smart-seq_neuronal.RDS` 84 | * `aibs_human_ctx_smart-seq_non_neuronal.RDS` 85 | * `supp.RData` 86 | * `/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_mouse_ctx-hpf_10x` 87 | * `aibs_mouse_ctx-hpf_10x_neuronal.RDS` 88 | * `aibs_mouse_ctx-hpf_10x_non_neuronal.RDS` 89 | * `supp.RData` 90 | * This script also reads a table listing the DSID, species, and technology (`brain_DSID_list.txt`) and loops over its rows. Line 52 uses a hard-wired path to this file. 91 | * The annotations follow the [Common Cell Type Nomenclature (CCN)](https://portal.brain-map.org/explore/classes/nomenclature). `seurat_object[["cell_type_annotation"]]` contains the annotated subclasses, and `seurat_object[["cell_type_annotation_class"]]` contains the annotated classes (i.e., glutamatergic, GABAergic, or non_neuronal). 92 |
93 | 94 | #### Cell type deconvolution 95 | 96 | [Analysis scripts](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/deconvolution) are available for the cell type deconvolution of spatial transcriptomics datasets using scRNA-seq reference datasets and BayesPrism. 97 | 98 | Steps for running deconvolution 99 | 100 | 1. Sample script for processing scRNA-seq reference: `process_reference_example.R` 101 | 2. Deconvolution script: `quest_deconvolution_jobarray.R` 102 | 3. Script for preparing deconvolution results for subsequent analysis (only run this after the deconvolution is complete): `create_input_files.R` 103 | 104 | ### Spatial variability analysis 105 | 106 | The scripts for the spatial variability (SV) analysis of spatial transcriptomics data are stored in [`data_analysis/spatial_variability/`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability). 107 | 108 | To perform whole-tissue SV analysis, use the script [`quest_SpatialDE_jobarray.py`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability/quest_SpatialDE_jobarray.py). To run the analysis, use the command `python quest_SpatialDE_jobarray.py $sample_directory`. 109 | 110 | To perform cell-type-specific SV analysis, use the script [`quest_SpatialDE_ct_specific.py`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability/quest_SpatialDE_ct_specific.py). To run the analysis, use the command `python quest_SpatialDE_ct_specific.py $sample_directory $cell_type`. 111 | 112 | ### Cell-cell interaction analysis 113 | 114 | To perform neighborhood-based cell-cell interaction analysis, use the script [`adj-analysis.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_cell_interaction/neighborhood-based/adj-analysis.R). To run the analysis, use the command `./adj-analysis.R $sample_directory`. 115 | 116 | To perform distance-based cell-cell interaction analysis, run the bash script [`cci-analysis-COMMOT-DGE.sh`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE.sh) to call different analysis scripts in the pipeline. 117 | 118 | ### Drug screen 119 | 120 | Scripts for drug discovery analysis are stored under [`data analysis/drug_discovery`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/drug_discovery). 121 | 122 | The four types of analysis are: 123 | 124 | 1. Differential gene expression analysis. [`Scripts for deconvoluted and annotated samples`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/drug_discovery/DGE). 125 | 126 | 2. Protein-protein interaction (PPI) network for spatially variable, differentially expressed (DE-SV) genes by cell type. [`Script for generating PPI network`](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.py). 127 | 128 | 3. CMAP L1000 drug enrichment (compounds with top overall positive/negative enrichment score on SV-DE gene sets of a cell type). [`Script for CMAP drug enrichment analysis`](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py). 129 | 130 | 4. CMAP L1000 drug perturbation (top gene targets perturbed by the top postiively/negatively enriched compounds). [`Script for CMAP drug perturbation analysis`](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py) (contained in the same script as above). 131 | 132 | -------------------------------------------------------------------------------- /data_analysis/cell_typing/annotation/runBrainCellTypeAnnotation-CluHeu.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### Author: Yiming Li 4 | ### 5 | ### Description: This script automatically annotates the cell types of brain spatial transcriptomics datasets using a cluster-based approach guided by some heuristics 6 | ### * It reads a table listing the DSID, technology, and species (brain_DSID_list.txt) and loop over its rows 7 | ### 8 | ### Usage: ./runBrainCellTypeAnnotation-CluHeu.R 9 | ### 10 | ### Example: To annotate all the brain datasets in our database: 11 | # chmod 755 runBrainCellTypeAnnotation-CluHeu.R 12 | # ./runBrainCellTypeAnnotation-CluHeu.R > runBrainCellTypeAnnotation-CluHeu.log 13 | 14 | 15 | 16 | library(SingleCellExperiment) 17 | library(Seurat) 18 | library(SingleR) 19 | library(dplyr) 20 | library(plyr) 21 | library(ggplot2) 22 | library(data.table) 23 | library(AUCell) 24 | 25 | 26 | 27 | ### Read/Create variables related to cancer annotation 28 | human_ref_dir <- "/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_human_ctx_smart-seq/" 29 | ref_data_sce_neuronal_human <- readRDS(paste0(human_ref_dir, "aibs_human_ctx_smart-seq_neuronal.RDS")) 30 | ref_data_sce_non_neuronal_human <- readRDS(paste0(human_ref_dir, "aibs_human_ctx_smart-seq_non_neuronal.RDS")) 31 | 32 | mouse_ref_dir <- "/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_mouse_ctx-hpf_10x/" 33 | ref_data_sce_neuronal_mouse <- readRDS(paste0(mouse_ref_dir, "aibs_mouse_ctx-hpf_10x_neuronal.RDS")) 34 | ref_data_sce_non_neuronal_mouse <- readRDS(paste0(mouse_ref_dir, "aibs_mouse_ctx-hpf_10x_non_neuronal.RDS")) 35 | 36 | ### Define functions 37 | clustering_seurat <- function(data) { 38 | data <- RunPCA(data, assay = "SCT", verbose = FALSE) 39 | data <- FindNeighbors(data, reduction = "pca", dims = 1:30, verbose = FALSE) 40 | data <- FindClusters(data, resolution = 1.2, verbose = FALSE) 41 | ## resolution: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html advises 0.4-1.2 for around 3K cells 42 | data <- RunUMAP(data, reduction = "pca", dims = 1:30, verbose = FALSE) 43 | return(data) 44 | } 45 | 46 | 47 | 48 | ################## 49 | 50 | 51 | 52 | target_list <- fread("~/stbase/brain_DSID_list.txt", sep = "\t") 53 | 54 | for (i in 1:nrow(target_list)) { 55 | # i <- 1 # For testing only 56 | target_ds_name <- as.character(target_list[i, "DSID"]) 57 | target_species <- as.character(target_list[i, "Species"]) 58 | target_tech <- as.character(target_list[i, "Technology"]) 59 | ### Do not differentiate between adult and non-adult datasets 60 | target_p_name <- paste0("PID", substr(target_ds_name, start = 3, stop = nchar(target_ds_name) - 1)) 61 | 62 | target_ds_dir <- paste(c("/share/fsmresfiles/SpatialT", target_tech, target_p_name, target_ds_name), collapse = "/") 63 | if (!dir.exists(target_ds_dir)) { 64 | cat(paste0(target_ds_dir, " does not exist")) 65 | next 66 | } 67 | if (!file.exists(paste0(target_ds_dir, "/metatable.tsv"))) { 68 | cat(paste0(target_ds_dir, "/metatable.tsv does not exist")) 69 | next 70 | } 71 | target_ds_metatable <- read.table(paste0(target_ds_dir, "/metatable.tsv"), header = TRUE, stringsAsFactors = FALSE, sep = "\t") 72 | 73 | cat(paste(c("\n\n>>>>>>>> ", target_ds_dir, " <<<<<<<<"), collapse = "")) 74 | 75 | for (target_sample_name in target_ds_metatable$SampleID) { 76 | # target_sample_name <- target_ds_metatable$SampleID[1] # For testing only 77 | target_sample_dir <- paste(c(target_ds_dir, "/", target_sample_name), collapse = "") 78 | 79 | ### Needed since some of the datasets were incorrectly prepared 80 | if (!dir.exists(target_ds_dir)) { 81 | cat(paste0(target_sample_dir, " does not exist")) 82 | next 83 | } 84 | 85 | cat(paste(c("\n\n>>>>>>>> ", target_sample_dir, " <<<<<<<<"), collapse = "")) 86 | 87 | ### Annotation results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/annotation 88 | output_dir_1 <- paste0(target_sample_dir, "/analysis") 89 | output_dir <- paste0(output_dir_1, "/annotation") 90 | if (!dir.exists(output_dir_1)) { 91 | dir.create(output_dir_1) 92 | } 93 | if (!dir.exists(output_dir)) { 94 | dir.create(output_dir) 95 | } 96 | 97 | ### Read in the QC-ed + transformed + processed Seurat object (from process_visium_standard.R) 98 | seurat_object_tn_path <- paste0(target_sample_dir, "/processed/Seurat.RDS") 99 | seurat_object_tn <- readRDS(seurat_object_tn_path) 100 | cat("\n\n### Processed Seurat object read.") 101 | if ("cell_type_annotation_class" %in% colnames(seurat_object_tn@meta.data) & "cell_type_annotation" %in% colnames(seurat_object_tn@meta.data)) { 102 | cat("\n# Seurat object already has cell type annotation results saved, skipping this sample.") 103 | next 104 | } 105 | 106 | ### Check if the Seurat object is after SCT 107 | if (!"SCT" %in% names(seurat_object_tn)) { 108 | cat("\n# [Warning] Seurat object does not have an \"SCT\" assay, renaming Spatial to SCT.") 109 | ### Some MERFISH files prepared by Yawei do not have an SCT assay 110 | ### However, they are already normalized so we can rename the assay to SCT for ease of subsequent analysis 111 | file.copy(seurat_object_tn_path, paste0(target_sample_dir, "/processed/Seurat_bk.RDS"), overwrite = TRUE) 112 | seurat_object_tn <- RenameAssays(object = seurat_object_tn, Spatial = "SCT") 113 | saveRDS(seurat_object_tn, file = seurat_object_tn_path) 114 | } 115 | 116 | ### Check if the ST data underwent dimensionality reduction and clustering 117 | if (!"umap" %in% names(seurat_object_tn)) { 118 | cat("\n# [Warning] Seurat object does not have UMAP dimensional reduction calculated, skipping this sample.") 119 | next 120 | } 121 | 122 | cat("\n\n### Start annotation") 123 | coords <- seurat_object_tn@meta.data 124 | coords <- tibble::rownames_to_column(coords, "spot") 125 | coords$seurat_clusters <- as.character(coords$seurat_clusters) 126 | 127 | ### Read marker gene lists and cell class/subclass mappings 128 | if (target_species == "Human") { 129 | load(paste0(human_ref_dir, "supp.RData")) 130 | } else { 131 | load(paste0(mouse_ref_dir, "supp.RData")) 132 | } 133 | 134 | ### Use AUCell to classify clusters into neuronal vs non-neuronal 135 | cells_rankings <- AUCell_buildRankings(GetAssayData(seurat_object_tn, assay = "SCT"), verbose = FALSE) 136 | ### The AUC value represents the fraction of genes, within the top 20% genes in the ranking, that are included in the signature 137 | runAUCell <- tryCatch( 138 | { 139 | cells_AUC <- AUCell_calcAUC(gene_lists, cells_rankings, aucMaxRank = nrow(cells_rankings) * 0.2, verbose = FALSE) 140 | }, 141 | error = function(e) e 142 | ) 143 | if (inherits(runAUCell, "error")){ 144 | cat(paste(c("\n# AUCell failed, ST dataset may have a small number of genes. Creating a subsetted gene list."), collapse = "")) 145 | data_genes <- rownames(seurat_object_tn) 146 | for (gene_list_name in names(gene_lists)) { 147 | gene_list <- gene_lists[[gene_list_name]] 148 | gene_lists[[gene_list_name]] <- gene_list[gene_list %in% data_genes] 149 | } 150 | gene_lists <- gene_lists[lapply(gene_lists, length) > 0] 151 | cells_AUC <- AUCell_calcAUC(gene_lists, cells_rankings, aucMaxRank = nrow(cells_rankings) * 0.2, verbose = FALSE) 152 | } 153 | 154 | clusters <- sort(unique(coords$seurat_clusters)) 155 | cluster_class <- data.frame(seurat_clusters = clusters, subclass = rep(NA, length(clusters))) 156 | for (i in 1:length(clusters)) { 157 | cluster <- clusters[i] 158 | spots <- coords[coords$seurat_clusters == cluster, "spot"] 159 | tmpmat <- cells_AUC[,colnames(cells_AUC) %in% spots] 160 | tmp <- rowSums(tmpmat@assays@data@listData$AUC) 161 | cluster_class$subclass[i] <- names(which(tmp == max(tmp))) 162 | } 163 | cluster_class$class <- cell_type_names[cluster_class$subclass] 164 | coords <- join(coords, cluster_class, by = "seurat_clusters") 165 | coords$class[coords$class == "glutamatergic"] <- "neuronal" 166 | coords$class[coords$class == "GABAergic"] <- "neuronal" 167 | coords <- coords[, c("spot", "class")] 168 | 169 | ### Use SingleR to annotate the subclasses 170 | cluster_results <- seurat_object_tn[["seurat_clusters"]]$seurat_clusters 171 | if (target_species == "Human") { 172 | annotation_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_neuronal_human, clusters = cluster_results, labels = ref_data_sce_neuronal_human$label, de.method="wilcox") 173 | annotation_non_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_non_neuronal_human, clusters = cluster_results, labels = ref_data_sce_non_neuronal_human$label, de.method="wilcox") 174 | } else { 175 | annotation_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_neuronal_mouse, clusters = cluster_results, labels = ref_data_sce_neuronal_mouse$label, de.method="wilcox") 176 | annotation_non_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_non_neuronal_mouse, clusters = cluster_results, labels = ref_data_sce_non_neuronal_mouse$label, de.method="wilcox") 177 | } 178 | 179 | ### Process SingleR annotation results 180 | tmpdf <- data.frame(spot = colnames(seurat_object_tn), annot_neu = annotation_neuronal$labels[cluster_results]) 181 | coords <- join(coords, tmpdf, by = "spot") 182 | tmpdf2 <- data.frame(spot = colnames(seurat_object_tn), annot_non = annotation_non_neuronal$labels[cluster_results]) 183 | coords <- join(coords, tmpdf2, by = "spot") 184 | coords$cell_type_annotation <- ifelse(coords$class == "neuronal", coords$annot_neu, coords$annot_non) 185 | coords$cell_type_annotation_class <- cell_type_names[coords$cell_type_annotation] 186 | 187 | ### Save annotation results to Seurat object 188 | seurat_object_tn[["cell_type_annotation"]] <- coords$cell_type_annotation 189 | seurat_object_tn[["cell_type_annotation_class"]] <- coords$cell_type_annotation_class 190 | saveRDS(seurat_object_tn, file = seurat_object_tn_path) 191 | cat("\n\n### Seurat.RDS overwritten with annotation results.") 192 | 193 | ### Visualize annotated cell types 194 | pdf(paste0(output_dir, "/cell_type_annotation.pdf")) 195 | print(SpatialDimPlot(seurat_object_tn)) 196 | print(DimPlot(seurat_object_tn, reduction = "umap")) 197 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation_class")) 198 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation_class")) 199 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation")) 200 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation")) 201 | dev.off() 202 | cat("\n\n### Visualization of annotated cell types completed.") 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /data_processing/process_visium_standard.R: -------------------------------------------------------------------------------- 1 | ### The purpose of this script is to process the 10x Visium data stored in standard structure (HDF5 + spatial/). 2 | ### The dataset IDs to be processed are read from a preexisting file DSID_list_10x_standard.txt in the shared folder. 3 | ### This script will loop through the datasets, read, transform, and process the datasets, and save the processed data as well as a meta table for each dataset. 4 | ### 5 | ### Usage: Rscript --no-save process_visium_standard.R > process_visium_standard.log 6 | ### 7 | ### Author: Saya Dennis; Yiming Li; Sanaz Ghotbaldini 8 | 9 | library(stringr) 10 | library(dplyr) 11 | library(Seurat) 12 | library(patchwork) 13 | library(data.table) 14 | # library(SeuratDisk) 15 | 16 | dn <- "/projects/b1131/SpatialT/10x" 17 | args <- commandArgs(trailingOnly=TRUE) 18 | dsid <- args[1] 19 | 20 | #### Get PID 21 | dsid <- paste0("DS", dsid) 22 | pid <- paste0("PID", substr(dsid, start=3, stop=nchar(dsid)-1)) # e.g. "PID1" 23 | #### Create empty data frame for meta table 24 | meta <- data.frame(DSID = character(0), SampleID = character(0), Nspots = integer(0), Nspots_postQC = integer(0), Ngenes = integer(0), Ngenes_postQC = integer(0), Condition = character(0)) 25 | #### Set dataset directory and get a list of samples 26 | dsdir <- paste0(dn, "/", pid, "/", dsid) # e.g. "/projects/b1131/SpatialT/10x/PID1/DS1A" 27 | sampleids <- dir(dsdir, pattern = dsid) # e.g. list of elements like "DS1A.1" 28 | 29 | cat(paste(c("\n>>>>>>>> Dataset [", dsdir, "] started <<<<<<<<\n"), collapse = "")) 30 | for (sampleid in sampleids) { 31 | #### Read data 32 | if (file.exists(paste0(dsdir, "/", sampleid, "/original/filtered_feature_bc_matrix.h5"))) { 33 | if (file.exists(paste0(dsdir, "/", sampleid, "/original/spatial/tissue_lowres_image.png"))) { 34 | data <- Load10X_Spatial(paste0(dsdir, "/", sampleid, "/original"), assay = "Spatial") 35 | } else { 36 | cat(paste(c("### Sample: ", sampleid, " [tissue_lowres_image.png] not found, using [tissue_hires_image.png]\n"), collapse = "")) 37 | img <- Read10X_Image(paste0(dsdir, "/", sampleid, "/original/spatial"), image.name = 'tissue_hires_image.png') 38 | data <- Load10X_Spatial(paste0(dsdir, "/", sampleid, "/original"), image = img, assay = "Spatial") 39 | data@images$slice1@scale.factors$lowres <- data@images$slice1@scale.factors$hires 40 | } 41 | 42 | } else if (file.exists(paste0(dsdir, "/", sampleid, "/original/Seurat.RDS"))) { 43 | cat(paste(c("### Sample: ", sampleid, " [filtered_feature_bc_matrix.h5] not found, using Seurat object\n"), collapse = "")) 44 | data <- readRDS(paste0(dsdir, "/", sampleid, "/original/Seurat.RDS")) 45 | image_key <- as.character(names(data@images)) 46 | names(data@images)[names(data@images) == image_key] <- "slice1" 47 | 48 | } else { 49 | cat(paste(c("### Sample: ", sampleid, " [filtered_feature_bc_matrix.h5] not found, using MEX files\n"), collapse = "")) 50 | if (file.exists(paste0(dsdir, "/", sampleid, "/original/spatial/tissue_lowres_image.png"))) { 51 | counts <- Read10X(paste0(dsdir, "/", sampleid, "/original")) 52 | data <- CreateSeuratObject(counts, assay = "Spatial") 53 | img <- Read10X_Image(paste0(dsdir, "/", sampleid, "/original/spatial")) 54 | img <- img[Cells(data)] 55 | DefaultAssay(img) <- DefaultAssay(data) 56 | data[["slice1"]] <- img 57 | } else { 58 | cat(paste(c("### Sample: ", sampleid, " [tissue_lowres_image.png] not found, using [tissue_hires_image.png]\n"), collapse = "")) 59 | counts <- Read10X(paste0(dsdir, "/", sampleid, "/original")) 60 | data <- CreateSeuratObject(counts, assay = "Spatial") 61 | img <- Read10X_Image(paste0(dsdir, "/", sampleid, "/original/spatial"), image.name = 'tissue_hires_image.png') 62 | img <- img[Cells(data)] 63 | DefaultAssay(img) <- DefaultAssay(data) 64 | data[["slice1"]] <- img 65 | data@images$slice1@scale.factors$lowres <- data@images$slice1@scale.factors$hires 66 | } 67 | } 68 | 69 | nspots <- ncol(data) 70 | ngenes <- nrow(data) 71 | 72 | #### Spot QC 73 | ## Step 1. Remove the spots with total UMI count < 500 / the total number of genes < 500 / >= 25% mitochondrial reads 74 | data[["percent_mt"]] <- PercentageFeatureSet(data, "^MT-") 75 | data <- data[, data$nCount_Spatial >= 500 & data$nFeature_Spatial >= 500 & data$percent_mt < 25] 76 | ## Step 2. 77 | ## Remove the spots with total UMI count < median(total UMI count) - 3 * SD(total UMI count). 78 | ## Remove the spots with total number of genes < median(total number of genes) - 3 * SD(total number of genes) 79 | data <- data[, data$nCount_Spatial >= median(data$nCount_Spatial) - 3 * sqrt(var(data$nCount_Spatial)) & data$nFeature_Spatial >= median(data$nFeature_Spatial) - 3 * sqrt(var(data$nFeature_Spatial))] 80 | 81 | #### Gene QC 82 | counts <- data.frame(GetAssayData(object = data, assay = "Spatial", slot = "counts")) 83 | counts <- counts > 0 84 | n_spots_per_gene <- rowSums(counts) 85 | data <- data[n_spots_per_gene >= 5,] 86 | 87 | nspots_qc <- ncol(data) 88 | ngenes_qc <- nrow(data) 89 | 90 | #### Exclude sample if there are < 50 spots after QC 91 | if (nspots_qc < 50) { 92 | cat(paste(c("### [", sampleid, "] has < 50 spots after QC, excluded\n"), collapse = "")) 93 | next 94 | } 95 | 96 | #### Append to the dataset metatable 97 | meta <- rbind(meta, data.frame(DSID = dsid, SampleID = sampleid, Nspots = nspots, Nspots_postQC = nspots_qc, Ngenes = ngenes, Ngenes_postQC = ngenes_qc, Condition = NA)) 98 | 99 | #### Transform and process data 100 | data <- SCTransform(data, assay = "Spatial", verbose = FALSE) 101 | 102 | ### Dimensionality reduction and clustering 103 | data <- RunPCA(data, assay = "SCT", verbose = FALSE) 104 | data <- FindNeighbors(data, reduction = "pca", dims = 1:30, verbose = FALSE) 105 | data <- FindClusters(data, resolution = 1.2, verbose = FALSE) 106 | ## resolution: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html advises 0.4-1.2 for around 3K cells 107 | data <- RunUMAP(data, reduction = "pca", dims = 1:30, verbose = FALSE) 108 | 109 | ### Save processed Seurat object 110 | ## "processed" folder should already exist for all datasets but just in case 111 | if (!dir.exists(paste0(dsdir, "/", sampleid, "/processed/"))) { 112 | dir.create(paste0(dsdir, "/", sampleid, "/processed/")) 113 | } 114 | saveRDS(data, file = paste0(dsdir, "/", sampleid, "/processed/Seurat.RDS")) 115 | 116 | ## Rename variables for the below code that is pasted from rewrite_text_files.R ## 117 | sample_dir <- paste0(dsdir, "/", sampleid) 118 | 119 | seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS")) 120 | 121 | #### Create spot IDs following R's column name requirements 122 | spotmeta <- seurat_object@meta.data 123 | spotmeta$new_spot_id <- paste0("sp", 1:nrow(spotmeta)) 124 | seurat_object[["new_spot_id"]] <- spotmeta$new_spot_id 125 | saveRDS(seurat_object, paste0(sample_dir, "/processed/Seurat.RDS")) 126 | 127 | spot_id_mapping <- spotmeta$new_spot_id 128 | names(spot_id_mapping) <- rownames(spotmeta) 129 | 130 | #### Prepare data for deconvolution 131 | if ("Spatial" %in% names(seurat_object@assays)) { 132 | counts <- GetAssayData(object = seurat_object, assay = "Spatial", slot = "counts") 133 | } else { 134 | ### MERFISH datasets 135 | counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts") 136 | } 137 | counts <- as.matrix(counts) 138 | gene_names <- rownames(counts) 139 | counts_t <- data.table(counts) 140 | counts_t$gene <- gene_names 141 | gc() 142 | counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long 143 | gc() 144 | cell_names <- counts_t$cell 145 | counts_t$cell <- NULL ### Remove the gene name column 146 | counts_t <- as.matrix(counts_t) 147 | rownames(counts_t) <- cell_names 148 | saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RDS")) 149 | gc() 150 | 151 | #### Retrieve QC-ed counts and coordinates 152 | 153 | ## Counts 154 | counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts") 155 | counts <- as.matrix(counts) 156 | colnames(counts) <- as.character(spot_id_mapping[colnames(counts)]) 157 | counts_df <- tibble::rownames_to_column(data.frame(counts), "gene") 158 | 159 | ## Coordinates 160 | if ("slice1" %in% names(seurat_object@images)) { 161 | location <- seurat_object@images$slice1@coordinates ### Visium 162 | location <- location[, c("col", "row")] ### Use the spot coordinates 163 | } else { 164 | location <- seurat_object@images$image@coordinates ### Others 165 | if ("x" %in% colnames(location)) { 166 | location <- location[, c("x", "y")] ### Use the spot coordinates 167 | } else { 168 | ### Some prepared MERFISH datasets did not follow the naming standard 169 | location <- location[, c("xcoord", "ycoord")] ### Use the spot coordinates 170 | colnames(location) <- c("x", "y") 171 | } 172 | } 173 | location$barcode <- rownames(location) 174 | colnames(location) <- c("x", "y", "barcode") 175 | location$barcode <- spot_id_mapping[location$barcode] 176 | 177 | ## Sometimes (e.g. /share/fsmresfiles/SpatialT/10x/PID153/DS153A/DS153A.1), the number of spots in seurat_object@images is different from ncol(seurat_object) 178 | ## Not sure why this is the case 179 | counts_spots <- colnames(counts_df) 180 | location_spots <- location$barcode 181 | keep_spots <- intersect(counts_spots, location_spots) 182 | counts_df <- counts_df[,c("gene", keep_spots)] 183 | rownames(location) <- location$barcode 184 | location <- location[keep_spots,] 185 | 186 | #### Write counts, coordinates, and meta_spots to file 187 | write.table(counts_df, file = paste0(sample_dir, "/processed/counts.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE) 188 | write.table(location, file = paste0(sample_dir, "/processed/coordinates.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE) 189 | saveRDS(list("counts" = counts_df, "coordinates" = location), file = paste0(sample_dir, "/processed/data_frames.RDS")) 190 | 191 | #### Prepare data for deconvolution (relative counts) 192 | if ("Spatial" %in% names(seurat_object@assays)) { 193 | DefaultAssay(seurat_object) <- "Spatial" 194 | seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6) 195 | counts <- GetAssayData(object = seurat_object, assay = "Spatial") 196 | } else { 197 | ### MERFISH datasets 198 | seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6) 199 | counts <- GetAssayData(object = seurat_object, assay = "SCT") 200 | } 201 | counts <- as.matrix(counts) 202 | gene_names <- rownames(counts) 203 | counts_t <- data.table(counts) 204 | counts_t$gene <- gene_names 205 | gc() 206 | counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long 207 | gc() 208 | cell_names <- counts_t$cell 209 | counts_t$cell <- NULL ### Remove the gene name column 210 | counts_t <- as.matrix(counts_t) 211 | rownames(counts_t) <- cell_names 212 | saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RC.RDS")) 213 | 214 | cat(paste(c("### [", sampleid, "] Finished.\n"), collapse = "")) 215 | } 216 | write.table(meta, file = paste0(dsdir, "/metatable_auto.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) 217 | if (file.exists(paste0(dsdir, "/metatable_orig.tsv"))) { 218 | meta$Condition <- NULL 219 | meta_orig <- fread(paste0(dsdir, "/metatable_orig.tsv")) 220 | meta <- merge(meta, meta_orig, by = c("DSID", "SampleID")) 221 | } 222 | write.table(meta, file = paste0(dsdir, "/metatable.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) 223 | 224 | cat(paste(c(">>>>>>>> Dataset: ", dsid, " completed <<<<<<<<\n"), collapse = "")) 225 | -------------------------------------------------------------------------------- /data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py: -------------------------------------------------------------------------------- 1 | import cmapPy 2 | import cmapPy.pandasGEXpress.parse 3 | import json 4 | import numpy as np 5 | from collections import Counter 6 | np.random.seed(1024) 7 | import pandas as pd 8 | import statsmodels 9 | from statsmodels.stats.multitest import fdrcorrection 10 | import os 11 | from sklearn.preprocessing import MinMaxScaler 12 | scaler = MinMaxScaler(feature_range = (-2,2)) 13 | import math 14 | import sys 15 | 16 | dsid = sys.argv[1] 17 | sampleid = sys.argv[2] 18 | # for single cell samples 19 | #deg_dir = '/projects/b1131/SpatialT/drug-target/'+dsid+'/'+sampleid+'/DGE_anno_SVG' 20 | # for deconv samples 21 | deg_dir = '/projects/b1131/SpatialT/drug-target/'+dsid+'/'+sampleid+'/DGE_dec_SVG' 22 | cell_direc = [i for i in os.listdir(deg_dir)] 23 | print('STARTING:', dsid, ' ', sampleid) 24 | 25 | # output dir 26 | out_dir = '/projects/b1131/SpatialT/drug-target-cmap2-svg/' 27 | enrich_dir = out_dir+dsid+'/'+sampleid+'/GSEA/' 28 | top_enrich_dir = out_dir+dsid+'/'+sampleid+'/Enrichment/' 29 | perturb_dir = out_dir+dsid+'/'+sampleid+'/Perturbation/' 30 | 31 | ##### gene processing ##### 32 | # hgnc genes 33 | hgnc_v2 = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/HGNC091923.txt', sep='\t') 34 | hgnc_dict = dict(zip(hgnc_v2['Approved symbol'], hgnc_v2['NCBI gene ID'])) 35 | hgnc_v2_prev = hgnc_v2[hgnc_v2['Previous symbol'].notnull()] 36 | hgnc_dict_prev = dict(zip(hgnc_v2_prev['Previous symbol'], hgnc_v2_prev['NCBI gene ID'])) 37 | 38 | #cmap genes 39 | gene_info = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/geneinfo_beta.txt', sep='\t') 40 | 41 | # mapping gene indices to real names 42 | gene_cmap2 = gene_info[['gene_id','gene_symbol']] 43 | gene_cmap2.columns = ['num','name'] 44 | lst = [] 45 | for i in gene_cmap2['num'].tolist(): 46 | if pd.isna(i) != True: 47 | lst.append(int(i)) 48 | else: 49 | lst.append(i) 50 | gene_cmap2['num'] = lst 51 | gene_dict_cmap2 = gene_cmap2.set_index('num').to_dict()['name'] 52 | gene_dict_cmap2_rev = {v:k for k,v in gene_dict_cmap2.items()} 53 | print('Genes cleaned for mapping') 54 | 55 | 56 | ##### cmap2 enrichment setup ##### 57 | # rank file of 6hr perturbations (from z scores)- gene x compound effects 58 | cmap2_cmap1_perturb_rank_dur = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/cmap2_6hr_pert_rank.csv', engine='c') 59 | cmap2_cmap1_perturb_rank_dur = cmap2_cmap1_perturb_rank_dur.set_index('Unnamed: 0') 60 | cmap2_cmap1_perturb_rank_dur_arry = np.array(cmap2_cmap1_perturb_rank_dur) 61 | 62 | # gene and compound name setup for gsea 63 | ROWS_dur = [str(gene_dict_cmap2_rev[i]) for i in cmap2_cmap1_perturb_rank_dur.index] 64 | COLS_dur = cmap2_cmap1_perturb_rank_dur.columns.tolist() 65 | ROW2IDX2_val_dur = list(range(0,len(ROWS_dur))) 66 | COL2IDX2_val_dur = list(range(0,len(COLS_dur))) 67 | ROW2IDX_key_dur = ROWS_dur 68 | COL2IDX_key_dur = COLS_dur 69 | ROW2IDX_dur = {k:v for k,v in zip(ROW2IDX_key_dur, ROW2IDX2_val_dur)} 70 | COL2IDX_dur = {k:v for k,v in zip(COL2IDX_key_dur, COL2IDX2_val_dur)} 71 | N_GENES, N_PERTURBATIONS = cmap2_cmap1_perturb_rank_dur_arry.shape 72 | N_REPEATS = 1000 73 | RANDOM_RANK = np.random.rand(N_GENES, N_REPEATS).argsort(axis=0) + 1 # gene x repeat 74 | 75 | 76 | ##### cmap2 perturbation setup ##### 77 | # cmap2 perturbation zscore 78 | z_score = np.load('/projects/b1131/SpatialT/cmap_ppi_database/cmap2_6hr_pert_z.npy') 79 | z_score_df = pd.DataFrame(z_score) 80 | z_score_df.columns = cmap2_cmap1_perturb_rank_dur.columns 81 | z_score_df.index = cmap2_cmap1_perturb_rank_dur.index 82 | print('cmap data read in') 83 | 84 | 85 | ##### format dge results ###### 86 | def get_up_down_deg (df): 87 | df_up = df[(df['stat']>0.5) & (df['qval']<0.05) ] 88 | df_down = df[(df['stat']<-0.5) & (df['qval']<0.05) ] 89 | return df_up, df_down 90 | 91 | # transform from genename to number 92 | def hgnc_gene_up_down(up_df, down_df): 93 | gene_num_up = [] 94 | for i in up_df['gene'].tolist(): 95 | if i in hgnc_dict.keys() and math.isnan(hgnc_dict[i]) == False: 96 | gene_num_up.append(str(int(hgnc_dict[i]))) 97 | elif i in hgnc_dict_prev.keys() and math.isnan(hgnc_dict_prev[i]) == False: 98 | gene_num_up.append(str(int(hgnc_dict_prev[i]))) 99 | else: 100 | print('SKIP GENE',i) 101 | 102 | gene_num_down = [] 103 | for i in down_df['gene'].tolist(): 104 | if i in hgnc_dict.keys() and math.isnan(hgnc_dict[i]) == False: 105 | gene_num_down.append(str(int(hgnc_dict[i]))) 106 | elif i in hgnc_dict_prev.keys() and math.isnan(hgnc_dict_prev[i]) == False: 107 | gene_num_down.append(str(int(hgnc_dict_prev[i]))) 108 | else: 109 | print('SKIP GENE', i) 110 | return gene_num_up, gene_num_down 111 | 112 | ##### GSEA #### 113 | def cmap2_gsea_setup(gene_num_up, gene_num_down): 114 | DATA = { 115 | "U": [ROW2IDX_dur[g] for g in gene_num_up if g in ROW2IDX_dur], 116 | "D": [ROW2IDX_dur[g] for g in gene_num_down if g in ROW2IDX_dur], 117 | } 118 | N_UP = len(DATA["U"]) 119 | N_DOWN = len(DATA["D"]) 120 | #print(N_UP, N_DOWN) 121 | return DATA, N_UP, N_DOWN 122 | 123 | # up/down gene set level es 124 | def STEP1(v): 125 | # deg sorted up/down reg gene perrturb array 126 | v = sorted(v) 127 | # t = length of up/down reg gene perturb array 128 | t = len(v) 129 | 130 | # (i+1)/t = fraction of elements up to index i in sorted up/down gene list 131 | # v[i]/n_genes = fraction of the rank of the current gene at index i out of the total number of genes 132 | # total upward (a) and downward (b) influence (es) for each up/down reg gene set 133 | a = max((i + 1) / t - v[i] / N_GENES for i in range(t)) 134 | b = max(v[i] / N_GENES - i / t for i in range(t)) 135 | es = 0 136 | if a > b: #check what a and b is 137 | es = a 138 | elif b > a: 139 | es = -b 140 | #print('a:',a,'b:', b) 141 | return es 142 | 143 | # diff of up/down gene set es 144 | def STEP2(rank_u, rank_d): 145 | #print(rank_u) 146 | # calculate up genes and down genes es difference for each perturbation to determine overall es 147 | es_u = STEP1(rank_u) if rank_u.size else 0 148 | es_d = STEP1(rank_d) if rank_d.size else 0 149 | #print(es_u, es_d) 150 | if np.sign(es_u) == np.sign(es_d): 151 | return 0 152 | return es_u - es_d 153 | 154 | def run_gsea(DATA, N_UP, N_DOWN): 155 | background = np.array([STEP2(RANDOM_RANK[:N_UP, c], RANDOM_RANK[N_UP:N_UP + N_DOWN, c]) for c in range(N_REPEATS)]) 156 | result = [] 157 | # for each perturbation, calculate the es score difference of up genes and down genes 158 | for col in range(N_PERTURBATIONS): 159 | es = STEP2(cmap2_cmap1_perturb_rank_dur_arry[DATA["U"], col], cmap2_cmap1_perturb_rank_dur_arry[DATA["D"], col]) 160 | #print(es) 161 | p = 1.0 162 | if es > 0: 163 | p = np.mean(background > es) 164 | elif es < 0: 165 | p = np.mean(background < es) 166 | result.append((es, p)) 167 | return result 168 | 169 | 170 | ##### format gsea ##### 171 | # all drugs enrichment score 172 | def format_gsea_res(result, cell): 173 | res_formatted = pd.DataFrame(result) 174 | res_formatted.columns = ['es','p'] 175 | res_formatted['es'] = [round(i,3) for i in res_formatted['es'].tolist()] 176 | res_formatted['p-adj']=statsmodels.stats.multitest.fdrcorrection(res_formatted['p'].tolist(), alpha=0.05, method='indep', is_sorted=False)[1] 177 | res_formatted = res_formatted.reset_index() 178 | res_formatted.index = COLS_dur 179 | #res_formatted = res_formatted.drop('level_0',axis = 1) 180 | res_formatted['drug'] = [i.split('--')[0] for i in res_formatted.index.tolist()] 181 | if not os.path.exists(enrich_dir ): 182 | os.makedirs(enrich_dir ) 183 | res_formatted.to_csv(enrich_dir+cell+'.csv') 184 | return res_formatted 185 | 186 | 187 | # top and bottom 500 enriched drugs - for website visualization 188 | def format_enrich(ds, cell): 189 | res = pd.DataFrame() 190 | res['CMAP_instance'] = [i.split('--')[1] for i in ds.index.tolist()] 191 | res['Drug'] = ds['drug'].tolist() 192 | res['Enrichment_score'] = ds['es'].tolist() 193 | res['P-value'] = ds['p'].tolist() 194 | res['Adj-p']= ds['p-adj'].tolist() 195 | res = res.sort_values('Enrichment_score', ascending=False) 196 | res_inv = res.head(500) 197 | res_pos = res.tail(500) 198 | if not os.path.exists(top_enrich_dir ): 199 | os.makedirs(top_enrich_dir ) 200 | res_inv.to_csv(top_enrich_dir+cell+'_INV_es.csv', index=False) 201 | res_pos.to_csv(top_enrich_dir+cell+'_POS_es.csv', index=False) 202 | return res_inv, res_pos 203 | 204 | 205 | ##### format perturbation ##### 206 | # format pertrbation network to create edge and node file 207 | def get_perturb_zs(cell, direction, es_df, dge, cell_type_deg_z): 208 | perturb_specific_dir = perturb_dir+cell+'/'+direction+'/' 209 | if not os.path.exists(perturb_specific_dir): 210 | os.makedirs(perturb_specific_dir) 211 | for j,i in es_df.iterrows(): 212 | # adding cmap perturbation rank and z score for each gene for top enriched compounds. 213 | cmap_instance = i['CMAP_instance'] 214 | drug = i['Drug'] 215 | cmap_drug = drug + '--' +cmap_instance 216 | #print(j, drug_new_idx) 217 | zs = cell_type_deg_z[[cmap_drug]] 218 | zs.columns = ['z'] 219 | zs = zs.sort_values('z') 220 | perturb_df = zs.merge(dge, how = 'left', left_index=True, right_on = 'gene') 221 | 222 | # edge of perturbation network 223 | deg_gene_num = perturb_df.shape[0] 224 | perturb_edges = pd.DataFrame() 225 | perturb_edges['Source'] = [drug for i in range(deg_gene_num)] 226 | perturb_edges['CMAP_instance'] = [cmap_instance for i in range(deg_gene_num)] 227 | perturb_edges['Target'] = perturb_df['gene'].tolist() 228 | perturb_edges['log2fc'] = perturb_df['stat'].tolist() 229 | perturb_edges['exp_z'] = perturb_df['z'].tolist() 230 | perturb_edges['exp_z_norm'] = [i[0] for i in scaler.fit_transform(np.array(perturb_edges['exp_z'].tolist()).reshape(-1, 1))] 231 | perturb_edges['abs_log2fc'] = [abs(i) for i in perturb_df['stat'].tolist()] 232 | perturb_edges['sign_log2fc'] = [np.sign(i) for i in perturb_df['stat'].tolist()] 233 | perturb_edges = perturb_edges.sort_values('exp_z') 234 | head_edges = perturb_edges.head(30) 235 | tail_edges = perturb_edges.tail(30) 236 | perturb_edges_sub = pd.concat([head_edges, tail_edges]) 237 | perturb_edges_sub.to_csv(perturb_specific_dir+cmap_drug+'_perturb.csv',index=False) 238 | 239 | # node of perturbation network 240 | perturb_nodes = pd.DataFrame() 241 | perturb_nodes['Id'] = perturb_edges_sub['Target'] 242 | perturb_nodes['Label'] = perturb_edges_sub['Target'] 243 | perturb_nodes['log2fc'] = perturb_edges_sub['log2fc'] 244 | perturb_nodes['sign_log2fc'] = perturb_edges_sub['sign_log2fc'] 245 | perturb_nodes['abs_log2fc'] = perturb_edges_sub['abs_log2fc'] 246 | drug_row = pd.DataFrame({'Id':[perturb_edges_sub['Source'].unique().tolist()[0]], 247 | 'Label':[perturb_edges_sub['Source'].unique().tolist()[0]], 248 | 'log2fc':[10], 249 | 'sign_log2fc':[0], 250 | 'abs_log2fc':[10]}) 251 | perturb_nodes = pd.concat([drug_row, perturb_nodes]) 252 | perturb_nodes.to_csv(perturb_specific_dir+cmap_drug+'_perturb_nodes.csv',index=False) 253 | 254 | ##### run enrich + perturb ###### 255 | def run_enrich_perturb(dsid, sampleid, deg_dir, cell_direc): 256 | for ct in cell_direc: 257 | # deg 258 | cell = ct.split('.csv')[0] 259 | deg = pd.read_csv(deg_dir + '/' + ct) 260 | print('deg file read in for:', cell) 261 | deg_up, deg_down = get_up_down_deg(deg) 262 | print('deg shape:', deg_up.shape, deg_down.shape) 263 | if deg_up.shape[0] > 2000 or deg_up.shape[0] <10 or deg_down.shape[0] > 2000 or deg_down.shape[0] <10: 264 | print('deg too few or too many') 265 | print('\n') 266 | continue 267 | gene_num_up, gene_num_down = hgnc_gene_up_down(deg_up, deg_down ) 268 | print('degs hgnc-formatted:', dsid, sampleid) 269 | 270 | # enrichment 271 | DATA, N_UP,N_DOWN = cmap2_gsea_setup(gene_num_up, gene_num_down) 272 | print('cmap matched deg:',N_UP,N_DOWN) 273 | gsea_res= run_gsea(DATA, N_UP, N_DOWN) 274 | print('finish gsea, head:', gsea_res[:3]) 275 | gsea_res_form = format_gsea_res(gsea_res, cell) 276 | enrich_inv, enrich_pos = format_enrich(gsea_res_form, cell) 277 | print('enrich saved, shape:', enrich_inv.shape, enrich_pos.shape, enrich_inv.head(1), enrich_inv.head(1)) 278 | 279 | # perturbation 280 | dge =pd.concat([deg_up, deg_down]).sort_values('stat') 281 | cell_type_deg_z = z_score_df[z_score_df.index.isin(dge['gene'].tolist())] 282 | print('deg genes matched to cmap:',cell_type_deg_z.shape[0]) 283 | get_perturb_zs(cell, 'INV', enrich_inv, dge, cell_type_deg_z) 284 | get_perturb_zs(cell, 'POS', enrich_pos, dge, cell_type_deg_z) 285 | print('inv and pos perturb files saved, shapes:', len(os.listdir(perturb_dir+cell+'/'+'INV/')),len(os.listdir(perturb_dir+cell+'/'+'POS/')) ) 286 | print('\n') 287 | 288 | run_enrich_perturb(dsid, sampleid, deg_dir, cell_direc) -------------------------------------------------------------------------------- /data_processing/process_non_visium_standard.R: -------------------------------------------------------------------------------- 1 | #### Description 2 | #### * The purpose of this script is to process non-Visium data stored in standard structure (counts.csv + coordinates.csv). This also applies to Visium data with no h5 + spatial data provided for public download. 3 | #### * This script will loop through the datasets, read, transform, and process the datasets, and save the processed data as well as a metatable for each dataset. 4 | #### * Note that if after QC, there are < 10 spots left, we will exclude the sample from our database. 5 | #### 6 | #### Author: Yiming Li, Saya Dennis (edits) 7 | 8 | library(stringr) 9 | library(dplyr) 10 | library(Seurat) 11 | library(patchwork) 12 | library(data.table) 13 | # library(SeuratDisk) 14 | 15 | dn <- "/projects/b1131/SpatialT/" 16 | args <- commandArgs(trailingOnly=TRUE) 17 | dsid <- args[1] 18 | tech <- args[2] 19 | dn <- paste0(dn, tech) 20 | 21 | #### Get PID 22 | dsid <- paste0("DS", dsid) 23 | pid <- paste0("PID", substr(dsid, start=3, stop=nchar(dsid)-1)) # e.g. "PID203" 24 | #### Create empty data frame for meta table 25 | meta <- data.frame(DSID = character(0), SampleID = character(0), Nspots = integer(0), Nspots_postQC = integer(0), Ngenes = integer(0), Ngenes_postQC = integer(0), Condition = character(0)) 26 | #### Set dataset directory and get a list of samples 27 | dsdir <- paste0(dn, "/", pid, "/", dsid) # e.g. "/projects/b1131/SpatialT/Slide-seq/PID203/DS203A" 28 | sampleids <- dir(dsdir, pattern = dsid) # e.g. list of elements like "DS203A.1" 29 | 30 | cat(paste(c("\n>>>>>>>> Dataset [", dsdir, "] started <<<<<<<<\n"), collapse = "")) 31 | for (sampleid in sampleids) { 32 | #### Read data 33 | ## !!! Assumes that the prepared counts.csv and coordinates.csv are ready under original/ 34 | counts <- fread(paste0(dsdir, "/", sampleid, "/original/counts.csv"), sep = ",", header = TRUE) 35 | coordinates <- read.table(paste0(dsdir, "/", sampleid, "/original/coordinates.csv"), sep = ",", header = TRUE) 36 | counts <- counts[counts$gene != "",] ## Remove empty "genes" 37 | ## Some datasets have duplicated values in the "gene" or "barcode" column 38 | ## Remove these "genes" / "barcodes" since it is hard to determine which observation we should keep 39 | tmp <- table(counts$gene) 40 | tmp <- names(tmp[tmp > 1]) 41 | if (length(tmp) > 0) { 42 | counts <- counts[!counts$gene %in% tmp,] 43 | cat(paste(c("# WARNING: Excluded ", as.character(length(tmp)), " genes with multiple associated rows.\n"), collapse = "")) 44 | } 45 | coordinates <- coordinates[!duplicated(coordinates),] 46 | tmp <- table(coordinates$barcode) 47 | tmp <- names(tmp[tmp > 1]) 48 | if (length(tmp) > 0) { 49 | coordinates <- coordinates[!coordinates$barcode %in% tmp,] 50 | cat(paste(c("# WARNING: Excluded ", as.character(length(tmp)), " spot IDs with multiple associated rows.\n"), collapse = "")) 51 | } 52 | 53 | #### Generate Seurat object 54 | gene_names <- counts$gene 55 | counts$gene <- NULL 56 | counts <- as.matrix(counts) 57 | rownames(counts) <- gene_names 58 | seurat_object <- CreateSeuratObject(counts = counts, project = 'SlideSeq', assay = "Spatial") 59 | rownames(coordinates) <- coordinates$barcode 60 | ## Using the spot coordinates, instead of pixel coordinates 61 | ## !!! This Seurat object cannot be directly overlayed on top of the tissue image (if any) 62 | coordinates <- coordinates[,c("row", "col")] 63 | # if (sum(is.na(coordinates$imagerow)) + sum(is.na(coordinates$imagecol)) == 0) { 64 | # coordinates <- coordinates[,c("imagerow", "imagecol")] 65 | # } else { 66 | # coordinates <- coordinates[,c("row", "col")] 67 | # } 68 | colnames(coordinates) <- c("xcoord", "ycoord") 69 | seurat_object[['images']]<- new(Class = "SlideSeq", assay = "Spatial", coordinates = coordinates) 70 | 71 | nspots <- ncol(seurat_object) 72 | ngenes <- nrow(seurat_object) 73 | 74 | #### Spot QC 75 | 76 | ## Step 1. Remove the spots with total UMI count < 500 / the total number of genes < 500 / >= 25% mitochondrial reads 77 | seurat_object[["percent_mt"]] <- PercentageFeatureSet(seurat_object, "^MT-") 78 | qc_step1 <- seurat_object$nCount_Spatial >= 500 & seurat_object$nFeature_Spatial >= 500 & seurat_object$percent_mt < 25 79 | if (sum(qc_step1) < 10) { 80 | cat(paste(c("# NOTE: Sample has less than 10 spots after QC. This sample will be excluded from the database and metatable.tsv.\n"), collapse = "")) 81 | next 82 | } 83 | seurat_object <- seurat_object[, qc_step1] 84 | 85 | ## Step 2. 86 | ## Remove the spots with total UMI count < median(total UMI count) - 3 * SD(total UMI count). 87 | ## Remove the spots with total number of genes < median(total number of genes) - 3 * SD(total number of genes) 88 | qc_step2 <- seurat_object$nCount_Spatial >= median(seurat_object$nCount_Spatial) - 3 * sqrt(var(seurat_object$nCount_Spatial)) & seurat_object$nFeature_Spatial >= median(seurat_object$nFeature_Spatial) - 3 * sqrt(var(seurat_object$nFeature_Spatial)) 89 | if (sum(qc_step2) < 10) { 90 | cat(paste(c("# NOTE: Sample has less than 10 spots after QC. This sample will be excluded from the database and metatable.tsv.\n"), collapse = "")) 91 | next 92 | } 93 | seurat_object <- seurat_object[, qc_step2] 94 | 95 | #### Gene QC 96 | counts <- data.frame(GetAssayData(object = seurat_object, assay = "Spatial", slot = "counts")) 97 | counts <- counts > 0 98 | n_spots_per_gene <- rowSums(counts) 99 | seurat_object <- seurat_object[n_spots_per_gene >= 5,] 100 | 101 | nspots_qc <- ncol(seurat_object) 102 | ngenes_qc <- nrow(seurat_object) 103 | 104 | #### Append to the dataset metatable 105 | meta <- rbind(meta, data.frame(DSID = dsid, SampleID = sampleid, Nspots = nspots, Nspots_postQC = nspots_qc, Ngenes = ngenes, Ngenes_postQC = ngenes_qc, Condition = NA)) 106 | 107 | #### Transform and process seurat_object 108 | seurat_object <- SCTransform(seurat_object, assay = "Spatial", verbose = FALSE) 109 | 110 | #### Dimensionality reduction and clustering 111 | 112 | #### The number of spots in a ST dataset is often small, need to set the npcs and dims parameters 113 | n_pcs <- min(min(dim(seurat_object)) - 1, 50) 114 | seurat_object <- RunPCA(seurat_object, assay = "SCT", verbose = FALSE, npcs = n_pcs) 115 | n_dims <- min(min(dim(seurat_object)) - 1, 30) 116 | seurat_object <- FindNeighbors(seurat_object, reduction = "pca", dims = 1:n_dims, verbose = FALSE) 117 | 118 | ## If the number of spots is < 50, UMAP with uwot (default) will fail, and FindClusters with resolution = 1.2 will sometimes fail 119 | ## https://github.com/satijalab/seurat/issues/4312#issuecomment-812938288 120 | if (ncol(seurat_object) < 50) { 121 | seurat_object <- FindClusters(seurat_object, resolution = 1, verbose = FALSE) 122 | ## resolution 1.2 sometimes fail 123 | seurat_object <- RunUMAP(seurat_object, umap.method = "umap-learn", reduction = "pca", dims = 1:n_dims, verbose = FALSE) 124 | cat(paste(c("# WARNING: Sample has less than 50 spots after QC.\n"), collapse = "")) 125 | } else { 126 | seurat_object <- FindClusters(seurat_object, resolution = 1.2, verbose = FALSE) 127 | ## resolution: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html advises 0.4-1.2 for around 3K cells 128 | seurat_object <- RunUMAP(seurat_object, reduction = "pca", dims = 1:n_dims, verbose = FALSE) 129 | } 130 | 131 | ### Save processed Seurat object 132 | ## "processed" folder should already exist for all datasets but just in case 133 | if (!dir.exists(paste0(dsdir, "/", sampleid, "/processed/"))) { 134 | dir.create(paste0(dsdir, "/", sampleid, "/processed/")) 135 | } 136 | saveRDS(seurat_object, file = paste0(dsdir, "/", sampleid, "/processed/Seurat.RDS")) 137 | 138 | # #### Retrieve QC-ed + transformed counts and coordinates 139 | # ## Counts 140 | # counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts") 141 | # counts <- tibble::rownames_to_column(data.frame(counts), "gene") 142 | # ## Coordinates 143 | # coords <- seurat_object@images$image@coordinates 144 | # coords <- tibble::rownames_to_column(data.frame(coords), "barcode") 145 | # coords <- coords[,c("barcode", "x", "y")] 146 | 147 | # #### Write counts, coordinates, and meta_spots to file 148 | # write.table(counts, file = paste0(dsdir, "/", sampleid, "/processed/counts.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE) 149 | # write.table(coords, file = paste0(dsdir, "/", sampleid, "/processed/coordinates.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE) 150 | # saveRDS(list("counts" = counts, "coordinates" = coords), file = paste0(dsdir, "/", sampleid, "/processed/data_frames.RDS")) 151 | # cat(paste(c("### [", sampleid, "] Finished.\n"), collapse = "")) 152 | # gc() 153 | 154 | ## Rename variables for the below code that is pasted from rewrite_text_files.R ## 155 | sample_dir <- paste0(dsdir, "/", sampleid) 156 | 157 | seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS")) 158 | 159 | #### Create spot IDs following R's column name requirements 160 | spotmeta <- seurat_object@meta.data 161 | spotmeta$new_spot_id <- paste0("sp", 1:nrow(spotmeta)) 162 | seurat_object[["new_spot_id"]] <- spotmeta$new_spot_id 163 | saveRDS(seurat_object, paste0(sample_dir, "/processed/Seurat.RDS")) 164 | 165 | spot_id_mapping <- spotmeta$new_spot_id 166 | names(spot_id_mapping) <- rownames(spotmeta) 167 | 168 | #### Prepare data for deconvolution 169 | if ("Spatial" %in% names(seurat_object@assays)) { 170 | counts <- GetAssayData(object = seurat_object, assay = "Spatial", slot = "counts") 171 | } else { 172 | ### MERFISH datasets 173 | counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts") 174 | } 175 | counts <- as.matrix(counts) 176 | gene_names <- rownames(counts) 177 | counts_t <- data.table(counts) 178 | counts_t$gene <- gene_names 179 | gc() 180 | counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long 181 | gc() 182 | cell_names <- counts_t$cell 183 | counts_t$cell <- NULL ### Remove the gene name column 184 | counts_t <- as.matrix(counts_t) 185 | rownames(counts_t) <- cell_names 186 | saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RDS")) 187 | gc() 188 | 189 | #### Retrieve QC-ed counts and coordinates 190 | 191 | ## Counts 192 | counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts") 193 | counts <- as.matrix(counts) 194 | colnames(counts) <- as.character(spot_id_mapping[colnames(counts)]) 195 | counts_df <- tibble::rownames_to_column(data.frame(counts), "gene") 196 | 197 | ## Coordinates 198 | if ("slice1" %in% names(seurat_object@images)) { 199 | location <- seurat_object@images$slice1@coordinates ### Visium 200 | location <- location[, c("col", "row")] ### Use the spot coordinates 201 | } else { 202 | location <- seurat_object@images$image@coordinates ### Others 203 | if ("x" %in% colnames(location)) { 204 | location <- location[, c("x", "y")] ### Use the spot coordinates 205 | } else { 206 | ### Some prepared MERFISH datasets did not follow the naming standard 207 | location <- location[, c("xcoord", "ycoord")] ### Use the spot coordinates 208 | colnames(location) <- c("x", "y") 209 | } 210 | } 211 | location$barcode <- rownames(location) 212 | colnames(location) <- c("x", "y", "barcode") 213 | location$barcode <- spot_id_mapping[location$barcode] 214 | 215 | ## Sometimes (e.g. /share/fsmresfiles/SpatialT/10x/PID153/DS153A/DS153A.1), the number of spots in seurat_object@images is different from ncol(seurat_object) 216 | ## Not sure why this is the case 217 | counts_spots <- colnames(counts_df) 218 | location_spots <- location$barcode 219 | keep_spots <- intersect(counts_spots, location_spots) 220 | counts_df <- counts_df[,c("gene", keep_spots)] 221 | rownames(location) <- location$barcode 222 | location <- location[keep_spots,] 223 | 224 | #### Write counts, coordinates, and meta_spots to file 225 | write.table(counts_df, file = paste0(sample_dir, "/processed/counts.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE) 226 | write.table(location, file = paste0(sample_dir, "/processed/coordinates.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE) 227 | saveRDS(list("counts" = counts_df, "coordinates" = location), file = paste0(sample_dir, "/processed/data_frames.RDS")) 228 | 229 | #### Prepare data for deconvolution (relative counts) 230 | if ("Spatial" %in% names(seurat_object@assays)) { 231 | DefaultAssay(seurat_object) <- "Spatial" 232 | seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6) 233 | counts <- GetAssayData(object = seurat_object, assay = "Spatial") 234 | } else { 235 | ### MERFISH datasets 236 | seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6) 237 | counts <- GetAssayData(object = seurat_object, assay = "SCT") 238 | } 239 | counts <- as.matrix(counts) 240 | gene_names <- rownames(counts) 241 | counts_t <- data.table(counts) 242 | counts_t$gene <- gene_names 243 | gc() 244 | counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long 245 | gc() 246 | cell_names <- counts_t$cell 247 | counts_t$cell <- NULL ### Remove the gene name column 248 | counts_t <- as.matrix(counts_t) 249 | rownames(counts_t) <- cell_names 250 | saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RC.RDS")) 251 | 252 | cat(paste(c("### [", sampleid, "] Finished.\n"), collapse = "")) 253 | } 254 | write.table(meta, file = paste0(dsdir, "/metatable_auto.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) 255 | if (file.exists(paste0(dsdir, "/metatable_orig.tsv"))) { 256 | meta$Condition <- NULL 257 | meta_orig <- fread(paste0(dsdir, "/metatable_orig.tsv")) 258 | meta <- merge(meta, meta_orig, by = c("DSID", "SampleID")) 259 | } 260 | write.table(meta, file = paste0(dsdir, "/metatable.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) 261 | 262 | cat(paste(c(">>>>>>>> Dataset: ", dsid, " completed <<<<<<<<\n"), collapse = "")) 263 | --------------------------------------------------------------------------------