├── .DS_Store
├── data_analysis
    ├── .DS_Store
    ├── drug_discovery
    │   ├── .DS_Store
    │   ├── DGE
    │   │   ├── .DS_Store
    │   │   ├── deconvoluted
    │   │   │   ├── .DS_Store
    │   │   │   ├── DGE-analysis-dec.sh
    │   │   │   └── DGE-analysis-dec.R
    │   │   ├── annotated
    │   │   │   ├── DGE_annotated.sh
    │   │   │   └── DGE-annotated.R
    │   │   ├── svg_deg_filter_quest.sh
    │   │   └── svg_deg_filter_quest.py
    │   ├── PPI_Drug_Enrichment_Perturbation
    │   │   ├── ppi_quest.sh
    │   │   ├── drug_screen_perturb_quest.sh
    │   │   ├── ppi_quest.py
    │   │   └── drug_screen_perturb_quest.py
    │   └── README.md
    ├── spatial_clustering
    │   ├── quest_stagate_to_seurat_updated_jobarray.R
    │   ├── Archive
    │   │   ├── quest_stagate_to_seurat_jobarray.R
    │   │   ├── quest_step02_stagate_jobarray.R
    │   │   └── quest_step01_stagate_jobarray.py
    │   ├── README.md
    │   └── quest_stagate_updated_jobarray.py
    ├── cell_cell_interaction
    │   ├── distance-based
    │   │   ├── cci-analysis-COMMOT-DGE-step3.R
    │   │   ├── cci-analysis-COMMOT-DGE-step2.py
    │   │   ├── cci-analysis-COMMOT-DGE-step1.R
    │   │   ├── cci-analysis-COMMOT-DGE-step4.py
    │   │   ├── cci-analysis-COMMOT-DGE.sh
    │   │   ├── cci-analysis-COMMOT.py
    │   │   ├── cci-analysis-COMMOT-pull-scores.py
    │   │   └── cci-analysis-COMMOT-DGE-step0.py
    │   └── neighborhood-based
    │   │   └── adj-analysis.R
    ├── cell_typing
    │   ├── deconvolution
    │   │   ├── create_input_files.R
    │   │   ├── quest_deconvolution_jobarray.R
    │   │   └── process_reference_example.R
    │   ├── reference
    │   │   ├── geo-download-scRNA-seq.py
    │   │   └── ref_data_processing_example.R
    │   └── annotation
    │   │   ├── annotation_example.R
    │   │   └── runBrainCellTypeAnnotation-CluHeu.R
    └── spatial_variability
    │   ├── quest_SpatialDE_ct_specific.py
    │   └── quest_SpatialDE_jobarray.py
├── .gitignore
├── data_curation
    └── geo-query.py
├── README.md
└── data_processing
    ├── process_visium_standard.R
    └── process_non_visium_standard.R


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/.DS_Store


--------------------------------------------------------------------------------
/data_analysis/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/.DS_Store


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/drug_discovery/.DS_Store


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/drug_discovery/DGE/.DS_Store


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/deconvoluted/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoyuanlab/SOAR/HEAD/data_analysis/drug_discovery/DGE/deconvoluted/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | data_analysis/drug_discovery/.DS_Store
 3 | data_analysis/drug_discovery/DGE/.DS_Store
 4 | data_analysis/.DS_Store
 5 | data_analysis/.DS_Store
 6 | data_analysis/drug_discovery/.DS_Store
 7 | data_analysis/drug_discovery/DGE/.DS_Store
 8 | data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/.DS_Store
 9 | data_analysis/drug_discovery/.DS_Store
10 | data_analysis/drug_discovery/.DS_Store
11 | data_analysis/.DS_Store
12 | data_analysis/.DS_Store
13 | data_analysis/drug_discovery/.DS_Store
14 | data_analysis/.DS_Store
15 | data_analysis/cell_cell_interaction/.DS_Store
16 | data_analysis/cell_typing/.DS_Store
17 | data_analysis/drug_discovery/.DS_Store
18 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/deconvoluted/DGE-analysis-dec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A p31931
 3 | #SBATCH -p normal
 4 | #SBATCH -t 01:00:00
 5 | #SBATCH -N 1
 6 | #SBATCH -n 1
 7 | #SBATCH --array=2-6
 8 | #SBATCH --mem=30G
 9 | #SBATCH --mail-user=yiming.li@northwestern.edu
10 | #SBATCH --mail-type=BEGIN,END,FAIL
11 | #SBATCH --job-name="DGE2%a"
12 | #SBATCH --output=/projects/b1131/ylz8811/pbs-cmds/DGE2_b4_%a.out
13 | 
14 | module purge all
15 | module load R/4.1.1
16 | module load geos/3.8.1
17 | 
18 | cd /projects/b1131/SpatialT/
19 | 
20 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/SpatialT/deconvoluted_samples.txt
21 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]}
22 | sample_dir=${split_dirs[0]}
23 | 
24 | echo "Sample directory: ${sample_dir}"
25 | 
26 | Rscript --vanilla /projects/b1131/SpatialT/ref_scripts/DGE-analysis-dec.R $sample_dir
27 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/annotated/DGE_annotated.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A b1042
 3 | #SBATCH -p genomics
 4 | #SBATCH -t 1:00:00
 5 | #SBATCH -N 1
 6 | #SBATCH -n 1
 7 | #SBATCH --array=0-8
 8 | #SBATCH --mem=30G
 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu
10 | #SBATCH --mail-type=END,FAIL
11 | #SBATCH --job-name="DGE%a"
12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/deg/out/DGE_%a.out
13 | 
14 | module purge all
15 | module load R/4.4.0
16 | 
17 | cd /projects/b1131/SpatialT/
18 | 
19 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/cosmx_colon_case_revision_samples.txt
20 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]}
21 | sample_dir=${split_dirs[0]}
22 | 
23 | echo "Sample directory: ${sample_dir}"
24 | 
25 | Rscript --vanilla /projects/b1131/SpatialT/ref_scripts/DGE-annotated.R $sample_dir


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A b1042
 3 | #SBATCH -p genomics
 4 | #SBATCH -t 2:00:00
 5 | #SBATCH -N 1
 6 | #SBATCH -n 1
 7 | #SBATCH --array=0-4
 8 | #SBATCH --mem=10G
 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu
10 | #SBATCH --mail-type=END,FAIL
11 | #SBATCH --job-name="ppi%a"
12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/drug/ppi%a.out
13 | 
14 | module purge all
15 | module load python-miniconda3/4.12.0
16 | source activate myenv
17 | 
18 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/SpatialT/cosmx_patho_samples.tsv
19 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]}
20 | dsid=${split_dirs[0]}
21 | sampleid=${split_dirs[1]}
22 | 
23 | echo "DSID: ${dsid}"
24 | echo "SampleiD: ${sampleid}"
25 | python /projects/b1131/SpatialT/ST-dataset/analysis/database_utilities/drug/ppi_quest.py $dsid $sampleid


--------------------------------------------------------------------------------
/data_analysis/spatial_clustering/quest_stagate_to_seurat_updated_jobarray.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(data.table)
 3 | library(stringr)
 4 | 
 5 | sample_dir <- commandArgs(trailingOnly=TRUE)
 6 | 
 7 | coord = read.csv(paste0(sample_dir, "/processed/coordinates.csv"))
 8 | 
 9 | so <- readRDS(paste0(sample_dir, "processed/Seurat.RDS"))
10 | meta <- so@meta.data
11 | meta$row <- 1:nrow(meta)
12 | 
13 | clustering_results <- fread(paste0(sample_dir, "analysis/clustering/STAGATE_clusters.csv"))
14 | colnames(clustering_results) <- c("new_spot_id", "STAGATE_cluster")
15 | 
16 | meta <- merge(meta, clustering_results, by = "new_spot_id")
17 | meta <- meta[order(meta$row),]
18 | 
19 | # keep only seurat obs where they are present in the coordinates column, this is the obs in which clustering are done on
20 | so <- so[, so@meta.data$new_spot_id %in% coord$barcode]
21 | so[["STAGATE_cluster"]] <- meta$STAGATE_cluster
22 | saveRDS(so, file = paste0(sample_dir, "processed/Seurat.RDS"))
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A b1042
 3 | #SBATCH -p genomics
 4 | #SBATCH -t 8:00:00
 5 | #SBATCH -N 1
 6 | #SBATCH -n 20
 7 | #SBATCH --array=0-4
 8 | #SBATCH --mem=188G
 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu
10 | #SBATCH --mail-type=END,FAIL
11 | #SBATCH --job-name="drug%a"
12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/drug/out/drug_rerun%a.out
13 | 
14 | module purge all
15 | module load python-miniconda3/4.12.0
16 | source activate myenv
17 | 
18 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/patho_samples_drug_cosmx.tsv
19 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]}
20 | dsid=${split_dirs[0]}
21 | sampleid=${split_dirs[1]}
22 | 
23 | echo "DSID: ${dsid}"
24 | echo "SampleiD: ${sampleid}"
25 | python /projects/b1131/SpatialT/ST-dataset/analysis/database_utilities/drug/drug_screen_perturb_quest.py $dsid $sampleid


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/svg_deg_filter_quest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A b1042
 3 | #SBATCH -p genomics
 4 | #SBATCH -t 1:00:00
 5 | #SBATCH -N 1
 6 | #SBATCH -n 1
 7 | #SBATCH --array=0-4
 8 | #SBATCH --mem=24G
 9 | #SBATCH --mail-user=yanyi.ding@northwestern.edu
10 | #SBATCH --mail-type=END,FAIL
11 | #SBATCH --job-name="deg_svg_%a"
12 | #SBATCH --output=/projects/b1131/ydn4687/SpatialT/cosmx_colon_revision/svg/out/deg_svg_%a.out
13 | 
14 | module purge all
15 | module load python-miniconda3/4.12.0
16 | source activate myenv
17 | 
18 | IFS=$'\n' read -d '' -r -a input_args < /projects/b1131/SpatialT/cosmx_patho_samples.tsv
19 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]}
20 | dsid=${split_dirs[0]}
21 | sampleid=${split_dirs[1]}
22 | tech=${split_dirs[2]}
23 | 
24 | echo "DSID: ${dsid}"
25 | echo "SampleID: ${sampleid}"
26 | echo "Tech: ${tech}"
27 | 
28 | python /projects/b1131/SpatialT/ST-dataset/analysis/database_utilities/SVG/svg_deg_filter_cosmx_case_quest.py $dsid $sampleid $tech


--------------------------------------------------------------------------------
/data_analysis/spatial_clustering/Archive/quest_stagate_to_seurat_jobarray.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(data.table)
 3 | library(stringr)
 4 | 
 5 | sample_dir <- commandArgs(trailingOnly=TRUE)
 6 | 
 7 | coord = read.csv(paste0(sample_dir, "/processed/coordinates.csv"))
 8 | optimal_n <- dir(paste0(sample_dir, "analysis/clustering"), pattern = "obsm_STAGATE_cluster_")
 9 | optimal_n <- str_split(str_split(optimal_n, "_cluster_")[[1]][2], '.csv')[[1]][1]
10 | 
11 | so <- readRDS(paste0(sample_dir, "processed/Seurat.RDS"))
12 | meta <- so@meta.data
13 | meta$row <- 1:nrow(meta)
14 | 
15 | clustering_results <- fread(paste0(sample_dir, "analysis/clustering/obsm_STAGATE_cluster_", optimal_n, ".csv"))
16 | colnames(clustering_results) <- c("new_spot_id", "STAGATE_cluster")
17 | 
18 | meta <- merge(meta, clustering_results, by = "new_spot_id")
19 | meta <- meta[order(meta$row),]
20 | 
21 | # keep only seurat obs where they are present in the coordinates column, this is the obs in which clustering are done on
22 | so <- so[, so@meta.data$new_spot_id %in% coord$barcode]
23 | so[["STAGATE_cluster"]] <- meta$STAGATE_cluster
24 | saveRDS(so, file = paste0(sample_dir, "processed/Seurat.RDS"))
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/data_analysis/spatial_clustering/Archive/quest_step02_stagate_jobarray.R:
--------------------------------------------------------------------------------
 1 | library(mclust)
 2 | 
 3 | #### Calculating BIC for each number of clusters ####
 4 | args <- commandArgs(trailingOnly=TRUE)
 5 | 
 6 | data <- read.csv(paste0(args, "analysis/clustering/obsm_STAGATE.csv"))
 7 | spot_ids <- data[,1]
 8 | row_size = nrow(data)
 9 | 
10 | print(paste0(">>> ", args, " started calculating BIC <<<"))
11 | bic <- c()
12 | max_cluster = 30
13 | if (nrow(data) < 60) {
14 |   max_cluster = nrow(data)/2
15 |   print(paste0('MAX CLUSTER: ', max_cluster))
16 | } 
17 | 
18 | for (i in 2:max_cluster) {
19 |   res <- Mclust(data[,-1], i, "EEE")
20 |   if(length(res) == 0) {
21 |     res <- Mclust(data[,-1], i)
22 |   }
23 |   bic <- append(bic, res$BIC[1])
24 | }
25 | 
26 | #### Visualize using the optimal number of clusters ####
27 | n_clust <- which.min(bic)+1
28 | print(paste0(">>> Optimal number of cluster for", args, ": ", n_clust, " <<<"))
29 | res <- Mclust(data[,-1], n_clust, "EEE")
30 | if(length(res) == 0) {
31 |   res <- Mclust(data[,-1], n_clust)
32 | }
33 | write.table(data.frame(res$classification), file=paste0(args, "analysis/clustering/obsm_STAGATE_cluster_", n_clust, ".csv"), quote=FALSE, sep=",", row.names=spot_ids, col.names=c("cluster"))
34 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/svg_deg_filter_quest.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import sys
 4 | 
 5 | dsid = sys.argv[1]
 6 | sampleid = sys.argv[2]
 7 | tech = sys.argv[3]
 8 | pid = 'PID'+dsid.split('DS')[1] 
 9 | 
10 | dge_dir = '/projects/b1131/SpatialT/drug-target/'+dsid+'/'+sampleid+'/DGE_dec/'
11 | svg_dir = '/projects/b1131/SpatialT/'+tech+'/'+pid+'/'+dsid+'/'+sampleid+'/analysis/SVG/'
12 | sample_dge_dir = dge_dir.split('DGE_dec/')[0]
13 | 
14 | svg = pd.read_csv(svg_dir+'SpatialDE_results.tsv', sep='\t')
15 | svg_filter = svg[svg['qval']<=0.1]
16 | print('svg read in')
17 | 
18 | if not os.path.exists(sample_dge_dir+'DGE_dec_SVG'):
19 |     os.makedirs(sample_dge_dir+'DGE_dec_SVG')
20 | 
21 | for file in os.listdir(dge_dir):
22 |     cell_type = file.split('.txt')[0]
23 |     print(cell_type)
24 |     deg = pd.read_csv(dge_dir+file, sep='\t')
25 |     deg=deg[deg['gene'].isin(svg_filter['g'].tolist())]
26 |     print('SVG filtered')
27 | 
28 |     df_up = deg[(deg['stat']>0.5) & (deg['qval']<0.05) ]
29 |     df_down = deg[(deg['stat']<-0.5) & (deg['qval']<0.05) ]
30 |     print('DEG up and down shape:', df_up.shape [0],df_down.shape[0])
31 |     
32 |     deg.to_csv(sample_dge_dir+'DGE_dec_SVG'+'/'+cell_type+'.csv',index=False)   
33 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step3.R:
--------------------------------------------------------------------------------
 1 | library(stringr)
 2 | library(data.table)
 3 | library(tradeSeq)
 4 | library(clusterExperiment)
 5 | 
 6 | args <- commandArgs(trailingOnly=TRUE)
 7 | 
 8 | sample_dir <- args[1]
 9 | thr_type <- args[2]
10 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1"
11 | # thr_type <- "short"
12 | 
13 | out_dir <- paste0(sample_dir, "/analysis/Distance/COMMOT_dec/", thr_type)
14 | pathways <- fread(paste0(out_dir, "/pathways_step1_success.txt"), header = FALSE)$V1
15 | pathways_success <- character(0)
16 | 
17 | for (pathway in pathways) {
18 | 	# pathway = "CXCL"
19 | 	pathway_dir <- paste0(out_dir, "/", pathway)
20 | 	
21 | 	possibleError <- tryCatch( {
22 | 		sce <- readRDS(paste0(pathway_dir, "/step1_sce.RDS"))
23 | 		assoRes <- fread(paste0(pathway_dir, "/assoRes.txt"), sep = "\t")
24 | 		genes <- assoRes$V1
25 | 		assoRes$V1 <- NULL
26 | 		assoRes <- data.frame(assoRes)
27 | 		rownames(assoRes) <- genes
28 | 		
29 | 		source(paste0(pathway_dir, "/step3.R"))
30 | 		fwrite(yhatScaled, paste0(pathway_dir, "/yhatScaled.txt"), sep = "\t", row.names = TRUE)
31 | 	},
32 | 		error=function(e) e
33 | 	)
34 | 	if(!inherits(possibleError, "error")) {
35 | 		pathways_success <- c(pathways_success, pathway)
36 | 	}
37 | }
38 | 
39 | pathways_failed <- pathways[!pathways %in% pathways_success]
40 | fwrite(data.frame(pathways_success), paste0(out_dir, "/pathways_step3_success.txt"), col.names = FALSE)
41 | fwrite(data.frame(pathways_failed), paste0(out_dir, "/pathways_step3_failed.txt"), col.names = FALSE)
42 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step2.py:
--------------------------------------------------------------------------------
 1 | import os, sys, pickle, datetime, anndata
 2 | import commot as ct
 3 | import scanpy as sc
 4 | import pandas as pd
 5 | import numpy as np
 6 | import scipy
 7 | from collections import Counter
 8 | 
 9 | thr_type = sys.argv[2]
10 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec/" + thr_type
11 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT/short'
12 | 
13 | with open(out_dir + "/pathways_step1_success.txt") as f:
14 |     pathways = [line.rstrip('\n') for line in f]
15 | 
16 | for pathway in pathways:
17 |     # pathway = "MIF"
18 |     pathway_dir = out_dir + "/" + pathway
19 |     
20 |     df_assoRes = pd.read_csv(pathway_dir + "/assoRes.txt", sep = "\t", index_col = 0)
21 |     n_deg_genes = df_assoRes.shape[0]
22 |     
23 |     n_points = 50
24 |     deg_pvalue_cutoff = 0.05
25 |     
26 |     string_step3 = 'assoRes <- assoRes[which(assoRes$pvalue_1 <= %f),]' % deg_pvalue_cutoff
27 |     string_step3 = string_step3 + '\noAsso <- order(assoRes[,"waldStat_1"], decreasing=TRUE)'
28 |     string_cluster = 'clusPat <- clusterExpressionPatterns(sce, nPoints = %d,' % n_points\
29 |         + 'verbose=TRUE, genes = rownames(assoRes)[oAsso][1:min(%d,length(oAsso))],' % n_deg_genes \
30 |         + ' k0s=4:5, alphas=c(0.1))'
31 |     
32 |     string_step3 = string_step3 + '\n' + string_cluster
33 |     string_step3 = string_step3 + '\nyhatScaled <- data.frame(clusPat$yhatScaled)\n'
34 |     
35 |     with open(pathway_dir + "/step3.R", "w") as text_file:
36 |         _ = text_file.write(string_step3)
37 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step1.R:
--------------------------------------------------------------------------------
 1 | library(stringr)
 2 | library(data.table)
 3 | library(tradeSeq)
 4 | library(clusterExperiment)
 5 | 
 6 | args <- commandArgs(trailingOnly=TRUE)
 7 | 
 8 | sample_dir <- args[1]
 9 | thr_type <- args[2]
10 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1"
11 | # thr_type <- "short"
12 | 
13 | out_dir <- paste0(sample_dir, "/analysis/Distance/COMMOT_dec/", thr_type)
14 | pathways <- fread(paste0(out_dir, "/pathways.txt"), header = FALSE)$V1
15 | pathways_success <- character(0)
16 | 
17 | for (pathway in pathways) {
18 | 	# pathway = "MIF"
19 | 	pathway_dir <- paste0(out_dir, "/", pathway)
20 | 	
21 | 	possibleError <- tryCatch( {
22 | 		X <- fread(paste0(pathway_dir, "/step1_X.csv"), header = TRUE)
23 | 		pseudoTime <- fread(paste0(pathway_dir, "/step1_pseudoTime.csv"), header = FALSE)$V1
24 | 		cellWeight <- fread(paste0(pathway_dir, "/step1_cellWeight.csv"), header = FALSE)$V1
25 | 		spot_ids <- X$V1
26 | 		X$V1 <- NULL
27 | 		X <- t(as.matrix(X))
28 | 		colnames(X) <- spot_ids
29 | 		
30 | 		source(paste0(pathway_dir, "/step1.R"))
31 | 		fwrite(assoRes, paste0(pathway_dir, "/assoRes.txt"), sep = "\t", row.names = TRUE)
32 | 		saveRDS(sce, paste0(pathway_dir, "/step1_sce.RDS"))
33 | 	},
34 | 		error=function(e) e
35 | 	)
36 | 	if(!inherits(possibleError, "error")) {
37 | 		pathways_success <- c(pathways_success, pathway)
38 | 	}
39 | }
40 | 
41 | pathways_failed <- pathways[!pathways %in% pathways_success]
42 | fwrite(data.frame(pathways_success), paste0(out_dir, "/pathways_step1_success.txt"), col.names = FALSE)
43 | fwrite(data.frame(pathways_failed), paste0(out_dir, "/pathways_step1_failed.txt"), col.names = FALSE)
44 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step4.py:
--------------------------------------------------------------------------------
 1 | import os, sys, pickle, datetime, anndata
 2 | import commot as ct
 3 | import scanpy as sc
 4 | import pandas as pd
 5 | import numpy as np
 6 | import scipy
 7 | from collections import Counter
 8 | 
 9 | thr_type = sys.argv[2]
10 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec/" + thr_type
11 | # thr_type = "short"
12 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT/short'
13 | 
14 | with open(out_dir + "/pathways_step3_success.txt") as f:
15 |     pathways = [line.rstrip('\n') for line in f]
16 | 
17 | for pathway in pathways:
18 |     # pathway = "MIF"
19 |     pathway_dir = out_dir + "/" + pathway
20 |     
21 |     yhat_scaled = pd.read_csv(pathway_dir + "/yhatScaled.txt", sep = "\t", index_col = 0)
22 |     df_assoRes = pd.read_csv(pathway_dir + "/assoRes.txt", sep = "\t", index_col = 0)
23 |     
24 |     df_deg = df_assoRes.rename(columns={'waldStat_1':'waldStat', 'df_1':'df', 'pvalue_1':'pvalue'})
25 |     df_deg = df_deg[['waldStat', 'df', 'pvalue']]
26 |     idx = np.argsort(-df_deg['waldStat'].values)
27 |     df_deg = df_deg.iloc[idx]
28 |     df_yhat = yhat_scaled
29 |     
30 |     deg_result = {"df_deg": df_deg, "df_yhat": df_yhat}
31 |     with open(pathway_dir + '/DEG_pt.pkl', 'wb') as handle:
32 |         pickle.dump(deg_result, handle, protocol = pickle.HIGHEST_PROTOCOL)
33 |     
34 |     df_deg_clus, df_yhat_clus = ct.tl.communication_deg_clustering(df_deg, df_yhat, deg_clustering_res=0.4)
35 |     top_de_genes = ct.pl.plot_communication_dependent_genes(df_deg_clus, df_yhat_clus, top_ngene_per_cluster=5, filename = pathway_dir + '/DEG.pdf', font_scale=1.2, return_genes = True)
36 |     
37 |     deg_result = {"df_deg": df_deg, "df_yhat": df_yhat, "df_deg_clus": df_deg_clus, "df_yhat_clus": df_yhat_clus, "top_de_genes": top_de_genes}
38 |     with open(pathway_dir + '/DEG_full.pkl', 'wb') as handle:
39 |         pickle.dump(deg_result, handle, protocol = pickle.HIGHEST_PROTOCOL)
40 | 


--------------------------------------------------------------------------------
/data_analysis/spatial_clustering/README.md:
--------------------------------------------------------------------------------
 1 | # Spatial Clustering 
 2 | Implementation of spatial clustering using gene expression and spatial location via the [STAGATE package](https://stagate.readthedocs.io/en/latest/index.html)
 3 | - Requires `counts.csv` and `coordinates.csv` in the processed folder and a list of sample directories for running the shell scripts. 
 4 | - Steps of running:
 5 |     - Run [quest_stagate_updated_jobarray.py](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/spatial_clustering/quest_stagate_updated_jobarray.py) followed by [quest_stagate_to_seurat_updated_jobarray.R](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/spatial_clustering/quest_stagate_to_seurat_updated_jobarray.R)
 6 |         -  quest_stagate_updated_jobarray.py:
 7 |             - **Please note that your provided argument (read into `sample_dir`) should have a "/" at the end of the path.**
 8 |             - Reads counts and coordinates into AnnData object and keeps only spots that have coordinates. 
 9 |             - If the AnnData contains only integer counts and the max count is greater than 20, it is likely not log transformed during processing, so log1p is done. 
10 |             - Conducts normalization then finds the top 3000 highly variable genes by the CellRanger approach (expects normalized and transformed counts). 
11 |             - Initial radius cutoff (`rad_cur`) was set to **2** for STAGATE to find neighbors for each spot. If the initial number of neighbors is less than **5**, step-wise addition of radius cutoff (`rad_add`) is done to reach at least 5 neighbors per spot. 
12 |                 - For non-visium technologies, `rad_add` might need to be increased(if too slow in reaching the optimal number)/decreased(if too many neighbors) to reach optimal neighbors between **5 - 15**. 
13 |             - STAGATE spatial net is then trained. Neighbors are found on the STAGATE spatial+expression-reduced dimensions.
14 |             - Louvain clustering is done using resolution determined based on the number of cells in a sample. 
15 |             - Reducued dimenision data from stagate is saved in `STAGATE_30dim.csv`  and cluster assignment is saved under 'STAGATE_clusters.csv' in the clustering subfolder.
16 |         - quest_stagate_to_seurat_updated_jobarray.R:
17 |             - Adds cluster assignment of each spot to the metadata of `Seurat.RDS` stored in the processed folder. 
18 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IFS=$'\n' read -d '' -r -a input_args < sample_list.txt
 4 | IFS=$' ' read -ra split_dirs <<< ${input_args[${SLURM_ARRAY_TASK_ID}]}
 5 | sample_dir=${split_dirs[0]}
 6 | ref_dir=${split_dirs[1]}
 7 | distance_type="short" # Supported arguments: short, medium, long, or xlong
 8 | 
 9 | echo "### Sample directory: ${sample_dir}"
10 | echo "### Reference directory: ${ref_dir}"
11 | echo "### Distance type: ${distance_type}"
12 | 
13 | module purge all
14 | module load python-miniconda3/4.12.0
15 | source activate SpatialT
16 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT.py $sample_dir $ref_dir $distance_type
17 | then
18 | 	echo "COMMOT main analysis completed"
19 | else
20 | 	echo "COMMOT main analysis failed"
21 | 	exit 1
22 | fi
23 | 
24 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-DGE-step0.py $sample_dir $ref_dir $distance_type
25 | then
26 | 	echo "DGE - Step 0 (Python) completed"
27 | else
28 | 	echo "DGE - Step 0 (Python) failed"
29 | 	exit 1
30 | fi
31 | 
32 | source deactivate
33 | module purge all
34 | module load R/4.1.1
35 | module load geos/3.8.1
36 | if Rscript --vanilla cci-analysis-COMMOT-DGE-step1.R $sample_dir $distance_type
37 | then
38 | 	echo "DGE - Step 1 (R) completed"
39 | else
40 | 	echo "DGE - Step 1 (R) failed"
41 | 	exit 1
42 | fi
43 | 
44 | module purge all
45 | module load python-miniconda3/4.12.0
46 | source activate SpatialT
47 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-DGE-step2.py $sample_dir $distance_type
48 | then
49 | 	echo "DGE - Step 2 (Python) completed"
50 | else
51 | 	echo "DGE - Step 2 (Python) failed"
52 | 	exit 1
53 | fi
54 | 
55 | source deactivate
56 | module purge all
57 | module load R/4.1.1
58 | module load geos/3.8.1
59 | if Rscript --vanilla cci-analysis-COMMOT-DGE-step3.R $sample_dir $distance_type
60 | then
61 | 	echo "DGE - Step 3 (R) completed"
62 | else
63 | 	echo "DGE - Step 3 (R) failed"
64 | 	exit 1
65 | fi
66 | 
67 | module purge all
68 | module load python-miniconda3/4.12.0
69 | source activate SpatialT
70 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-DGE-step4.py $sample_dir $distance_type
71 | then
72 | 	echo "DGE - Step 4 (Python) completed"
73 | else
74 | 	echo "DGE - Step 4 (Python) failed"
75 | 	exit 1
76 | fi
77 | 
78 | if ${HOME}/.conda/envs/SpatialT/bin/python cci-analysis-COMMOT-pull-scores.py $sample_dir $ref_dir $distance_type
79 | then
80 | 	echo "Pulling scores completed"
81 | else
82 | 	echo "Pulling scores failed"
83 | 	exit 1
84 | fi
85 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/deconvolution/create_input_files.R:
--------------------------------------------------------------------------------
 1 | ### Author: Yiming Li
 2 | ### Example usage:
 3 | ### create_input_files.R $sample_dir
 4 | 
 5 | library(Seurat)
 6 | library(BayesPrism)
 7 | library(data.table)
 8 | library(dplyr)
 9 | library(stringr)
10 | 
11 | ### Read the list of DSIDs for use in our database
12 | args <- commandArgs(trailingOnly=TRUE)
13 | 
14 | st_dir <- "/projects/b1131/SpatialT"
15 | 
16 | sample_dir <- args[1]
17 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID5/DS5A/DS5A.12_151676"
18 | 
19 | ### Read Seurat objects and deconvolution results
20 | seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS"))
21 | location <- fread(paste0(sample_dir, "/processed/coordinates.csv"))
22 | output_dir <- paste0(sample_dir, "/analysis/deconvolution")
23 | theta <- readRDS(paste0(output_dir, "/BayesPrism_theta.RDS"))
24 | bp.res <- readRDS(paste0(output_dir, "/BayesPrism_results.RDS"))
25 | meta <- seurat_object@meta.data
26 | spot_id_mapping <- meta$new_spot_id
27 | names(spot_id_mapping) <- rownames(meta)
28 | 
29 | gc()
30 | 
31 | ### Hard-assigned labels
32 | hard_labels <- apply(theta, 1, function(x) names(x)[which(x == max(x))])
33 | hard_labels <- data.frame(cell_type_dec_max = as.character(hard_labels), spot_id = names(hard_labels))
34 | meta$spot_id <- rownames(meta)
35 | meta$row_id <- 1:nrow(meta)
36 | meta <- merge(meta, hard_labels, by = "spot_id")
37 | meta <- meta[order(meta$row_id),]
38 | seurat_object[["cell_type_dec_max"]] <- meta$cell_type_dec_max
39 | # sum(colnames(seurat_object) == meta$spot_id)
40 | saveRDS(seurat_object, paste0(sample_dir, "/processed/Seurat.RDS"))
41 | 
42 | rm(seurat_object)
43 | gc()
44 | 
45 | ### Save deconvoluted cell-type-specific expressions
46 | all_cell_types_s <- character(0)
47 | all_cell_types <- character(0)
48 | ct_i <- 1
49 | for (cell_type in colnames(theta)) {
50 | 	# cell_type <- "CAFs"
51 | 	cell_type_s <- gsub("/", ".", cell_type)
52 | 	cell_type_s <- gsub(" ", ".", cell_type_s)
53 | 	cell_type_s <- gsub("-", ".", cell_type_s)
54 | 	cell_type_s <- gsub("\\*", ".", cell_type_s)
55 | 	cell_type_s <- gsub("\\+", ".", cell_type_s)
56 | 	
57 | 	ct_exp <- get.exp(bp = bp.res, state.or.type = "type", cell.name = cell_type)
58 | 	counts_df <- data.table(ct_exp)
59 | 	counts_df$spot <- spot_id_mapping[rownames(ct_exp)]
60 | 	counts_df <- transpose(counts_df, keep.names = "gene", make.names = "spot")
61 | 	keep_spots <- intersect(colnames(counts_df), location$barcode)
62 | 	keep_spots <- c("gene", keep_spots)
63 | 	counts_df <- counts_df[,..keep_spots]
64 | 	
65 | 	fwrite(counts_df, paste0(output_dir, "/counts_", cell_type_s, "_deconv_only.csv"), sep = ",")
66 | 	all_cell_types[ct_i] <- cell_type
67 | 	all_cell_types_s[ct_i] <- cell_type_s
68 | 	ct_i <- ct_i + 1
69 | }
70 | fwrite(data.table(cell_type = all_cell_types, cell_type_s = all_cell_types_s), paste0(output_dir, "/all_cell_types.txt"), sep = "\t")
71 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/reference/geo-download-scRNA-seq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script queries the GEO using esearch and esummary for scRNA-seq datasets related to a certain organ/species
 3 | 	* Note that this script does NOT check the validity of the arguments
 4 | 	* Two intermediate files will be created during runtime: esearch.xml and esummary.xml (cleaned up at the end)
 5 | 	* Please change any space in the keyword to a plus sign, e.g. spinal+cord instead of spinal cord
 6 | 
 7 | Usage: python geo-download-scRNA-seq.py <organ> <species>
 8 | 	e.g. python geo-download-scRNA-seq.py lymph+node mouse
 9 | 
10 | Author:
11 | 	Yiming Li
12 | """
13 | 
14 | import pandas as pd
15 | import requests
16 | import xml.etree.ElementTree as ET
17 | import time
18 | import sys
19 | import os
20 | 
21 | def loadRSS_esearch(organ, species, keyword):
22 | 	url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=' + keyword + '+' + organ + '+AND+' + species + '[organism]&retmax=100000&usehistory=y'
23 | 	resp = requests.get(url)
24 | 	with open('esearch.xml', 'wb') as f:
25 | 		f.write(resp.content)
26 | 
27 | def parseXML_esearch(xmlfile):
28 | 	tree = ET.parse(xmlfile)
29 | 	root = tree.getroot()
30 | 	items = []
31 | 	for item in root.findall('./IdList/Id'):
32 | 		items.append(item.text)
33 | 	
34 | 	return(items)
35 | 
36 | def loadRSS_esummary(gds_id):
37 | 	url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&id=' + str(gds_id)
38 | 	resp = requests.get(url)
39 | 	with open('esummary.xml', 'wb') as f:
40 | 		f.write(resp.content)
41 | 
42 | def parseXML_esummary(xmlfile):
43 | 	tree = ET.parse(xmlfile)
44 | 	root = tree.getroot()
45 | 	items = []
46 | 	for item in root.findall('./DocSum/Item'):
47 | 		if (item.attrib['Name'] in ["Accession", "title", "summary", "GPL", "taxon", "gdsType", "FTPLink"]):
48 | 			items.append(item.text)
49 | 	
50 | 	return(items)
51 | 
52 | organ = sys.argv[1] # organ = 'spinal+cord'
53 | species = sys.argv[2] # species = "human"
54 | 
55 | 
56 | 
57 | ############
58 | 
59 | 
60 | 
61 | keywords = ['scRNA-seq', 'single+cell+RNA-seq', 'single+cell+RNA+sequencing', 'single+cell+transcriptomics', 'single+cell+transcriptome'] # Change this to make the search broader/narrower
62 | 
63 | ### Step 1. Get GDS IDs
64 | gds_ids = []
65 | for keyword in keywords:
66 | 	loadRSS_esearch(organ, species, keyword)
67 | 	time.sleep(1)
68 | 	gds_ids = gds_ids + parseXML_esearch('esearch.xml')
69 | 
70 | gds_ids = list(set(gds_ids)) # Remove duplicates
71 | 
72 | print(">>>>> [Organ: " + organ + "; Species: " + species + "] " + str(len(gds_ids)) + " GDS IDs found <<<<<\n")
73 | 
74 | ### Step 2. Get meta-information
75 | results = []
76 | for gds_id in gds_ids:
77 | 	loadRSS_esummary(gds_id)
78 | 	time.sleep(1)
79 | 	results.append(parseXML_esummary('esummary.xml'))
80 | 	print("[" + gds_id + "] completed")
81 | 
82 | results = pd.DataFrame(results, columns=["Accession", "title", "summary", "GPL", "taxon", "gdsType", "FTPLink"])
83 | results["GDS_ID"] = gds_ids
84 | results["Species"] = species
85 | results["Organ"] = organ
86 | results.to_csv(species + "-" + organ + ".csv", sep='\t', index = False)
87 | 
88 | os.remove("esearch.xml")
89 | os.remove("esummary.xml")
90 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT.py:
--------------------------------------------------------------------------------
 1 | import os, sys, pickle, datetime
 2 | import commot as ct
 3 | import scanpy as sc
 4 | import pandas as pd
 5 | import numpy as np
 6 | from collections import Counter
 7 | 
 8 | ### Read in data
 9 | data_dir = sys.argv[1] + "/analysis/deconvolution/"
10 | # data_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/deconvolution/'
11 | # data_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/'
12 | counts_dir = data_dir + 'binded_counts.csv'
13 | coord_dir = data_dir + 'binded_coordinates.csv'
14 | anno_dir = data_dir + 'binded_cell_types.tsv'
15 | counts = pd.read_csv(counts_dir, index_col = 0)
16 | coord = pd.read_csv(coord_dir, index_col = 2)
17 | annotation = pd.read_csv(anno_dir, index_col = 0, sep = "\t")
18 | cell_types = annotation['cell_type'].tolist()
19 | 
20 | ### Get species
21 | ref_dir = sys.argv[2]
22 | # ref_dir = "/projects/b1131/SpatialT/ref/final/Cancer/Breast/Human"
23 | ref_dir = ref_dir.split("/")
24 | species = [i for i in ref_dir if i][-1].lower() # Last non-empty string in ref path
25 | 
26 | ### Get spatial distance type
27 | thr_type = sys.argv[3]
28 | # thr_type = "short"
29 | thr_type_multiplier = {
30 |     "short": 500,
31 |     "medium": 1000,
32 |     "long": 1500,
33 |     "xlong": 2500,
34 | }
35 | 
36 | ### Spatial distance constraint
37 | center_to_center_dist_techs = {
38 |     "10x": 100,
39 |     "ST": 200,
40 |     "DBiT-seq": 20,
41 |     "Slide-seq": 20,
42 |     "MERFISH": 0.334,
43 |     "osmFISH": 0.13,
44 |     "seqFISH": 0.26,
45 |     "sci-Space": 222,
46 | }
47 | tech =  data_dir.split("/")[4]
48 | dis_thr = thr_type_multiplier[thr_type] / center_to_center_dist_techs[tech]
49 | 
50 | ### Set up anndata
51 | adata = sc.AnnData(counts.T)
52 | adata.var_names_make_unique()
53 | adata = adata[coord.index,]
54 | coor_df = coord.loc[adata.obs_names, ["x", "y"]]
55 | adata.obsm["spatial"] = coor_df.to_numpy()
56 | adata.raw = adata
57 | 
58 | ### Data processing
59 | sc.pp.normalize_total(adata, inplace=True)
60 | sc.pp.log1p(adata)
61 | adata_disthr = adata.copy()
62 | sc.pp.highly_variable_genes(adata, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
63 | adata = adata[:, adata.var.highly_variable]
64 | sc.tl.pca(adata, svd_solver = 'arpack')
65 | sc.pp.neighbors(adata, n_neighbors = 10, n_pcs = 40)
66 | sc.tl.umap(adata)
67 | sc.tl.leiden(adata, resolution = 0.4)
68 | 
69 | ### Get CellChat ligand-receptors
70 | df_cellchat = ct.pp.ligand_receptor_database(database = 'CellChat', species = species)
71 | df_cellchat_filtered = ct.pp.filter_lr_database(df_cellchat, adata_disthr, min_cell_pct = 0.05)
72 | 
73 | if (df_cellchat_filtered.shape == (0, 0)):
74 |     raise ValueError("ct.pp.filter_lr_database() returns an empty data frame, too few overlapping genes between reference and data")
75 | 
76 | now = datetime.datetime.now()
77 | print("Analysis started: ")
78 | print(now)
79 | 
80 | ct.tl.spatial_communication(adata_disthr, database_name = 'cellchat', df_ligrec = df_cellchat_filtered, dis_thr = dis_thr, heteromeric = True, pathway_sum = True)
81 | 
82 | adata_disthr.write(data_dir + "adata_disthr_" + thr_type + ".h5ad")
83 | 
84 | now = datetime.datetime.now()
85 | print("Analysis finished: ")
86 | print(now)
87 | 


--------------------------------------------------------------------------------
/data_analysis/spatial_clustering/Archive/quest_step01_stagate_jobarray.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import pandas as pd
  5 | import scanpy as sc
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | import tensorflow as tf
  9 | from sklearn.mixture import GaussianMixture
 10 | from sklearn.metrics.cluster import adjusted_rand_score
 11 | 
 12 | import STAGATE
 13 | 
 14 |     
 15 | sample_dir = sys.argv[1]
 16 | tech = sample_dir.split("/")[4]
 17 | print(">>> " + sample_dir + " started first step STAGE clustering<<<")
 18 | 
 19 | # directory of counts and coordinates
 20 | counts_file = os.path.join(sample_dir, 'processed/counts.csv')
 21 | coor_file = os.path.join(sample_dir, 'processed/coordinates.csv')
 22 | 
 23 | if os.path.isfile(counts_file) and os.path.isfile(coor_file):
 24 | 	# read and format data to anndata 
 25 | 	counts = pd.read_csv(counts_file, index_col=0)
 26 | 	coor_df = pd.read_csv(coor_file) 
 27 | 	coor_df.set_index('barcode', drop=True, inplace=True)
 28 | 	adata = sc.AnnData(counts.T)
 29 | 	adata.var_names_make_unique()
 30 | 
 31 | 	# keep only obs that are in coordinatesfile
 32 | 	adata = adata[coor_df.index,]
 33 | 	coor_df = coor_df.loc[adata.obs_names, ['x', 'y']]
 34 | 	adata.obsm["spatial"] = coor_df.to_numpy()
 35 | 	adata.raw = adata
 36 | 
 37 | 
 38 | 	# check if need to log1p by finding non-int values across columns
 39 | 	int_only = True       
 40 | 	for col in counts.columns.tolist():
 41 | 		col_int = counts[col].astype(str).str.isdigit().all()
 42 | 		if col_int == False:
 43 | 			int_only=False 
 44 | 			print('NON-INT COUNTS')
 45 | 			break
 46 | 
 47 | 	if counts.to_numpy().max() >20 and int_only==True:
 48 | 		sc.pp.log1p(adata)
 49 | 
 50 | 	# normalization
 51 | 	sc.pp.normalize_total(adata, target_sum=1e4)
 52 | 	sc.pp.filter_genes(adata,min_cells=5)
 53 | 	sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)
 54 | 
 55 | 	tf.compat.v1.disable_eager_execution()
 56 | 	rad_cur = 2
 57 | 	STAGATE.Cal_Spatial_Net(adata, rad_cutoff=rad_cur)
 58 | 	neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs
 59 | 	print('TECH:', tech, ' INIT NEIGHBORS:', neighbors)
 60 | 
 61 | 	# add radius_cutoff based on technology until reach at least 5 neighbors 
 62 | 	if neighbors < 5 :
 63 | 		while neighbors < 5:
 64 | 			if tech == 'ST':
 65 | 				rad_add = 1
 66 | 			elif tech == 'DBiT-seq':
 67 | 				rad_add = 1
 68 | 			elif tech == '10x':
 69 | 				rad_add = 2
 70 | 			elif tech == 'seqFISH':
 71 | 				rad_add = 5
 72 | 			elif tech == 'MERFISH':
 73 | 				rad_add = 30
 74 | 			elif tech == 'Slide-seq':
 75 | 				rad_add = 30
 76 | 			elif tech == 'osmFISH':
 77 | 				rad_add = 300
 78 | 			else:
 79 | 				rad_add = 10
 80 | 				
 81 | 			rad_cur = rad_cur + rad_add
 82 | 			STAGATE.Cal_Spatial_Net(adata, rad_cutoff= rad_cur)
 83 | 			neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs
 84 | 
 85 | 	print(' FINAL RADIUS CUTOFF:', rad_cur,  'FINAL NEIGHBORS:', neighbors)
 86 | 	#print(adata.uns['Spatial_Net'])
 87 | 
 88 | 	#### Running STAGATE ####
 89 | 	adata = STAGATE.train_STAGATE(adata, alpha=0)
 90 | 
 91 | 	sc.pp.neighbors(adata, use_rep='STAGATE')
 92 | 	sc.tl.umap(adata)
 93 | 
 94 | 	# create clustering folder if does not exist
 95 | 	if not os.path.exists(f'{sample_dir}analysis/clustering'):
 96 | 		os.makedirs(f'{sample_dir}analysis/clustering')
 97 | 
 98 | 	pd.DataFrame(adata.obsm['STAGATE'], index=adata.obs.index).to_csv(f'{sample_dir}analysis/clustering/obsm_STAGATE.csv')
 99 | 	print(">>> " + sample_dir + " finished first step STAGATE clustering<<<")
100 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/annotated/DGE-annotated.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | ### Author: Jenny, Yiming
 4 | ###
 5 | ### Description: This script performs DGE analysis (single-cell level data)
 6 | 
 7 | library(Seurat)
 8 | library(data.table)
 9 | library(dplyr)
10 | library(stringr)
11 | 
12 | ### Define paths and variables
13 | args <- commandArgs(trailingOnly=TRUE)
14 | st_dir <- "/projects/b1131/SpatialT"
15 | dt_dir <- "/projects/b1131/SpatialT/drug-target/"
16 | sample_dir <- args[1]
17 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/"
18 | # sample_dir <- "/projects/b1131/SpatialT/DBiT-seq/PID150/DS150A/DS150A.GSM4096261/"
19 | 
20 | ds_name <- str_split(sample_dir, '/')[[1]][7]
21 | tech <- str_split(sample_dir, '/')[[1]][5]
22 | p_name <- str_split(sample_dir, '/')[[1]][6]
23 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/")
24 | sample_name <- str_split(sample_dir, '/')[[1]][8]
25 | 
26 | ### Read Seurat object
27 | seurat_object_path <- paste0(sample_dir, "processed/Seurat.RDS")
28 | seurat_object <- readRDS(seurat_object_path)
29 | 
30 | ### DGE results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/DGE
31 | output_dir <- paste0(sample_dir, "analysis/DGE")
32 | if (!dir.exists(output_dir)) {
33 | 	dir.create(output_dir)
34 | }
35 | dir.create(paste0(dt_dir, ds_name))
36 | dir.create(paste0(dt_dir, ds_name, "/", sample_name))
37 | dir.create(paste0(dt_dir, ds_name, "/", sample_name, "/DGE_anno"))
38 | 
39 | 
40 | ### this is to change '/' in cell type names to '.'
41 | seurat_object@meta.data$cell_type_annotation_class <- gsub(
42 |   pattern = "/", 
43 |   replacement = ".", 
44 |   x = seurat_object@meta.data$cell_type_annotation_class
45 | )
46 | annotations <- seurat_object$cell_type_annotation_class
47 | uniq_anno = unique(annotations)
48 | Idents(seurat_object) <- "cell_type_annotation_class"
49 | saveRDS(seurat_object, paste0(sample_dir,'/processed/Seurat_reanno.RDS'))
50 | 
51 | ### Perform DGE analysis on different cell types
52 | if (length(uniq_anno) == 1) {
53 | 	fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_anno.tsv"), sep = "\t")
54 | 	cat("\n\n### Only one annotated cell type -- skipping DGE analysis (cell types).")
55 | 	cat("\n# Writing empty DGE_cell_types_dec.tsv to file.")
56 | } else {
57 | 	# https://satijalab.org/seurat/archive/v3.1/future_vignette.html
58 | 	options(future.globals.maxSize = 5000 * 1024^2)
59 | 	DGE_cell_types <- FindAllMarkers(seurat_object, assay = "SCT", logfc.threshold = 0.2, min.pct = 0.1, verbose = FALSE)
60 | 	if (nrow(DGE_cell_types) == 0) {
61 | 		fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_anno.tsv"), sep = "\t")
62 | 		cat("\n\n### DGE analysis (cell types) cannot be performed due to having too few spots in one/many of the cell types.")
63 | 		cat("\n# Writing empty DGE_cell_types_dec.tsv to file.")
64 | 	} else {
65 | 		DGE_cell_types$cluster <- as.character(DGE_cell_types$cluster)
66 | 		fwrite(DGE_cell_types, paste0(output_dir, "/DGE_cell_types_anno.tsv"), sep = "\t")
67 | 		
68 | 		for (cell_type in sort(unique(DGE_cell_types$cluster))) {
69 | 			cat(paste0(cell_type,' being anlayzed for DEG \n'))
70 | 			# cell_type <- "Malignant"
71 | 			DGE_cell_types_less <- DGE_cell_types[DGE_cell_types$cluster == cell_type,]
72 | 			DGE_cell_types_less <- DGE_cell_types_less[,c("gene", "avg_log2FC", "p_val", "p_val_adj")]
73 | 			colnames(DGE_cell_types_less) <- c("gene", "stat", "pval", "qval")
74 | 			fwrite(DGE_cell_types_less, paste0(dt_dir, ds_name, "/", sample_name, "/DGE_anno/", cell_type, ".txt"), sep = "\t")
75 | 		}
76 | 		cat("\n\n### DGE analysis (cell types) results written to file.")
77 | 	}
78 | }
79 | tt4 <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
80 | cat(paste0("\n### Analysis completed; max memory consumed: ", as.character(tt4), "M -- [", Sys.time(), "]\n\n"))
81 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/README.md:
--------------------------------------------------------------------------------
 1 | # Drug Discovery
 2 | 
 3 | Drug discovery analysis aims to identify repurposable and established compounds for targeting cell types of interests in pathological sample. Analysis is conducted on spatially variable and differentially expressed (SV-DE) genes for each deconvoluated cell type. 
 4 | 
 5 | ## Drug Enrichment and Perturbation [Script](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py) 
 6 | 
 7 | ### Input
 8 | - `CMAP L1000 perturbation profile` (e.g., cmap2_6hr_pert_z.npy), which can be downloaded from [CMAP](https://clue.io/data/CMap2020#LINCS2020) signature/level 5 data. This file contains the perturbation MODZ score calculated for each compound perturbation on each gene (12,328 total). We included 6hr perturbations (145,491) to reduce redunduncy and avoid the confounding factor of treatment duration. More explanation on MODZ score can be found from this CMAP [article](https://clue.io/connectopedia/replicate_collapse). Additional metadata can also be downloade from CMAP to filter for cell type, dosage, etc. 
 9 | 
10 | - `CMAP L1000 rank profile` (e.g., cmap2_6hr_pert_rank.csv), which is created by ranking the perturbation MODZ score of genes for each of the 145,491 perturbations.
11 | 
12 | - `Gene name mappings` (e.g., geneinfo_beta.txt and HGNC091923.txt) for mapping between gene IDs and symbols
13 | 
14 | - `Spatially variable (tissue level) cell type-specific differnetially expressed genes` (e.g., dsid/sampleid/DGE_dec_SVG/). SVG can be generated from [script](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability), and DGE can be generated from the [script](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/DGE). Filter is used to select tissue-elevel spatially variable (q-value <0.1) up and down DE gene sets with |log2fc| > 0.5 and q-value < 0.05. For cell types that don't have enough deconvoluted spots for SVG analysis, DGE results are used directly.
15 | 
16 | ### CMAP2 Enrichment Calculation
17 | - Gene set enrichment analysis (GSEA) is conducted on the resulting set of up and down DGE sets (10 <= gene set size <= 2000) for each CMAP compound perturbation. GSEA score difference between up and down DEG set is calculated as the enrichment score for the compound. P-value is calculated by comparing the proportion of enrichment score calculated from random gene rankings greater/less than the compound's enrichment score (depending on the sign). More information can be found at the Methods section of [manuscript](https://www.biorxiv.org/content/10.1101/2022.04.17.488596v3).
18 | - Compounds with the 500 highest (inverse enrichment, suppress DEGs) and lowest (positive enrichment, promote DEGs) enrichment score are saved for every cell type as output(e.g., dsid/sampleid/Enrichment/). 
19 | 
20 | ### CMAP2 Perturbation Network
21 | - For each of the 500 positively and 500 inversely enriched compound perturbation, CMAP perturbation MODZ score of the compound on the SV-DE genes DEGs represents the effect a compound has on the gene target.
22 | - Top and bottom 30 SV-DE genes with the highest absolute value of perturbation MODZ score are saved  along with the log2fc of the SV-DE genes for plotting the perturbation network (e.g., dsid/sampleid/Perturbation/). 
23 | 
24 | 
25 | ## Protein-protein Interaction [Script](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.py)
26 | 
27 | ### Input
28 | - `Human protein interactome` (e.g., Interactome.tsv'), which contains 350k pairs of PPIs. Maping of NCBI gene IDs and symbols can be done using HGNC reference (e.g., 'HGNC.tsv'). 
29 | 
30 | - `Spatially variable (tissue level) cell type-specific differnetially expressed genes` similar to that required for drug screen. 
31 | 
32 | ### PPI Network
33 | - Top 300 SV-DE genes with the highest absolute values is used to find matching PPIs in the interactome (both receiver and sender need to be from the top 300 DEG list).
34 | - Results along with log2fc of the SV-DE genes are saved for plotting PPI network. 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/annotation/annotation_example.R:
--------------------------------------------------------------------------------
 1 | """
 2 | Cell Type Annotation Example Script
 3 | 
 4 | Author: Saya Dennis 
 5 | 
 6 | Usage: Rscript --no-save annotation_example.R
 7 | 
 8 | Requirements: 
 9 | 1. You need to first process your reference scRNA-seq dataset.
10 | 2. In the code below, edit the directory/file names of your reference dataset (saved into variable ref_data_sce) 
11 | 3. You will need to create an annotation directory under your ST sample directories 
12 |     - This should look like this: /share/fsmresfiles/SpatialT/{tech}/PID{pid}/{dsid}/{sampleid}/analysis/annotation/
13 |     - To automate generating this directory, refer to script ST-dataset/analysis/cell_type_annotation/01_create_anno_directory.py
14 | 4. Back up your un-annotated Seurat object 
15 |     - Back up to a file named Seurat.RDS.bk under the same directory. 
16 |     - Refer to ST-dataset/analysis/cell_type_annotation/saya_cell_type_annotation_examples/02_backup_seurat_before_annotation.py
17 | 5. Below, edit the PID, DSID, and technology directory (e.g. /share/fsmresfiles/SpatialT/DBiT-seq)
18 | 
19 | """
20 | 
21 | library(data.table)
22 | library(SingleCellExperiment)
23 | library(scuttle)
24 | library(Seurat)
25 | library(SingleR)
26 | 
27 | ################################################
28 | #### Cell type annotation on a dataset DS2O ####
29 | ################################################
30 | 
31 | dref <- '/share/fsmresfiles/SpatialT/ref/Heart/Adult/heart-cell-atlas/processed/' # reference directory 
32 | ref_data_sce <- readRDS(paste0(dref, 'sce_heart.RDS'))
33 | 
34 | target_p_name <- "PID70"
35 | target_ds_name <- "DS70B"
36 | target_ds_dir <- paste(c("/share/fsmresfiles/SpatialT/DBiT-seq", target_p_name, target_ds_name), collapse = "/")
37 | target_ds_metatable <- read.table(paste0(target_ds_dir, "/metatable.tsv"), header = TRUE, stringsAsFactors = FALSE)
38 | 
39 | ### Loop through samples and annotate 
40 | for (target_sample_name in target_ds_metatable$SampleID) {
41 |     cat(paste0("Starting annotation for sample ", target_sample_name, " -- ", Sys.time(), "\n"))
42 |     # create annotation directory 
43 |     # if (!dir.exists(paste0(dds, "/", sampleid, "/analysis/annotation"))) {
44 |     #     dir.create(paste0(dds, "/", sampleid, "/analysis/annotation"))
45 |     # }
46 | 
47 |     # load processed Seurat object 
48 |     target_sample_dir <- paste(c(target_ds_dir, "/", target_sample_name), collapse = "")
49 |     seurat_object_tn_path <- paste0(target_sample_dir, "/processed/Seurat.RDS.bk")
50 |     seurat_object_tn <- readRDS(seurat_object_tn_path)
51 | 
52 |     ### Perform spot-based cell type annotation and save to Seurat
53 |     annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, labels = ref_data_sce$label, de.method="wilcox")
54 |     seurat_object_tn[["cell_type_annotation"]] <- annotation$labels
55 | 
56 |     ### Perform cluster-based cell type annotation and save to Seurat
57 |     cluster_results <- seurat_object_tn[["seurat_clusters"]]$seurat_clusters
58 |     annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, clusters = cluster_results, labels = ref_data_sce$label, de.method="wilcox")
59 |     seurat_object_tn[["cell_type_annotation_clusters"]] <- annotation$labels[cluster_results]
60 | 
61 |     ### Overwrite the previously saved Seurat object with cell type annotated Seurat object 
62 |     saveRDS(seurat_object_tn, file = paste0(target_sample_dir, "/processed/Seurat.RDS"))
63 | 
64 |     ### Visualize annotated cell types
65 |     pdf(paste0(target_sample_dir, "/analysis/annotation/cell_type_annotation.pdf")) ### Change to your own save directory/name
66 |     # pdf("analysis/annotation/cell_type_annotation.pdf")
67 |     print(SpatialDimPlot(seurat_object_tn))
68 |     print(DimPlot(seurat_object_tn, reduction = "umap"))
69 |     print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation"))
70 |     print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation"))
71 |     dev.off()
72 |     cat(paste0("Finished annotation for sample ", target_sample_name, " -- ", Sys.time(), "\n\n"))
73 | }
74 | 


--------------------------------------------------------------------------------
/data_analysis/spatial_clustering/quest_stagate_updated_jobarray.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import pandas as pd
  5 | import scanpy as sc
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | import tensorflow as tf
  9 | from sklearn.mixture import GaussianMixture
 10 | from sklearn.metrics.cluster import adjusted_rand_score
 11 | 
 12 | import STAGATE
 13 | 
 14 |     
 15 | sample_dir = sys.argv[1]
 16 | tech = sample_dir.split("/")[4]
 17 | print(">>> " + sample_dir + " started first step STAGE clustering<<<")
 18 | 
 19 | # directory of counts and coordinates
 20 | counts_file = os.path.join(sample_dir, 'processed/counts.csv')
 21 | coor_file = os.path.join(sample_dir, 'processed/coordinates.csv')
 22 | 
 23 | if os.path.isfile(counts_file) and os.path.isfile(coor_file):
 24 | 	# read and format data to anndata 
 25 | 	counts = pd.read_csv(counts_file, index_col=0)
 26 | 	coor_df = pd.read_csv(coor_file) 
 27 | 	coor_df.set_index('barcode', drop=True, inplace=True)
 28 | 	adata = sc.AnnData(counts.T)
 29 | 	adata.var_names_make_unique()
 30 | 
 31 | 	# keep only obs that are in coordinatesfile
 32 | 	adata = adata[coor_df.index,]
 33 | 	coor_df = coor_df.loc[adata.obs_names, ['x', 'y']]
 34 | 	adata.obsm["spatial"] = coor_df.to_numpy()
 35 | 	adata.raw = adata
 36 | 
 37 | 
 38 | 	# check if need to log1p by finding non-int values across columns
 39 | 	int_only = True       
 40 | 	for col in counts.columns.tolist():
 41 | 		col_int = counts[col].astype(str).str.isdigit().all()
 42 | 		if col_int == False:
 43 | 			int_only=False 
 44 | 			print('NON-INT COUNTS')
 45 | 			break
 46 | 
 47 | 	if counts.to_numpy().max() >20 and int_only==True:
 48 | 		sc.pp.log1p(adata)
 49 | 
 50 | 	# normalization
 51 | 	sc.pp.normalize_total(adata, target_sum=1e4)
 52 | 	sc.pp.filter_genes(adata,min_cells=5)
 53 | 	sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)
 54 | 
 55 | 	tf.compat.v1.disable_eager_execution()
 56 | 	rad_cur = 2
 57 | 	STAGATE.Cal_Spatial_Net(adata, rad_cutoff=rad_cur)
 58 | 	neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs
 59 | 	print('TECH:', tech, ' INIT NEIGHBORS:', neighbors)
 60 | 
 61 | 	# add radius_cutoff based on technology until reach at least 5 neighbors 
 62 | 	if neighbors < 5 :
 63 | 		while neighbors < 5:
 64 | 			if tech == 'ST':
 65 | 				rad_add = 1
 66 | 			elif tech == 'DBiT-seq':
 67 | 				rad_add = 1
 68 | 			elif tech == '10x':
 69 | 				rad_add = 2
 70 | 			elif tech == 'seqFISH':
 71 | 				rad_add = 5
 72 | 			elif tech == 'MERFISH':
 73 | 				rad_add = 30
 74 | 			elif tech == 'Slide-seq':
 75 | 				rad_add = 30
 76 | 			elif tech == 'osmFISH':
 77 | 				rad_add = 300
 78 | 			else:
 79 | 				rad_add = 10
 80 | 				
 81 | 			rad_cur = rad_cur + rad_add
 82 | 			STAGATE.Cal_Spatial_Net(adata, rad_cutoff= rad_cur)
 83 | 			neighbors = adata.uns['Spatial_Net'].shape[0]/adata.n_obs
 84 | 
 85 | 	print(' FINAL RADIUS CUTOFF:', rad_cur,  'FINAL NEIGHBORS:', neighbors)
 86 | 	#print(adata.uns['Spatial_Net'])
 87 | 
 88 | 	#### Running STAGATE ####
 89 | 	adata = STAGATE.train_STAGATE(adata, alpha=0)
 90 | 
 91 | 	sc.pp.neighbors(adata, use_rep='STAGATE')
 92 | 	sc.tl.umap(adata)
 93 | 
 94 | 	# determine cluster resolution based on cell size
 95 | 	if adata.shape[0] < 100:
 96 | 	    res=1.2
 97 | 	elif adata.shape[0] >=100 and adata.shape[0] <500:
 98 | 	    res = 0.7
 99 | 	elif adata.shape[0] >=500 and adata.shape[0] <5000:
100 | 	    res = 0.5    
101 | 	elif adata.shape[0] >=5000 and adata.shape[0] <20000:
102 | 	    res = 0.3    
103 | 	elif adata.shape[0] >=20000:
104 | 	    res = 0.1
105 | 
106 | 	# clustering
107 | 	sc.tl.louvain(adata, resolution=res)
108 | 	sc.pl.embedding(adata, basis="spatial", color="louvain",s=6, show=False, title='STAGATE')
109 | 
110 | 	# create clustering folder if does not exist
111 | 	if not os.path.exists(f'{sample_dir}analysis/clustering'):
112 | 		os.makedirs(f'{sample_dir}analysis/clustering')
113 | 
114 | 	pd.DataFrame(adata.obsm['STAGATE'], index=adata.obs.index).to_csv(f'{sample_dir}analysis/clustering/STAGATE_30dim.csv')
115 | 	pd.DataFrame(adata.obs['louvain'], index=adata.obs.index).to_csv(f'{sample_dir}analysis/clustering/STAGATE_clusters.csv')
116 | 	print(">>> " + sample_dir + " finished first step STAGATE clustering<<<")
117 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-pull-scores.py:
--------------------------------------------------------------------------------
 1 | import os, sys, pickle, datetime, anndata
 2 | import commot as ct
 3 | import scanpy as sc
 4 | import pandas as pd
 5 | import numpy as np
 6 | import scipy
 7 | from collections import Counter
 8 | 
 9 | ### Read in data
10 | data_dir = sys.argv[1] + "/analysis/deconvolution/"
11 | # data_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/deconvolution/'
12 | # data_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/'
13 | counts_dir = data_dir + 'binded_counts.csv'
14 | counts = pd.read_csv(counts_dir, index_col = 0)
15 | 
16 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec"
17 | # out_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/Distance/COMMOT_dec'
18 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec'
19 | if not os.path.exists(out_dir):
20 |     os.makedirs(out_dir)
21 | 
22 | ### Get spatial distance type
23 | thr_type = sys.argv[3]
24 | out_dir = out_dir + "/" + thr_type
25 | # thr_type = "short"
26 | # out_dir = '/share/fsmresfiles/SpatialT/10x/PID4/DS4A/DS4A.1/analysis/Distance/COMMOT_dec/thr_type'
27 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec/short'
28 | if not os.path.exists(out_dir):
29 |     os.makedirs(out_dir)
30 | 
31 | ### Get COMMOT object and pathways
32 | adata_disthr = sc.read_h5ad(data_dir + "adata_disthr_" + thr_type + ".h5ad")
33 | with open(out_dir + "/pathways.txt") as f:
34 |     pathways = [line.rstrip('\n') for line in f]
35 | 
36 | ### Get cell types
37 | anno_dir = data_dir + 'binded_cell_types.tsv'
38 | annotation = pd.read_csv(anno_dir, index_col = 0, sep = "\t")
39 | cell_types = annotation['cell_type'].tolist()
40 | 
41 | ### Cell-type-level scores
42 | adata_disthr.obs['cell_type'] = cell_types
43 | for pathway in pathways:
44 |     ct.tl.cluster_communication(adata_disthr, database_name = 'cellchat', pathway_name = pathway, clustering = 'cell_type', n_permutations = 100)
45 | 
46 | ### Pull LR pairs
47 | lrpairs = [str(i).replace("commot-cellchat-", "") for i in adata_disthr.obsp]
48 | lrpairs = [i for i in lrpairs if "-" in i]
49 | lrpairs.sort()
50 | lrpairs.remove("total-total")
51 | for lrpair in lrpairs:
52 |     ct.tl.cluster_communication(adata_disthr, database_name = 'cellchat', pathway_name = lrpair, clustering = 'cell_type', n_permutations = 100)
53 | 
54 | ### Pathway-level
55 | rows_list = []
56 | for pathway in pathways:
57 |     # https://github.com/zcang/COMMOT/issues/10
58 |     # rows (first index) represent senders and the columns (second index) represent receivers
59 |     tmp_mat = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + pathway]["communication_matrix"]
60 |     tmp_p = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + pathway]["communication_pvalue"]
61 |     for i in tmp_mat.index:
62 |         for j in tmp_mat.columns:
63 |             rows_list.append({"pathway": pathway, "cell_type1": i, "cell_type2": j, "score": tmp_mat.loc[i,j], "p_val": tmp_p.loc[i,j]})
64 | 
65 | df = pd.DataFrame(rows_list)
66 | df.to_csv(out_dir + "/communication_scores_pathway.txt", index = False)
67 | 
68 | ### LR-pair-level
69 | ccdb = adata_disthr.uns['commot-cellchat-info']["df_ligrec"]
70 | ccdb["lrpair"] = ccdb["ligand"] + "-" + ccdb["receptor"]
71 | ccdb_dict = dict(zip(ccdb.lrpair, ccdb.pathway))
72 | rows_list = []
73 | lrpairs2 = lrpairs
74 | for lrpair in lrpairs:
75 |     if lrpair in ccdb_dict:
76 |         tmp_mat = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + lrpair]["communication_matrix"]
77 |         tmp_p = adata_disthr.uns['commot_cluster-cell_type-cellchat-' + lrpair]["communication_pvalue"]
78 |         for i in tmp_mat.index:
79 |             for j in tmp_mat.columns:
80 |                 rows_list.append({"pathway": ccdb_dict[lrpair], "lrpair": lrpair, "cell_type1": i, "cell_type2": j, "score": tmp_mat.loc[i,j], "p_val": tmp_p.loc[i,j]})
81 |     else:
82 |         lrpairs2.remove(lrpair)
83 | 
84 | with open(out_dir + '/lrpairs.txt', 'w') as f:
85 |     for lrpair in lrpairs2:
86 |         _ = f.write(f"{lrpair}\n")
87 | 
88 | df = pd.DataFrame(rows_list)
89 | df.to_csv(out_dir + "/communication_scores_lrpair.txt", index = False)
90 | 
91 | ### Overwrite result file
92 | adata_disthr.write(data_dir + "adata_disthr_" + thr_type + "_cs.h5ad")
93 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE-step0.py:
--------------------------------------------------------------------------------
 1 | import os, sys, pickle, datetime, anndata, shutil
 2 | import commot as ct
 3 | import scanpy as sc
 4 | import pandas as pd
 5 | import numpy as np
 6 | import scipy
 7 | from collections import Counter
 8 | 
 9 | ### Read in data
10 | data_dir = sys.argv[1] + "/analysis/deconvolution/"
11 | # data_dir = '/share/fsmresfiles/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/'
12 | # data_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/deconvolution/'
13 | counts_dir = data_dir + 'binded_counts.csv'
14 | counts = pd.read_csv(counts_dir, index_col = 0)
15 | 
16 | out_dir = sys.argv[1] + "/analysis/Distance/COMMOT_dec"
17 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec'
18 | if not os.path.exists(out_dir):
19 |     os.makedirs(out_dir)
20 | 
21 | ### Get spatial distance type
22 | thr_type = sys.argv[3]
23 | out_dir = out_dir + "/" + thr_type
24 | # out_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT_dec/short'
25 | if not os.path.exists(out_dir):
26 |     os.makedirs(out_dir)
27 | # else:
28 | #     shutil.rmtree(out_dir)
29 | #     os.makedirs(out_dir)
30 | 
31 | adata_disthr = sc.read_h5ad(data_dir + "adata_disthr_" + thr_type + ".h5ad")
32 | adata_disthr.layers['counts'] = scipy.sparse.csr_matrix(counts.values.T)
33 | 
34 | pathways = [str(i).replace("commot-cellchat-", "") for i in adata_disthr.obsp]
35 | pathways = [i for i in pathways if "-" not in i]
36 | pathways.sort()
37 | with open(out_dir + '/pathways.txt', 'w') as f:
38 |     for pathway in pathways:
39 |         _ = f.write(f"{pathway}\n")
40 | 
41 | for pathway in pathways:
42 |     # pathway = "MIF"
43 |     
44 |     ### rpy2 does not fully work on Quest; want to achieve the following:
45 |     ### df_deg, df_yhat = ct.tl.communication_deg_detection(adata_disthr, database_name = 'cellchat', pathway = pathway, summary = 'receiver')
46 |     summary = 'receiver'
47 |     database_name = "cellchat"
48 |     pathway_dir = out_dir + "/" + pathway
49 |     # pathway_dir = '/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/analysis/Distance/COMMOT/short/MIF'
50 |     if not os.path.exists(pathway_dir):
51 |         os.makedirs(pathway_dir)
52 |     
53 |     # prepare input adata for R
54 |     adata_deg = anndata.AnnData(X = adata_disthr.layers['counts'], var = pd.DataFrame(index=list(adata_disthr.var_names)), obs = pd.DataFrame(index=list(adata_disthr.obs_names)))
55 |     adata_deg_var = adata_deg.copy()
56 |     sc.pp.filter_genes(adata_deg_var, min_cells=3)
57 |     sc.pp.filter_genes(adata_deg, min_cells=3)
58 |     sc.pp.normalize_total(adata_deg_var, target_sum=1e4)
59 |     sc.pp.log1p(adata_deg_var)
60 |     sc.pp.highly_variable_genes(adata_deg_var, min_mean=0.0125, max_mean=3, min_disp=0.5)
61 |     adata_deg = adata_deg[:, adata_deg_var.var.highly_variable]
62 |     del adata_deg_var
63 |     
64 |     summary_name = 'commot-'+database_name+'-sum-'+summary
65 |     if summary == 'sender':
66 |         summary_abrv = 's'
67 |     else:
68 |         summary_abrv = 'r'
69 |     
70 |     comm_sum = adata_disthr.obsm[summary_name][summary_abrv+'-'+pathway].values.reshape(-1,1)
71 |     cell_weight = np.ones_like(comm_sum).reshape(-1,1)
72 |     
73 |     ### Save data for R
74 |     Xmat = pd.DataFrame(adata_deg.X.toarray())
75 |     Xmat.index = adata_deg.obs.index
76 |     Xmat.columns = adata_deg.var.index
77 |     Xmat.to_csv(pathway_dir + "/step1_X.csv", header = True, index = True)
78 |     pseudoTime = pd.DataFrame(comm_sum)
79 |     pseudoTime.to_csv(pathway_dir + "/step1_pseudoTime.csv", header = False, index = False)
80 |     cellWeight = pd.DataFrame(cell_weight)
81 |     cellWeight.to_csv(pathway_dir + "/step1_cellWeight.csv", header = False, index = False)
82 |     
83 |     nknots = 6
84 |     
85 |     string_fitGAM = 'sce <- fitGAM(counts=X, pseudotime=pseudoTime, cellWeights=cellWeight, nknots=%d, verbose=TRUE)' % nknots
86 |     string_fitGAM = string_fitGAM + '\nassoRes <- data.frame( associationTest(sce, global=FALSE, lineage=TRUE) )'
87 |     string_fitGAM = string_fitGAM + '\nassoRes[is.nan(assoRes[,"waldStat_1"]),"waldStat_1"] <- 0.0'
88 |     string_fitGAM = string_fitGAM + '\nassoRes[is.nan(assoRes[,"df_1"]),"df_1"] <- 0.0'
89 |     string_fitGAM = string_fitGAM + '\nassoRes[is.nan(assoRes[,"pvalue_1"]),"pvalue_1"] <- 1.0\n'
90 |     
91 |     with open(pathway_dir + "/step1.R", "w") as text_file:
92 |         _ = text_file.write(string_fitGAM)
93 | 


--------------------------------------------------------------------------------
/data_analysis/spatial_variability/quest_SpatialDE_ct_specific.py:
--------------------------------------------------------------------------------
  1 | import SpatialDE
  2 | import NaiveDE
  3 | import numpy as np
  4 | import pandas as pd
  5 | from pandas.api.types import is_numeric_dtype
  6 | import os
  7 | import sys
  8 | 
  9 | st_dir = "/projects/b1131/SpatialT/" # On Quest
 10 | 
 11 | sample_dir = sys.argv[1]
 12 | cell_type = sys.argv[2]
 13 | print("\n\n>>> " + sample_dir + "[" + cell_type + "] started <<<")
 14 | 
 15 | ### Read counts and coordinates
 16 | counts = pd.read_csv(sample_dir + 'analysis/deconvolution/counts_' + cell_type + '_deconv_only.csv')
 17 | counts_num = counts._get_numeric_data()
 18 | min_count = counts_num.min().min()
 19 | if (min_count < 0):
 20 |     counts_num[counts_num < 0] = 0
 21 | 
 22 | coordinates = pd.read_csv(sample_dir + 'processed/coordinates.csv')
 23 | counts.loc['Total',:]= counts.sum(axis=0)
 24 | 
 25 | ### Align counts and coordinates index
 26 | error_count = 0
 27 | for i,j in zip (counts.columns.tolist()[1:], coordinates['barcode'].tolist()):
 28 |     if i != j:
 29 |         error_count = error_count + 1
 30 | 
 31 | if error_count > 0:
 32 |     print("[ERROR] " + sample_dir + " has not matching spot IDs.")
 33 |     sys.exit()
 34 | 
 35 | ### Get total counts
 36 | total_counts = counts.iloc[-1][1:].tolist()
 37 | 
 38 | ### Process data
 39 | sample_info = pd.DataFrame()
 40 | if 'x' in coordinates.columns:
 41 |     sample_info['y'] = coordinates['y']
 42 |     sample_info['x'] = coordinates['x']
 43 | else:
 44 |     print("[ERROR] " + sample_dir + " has problematic coordinates column names.")
 45 |     sys.exit(1)
 46 | 
 47 | sample_info['total_counts'] = total_counts
 48 | sample_info.index = coordinates['barcode']
 49 | # sample_info
 50 | reshaped_counts = counts.set_index('gene').iloc[:-1].transpose()
 51 | reshaped_counts.index = coordinates['barcode']
 52 | reshaped_counts = reshaped_counts.T[reshaped_counts.sum(0) >= 3].T
 53 | # reshaped_counts
 54 | 
 55 | ### Run SpatialDE
 56 | try:
 57 |     norm_expr = NaiveDE.stabilize(reshaped_counts.T).T
 58 | except:
 59 |     norm_expr = np.log(reshaped_counts.T).T
 60 | 
 61 | resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(total_counts)').T
 62 | # sample_resid_expr = resid_expr.sample(n=15202, axis=1, random_state=1)
 63 | X = sample_info[['x', 'y']]
 64 | 
 65 | try:
 66 |     results = SpatialDE.run(X.to_numpy(), resid_expr)
 67 | except:
 68 |     print("[ERROR] " + sample_dir + " SpatialDE failed, probably because the data is too sparse / contains too few spots for this cell type.")
 69 |     sys.exit(1)
 70 | 
 71 | if not os.path.exists(f'{sample_dir}analysis/'):
 72 |     os.makedirs(f'{sample_dir}analysis/')
 73 | 
 74 | if not os.path.exists(f'{sample_dir}analysis/SVG/'):
 75 |     os.makedirs(f'{sample_dir}analysis/SVG/')
 76 | 
 77 | ### Write results to file
 78 | results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_results_' + cell_type + '.tsv', sep = '\t', index = False)
 79 | # g - The name of the gene
 80 | # pval - The P-value for spatial differential expression
 81 | # qval - Significance after correcting for multiple testing
 82 | # l - A parameter indicating the distance scale a gene changes expression over
 83 | print("# SVG analysis [" + cell_type + "] finished.")
 84 | 
 85 | sign_results = results.query('qval < 0.05')
 86 | n_patterns = 5 # Default, hard-wired for now
 87 | 
 88 | if sign_results.shape[0]>0:
 89 |     ### Get average l
 90 |     l = pd.DataFrame(sign_results['l'].value_counts()).index.tolist()
 91 |     count = pd.DataFrame(sign_results['l'].value_counts())['count'].tolist()
 92 |     total_count = sum(count)
 93 |     total = 0
 94 |     for i,j in zip(l, count):
 95 |         ij = i*j
 96 |         total += ij
 97 |     
 98 |     L = round(total/total_count)
 99 |     histology_results, patterns = SpatialDE.aeh.spatial_patterns(X.to_numpy(), resid_expr, sign_results, C = n_patterns, l = L, verbosity = 1, delta_elbo_threshold = 1)
100 |     print("# Pattern analysis [" + cell_type + "] finished.")
101 | else:
102 |     patterns = pd.DataFrame(columns=['0', '1'])
103 |     histology_results = pd.DataFrame(columns=['g', 'pattern', 'membership'])
104 |     print("# [WARNING] Cannot perform pattern analysis [" + cell_type + "], no sig genes.")
105 | 
106 | ### Write results to file
107 | histology_results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_histology_results_' + cell_type + '.tsv', sep = '\t', index = False)
108 | patterns.to_csv(sample_dir + 'analysis/SVG/SpatialDE_patterns_' + cell_type + '.tsv', sep = '\t', index = False)
109 | print(">>> " + sample_dir + " finished <<<")
110 | 


--------------------------------------------------------------------------------
/data_analysis/spatial_variability/quest_SpatialDE_jobarray.py:
--------------------------------------------------------------------------------
  1 | import SpatialDE
  2 | import NaiveDE
  3 | import pandas as pd
  4 | from pandas.api.types import is_numeric_dtype
  5 | import os
  6 | import sys
  7 | 
  8 | # st_dir = "/share/fsmresfiles/SpatialT/" # On FSM servers
  9 | st_dir = "/projects/b1131/SpatialT/" # On Quest
 10 | all_samples = pd.read_csv(st_dir + "master_table_new.txt", sep = "\t") # 1735 samples
 11 | 
 12 | sample_dir = sys.argv[1]
 13 | print(">>> " + sample_dir + " started <<<")
 14 | 
 15 | ### Read counts and coordinates
 16 | counts = pd.read_csv(sample_dir + 'processed/counts.csv')
 17 | counts_num = counts._get_numeric_data()
 18 | min_count = counts_num.min().min()
 19 | if (min_count < 0):
 20 |     counts_num[counts_num < 0] = 0
 21 | 
 22 | coordinates = pd.read_csv(sample_dir + 'processed/coordinates.csv')
 23 | counts.loc['Total',:]= counts.sum(axis=0)
 24 | 
 25 | ### Align counts and coordinates index
 26 | error_count = 0
 27 | for i,j in zip (counts.columns.tolist()[1:], coordinates['barcode'].tolist()):
 28 |     if i != j:
 29 |         error_count = error_count + 1
 30 | 
 31 | if error_count > 0:
 32 |     print("[ERROR] " + sample_dir + " has not matching spot IDs.")
 33 |     sys.exit()
 34 | 
 35 | ### Get total counts
 36 | total_counts = counts.iloc[-1][1:].tolist()
 37 | 
 38 | ### Process data
 39 | sample_info = pd.DataFrame()
 40 | if 'x' in coordinates.columns:
 41 |     sample_info['y'] = coordinates['y']
 42 |     sample_info['x'] = coordinates['x']
 43 | else:
 44 |     print("[ERROR] " + sample_dir + " has problematic coordinates column names.")
 45 |     sys.exit()
 46 | 
 47 | sample_info['total_counts'] = total_counts
 48 | sample_info.index = coordinates['barcode']
 49 | # sample_info
 50 | reshaped_counts = counts.set_index('gene').iloc[:-1].transpose()
 51 | reshaped_counts.index = coordinates['barcode']
 52 | reshaped_counts = reshaped_counts.T[reshaped_counts.sum(0) >= 3].T
 53 | # reshaped_counts
 54 | 
 55 | ### Run SpatialDE
 56 | try:
 57 |     norm_expr = NaiveDE.stabilize(reshaped_counts.T).T
 58 | except:
 59 |     norm_expr = np.log(reshaped_counts.T).T
 60 | 
 61 | resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(total_counts)').T
 62 | # sample_resid_expr = resid_expr.sample(n=15202, axis=1, random_state=1)
 63 | X = sample_info[['x', 'y']]
 64 | 
 65 | try:
 66 |     results = SpatialDE.run(X, resid_expr)
 67 | except:
 68 |     print("[ERROR] " + sample_dir + " SpatialDE failed, probably because the data is too sparse / contains too few spots for this cell type.")
 69 |     sys.exit()
 70 | 
 71 | if not os.path.exists(f'{sample_dir}analysis/'):
 72 |     os.makedirs(f'{sample_dir}analysis/')
 73 | 
 74 | if not os.path.exists(f'{sample_dir}analysis/SVG/'):
 75 |     os.makedirs(f'{sample_dir}analysis/SVG/')
 76 | 
 77 | ### Write results to file
 78 | results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_results.tsv', sep = '\t', index = False)
 79 | # g - The name of the gene
 80 | # pval - The P-value for spatial differential expression
 81 | # qval - Significance after correcting for multiple testing
 82 | # l - A parameter indicating the distance scale a gene changes expression over
 83 | 
 84 | print("# Whole tissue SVG analysis finished.")
 85 | 
 86 | sign_results = results.query('qval < 0.05')
 87 | n_patterns = 5 # Default, hard-wired for now
 88 | 
 89 | if sign_results.shape[0]>0:
 90 |     ### Get average l
 91 |     l = pd.DataFrame(sign_results['l'].value_counts()).index.tolist()
 92 |     count = pd.DataFrame(sign_results['l'].value_counts())['l'].tolist()
 93 |     total_count = sum(count)
 94 |     total = 0
 95 |     for i,j in zip(l, count):
 96 |         ij = i*j
 97 |         total += ij
 98 | 
 99 |     L = round(total/total_count)
100 |     histology_results, patterns = SpatialDE.aeh.spatial_patterns(X, resid_expr, sign_results, C = n_patterns, l = L, verbosity = 1, delta_elbo_threshold = 1)
101 |     print("# Pattern analysis finished.")
102 | else:
103 |     patterns = pd.DataFrame(columns=['0', '1'])
104 |     histology_results = pd.DataFrame(columns=['g', 'pattern', 'membership'])
105 |     print("# [WARNING] Cannot perform pattern analysis, no sig genes.")
106 | 
107 | ### Write results to file
108 | if not os.path.exists(f'{sample_dir}analysis/SVG'):
109 |     os.makedirs(f'{sample_dir}analysis/SVG')
110 | histology_results.to_csv(sample_dir + 'analysis/SVG/SpatialDE_histology_results.tsv', sep = '\t', index = False)
111 | patterns.to_csv(sample_dir + 'analysis/SVG/SpatialDE_patterns.tsv', sep = '\t', index = False)
112 | 
113 | print(">>> " + sample_dir + " finished <<<")
114 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/deconvolution/quest_deconvolution_jobarray.R:
--------------------------------------------------------------------------------
  1 | ### Author: Yiming Li
  2 | ### Example usage:
  3 | ### quest_deconvolution_jobarray.R $sample_dir $ref_dir
  4 | 
  5 | library(stringr)
  6 | library(Seurat)
  7 | library(BayesPrism)
  8 | library(data.table)
  9 | 
 10 | # cd /share/fsmresfiles/SpatialT/10x/PID27/DS27A/DS27A_1160920F/processed
 11 | # conda activate R4
 12 | args <- commandArgs(trailingOnly=TRUE)
 13 | 
 14 | ### Change the below parameters if needed
 15 | if (length(args) > 2) {
 16 |         n_cores <- as.integer(args[3])
 17 | } else {
 18 |         n_cores <- 20
 19 | }
 20 | chain.length <- 1000
 21 | burn.in <- 500
 22 | maxit <- 10000
 23 | 
 24 | st_dir <- "/projects/b1131/SpatialT"
 25 | sample_dir <- args[1]
 26 | ds_name <- str_split(sample_dir, '/')[[1]][7]
 27 | tech <- str_split(sample_dir, '/')[[1]][5]
 28 | p_name <- str_split(sample_dir, '/')[[1]][6]
 29 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/")
 30 | sample_name <- str_split(sample_dir, '/')[[1]][8]
 31 | 
 32 | ### Assumes two files are present under this path:
 33 | ### * sc.dat.filtered.pc.sig.RDS
 34 | ### * cell_types.txt
 35 | ref_dir <- args[2]
 36 | 
 37 | ##### Validate if reference files exist
 38 | 
 39 | if (!file.exists(paste0(ref_dir, "/cell_types.txt"))) {
 40 | 	stop(paste0("[", ref_dir, "/cell_types.txt] does not exist\n"))
 41 | } else if (!file.exists(paste0(ref_dir, "/sc.dat.filtered.pc.sig.RDS"))) {
 42 | 	stop(paste0("[", ref_dir, "/sc.dat.filtered.pc.sig.RDS] does not exist\n"))
 43 | }
 44 | final_ref <- readRDS(paste0(ref_dir, "/sc.dat.filtered.pc.sig.RDS"))
 45 | cell_types <- fread(paste0(ref_dir, "/cell_types.txt"))
 46 | cell.type.labels <- cell_types$label
 47 | cell.state.labels <- cell_types$label
 48 | 
 49 | ##### Validate if transposed count matrix exist
 50 | 
 51 | if (!file.exists(paste0(sample_dir, "/processed/bk.dat.RDS"))) {
 52 | 	stop(paste0("[", sample_dir, "/processed/bk.dat.RDS] does not exist\n"))
 53 | } else {
 54 | 	bk.dat <- readRDS(paste0(sample_dir, "/processed/bk.dat.RDS"))
 55 | 	cat(paste0("\n[", sample_name, "] - ST count matrix read from file\n"))
 56 | }
 57 | 
 58 | 
 59 | 
 60 | #####
 61 | 
 62 | 
 63 | 
 64 | ### Create output dir
 65 | output_dir <- paste0(sample_dir, "/analysis")
 66 | if (!dir.exists(output_dir)) {
 67 | 	dir.create(output_dir)
 68 | }
 69 | output_dir <- paste0(sample_dir, "/analysis/deconvolution")
 70 | if (!dir.exists(output_dir)) {
 71 | 	dir.create(output_dir)
 72 | }
 73 | 
 74 | ### Run BayesPrism
 75 | myPrism <- new.prism(
 76 | 	reference = final_ref,
 77 | 	mixture = bk.dat,
 78 | 	input.type = "count.matrix", 
 79 | 	cell.type.labels = cell.type.labels, 
 80 | 	cell.state.labels = cell.state.labels,
 81 | 	# key="tumor",
 82 | 	key = NULL,
 83 | 	outlier.cut = 0.01,
 84 | 	outlier.fraction = 0.1
 85 | )
 86 | cat(paste0("\n[", sample_name, "] - deconvolution started [", Sys.time(), "]\n"))
 87 | bp.res <- run.prism(prism = myPrism, n.cores = n_cores, gibbs.control = list(burn.in = burn.in, chain.length = chain.length), opt.control = list(maxit = maxit))
 88 | saveRDS(bp.res, paste0(output_dir, "/BayesPrism_results.RDS"))
 89 | gc()
 90 | cat(paste0("\n[", sample_name, "] - deconvolution done [", Sys.time(), "]\n"))
 91 | 
 92 | 
 93 | 
 94 | #####
 95 | 
 96 | 
 97 | 
 98 | ### Save thetas
 99 | theta.cv <- bp.res@posterior.theta_f@theta.cv
100 | theta <- get.fraction(bp = bp.res, which.theta = "final", state.or.type = "type")
101 | # BayesPrism advises to mask theta with CV above 0.5 (Visium)
102 | theta[theta.cv > 0.5] <- 0
103 | theta <- t(apply(theta, 1, function(x) x / sum(x)))
104 | theta <- theta[,sort(colnames(theta))]
105 | saveRDS(theta, paste0(output_dir, "/BayesPrism_theta.RDS"))
106 | 
107 | ### Save deconvoluted cell-type-specific expressions
108 | seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS"))
109 | location <- fread(paste0(sample_dir, "/processed/coordinates.csv"))
110 | meta <- seurat_object@meta.data
111 | spot_id_mapping <- meta$new_spot_id
112 | names(spot_id_mapping) <- rownames(meta)
113 | 
114 | deconv_genes <- colnames(bp.res@prism@mixture)
115 | not_deconv_genes <- colnames(bk.dat)
116 | not_deconv_genes <- not_deconv_genes[!not_deconv_genes %in% deconv_genes]
117 | not_deconv_exp <- bk.dat[,not_deconv_genes]
118 | not_deconv_exp <- not_deconv_exp / length(colnames(theta))
119 | 
120 | all_cell_types <- character(0)
121 | ct_i <- 1
122 | for (cell_type in colnames(theta)) {
123 | 	# cell_type <- "CAFs"
124 | 	cell_type_s <- gsub("/", ".", cell_type)
125 | 	cell_type_s <- gsub(" ", ".", cell_type_s)
126 | 	cell_type_s <- gsub("-", ".", cell_type_s)
127 | 	cell_type_s <- gsub("\\*", ".", cell_type_s)
128 | 	cell_type_s <- gsub("\\+", ".", cell_type_s)
129 | 	
130 | 	ct_exp <- get.exp(bp = bp.res, state.or.type = "type", cell.name = cell_type)
131 | 	ct_exp <- cbind(ct_exp, not_deconv_exp)
132 | 	
133 | 	counts_df <- data.table(ct_exp)
134 | 	counts_df$spot <- spot_id_mapping[rownames(ct_exp)]
135 | 	counts_df <- transpose(counts_df, keep.names = "gene", make.names = "spot")
136 | 	keep_spots <- intersect(colnames(counts_df), location$barcode)
137 | 	keep_spots <- c("gene", keep_spots)
138 | 	counts_df <- counts_df[,..keep_spots]
139 | 	
140 | 	#### Write counts, coordinates, and meta_spots to file
141 | 	fwrite(counts_df, paste0(output_dir, "/counts_", cell_type_s, ".csv"), sep = ",")
142 | 	all_cell_types[ct_i] <- cell_type_s
143 | 	ct_i <- ct_i + 1
144 | }
145 | write.table(all_cell_types, paste0(output_dir, "/all_cell_types.txt"), col.names = FALSE, row.names = FALSE, quote = FALSE)
146 | cat(paste0("\n[", sample_name, "] - cell-type-specific expression matrices saved [", Sys.time(), "]\n"))
147 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os
  3 | import numpy as np
  4 | from math import isnan
  5 | import sys
  6 | 
  7 | ds = sys.argv[1]
  8 | sample = sys.argv[2]
  9 | print('PPI analysis for: ', ds, sample)
 10 | ppi_out_dir = '/projects/b1131/SpatialT/drug-target-cmap2-svg/'+ds+'/'+sample+'/PPI/'
 11 | if not os.path.exists(ppi_out_dir):
 12 |     os.makedirs(ppi_out_dir)
 13 | # for single cellular annotated samples
 14 | #deg_dir = '/projects/b1131/SpatialT/drug-target/'+ds+'/'+sample+'/DGE_anno_SVG/'
 15 | # for deconvoluted samples
 16 | deg_dir = '/projects/b1131/SpatialT/drug-target/'+ds+'/'+sample+'/DGE_dec_SVG/'
 17 | 
 18 | # read in gene reference and remove unformatted genes
 19 | gene = pd.read_table('/projects/b1131/SpatialT/cmap_ppi_database/HGNC.tsv')
 20 | gene = gene[['NCBI Gene ID(supplied by NCBI)','Approved symbol']]
 21 | gene.columns = ['num','name']
 22 | lst = []
 23 | for i in gene['num'].tolist():
 24 |     if pd.isna(i) != True:
 25 |         lst.append(int(i))
 26 |     else:
 27 |         lst.append(i)
 28 | gene['num'] = lst
 29 | gene_dict = gene.set_index('num').to_dict()['name']
 30 | 
 31 | clean_hgnc_dict = filter(lambda k: not isnan(k), gene_dict)
 32 | clean_hgnc_dict = {k: gene_dict[k] for k in gene_dict if not isnan(k)}
 33 | clean_hgnc_dict = {int(k):v for k,v in clean_hgnc_dict.items()}
 34 | 
 35 | # interactome for ppi network
 36 | interactome = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/Interactome.tsv', sep='\t')
 37 | 
 38 | # mapping genes to interactome
 39 | unmappable_pro = []
 40 | Protein_A_Name = []
 41 | Protein_B_Name = []
 42 | 
 43 | for i in interactome['#Protein A'].tolist():
 44 |     if i in clean_hgnc_dict.keys(): 
 45 |         Protein_A_Name.append(clean_hgnc_dict[i] )
 46 |     else:
 47 |         if i not in unmappable_pro: 
 48 |             unmappable_pro.append(i)
 49 |             print(i)
 50 |         Protein_A_Name.append('drop')
 51 | 
 52 | for i in interactome['Protein B'].tolist():
 53 |     if i in clean_hgnc_dict.keys(): 
 54 |         Protein_B_Name.append(clean_hgnc_dict[i] )
 55 |     else:
 56 |         if i not in unmappable_pro: 
 57 |             unmappable_pro.append(i)
 58 |             print(i)
 59 |         Protein_B_Name.append('drop')
 60 | 
 61 | # drop unmappable interactome interactions
 62 | interactome['Protein_A_Name'] = Protein_A_Name
 63 | interactome['Protein_B_Name'] = Protein_B_Name
 64 | dropa_index = interactome[interactome['Protein_A_Name'] == 'drop'].index.tolist()
 65 | dropb_index = interactome[interactome['Protein_B_Name'] == 'drop'].index.tolist()
 66 | print(len(dropa_index), len(dropb_index))
 67 | # 538 interactions removed due to unmappable gene number 
 68 | for i in dropa_index:
 69 |     if i not in dropb_index:
 70 |         dropb_index.append(i)
 71 | len(dropb_index)
 72 | clean_interactome = interactome.drop(dropb_index)
 73 | print('interactome cleaned')
 74 | 
 75 | # generate ppi network using SV-DGEs
 76 | for deg_file in os.listdir(deg_dir):
 77 |     # read in deg-svg file and filter (no svg fitler placed on cell types that don't have SVG analysis )
 78 |     cell_type = deg_file.split('.csv')[0]
 79 |     log2fc = pd.read_csv(deg_dir+deg_file)
 80 |     log2fc['abs_stat'] = [abs(i) for i in log2fc['stat'].tolist()]
 81 |     log2fc_top300 = log2fc.sort_values('abs_stat', ascending=False)[:300]
 82 |     log2fc_top300_sig = log2fc_top300[log2fc_top300['qval']<0.05]
 83 |     print('sv-deg file read in')
 84 | 
 85 |     # map degs to inteactome
 86 |     ppi_index_interactome_top300 = []
 87 |     for i in clean_interactome.iterrows():
 88 |         row = i[1]
 89 |         if row['Protein_A_Name'] in log2fc_top300_sig['gene'].tolist() and row['Protein_B_Name'] in log2fc_top300_sig['gene'].tolist():
 90 |             ppi_index_interactome_top300.append(i[0])
 91 |     ppi_top300 = clean_interactome[clean_interactome.index.isin(ppi_index_interactome_top300)][['#Protein A','Protein B', 'Protein_A_Name','Protein_B_Name']]
 92 |     print('sv-deg mapped to interactome')
 93 | 
 94 |     # drop ppis where both proteins are the same
 95 |     same_pro_index_top300 = []
 96 |     for i in ppi_top300.iterrows():
 97 |         row = i[1]
 98 |         if row['#Protein A'] == row['Protein B']:
 99 |             same_pro_index_top300.append(i[0])
100 |     ppi_top300_nored = ppi_top300.drop(same_pro_index_top300)
101 | 
102 |     # format edges list for output
103 |     ppi_top300_nored = ppi_top300_nored[['Protein_A_Name','Protein_B_Name']]
104 |     ppi_top300_nored.columns = ['Source','Target']
105 | 
106 |     # format nodes list for output
107 |     ppi_uniq_pro = list(set(ppi_top300_nored['Source'].tolist()) )
108 |     for i in ppi_top300_nored['Target'].tolist():
109 |         if i not in ppi_top300_nored['Source'].tolist() and i not in ppi_uniq_pro :
110 |             ppi_uniq_pro.append(i)
111 | 
112 |     ppi_nodes_top300 = pd.DataFrame()
113 |     ppi_nodes_top300['ID'] = ppi_uniq_pro
114 |     ppi_nodes_top300['Label'] = ppi_uniq_pro
115 | 
116 |     ppi_nodes_top300 = pd.merge(ppi_nodes_top300, log2fc[['gene','stat']], how = 'left', right_on = 'gene', left_on = 'Label')
117 |     ppi_nodes_top300 = ppi_nodes_top300.drop('gene',axis = 1)
118 | 
119 |     ppi_nodes_top300['sign'] = [np.sign(i) for i in ppi_nodes_top300['stat'].tolist()]
120 |     ppi_nodes_top300['abs_stat'] = [abs(i) for i in ppi_nodes_top300['stat'].tolist()]
121 | 
122 |     # save
123 |     ppi_nodes_top300.to_csv(ppi_out_dir+cell_type+'_ppi_nodes.csv', index=False)
124 |     ppi_top300_nored.to_csv(ppi_out_dir+cell_type+'_ppi.csv', index=False)
125 |     print(cell_type,'ppi saved')
126 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/DGE/deconvoluted/DGE-analysis-dec.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ### Author: Yiming
  4 | ###
  5 | ### Description: This script performs DGE analysis (using pseudo-cell data)
  6 | 
  7 | library(Seurat)
  8 | library(data.table)
  9 | library(dplyr)
 10 | library(stringr)
 11 | 
 12 | ### Define paths and variables
 13 | args <- commandArgs(trailingOnly=TRUE)
 14 | st_dir <- "/projects/b1131/SpatialT"
 15 | dt_dir <- "/projects/b1131/SpatialT/drug-target/"
 16 | sample_dir <- args[1]
 17 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID1/DS1D/DS1D.1/"
 18 | # sample_dir <- "/projects/b1131/SpatialT/DBiT-seq/PID150/DS150A/DS150A.GSM4096261/"
 19 | 
 20 | ds_name <- str_split(sample_dir, '/')[[1]][7]
 21 | tech <- str_split(sample_dir, '/')[[1]][5]
 22 | p_name <- str_split(sample_dir, '/')[[1]][6]
 23 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/")
 24 | sample_name <- str_split(sample_dir, '/')[[1]][8]
 25 | 
 26 | ### Read all possible cell types
 27 | all_cell_types_df <- fread(paste0(sample_dir, "/analysis/deconvolution/all_cell_types.txt"))
 28 | all_cell_types <- all_cell_types_df$cell_type_s
 29 | ct_mapping <- all_cell_types_df$cell_type
 30 | names(ct_mapping) <- all_cell_types
 31 | 
 32 | ### Read Seurat object
 33 | seurat_object_tn_path <- paste0(sample_dir, "processed/Seurat.RDS")
 34 | seurat_object_tn <- readRDS(seurat_object_tn_path)
 35 | spot_id_mapping <- seurat_object_tn@meta.data$new_spot_id
 36 | names(spot_id_mapping) <- as.character(rownames(seurat_object_tn@meta.data))
 37 | 
 38 | ### Read cell type fractions
 39 | theta <- readRDS(paste0(sample_dir, "/analysis/deconvolution/BayesPrism_theta.RDS"))
 40 | rownames(theta) <- as.character(spot_id_mapping[rownames(theta)])
 41 | 
 42 | ### DGE results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/DGE
 43 | output_dir <- paste0(sample_dir, "analysis/DGE")
 44 | if (!dir.exists(output_dir)) {
 45 | 	dir.create(output_dir)
 46 | }
 47 | dir.create(paste0(dt_dir, ds_name))
 48 | dir.create(paste0(dt_dir, ds_name, "/", sample_name))
 49 | dir.create(paste0(dt_dir, ds_name, "/", sample_name, "/DGE_dec"))
 50 | 
 51 | ### Create pseudo-cell-level object
 52 | if (file.exists(paste0(sample_dir, "analysis/deconvolution/binded_exp_Seurat.RDS"))) {
 53 | # if (FALSE) {
 54 | 	binded_so <- readRDS(paste0(sample_dir, "analysis/deconvolution/binded_exp_Seurat.RDS"))
 55 | } else {
 56 | 	ct_exp <- list()
 57 | 	total_cells_each_ct <- list()
 58 | 	for (cell_type in all_cell_types) {
 59 | 		# cell_type <- "Malignant"
 60 | 		# cell_type <- all_cell_types[6]
 61 | 		exp_mat <- fread(paste0(sample_dir, "/analysis/deconvolution/counts_", cell_type, "_deconv_only.csv"))
 62 | 		genes <- exp_mat$gene
 63 | 		exp_mat$gene <- NULL
 64 | 		exp_mat <- as.matrix(exp_mat)
 65 | 		rownames(exp_mat) <- genes
 66 | 		
 67 | 		# Divide by fraction and leave out the spots with zero fraction of this cell type
 68 | 		fractions <- theta[,as.character(ct_mapping[cell_type])]
 69 | 		non_zero_fraction_spots <- names(which(fractions != 0))
 70 | 		exp_mat <- t(apply(exp_mat, 1, function(x) x / fractions))
 71 | 		exp_mat[is.na(exp_mat)] <- 0
 72 | 		exp_mat <- exp_mat[, non_zero_fraction_spots, drop = FALSE]
 73 | 		
 74 | 		ct_exp[[cell_type]] <- exp_mat
 75 | 		total_cells_each_ct[[cell_type]] <- ncol(exp_mat)
 76 | 	}
 77 | 	ct_exp <- do.call(cbind, ct_exp)
 78 | 	colnames(ct_exp) <- paste0("sp", 1:ncol(ct_exp))
 79 | 	saveRDS(ct_exp, paste0(sample_dir, "analysis/deconvolution/binded_exp.RDS"))
 80 | 	
 81 | 	# Remove "pseudo-cells" with zero deconvoluted expression in all the genes
 82 | 	# Initially this was done because CellChat does not accept objects with zero total expression cells
 83 | 	col_sums <- colSums(ct_exp)
 84 | 	kept_pseudo_cells <- names(which(col_sums > 0))
 85 | 	all_cell_types <- all_cell_types[total_cells_each_ct != 0]
 86 | 	total_cells_each_ct <- total_cells_each_ct[total_cells_each_ct != 0]
 87 | 	meta <- data.frame(labels = rep(all_cell_types, times = total_cells_each_ct))
 88 | 	meta$cell_id <- paste0("sp", 1:ncol(ct_exp))
 89 | 	meta <- meta[meta$cell_id %in% kept_pseudo_cells,]
 90 | 	row.names(meta) <- meta$cell_id
 91 | 	meta$cell_id <- NULL
 92 | 	ct_exp_less <- ct_exp[,kept_pseudo_cells]
 93 | 	
 94 | 	rm(seurat_object_tn)
 95 | 	binded_so <- CreateSeuratObject(ct_exp_less)
 96 | 	binded_so[["cell_type"]] <- meta$labels
 97 | 	binded_so <- SCTransform(binded_so, verbose = FALSE, return.only.var.genes = FALSE)
 98 | 	saveRDS(binded_so, paste0(sample_dir, "analysis/deconvolution/binded_exp_Seurat.RDS"))
 99 | }
100 | 
101 | ### Perform DGE analysis on different cell types
102 | annotations <- binded_so[["cell_type"]]$cell_type
103 | names(annotations) <- rownames(binded_so[["cell_type"]])
104 | Idents(binded_so) <- annotations
105 | 
106 | if (length(table(binded_so[["cell_type"]])) == 1) {
107 | 	fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_dec.tsv"), sep = "\t")
108 | 	cat("\n\n### Only one annotated cell type -- skipping DGE analysis (cell types).")
109 | 	cat("\n# Writing empty DGE_cell_types_dec.tsv to file.")
110 | } else {
111 | 	# https://satijalab.org/seurat/archive/v3.1/future_vignette.html
112 | 	options(future.globals.maxSize = 5000 * 1024^2)
113 | 	DGE_cell_types <- FindAllMarkers(binded_so, assay = "SCT", logfc.threshold = 0.1, min.pct = 0.1, verbose = FALSE)
114 | 	if (nrow(DGE_cell_types) == 0) {
115 | 		fwrite(data.frame(gene = character(0), cluster = integer(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val_adj = numeric(0)), paste0(output_dir, "/DGE_cell_types_dec.tsv"), sep = "\t")
116 | 		cat("\n\n### DGE analysis (cell types) cannot be performed due to having too few spots in one/many of the cell types.")
117 | 		cat("\n# Writing empty DGE_cell_types_dec.tsv to file.")
118 | 	} else {
119 | 		DGE_cell_types$cluster <- as.character(DGE_cell_types$cluster)
120 | 		fwrite(DGE_cell_types, paste0(output_dir, "/DGE_cell_types_dec.tsv"), sep = "\t")
121 | 		
122 | 		for (cell_type in sort(unique(DGE_cell_types$cluster))) {
123 | 			# cell_type <- "Malignant"
124 | 			DGE_cell_types_less <- DGE_cell_types[DGE_cell_types$cluster == cell_type,]
125 | 			DGE_cell_types_less <- DGE_cell_types_less[,c("gene", "avg_log2FC", "p_val", "p_val_adj")]
126 | 			colnames(DGE_cell_types_less) <- c("gene", "stat", "pval", "qval")
127 | 			fwrite(DGE_cell_types_less, paste0(dt_dir, ds_name, "/", sample_name, "/DGE_dec/", cell_type, ".txt"), sep = "\t")
128 | 		}
129 | 		cat("\n\n### DGE analysis (cell types) results written to file.")
130 | 	}
131 | }
132 | tt4 <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
133 | cat(paste0("\n### Analysis completed; max memory consumed: ", as.character(tt4), "M -- [", Sys.time(), "]\n\n"))
134 | 


--------------------------------------------------------------------------------
/data_curation/geo-query.py:
--------------------------------------------------------------------------------
  1 | ### Usage: python3 geo-query.py
  2 | ### Author: Yiming Li
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import requests
  7 | import xml.etree.ElementTree as ET
  8 | import time, os, shutil, sys
  9 | 
 10 | def fetch_species_GDS(species):
 11 | 	# species = "mouse"
 12 | 	urls = ["http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+transcriptomics+AND+" + species + "[organism]&retmax=100000&usehistory=y", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+transcriptome+AND+" + species + "[organism]&retmax=100000&usehistory=y", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+RNA-seq+AND+" + species + "[organism]&retmax=100000&usehistory=y", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=spatial+RNA+sequencing+AND+" + species + "[organism]&retmax=100000&usehistory=y"]
 13 | 	items = []
 14 | 	query_keys = []
 15 | 	WebEnvs = []
 16 | 	for url in urls:
 17 | 		resp = requests.get(url)
 18 | 		with open('tmp2.xml', 'wb') as f:
 19 | 			f.write(resp.content)
 20 | 		
 21 | 		with open('tmp2.xml', 'r') as file:
 22 | 			xml_text = file.read()
 23 | 		
 24 | 		if "API rate limit exceeded" in xml_text:
 25 | 			sys.exit('[ERROR] E-utils API rate limit exceeded')
 26 | 		
 27 | 		tree = ET.parse('tmp2.xml')
 28 | 		root = tree.getroot()
 29 | 		query_keys.append(root.findall('./QueryKey')[0].text)
 30 | 		WebEnvs.append(root.findall('./WebEnv')[0].text)
 31 | 		for item in root.findall('./IdList/Id'):
 32 | 			items.append(item.text)
 33 | 	
 34 | 	return(items, query_keys, WebEnvs)
 35 | 
 36 | def fetch_PMIDs(query_keys, WebEnvs):
 37 | 	pmids = []
 38 | 	items = []
 39 | 	for query_key, WebEnv in zip(query_keys, WebEnvs):
 40 | 		url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=gds&db=pubmed&query_key=' + str(query_key) + '&WebEnv=' + str(WebEnv)
 41 | 		resp = requests.get(url)
 42 | 		with open('tmp3.xml', 'wb') as f:
 43 | 			f.write(resp.content)
 44 | 		
 45 | 		with open('tmp3.xml', 'r') as file:
 46 | 			xml_text = file.read()
 47 | 		
 48 | 		if "API rate limit exceeded" in xml_text:
 49 | 			sys.exit('[ERROR] E-utils API rate limit exceeded')
 50 | 		
 51 | 		tree = ET.parse('tmp3.xml')
 52 | 		root = tree.getroot()
 53 | 		
 54 | 		for item in root.findall('./LinkSet/IdList/Id'):
 55 | 			items.append(item.text)
 56 | 		
 57 | 		for pmid in root.findall('./LinkSet/LinkSetDb/Link/Id'):
 58 | 			pmids.append(pmid.text)
 59 | 	
 60 | 	return(items, pmids)
 61 | 
 62 | def loadRSS(gds_id):
 63 | 	url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&id=' + str(gds_id)
 64 | 	resp = requests.get(url)
 65 | 	with open('tmp.xml', 'wb') as f:
 66 | 		f.write(resp.content)
 67 | 
 68 | def parseXML(xmlfile):
 69 | 	with open(xmlfile, 'r') as file:
 70 | 		xml_text = file.read()
 71 | 	
 72 | 	if "API rate limit exceeded" in xml_text:
 73 | 		sys.exit('[ERROR] E-utils API rate limit exceeded')
 74 | 	
 75 | 	tree = ET.parse(xmlfile)
 76 | 	root = tree.getroot()
 77 | 	items = []
 78 | 	for item in root.findall('./DocSum/Item'):
 79 | 		if (item.attrib['Name'] in ["Accession", "title", "summary", "GPL", "GSE", "taxon", "gdsType", "FTPLink"]):
 80 | 			items.append(item.text)
 81 | 	
 82 | 	return(items)
 83 | 
 84 | def fetch_species_meta(species, save_dir):
 85 | 	print("### Starting initial query for [" + species + "]")
 86 | 	species_ids, query_keys, WebEnvs = fetch_species_GDS(species)
 87 | 	query_keys_df = pd.DataFrame({"query_key": query_keys, "WebEnv": WebEnvs})
 88 | 	query_keys_df.to_csv(save_dir + "/" + species + "_query_keys.tsv", sep='\t', index = False)
 89 | 	
 90 | 	current_species_ids = [line.strip() for line in open(species + "_GDS_current.txt", 'r')]
 91 | 	species_ids = list(set(species_ids) - set(current_species_ids))
 92 | 	species_ids_concat = list(set(species_ids) | set(current_species_ids))
 93 | 	print("### Initial query for [" + species + "] completed\n### Number of species GDS IDs: " + str(len(current_species_ids)) + " (previous), " + str(len(species_ids)) + " (this query), " + str(len(species_ids_concat)) + " (concatenated)")
 94 | 	
 95 | 	### Get meta-information
 96 | 	print("### Starting query for [" + species + "] meta-information")
 97 | 	results = []
 98 | 	for gds_id in species_ids:
 99 | 		loadRSS(gds_id)
100 | 		time.sleep(1)
101 | 		results.append(parseXML('tmp.xml'))
102 | 	
103 | 	results = pd.DataFrame(results, columns=["Accession", "title", "summary", "GPL", "GSE", "taxon", "gdsType", "FTPLink"])
104 | 	results["GDS_ID"] = species_ids
105 | 	results["Organism"] = species
106 | 	results.to_csv(save_dir + "/" + species + ".tsv", sep='\t', index = False)
107 | 	print("### Meta-information of [" + species + "] GDS IDs saved to " + save_dir + "/" + species + ".tsv")
108 | 	
109 | 	### Save GDS lists
110 | 	with open(save_dir + "/" + species + "_GDS_" + save_dir + ".txt", 'w') as f:
111 | 		for line in species_ids:
112 | 			f.write(f"{line}\n")
113 | 	
114 | 	shutil.copyfile(species + "_GDS_current.txt", species + "_GDS_current.txt.bk")
115 | 	
116 | 	with open(species + "_GDS_current.txt", 'w') as f:
117 | 		for line in species_ids_concat:
118 | 			f.write(f"{line}\n")
119 | 	
120 | 	print("### " + species + "_GDS_current.txt overwritten")
121 | 	
122 | 	print("### Starting query for [" + species + "] PMIDs")
123 | 	gds_ids, pmids = fetch_PMIDs(query_keys, WebEnvs)
124 | 	pmids_df = pd.DataFrame({"PMID": pmids})
125 | 	pmids_df.to_csv(save_dir + "/" + species + "_PMIDs.tsv", sep='\t', index = False)
126 | 	print("### PMIDs associated with [" + species + "] saved to " + save_dir + "/" + species + "_PMIDs.tsv")
127 | 	
128 | 	print("### [" + species + "] completed\n")
129 | 	return(results)
130 | 
131 | ##########################################################################################
132 | 
133 | owd = os.getcwd()
134 | os.chdir("/share/fsmresfiles/SpatialT/GEO_query")
135 | save_dir = time.strftime("%Y%m%d")
136 | os.makedirs(save_dir, exist_ok = True)
137 | 
138 | ### Get mouse GDS IDs
139 | results_mouse = fetch_species_meta("mouse", save_dir)
140 | 
141 | ### Get human GDS IDs
142 | results_human = fetch_species_meta("human", save_dir)
143 | 
144 | ### Combine results
145 | results_mouse = pd.read_csv(save_dir + "/mouse.tsv", sep = '\t')
146 | results_human = pd.read_csv(save_dir + "/human.tsv", sep = '\t')
147 | results_all = pd.concat([results_mouse, results_human])
148 | results_all = results_all.sort_values(by=['GDS_ID', 'Organism'])
149 | 
150 | results_all['Accession_is_GSM'] = results_all['Accession'].str.startswith('GSM', na=False)
151 | results_all['GSM'] = np.where(results_all['Accession_is_GSM'], results_all['Accession'], "")
152 | results_all['Accession'] = np.where(results_all['Accession_is_GSM'], "", results_all['Accession'])
153 | results_all['Is ST data'] = ""
154 | results_all['Technology'] = ""
155 | results_all['Platform'] = ""
156 | results_all['Add to SOAR'] = ""
157 | results_all['PMID'] = ""
158 | results_all['GSE'] = "GSE" + results_all['GSE'].apply(str)
159 | 
160 | reordered_columns = ['GDS_ID', 'Organism', 'Accession', "GSE", "GSM", 'Is ST data', 'Technology', 'Platform', 'PMID', 'Add to SOAR', 'title', 'summary', 'GPL', 'taxon', 'gdsType', 'FTPLink']
161 | results_all = results_all[reordered_columns]
162 | results_all = results_all.sort_values(by=['GSE'])
163 | results_all.to_csv(save_dir + "/all.tsv", sep='\t', index = False)
164 | 
165 | ### Clean up
166 | os.remove("tmp.xml")
167 | os.remove("tmp2.xml")
168 | os.remove("tmp3.xml")
169 | os.chdir(owd)
170 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/deconvolution/process_reference_example.R:
--------------------------------------------------------------------------------
  1 | ### conda activate R4
  2 | library(data.table)
  3 | library(splitstackshape)
  4 | library(Seurat)
  5 | library(BayesPrism)
  6 | library(SingleCellExperiment)
  7 | library(scuttle)
  8 | 
  9 | ### This script uses Thymus, Human as an example
 10 | ### This script assumes that:
 11 | ### * A Seurat object of the identified scRNA-seq reference data has already been created
 12 | ### * The annotated cell types are stored in seurat.object[["label"]]
 13 | organ <- "Thymus"
 14 | species <- "Human"
 15 | seurat.object <- readRDS("/share/fsmresfiles/SpatialT/ref/Tabula_Sapiens/Thymus/Seurat.RDS")
 16 | qc_plot_output_dir <- "~/stbase" # Change this to your own directory
 17 | 
 18 | #### Create save path
 19 | save_path <- paste0("/share/fsmresfiles/SpatialT/ref/final/", organ)
 20 | if (!dir.exists(save_path)) {
 21 | 	dir.create(save_path)
 22 | }
 23 | save_path <- paste0(save_path, "/", species)
 24 | if (!dir.exists(save_path)) {
 25 | 	dir.create(save_path)
 26 | }
 27 | 
 28 | #### Get # cells and # genes before QC
 29 | n_cells <- ncol(seurat.object)
 30 | n_genes <- nrow(seurat.object)
 31 | gc()
 32 | 
 33 | #### Get metadata
 34 | meta <- seurat.object@meta.data
 35 | meta$cell_id <- rownames(meta)
 36 | meta <- meta[,c("cell_id", "label")]
 37 | meta <- meta[!is.na(meta$label),]
 38 | 
 39 | ### Perform stratified sampling (by cell type labels) on the reference if the total number of cells is larger than 30000 to avoid out-of-memory error
 40 | ### You may change 30000 to a slightly larger number or skip this step if QC drops a lot of cells
 41 | if (nrow(meta) > 30000) {
 42 | 	if (organ == "Brain") {
 43 | 		meta <- stratified(meta, "subclass_label", size = 30000 / n_cells)
 44 | 	} else {
 45 | 		meta <- stratified(meta, "label", size = 30000 / n_cells)
 46 | 	}
 47 | }
 48 | seurat.object[["cell_id"]] <- colnames(seurat.object)
 49 | seurat.object <- subset(seurat.object, subset = cell_id %in% meta$cell_id)
 50 | n_cells_strat <- ncol(seurat.object)
 51 | 
 52 | ### Get cell type proportions
 53 | cell_type_string <- table(meta$label)
 54 | percs <- paste0(as.character(round(as.numeric(cell_type_string / sum(cell_type_string) * 100), 2)), "%")
 55 | cell_type_string2 <- paste0(names(cell_type_string), " (", as.character(round(as.numeric(cell_type_string), 2)), ", ", percs, ")")
 56 | cell_type_string2 <- paste(cell_type_string2, collapse = ", ")
 57 | 
 58 | ### Sort metadata by Seurat object's cell order
 59 | meta <- meta[match(colnames(seurat.object), meta$cell_id),]
 60 | sum(meta$cell_id == colnames(seurat.object)) == nrow(meta)
 61 | 
 62 | ### Cell QC
 63 | ### !!! Please perform cell QC case-by-case instead of using uniform thresholds
 64 | seurat.object[["percent_mt"]] <- PercentageFeatureSet(seurat.object, "^MT-")
 65 | seurat.object.bk <- seurat.object
 66 | pdf(paste0(qc_plot_output_dir, "/sc_ref_", organ, "_", species, "_beforeQC.pdf"))
 67 | print(VlnPlot(seurat.object, features = "nCount_RNA"))
 68 | print(VlnPlot(seurat.object, features = "nFeature_RNA"))
 69 | dev.off()
 70 | 
 71 | ### * Change the nCount_RNA and nFeature_RNA thresholds based on the violin plots
 72 | seurat.object <- seurat.object.bk
 73 | seurat.object <- seurat.object[, seurat.object$nCount_RNA > 500 & seurat.object$nFeature_RNA > 250 & seurat.object$percent_mt < 20]
 74 | (n_cells_qc1 <- ncol(seurat.object))
 75 | pdf(paste0(qc_plot_output_dir, "/sc_ref_", organ, "_", species, "_QC1.pdf"))
 76 | print(VlnPlot(seurat.object, features = "nCount_RNA"))
 77 | print(VlnPlot(seurat.object, features = "nFeature_RNA"))
 78 | dev.off()
 79 | 
 80 | ### * Change the two thresholds based on the violin plots
 81 | seurat.object2 <- seurat.object[, seurat.object$nCount_RNA < 30000 & seurat.object$nFeature_RNA < 6000]
 82 | n_cells_qc2 <- ncol(seurat.object2)
 83 | pdf(paste0(qc_plot_output_dir, "/sc_ref_", organ, "_", species, "_QC2.pdf"))
 84 | print(VlnPlot(seurat.object2, features = "nCount_RNA"))
 85 | print(VlnPlot(seurat.object2, features = "nFeature_RNA"))
 86 | dev.off()
 87 | 
 88 | ### Overwrite the Seurat object if the second two violin plots look okay
 89 | # seurat.object
 90 | # seurat.object2
 91 | seurat.object <- seurat.object2
 92 | # seurat.object
 93 | 
 94 | ### Write the QC-ed Seurat object and the metadata to file
 95 | meta <- seurat.object@meta.data
 96 | meta$cell_id <- rownames(meta)
 97 | meta <- meta[,c("cell_id", "label")]
 98 | fwrite(meta, paste0(save_path, "/cell_types.txt"), sep = "\t")
 99 | saveRDS(seurat.object, paste0(save_path, "/Seurat.RDS"))
100 | 
101 | ### Generate SingleCellExperiment object for cell type annotation
102 | counts <- GetAssayData(seurat.object, assay = "RNA")
103 | meta$cell_id <- NULL
104 | ref_data_sce <- SingleCellExperiment(list(counts = counts), colData = meta)
105 | ref_data_sce <- logNormCounts(ref_data_sce)
106 | saveRDS(ref_data_sce, file = paste0(save_path, "/SCE.RDS"))
107 | 
108 | ### Generate data for BayesPrism (cell type deconvolution)
109 | ### * Counts data
110 | counts <- GetAssayData(seurat.object, assay = "RNA")
111 | counts <- as.matrix(counts)
112 | gene_names <- rownames(counts)
113 | counts <- data.table(counts)
114 | counts$gene <- gene_names
115 | counts <- transpose(counts, keep.names = "cell", make.names = "gene")
116 | cell_names <- counts$cell
117 | counts$cell <- NULL
118 | counts <- as.matrix(counts)
119 | rownames(counts) <- cell_names
120 | min_count <- min(counts)
121 | max_count <- max(counts)
122 | saveRDS(counts, paste0(save_path, "/mat_transposed.RDS"))
123 | gc()
124 | 
125 | ### * BayesPrism gene filtering step 1
126 | if (species == "Human") {
127 | 	sc.dat.filtered <- cleanup.genes(input=counts, input.type="count.matrix", species="hs", gene.group = c("Rb", "Mrp", "other_Rb", "chrM", "MALAT1","chrX","chrY"), exp.cells=5)
128 | 	sc.dat.filtered.pc <- select.gene.type(sc.dat.filtered, gene.type = "protein_coding") # only works for human
129 | } else {
130 | 	sc.dat.filtered.pc <- cleanup.genes(input=counts, input.type="count.matrix", species="mm", gene.group = c("Rb", "Mrp", "other_Rb", "chrM","chrX","chrY"), exp.cells=5)
131 | }
132 | 
133 | ### * BayesPrism gene filtering step 2
134 | cell.type.labels <- meta$label
135 | cell.state.labels <- meta$label
136 | diff.exp.stat <- get.exp.stat(sc.dat=counts[,colSums(counts>0)>3], cell.type.labels=cell.type.labels, cell.state.labels=cell.state.labels, psuedo.count=0.1, cell.count.cutoff=50, n.cores=1)
137 | 
138 | ### !!! Check that all cell types has > 50 marker genes
139 | ### This threshold can be more lenient for sparser cell types
140 | ### Change pval.max and lfc.min to get more genes
141 | sc.dat.filtered.pc.sig <- select.marker(sc.dat=sc.dat.filtered.pc, stat=diff.exp.stat, pval.max=0.01, lfc.min=0.1)
142 | 
143 | ### You may check if ngenes_filt2 is around 5000
144 | ### If the number is too small (e.g. < 2000), we may need to use ngenes_filt1 for deconvolution instead
145 | (ngenes_filt1 <- ncol(sc.dat.filtered.pc))
146 | (ngenes_filt2 <- ncol(sc.dat.filtered.pc.sig))
147 | 
148 | ### Save to BayesPrism-filtered references to file
149 | saveRDS(sc.dat.filtered.pc, paste0(save_path, "/sc.dat.filtered.pc.RDS"))
150 | saveRDS(sc.dat.filtered.pc.sig, paste0(save_path, "/sc.dat.filtered.pc.sig.RDS"))
151 | 
152 | results <- data.frame(organ = organ, species = species, save_path = save_path, ncells = n_cells, ngenes = n_genes, ncells_qc1 = n_cells_qc1, ncells_qc2 = n_cells_qc2, n_cells_strat = n_cells_strat, cell_types = cell_type_string2, min_count = min_count, max_count = max_count, ngenes_filt1 = ngenes_filt1, ngenes_filt2 = ngenes_filt2)
153 | fwrite(results, paste0(save_path, "/summary_stats.txt"), sep = "\t")
154 | gc()
155 | results
156 | cat(paste0("\n>>> ", save_path, " finished\n"))
157 | 


--------------------------------------------------------------------------------
/data_analysis/cell_cell_interaction/neighborhood-based/adj-analysis.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ### Author: Yiming Li
  4 | ###
  5 | ### Description: This script performs neighborhood-based analysis
  6 | ### Usage: adj-analysis.R $sample_dir
  7 | 
  8 | library(Seurat)
  9 | library(plyr)
 10 | library(data.table)
 11 | library(stringr)
 12 | 
 13 | ### Read the list of DSIDs for use in our database
 14 | args <- commandArgs(trailingOnly=TRUE)
 15 | 
 16 | st_dir <- "/projects/b1131/SpatialT"
 17 | # st_dir <- "/share/fsmresfiles"
 18 | 
 19 | sample_dir <- args[1]
 20 | # sample_dir <- "/projects/b1131/SpatialT/10x/PID5/DS5A/DS5A.06_151670"
 21 | # sample_dir <- "/share/fsmresfiles/SpatialT/10x/PID5/DS5A/DS5A.12_151676"
 22 | ds_name <- str_split(sample_dir, '/')[[1]][7]
 23 | tech <- str_split(sample_dir, '/')[[1]][5]
 24 | p_name <- str_split(sample_dir, '/')[[1]][6]
 25 | ds_dir <- paste(c(st_dir, tech, p_name, ds_name), collapse = "/")
 26 | sample_name <- str_split(sample_dir, '/')[[1]][8]
 27 | 
 28 | ### Functions
 29 | get_dist <- function(row1, col1, row2, col2) {
 30 | 	return(sqrt((row1-row2)^2 + (col1-col2)^2))
 31 | }
 32 | 
 33 | get_adjacency <- function(row1, col1, row2, col2) {
 34 | 	return(sum(abs(row1-row2) <= 1 & abs(col1-col2) <= 1))
 35 | }
 36 | 
 37 | get_dist_from_cell_type <- function(row, col, cell_type, one_cell_type_dfs) {
 38 | 	cell_types <- names(one_cell_type_dfs)
 39 | 	distances <- rep(NA, length(cell_types) * 3)
 40 | 	names(distances) <- c(paste0(cell_types, "_min"), paste0(cell_types, "_median"), paste0(cell_types, "_adjacent"))
 41 | 	for (compared_cell_type in cell_types) {
 42 | 		if (cell_type == compared_cell_type) {
 43 | 			next
 44 | 		}
 45 | 		compared_cell_type_df <- one_cell_type_dfs[[compared_cell_type]]
 46 | 		tmp <- get_dist(row, col, compared_cell_type_df$row, compared_cell_type_df$col)
 47 | 		distances[paste0(compared_cell_type, "_min")] <- min(tmp)
 48 | 		distances[paste0(compared_cell_type, "_median")] <- median(tmp)
 49 | 		distances[paste0(compared_cell_type, "_adjacent")] <- get_adjacency(row, col, compared_cell_type_df$row, compared_cell_type_df$col)
 50 | 	}
 51 | 	return(distances)
 52 | }
 53 | 
 54 | ### Start adjacency-based analysis
 55 | ### Results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/Distance
 56 | output_dir <- paste0(sample_dir, "/analysis/Distance")
 57 | if (!dir.exists(output_dir)) {
 58 | 	dir.create(output_dir)
 59 | }
 60 | 
 61 | ### Check if the analysis has already been done
 62 | if (file.exists(paste0(output_dir, "/cci_adj_results.tsv"))) {
 63 | 	tmpdf <- fread(paste0(output_dir, "/cci_adj_results.tsv"))
 64 | 	if ("wilcox_pval_two_sided" %in% colnames(tmpdf)) {
 65 | 		### Save old results to a separate file
 66 | 		file.copy(paste0(output_dir, "/cci_adj_results.tsv"), paste0(output_dir, "/cci_adj_results_old.tsv"), overwrite = TRUE)
 67 | 	} else if ("avg_log2FC" %in% colnames(tmpdf)) {
 68 | 		stop("\n### Adjacency analysis already done.\n")
 69 | 	}
 70 | }
 71 | 
 72 | ### Read in the QC-ed + transformed + processed Seurat object (from process_visium_standard.R)
 73 | seurat_object_tn_path <- paste0(sample_dir, "/processed/Seurat.RDS")
 74 | seurat_object_tn <- readRDS(seurat_object_tn_path)
 75 | cat("\n\n### Processed Seurat object read.\n\n### Starting adjacency analysis...\n")
 76 | ### !!! Assuming that the Seurat object has underwent SCT and clustering
 77 | 
 78 | ### Get the table with calculated distance-based statistics
 79 | if (file.exists(paste0(output_dir, "/distance_stats_dec_max.tsv"))) {
 80 | 	coords <- fread(paste0(output_dir, "/distance_stats_dec_max.tsv"), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
 81 | 	coords$spot <- as.character(coords$spot) ### In case the spot IDs are numeric
 82 | 	cell_types <- sort(unique(seurat_object_tn@meta.data$cell_type_dec_max))
 83 | 	cat("\n### Distance-based statistics read from file.\n")
 84 | } else {
 85 | 	### Get coordinates and cell type annotations
 86 | 	annotations <- seurat_object_tn@meta.data$cell_type_dec_max
 87 | 	if ("slice1" %in% names(seurat_object_tn@images)) {
 88 | 		coords <- seurat_object_tn@images$slice1@coordinates ### Visium
 89 | 		coords <- coords[, c("row", "col")] ### Use the spot coordinates
 90 | 	} else {
 91 | 		coords <- seurat_object_tn@images$image@coordinates ### Others
 92 | 		if ("x" %in% colnames(coords)) {
 93 | 			coords <- coords[, c("x", "y")] ### Use the spot coordinates
 94 | 		} else {
 95 | 			### Some prepared MERFISH datasets did not follow the naming standard
 96 | 			coords <- coords[, c("xcoord", "ycoord")] ### Use the spot coordinates
 97 | 		}
 98 | 	}
 99 | 	colnames(coords) <- c("row", "col")
100 | 	coords <- tibble::rownames_to_column(coords, "spot")
101 | 	coords$annotation <- annotations
102 | 	
103 | 	### Get distance metrics, currently supporting:
104 | 	### * Minimum distance from another cell type (the same cell type -- marked as NA)
105 | 	### * Median distance from another cell type (the same cell type -- marked as NA)
106 | 	### * Whether adjacent to another cell type (0 = FALSE, 1 = TRUE; the same cell type -- marked as NA)
107 | 	cell_types <- sort(unique(annotations))
108 | 	one_cell_type_dfs <- list()
109 | 	for (cell_type in cell_types) {
110 | 		one_cell_type_dfs[[cell_type]] <- coords[coords$annotation == cell_type,]
111 | 	}
112 | 	df_colnames <- c("spot", paste0(cell_types, "_min"), paste0(cell_types, "_median"), paste0(cell_types, "_adjacent"))
113 | 	distance_stats_df <- data.frame(matrix(NA, ncol = length(df_colnames), nrow = 0))
114 | 	colnames(distance_stats_df) <- df_colnames
115 | 	for (i in 1:nrow(coords)) {
116 | 		row <- coords$row[i]
117 | 		col <- coords$col[i]
118 | 		cell_type <- coords$annotation[i]
119 | 		distances <- get_dist_from_cell_type(row, col, cell_type, one_cell_type_dfs)
120 | 		tmpdf <- data.frame(matrix(distances, ncol = length(df_colnames)-1, nrow = 1))
121 | 		rownames(tmpdf) <- coords$spot[i]
122 | 		tmpdf <- tibble::rownames_to_column(tmpdf, "spot")
123 | 		colnames(tmpdf) <- df_colnames
124 | 		distance_stats_df <- rbind(distance_stats_df, tmpdf)
125 | 	}
126 | 	coords <- join(coords, distance_stats_df, by = "spot")
127 | 	coords$row <- NULL
128 | 	coords$col <- NULL
129 | 	fwrite(coords, paste0(output_dir, "/distance_stats_dec_max.tsv"), sep = "\t")
130 | 	cat("\n### Distance-based statistics calculated.\n")
131 | 	
132 | 	### Ensure that coords is a data table
133 | 	coords <- fread(paste0(output_dir, "/distance_stats_dec_max.tsv"), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
134 | 	cell_types <- sort(unique(seurat_object_tn@meta.data$cell_type_dec_max))
135 | }
136 | 
137 | ### Test if in all cells of cell_type1, the gene expression levels in those adjacent / not adjacent to cell_type2 differ significantly
138 | cci_adj_results_df <- data.frame(gene = character(0), cell_type1 = character(0), cell_type2 = character(0), adjacency = character(0), avg_log2FC = numeric(0), pct.1 = numeric(0), pct.2 = numeric(0), p_val = numeric(0), p_val_adj = numeric(0))
139 | cat("\n### Adjacency analysis started.\n")
140 | 
141 | if (length(cell_types) == 1) {
142 | 	cat(paste0("\n# Only one cell type present in the sample. Skipping this sample"))
143 | 	next
144 | }
145 | cat(paste0("\n# ", as.character(length(cell_types)), " cell types in total."))
146 | 
147 | for (cell_type in cell_types) {
148 | 	# cell_type <- cell_types[1]
149 | 	coords_cell_type <- coords[coords$annotation == cell_type,]
150 | 	other_cell_types <- cell_types[!cell_types == cell_type]
151 | 	seurat_cell_type <- seurat_object_tn[, coords_cell_type$spot]
152 | 	meta <- seurat_cell_type@meta.data
153 | 	meta <- tibble::rownames_to_column(meta, "spot")
154 | 	
155 | 	for (other_cell_type in other_cell_types) {
156 | 		# other_cell_type <- other_cell_types[1]
157 | 		tmp <- paste0(other_cell_type, "_adjacent")
158 | 		adjacent_cells <- coords_cell_type$spot[coords_cell_type[,..tmp] > 0]
159 | 		not_adjacent_cells <- coords_cell_type$spot[coords_cell_type[,..tmp] == 0]
160 | 		if (length(adjacent_cells) == 0 | length(not_adjacent_cells) == 0) {
161 | 			### All cell_type cells are adjacent to or not adjacent to other_cell_type cells
162 | 			### Skip this combination
163 | 			next
164 | 		}
165 | 		adj_df <- rbind(data.frame(spot = adjacent_cells, adjacency = "Adjacent"), data.frame(spot = not_adjacent_cells, adjacency = "Not adjacent"))
166 | 		adj_df <- join(meta, adj_df, by = "spot")
167 | 		adjacency <- adj_df$adjacency
168 | 		names(adjacency) <- adj_df$spot
169 | 		Idents(seurat_cell_type) <- adjacency
170 | 		
171 | 		### Thresholds decided based on: https://www.nature.com/articles/s41467-019-12266-7
172 | 		### Genes with absolute log2 fold change threshold > 0.1 and expressed in at least 10% of the cells are considered
173 | 		DGE_adjacency <- FindAllMarkers(seurat_cell_type, assay = "SCT", logfc.threshold = 0.1, min.pct = 0.1, verbose = FALSE)
174 | 		if (nrow(DGE_adjacency) == 0) {
175 | 			### No DGE genes found
176 | 			next
177 | 		}
178 | 		DGE_adjacency$cell_type1 <- cell_type
179 | 		DGE_adjacency$cell_type2 <- other_cell_type
180 | 		DGE_adjacency$adjacency <- as.character(DGE_adjacency$cluster)
181 | 		
182 | 		DGE_adjacency <- DGE_adjacency[,c("gene", "cell_type1", "cell_type2", "adjacency", "avg_log2FC", "pct.1", "pct.2", "p_val", "p_val_adj")]
183 | 		cci_adj_results_df <- rbind(cci_adj_results_df, DGE_adjacency)
184 | 		gc()
185 | 	}
186 | 	tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
187 | 	cat(paste0("\n# Cell type [", cell_type, "] done; memory consumed: ", as.character(tt), "M -- [", Sys.time(), "]\n\n"))
188 | }
189 | 
190 | fwrite(cci_adj_results_df, paste0(output_dir, "/cci_adj_results.tsv"), sep = "\t")
191 | cat("\n\n### Adjacency analysis completed. Results written to cci_adj_results.tsv.\n")
192 | 
193 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/reference/ref_data_processing_example.R:
--------------------------------------------------------------------------------
  1 | ### This is a pseudo-script demonstrating the possible steps of processing the downloaded scRNA-seq datasets
  2 | ### All the filenames are hard-wired, and this script is for your reference only
  3 | ### 
  4 | ### Author: Yiming Li
  5 | 
  6 | ### This example uses a brain scRNA-seq dataset:
  7 | ### cd /share/fsmresfiles/SpatialT/ref/Brain/Non_Adult/GSE60361
  8 | 
  9 | 
 10 | 
 11 | ###### Read the count matrix and metadata
 12 | 
 13 | library(data.table) ### For fread -- faster than read.table
 14 | 
 15 | exprMatrix <- fread("exprMatrix.tsv", sep = "\t") ### row = gene, column = cell
 16 | str(exprMatrix)
 17 | # Classes ‘data.table’ and 'data.frame':	19972 obs. of  3006 variables:
 18 | #  $ sample        : chr  "Tspan12" "Tshz1" "Fnbp1l" "Adamts15" ...
 19 | #  $ 1772071015-C02: num  0 2 2 0 1 ...
 20 | #  $ 1772071017-G12: num  0 1 1 0 1 0 0 0 0 0 ...
 21 | #  $ 1772071017-A05: num  0 0 2.81 0 1 ...
 22 | # ......
 23 | 
 24 | meta <- fread("meta.tsv", sep = "\t") ### row = cell
 25 | str(meta)
 26 | # Classes ‘data.table’ and 'data.frame':	3005 obs. of  12 variables:
 27 | #  $ V1             : chr  "1772071015-C02" "1772071017-G12" "1772071017-A05" "1772071014-B06" ...
 28 | #  $ tissue         : chr  "sscortex" "sscortex" "sscortex" "sscortex" ...
 29 | #  $ group          : int  1 1 1 1 1 1 1 1 1 1 ...
 30 | #  $ total mRNA mol : int  21580 21748 31642 32916 21531 24799 31406 20389 23022 24184 ...
 31 | #  $ well           : int  11 95 33 42 48 13 50 66 29 28 ...
 32 | #  $ sex            : int  1 -1 -1 1 1 -1 1 -1 1 1 ...
 33 | #  $ age            : int  21 20 20 21 25 20 25 23 21 21 ...
 34 | #  $ diameter       : num  0 9.56 11.1 11.7 11 11.9 11.3 10.9 12.9 11.2 ...
 35 | #  $ level1class    : chr  "interneurons" "interneurons" "interneurons" "interneurons" ...
 36 | #  $ level2class    : chr  "Int10" "Int10" "Int6" "Int10" ...
 37 | # ......
 38 | 
 39 | ### Here we see that the columns "level1class" and "level2class" are cell type labels
 40 | 
 41 | ### We can check if the number of cells in the count matrix and the metatable are the same
 42 | ### (Assuming that the rows/columns in the metatable / count matrix are unique)
 43 | if (ncol(exprMatrix) == nrow(meta) + 1) {
 44 | 	### "+ 1" because the first column of exprMatrix is not a cell
 45 | 	cat("\nDimensions match\n")
 46 | 	if (sum(sort(colnames(exprMatrix)[2:ncol(exprMatrix)]) == sort(meta$V1)) == nrow(meta)) {
 47 | 		cat("Cell IDs match\n")
 48 | 	} else {
 49 | 		cat("Cell IDs do not match\n")
 50 | 	}
 51 | } else {
 52 | 	cat("\nDimensions do not match\n")
 53 | }
 54 | 
 55 | ###!!!!!!! Note that you will need to filter exprMatrix and meta if the dimensions and/or cell IDs do not match!
 56 | 
 57 | ### To perform harmonization (later with another dataset), we can check the unique cell labels in the metatable
 58 | 
 59 | table(meta$level1class)
 60 | # astrocytes-ependymal    endothelial-mural         interneurons
 61 | #                  224                  235                  290
 62 | #            microglia     oligodendrocytes        pyramidal CA1
 63 | #                   98                  820                  939
 64 | #         pyramidal SS
 65 | #                  399
 66 | 
 67 | table(meta$level2class)
 68 | #  (none)    Astro1    Astro2   CA1Pyr1   CA1Pyr2 CA1PyrInt   CA2Pyr2   Choroid
 69 | #     189        68        61       380       447        49        41        10
 70 | # ClauPyr     Epend      Int1     Int10     Int11     Int12     Int13     Int14
 71 | #       5        20        12        21        10        21        15        22
 72 | #   Int15     Int16      Int2      Int3      Int4      Int5      Int6      Int7
 73 | #      18        20        24        10        15        20        22        23
 74 | #    Int8      Int9      Mgl1      Mgl2    Oligo1    Oligo2    Oligo3    Oligo4
 75 | #      26        11        17        16        45        98        87       106
 76 | #  Oligo5    Oligo6     Peric      Pvm1      Pvm2   S1PyrDL  S1PyrL23   S1PyrL4
 77 | #     125       359        21        32        33        81        74        26
 78 | # S1PyrL5  S1PyrL5a   S1PyrL6  S1PyrL6b    SubPyr     Vend1     Vend2      Vsmc
 79 | #      16        28        39        21        22        32       105        62
 80 | 
 81 | ### We can also write the above tables to file if you would like to work in a spreadsheet later
 82 | write.table(table(meta$level1class), "GSE60361_level1class.txt", quote = FALSE, sep = "\t", row.names = FALSE)
 83 | write.table(table(meta$level2class), "GSE60361_level2class.txt", quote = FALSE, sep = "\t", row.names = FALSE)
 84 | 
 85 | ###!!!!!!! Cell type label harmonization (no generic codes for this)
 86 | 
 87 | ### You can define a function for replacing labels during harmonization
 88 | replace_labels <- function(vector, old_label, new_label) {
 89 | 	return(replace(vector, vector == old_label, new_label))
 90 | }
 91 | 
 92 | ### For example, if you would like to change the "oligodendrocytes" labels into "OLG" in this dataset
 93 | meta$level1class <- replace_labels(meta$level1class, "oligodendrocytes", "OLG")
 94 | 
 95 | table(meta$level1class) ### See that the labels have changed
 96 | # astrocytes-ependymal    endothelial-mural         interneurons
 97 | #                  224                  235                  290
 98 | #            microglia                  OLG        pyramidal CA1
 99 | #                   98                  820                  939
100 | #         pyramidal SS
101 | #                  399
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | ###### Save the count matrix and metadata
109 | 
110 | library(data.table)
111 | library(mltools) ### For the sparsify() function
112 | library(plyr) ### For the join() function
113 | 
114 | exprMatrix <- fread("exprMatrix.tsv", sep = "\t") ### Read into a data.table
115 | gene_names <- exprMatrix$sample ### Store the gene names
116 | exprMatrix$sample <- NULL ### Remove the gene column
117 | 
118 | exprMatrix <- sparsify(exprMatrix, sparsifyNAs = TRUE) ### Convert to dgCMatrix format
119 | rownames(exprMatrix) <- gene_names ### The column names should be there, you can check by colnames(exprMatrix)
120 | saveRDS(exprMatrix, "mtx.rds")
121 | 
122 | ### The below assumes that we will use the meta$level2class labels, and they have been harmonized with other datasets
123 | 
124 | ### Change the order of cells in the metatable
125 | meta <- fread("meta.tsv", sep = "\t")
126 | meta <- meta[,c("V1", "level2class")]
127 | colnames(meta) <- c("cell", "label")
128 | meta_sorted <- data.frame(cell = colnames(exprMatrix))
129 | meta_sorted <- join(meta_sorted, meta, by = "cell")
130 | 
131 | fwrite(meta_sorted, file = "ident.csv")
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | ###### Create a SingleCellExperiment object based on the count matrix and the (harmonized) metatable
139 | 
140 | library(data.table)
141 | library(SingleCellExperiment)
142 | library(scuttle)
143 | 
144 | ### Read the prepared files
145 | exprMatrix <- readRDS("mtx.rds") ### From the previous step
146 | meta_sorted <- fread("ident.csv", sep = ",") ### From the previous step
147 | 
148 | meta_sorted$cell <- NULL
149 | 
150 | ### Create SingleCellExperiment object for SingleR and normalize it
151 | ref_data_sce <- SingleCellExperiment(list(counts = exprMatrix), colData = meta_sorted)
152 | 
153 | ref_data_sce <- logNormCounts(ref_data_sce)
154 | ### If your downloaded counts data is already normalized, the above command will fail.
155 | ### However, SingleR expects a "logcounts" assay in the input SCE object, so you need to run the following command.
156 | # logcounts(ref_data_sce) <- counts(ref_data_sce)
157 | 
158 | ref_data_sce <- ref_data_sce[,ref_data_sce$label != ""] ### Remove empty cell type labels
159 | ### Need to remove the cells with labels like “doublets”, “Not Assigned”, etc., e.g.:
160 | # ref_data_sce <- ref_data_sce[,ref_data_sce$label != "not applicable"]
161 | 
162 | saveRDS(ref_data_sce, file = "GSE60361.RDS")
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | ######### Test cell type annotation on a dataset (3A)
170 | 
171 | library(Seurat)
172 | library(SingleR)
173 | 
174 | ref_data_sce <- readRDS("GSE60361.RDS")
175 | dim(ref_data_sce)
176 | table(ref_data_sce$label)
177 | length(table(ref_data_sce$label))
178 | 
179 | ### Example: human brain Visium dataset
180 | target_p_name <- "PID3"
181 | target_ds_name <- "DS3A"
182 | target_ds_dir <- paste(c("/share/fsmresfiles/SpatialT/10x", target_p_name, target_ds_name), collapse = "/")
183 | target_ds_metatable <- read.table(paste0(target_ds_dir, "/metatable.tsv"), header = TRUE, stringsAsFactors = FALSE)
184 | target_sample_name <- target_ds_metatable$SampleID[1] # For testing only
185 | target_sample_dir <- paste(c(target_ds_dir, "/", target_sample_name), collapse = "")
186 | seurat_object_tn_path <- paste0(target_sample_dir, "/processed/Seurat.RDS")
187 | seurat_object_tn <- readRDS(seurat_object_tn_path)
188 | 
189 | ### Perform spot-based cell type annotation and save to Seurat
190 | annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, labels = ref_data_sce$label, de.method="wilcox")
191 | seurat_object_tn[["cell_type_annotation"]] <- annotation$labels
192 | 
193 | ### Perform cluster-based cell type annotation and save to Seurat
194 | cluster_results <- seurat_object_tn[["seurat_clusters"]]$seurat_clusters
195 | annotation <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce, clusters = cluster_results, labels = ref_data_sce$label, de.method="wilcox")
196 | seurat_object_tn[["cell_type_annotation_clusters"]] <- annotation$labels[cluster_results]
197 | 
198 | ### Visualize annotated cell types
199 | pdf("~/stbase/DS3A_test.pdf") ### Change to your own save directory/name
200 | print(SpatialDimPlot(seurat_object_tn))
201 | print(DimPlot(seurat_object_tn, reduction = "umap"))
202 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation"))
203 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation"))
204 | print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation_clusters"))
205 | print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation_clusters"))
206 | dev.off()
207 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spatial transcriptOmics Analysis Resource
  2 | 
  3 | This repository contains the data curation, processing, and analysis scripts used by the article "SOAR elucidates disease mechanisms and empowers drug discovery through spatial transcriptomics" [[bioRxiv preprint]](https://www.biorxiv.org/content/10.1101/2022.04.17.488596v2) | [[Website]](https://soar.fsm.northwestern.edu/).
  4 | 
  5 | ## Data curation
  6 | 
  7 | To query the [Gene Expression Omnibus (GEO)](https://www.ncbi.nlm.nih.gov/geo/) for potential human and mouse spatial transcriptomics datasets, please run [the Python script](https://github.com/luoyuanlab/SOAR/tree/main/data_curation/geo-query.py) using different keywords.
  8 | 
  9 | * `python3 geo-query.py`
 10 | 
 11 | The retrieved GDS list with annotated meta-information will be stored in `./<%Y%m%d>/all.csv`.
 12 | 
 13 | ## Data processing
 14 | 
 15 | The data processing scripts are available under [`data_processing/`](https://github.com/luoyuanlab/SOAR/tree/main/data_processing). The scripts automatically perform spot and gene quality control, data transformation, normalization, and dimensionality reduction.
 16 | 
 17 | **10x Visium data** in standard format can be processed using [`process_visium_standard.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_processing/process_visium_standard.R). The script assumes that the directory contains one option from the below:
 18 | 
 19 | 1. (a) `filtered_feature_bc_matrix.h5` or [MEX files](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices) and (b) the image data in a subdirectory called `spatial`
 20 | 2. A Visium Seurat object with `data@images` properly added
 21 | 
 22 | Please note that 10x Visium data with only counts and coordinates and no `spatial/` folder data should be processed using the non-Visium scripts.
 23 | 
 24 | **Other types of spatial transcriptomics data** transformed into a standard format (`counts.csv` and `coordinates.csv`, please see below for the guidelines) can be processed using [`process_non_visium_standard.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_processing/process_non_visium_standard.R). The script can also be used on Visium data with no h5 + spatial data provided for public download.
 25 | 
 26 | `counts.csv`
 27 | 
 28 | * Comma-delimited
 29 | * Header: `gene,<spot ID 1>,...,<spot ID n>`
 30 | * Each row = one gene
 31 | * Gene symbols should be used (not Ensembl IDs, etc.)
 32 | 
 33 | `coordinates.csv`
 34 | 
 35 | * Comma-delimited
 36 | * Header: `barcode,row,col`
 37 | 	* The example file has more columns but only these three columns are required
 38 | 	* Use the spot coordinates (row, col) instead of pixel coordinates (imagerow, imagecol in the example file) if possible
 39 | * Each row = one spot
 40 | 
 41 | The barcode column of `coordinates.csv` should be exactly the same as the `counts.csv` header (after removing "gene"), i.e. the spot IDs should match.
 42 | 
 43 | ## Data analysis
 44 | 
 45 | ### Overview
 46 | 
 47 | The overall flow of data analysis is as below.
 48 | 
 49 | 1. Perform [spatial clustering](#spatial-clustering)
 50 | 2. Perform whole-tissue [spatial variability analysis](#spatial-variability-analysis)
 51 | 3. Check if the spatial transcriptomics technology is at single-cell level
 52 | 	* If so (e.g. MERFISH), perform [cell type annotation](#cell-type-annotation)
 53 | 	* If not (e.g. 10x Visium), perform [cell type deconvolution](#cell-type-deconvolution)
 54 | 4. Perform cell-type-specific [spatial variability analyis](#spatial-variability-analysis)
 55 | 5. Perform [cell-cell interaction analysis](#cell-cell-interaction-analysis)
 56 | 
 57 | ### Spatial clustering
 58 | 
 59 | The scripts for performing spatial clustering are stored in [`data_analysis/spatial_clustering/`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_clustering). Please refer to the [README](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_clustering/README.md) for the details.
 60 | 
 61 | ### Cell typing
 62 | 
 63 | In the [`data_analysis/cell_typing/`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing) folder, scripts are available for performing [cellular deconvolution](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/deconvolution) (spatial transcriptomics technologies with multiple cells per capture location, e.g. 10x Visium) and [cell type annotation](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/annotation) (single-cell-level spatial transcriptomics technologies, e.g. MERFISH).
 64 | 
 65 | #### scRNA-seq reference identification and processing
 66 | 
 67 | To identify scRNA-seq references for cell typing, users may utilize the [GEO query helper script](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/reference/geo-download-scRNA-seq.py). [`ref_data_processing_example.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/reference/ref_data_processing_example.R) is an example script for processing the downloaded scRNA-seq data. Please note that cell quality control needs to be performed case-by-case, i.e. the thresholds should be chosen manually based on the QC plots.
 68 | 
 69 | #### Cell type annotation
 70 | 
 71 | [`annotation_example.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/annotation/annotation_example.R) is an example script for performing cell type annotation on single-cell-level spatial transcriptomics datasets (e.g. MERFISH) using scRNA-seq reference datasets and SingleR.
 72 | 
 73 | <details><summary>Heuristics-guided cell type annotation for brain datasets (click me)</summary>
 74 | 
 75 | [`runBrainCellTypeAnnotation-CluHeu.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/annotation/runBrainCellTypeAnnotation-CluHeu.R)
 76 | 
 77 | * Usage: `./runBrainCellTypeAnnotation-CluHeu.R > runBrainCellTypeAnnotation-CluHeu.log`
 78 | * Description
 79 |     * This script automatically annotates the cell types of brain Visium datasets using a cluster-based approach guided by some heuristics.
 80 |     * Note that:
 81 |         * This script requires processed mouse and human scRNA-seq references as the input, and the file paths are currently hard-wired:
 82 |             * `/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_human_ctx_smart-seq`
 83 |                 * `aibs_human_ctx_smart-seq_neuronal.RDS`
 84 |                 * `aibs_human_ctx_smart-seq_non_neuronal.RDS`
 85 |                 * `supp.RData`
 86 |             * `/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_mouse_ctx-hpf_10x`
 87 |                 * `aibs_mouse_ctx-hpf_10x_neuronal.RDS`
 88 |                 * `aibs_mouse_ctx-hpf_10x_non_neuronal.RDS`
 89 |                 * `supp.RData`
 90 |         * This script also reads a table listing the DSID, species, and technology (`brain_DSID_list.txt`) and loops over its rows. Line 52 uses a hard-wired path to this file.
 91 |         * The annotations follow the [Common Cell Type Nomenclature (CCN)](https://portal.brain-map.org/explore/classes/nomenclature). `seurat_object[["cell_type_annotation"]]` contains the annotated subclasses, and `seurat_object[["cell_type_annotation_class"]]` contains the annotated classes (i.e., glutamatergic, GABAergic, or non_neuronal).
 92 | </details>
 93 | 
 94 | #### Cell type deconvolution
 95 | 
 96 | [Analysis scripts](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_typing/deconvolution) are available for the cell type deconvolution of spatial transcriptomics datasets using scRNA-seq reference datasets and BayesPrism.
 97 | 
 98 | Steps for running deconvolution
 99 | 
100 | 1. Sample script for processing scRNA-seq reference: `process_reference_example.R`
101 | 2. Deconvolution script: `quest_deconvolution_jobarray.R`
102 | 3. Script for preparing deconvolution results for subsequent analysis (only run this after the deconvolution is complete): `create_input_files.R`
103 | 
104 | ### Spatial variability analysis
105 | 
106 | The scripts for the spatial variability (SV) analysis of spatial transcriptomics data are stored in  [`data_analysis/spatial_variability/`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability).
107 | 
108 | To perform whole-tissue SV analysis, use the script [`quest_SpatialDE_jobarray.py`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability/quest_SpatialDE_jobarray.py). To run the analysis, use the command `python quest_SpatialDE_jobarray.py $sample_directory`.
109 | 
110 | To perform cell-type-specific SV analysis, use the script [`quest_SpatialDE_ct_specific.py`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/spatial_variability/quest_SpatialDE_ct_specific.py). To run the analysis, use the command `python quest_SpatialDE_ct_specific.py $sample_directory $cell_type`.
111 | 
112 | ### Cell-cell interaction analysis
113 | 
114 | To perform neighborhood-based cell-cell interaction analysis, use the script [`adj-analysis.R`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_cell_interaction/neighborhood-based/adj-analysis.R). To run the analysis, use the command `./adj-analysis.R $sample_directory`.
115 | 
116 | To perform distance-based cell-cell interaction analysis, run the bash script [`cci-analysis-COMMOT-DGE.sh`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/cell_cell_interaction/distance-based/cci-analysis-COMMOT-DGE.sh) to call different analysis scripts in the pipeline.
117 | 
118 | ### Drug screen
119 | 
120 | Scripts for drug discovery analysis are stored under [`data analysis/drug_discovery`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/drug_discovery).
121 | 
122 | The four types of analysis are:
123 | 
124 | 1. Differential gene expression analysis. [`Scripts for deconvoluted and annotated samples`](https://github.com/luoyuanlab/SOAR/tree/main/data_analysis/drug_discovery/DGE).
125 |    
126 | 2. Protein-protein interaction (PPI) network for spatially variable, differentially expressed (DE-SV) genes by cell type. [`Script for generating PPI network`](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/ppi_quest.py).
127 |    
128 | 3. CMAP L1000 drug enrichment (compounds with top overall positive/negative enrichment score on SV-DE gene sets of a cell type). [`Script for CMAP drug enrichment analysis`](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py).
129 |    
130 | 4. CMAP L1000 drug perturbation (top gene targets perturbed by the top postiively/negatively enriched compounds). [`Script for CMAP drug perturbation analysis`](https://github.com/luoyuanlab/SOAR/blob/main/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py) (contained in the same script as above).
131 |    
132 | 


--------------------------------------------------------------------------------
/data_analysis/cell_typing/annotation/runBrainCellTypeAnnotation-CluHeu.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ### Author: Yiming Li
  4 | ###
  5 | ### Description: This script automatically annotates the cell types of brain spatial transcriptomics datasets using a cluster-based approach guided by some heuristics
  6 | ### 	* It reads a table listing the DSID, technology, and species (brain_DSID_list.txt) and loop over its rows
  7 | ### 
  8 | ### Usage: ./runBrainCellTypeAnnotation-CluHeu.R
  9 | ###
 10 | ### Example: To annotate all the brain datasets in our database:
 11 | # chmod 755 runBrainCellTypeAnnotation-CluHeu.R
 12 | # ./runBrainCellTypeAnnotation-CluHeu.R > runBrainCellTypeAnnotation-CluHeu.log
 13 | 
 14 | 
 15 | 
 16 | library(SingleCellExperiment)
 17 | library(Seurat)
 18 | library(SingleR)
 19 | library(dplyr)
 20 | library(plyr)
 21 | library(ggplot2)
 22 | library(data.table)
 23 | library(AUCell)
 24 | 
 25 | 
 26 | 
 27 | ### Read/Create variables related to cancer annotation
 28 | human_ref_dir <- "/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_human_ctx_smart-seq/"
 29 | ref_data_sce_neuronal_human <- readRDS(paste0(human_ref_dir, "aibs_human_ctx_smart-seq_neuronal.RDS"))
 30 | ref_data_sce_non_neuronal_human <- readRDS(paste0(human_ref_dir, "aibs_human_ctx_smart-seq_non_neuronal.RDS"))
 31 | 
 32 | mouse_ref_dir <- "/share/fsmresfiles/SpatialT/ref/Brain/Adult/aibs_mouse_ctx-hpf_10x/"
 33 | ref_data_sce_neuronal_mouse <- readRDS(paste0(mouse_ref_dir, "aibs_mouse_ctx-hpf_10x_neuronal.RDS"))
 34 | ref_data_sce_non_neuronal_mouse <- readRDS(paste0(mouse_ref_dir, "aibs_mouse_ctx-hpf_10x_non_neuronal.RDS"))
 35 | 
 36 | ### Define functions
 37 | clustering_seurat <- function(data) {
 38 | 	data <- RunPCA(data, assay = "SCT", verbose = FALSE)
 39 | 	data <- FindNeighbors(data, reduction = "pca", dims = 1:30, verbose = FALSE)
 40 | 	data <- FindClusters(data, resolution = 1.2, verbose = FALSE)
 41 | 	## resolution: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html advises 0.4-1.2 for around 3K cells
 42 | 	data <- RunUMAP(data, reduction = "pca", dims = 1:30, verbose = FALSE)
 43 | 	return(data)
 44 | }
 45 | 
 46 | 
 47 | 
 48 | ##################
 49 | 
 50 | 
 51 | 
 52 | target_list <- fread("~/stbase/brain_DSID_list.txt", sep = "\t")
 53 | 
 54 | for (i in 1:nrow(target_list)) {
 55 | 	# i <- 1 # For testing only
 56 | 	target_ds_name <- as.character(target_list[i, "DSID"])
 57 | 	target_species <- as.character(target_list[i, "Species"])
 58 | 	target_tech <- as.character(target_list[i, "Technology"])
 59 | 	### Do not differentiate between adult and non-adult datasets
 60 | 	target_p_name <- paste0("PID", substr(target_ds_name, start = 3, stop = nchar(target_ds_name) - 1))
 61 | 	
 62 | 	target_ds_dir <- paste(c("/share/fsmresfiles/SpatialT", target_tech, target_p_name, target_ds_name), collapse = "/")
 63 | 	if (!dir.exists(target_ds_dir)) {
 64 | 		cat(paste0(target_ds_dir, " does not exist"))
 65 | 		next
 66 | 	}
 67 | 	if (!file.exists(paste0(target_ds_dir, "/metatable.tsv"))) {
 68 | 		cat(paste0(target_ds_dir, "/metatable.tsv does not exist"))
 69 | 		next
 70 | 	}
 71 | 	target_ds_metatable <- read.table(paste0(target_ds_dir, "/metatable.tsv"), header = TRUE, stringsAsFactors = FALSE, sep = "\t")
 72 | 	
 73 | 	cat(paste(c("\n\n>>>>>>>> ", target_ds_dir, " <<<<<<<<"), collapse = ""))
 74 | 	
 75 | 	for (target_sample_name in target_ds_metatable$SampleID) {
 76 | 		# target_sample_name <- target_ds_metatable$SampleID[1] # For testing only
 77 | 		target_sample_dir <- paste(c(target_ds_dir, "/", target_sample_name), collapse = "")
 78 | 		
 79 | 		### Needed since some of the datasets were incorrectly prepared
 80 | 		if (!dir.exists(target_ds_dir)) {
 81 | 			cat(paste0(target_sample_dir, " does not exist"))
 82 | 			next
 83 | 		}
 84 | 		
 85 | 		cat(paste(c("\n\n>>>>>>>> ", target_sample_dir, " <<<<<<<<"), collapse = ""))
 86 | 		
 87 | 		### Annotation results will be stored in, e.g., 10x/PID1/DS1D/DS1D.1/analysis/annotation
 88 | 		output_dir_1 <- paste0(target_sample_dir, "/analysis")
 89 | 		output_dir <- paste0(output_dir_1, "/annotation")
 90 | 		if (!dir.exists(output_dir_1)) {
 91 | 			dir.create(output_dir_1)
 92 | 		}
 93 | 		if (!dir.exists(output_dir)) {
 94 | 			dir.create(output_dir)
 95 | 		}
 96 | 		
 97 | 		### Read in the QC-ed + transformed + processed Seurat object (from process_visium_standard.R)
 98 | 		seurat_object_tn_path <- paste0(target_sample_dir, "/processed/Seurat.RDS")
 99 | 		seurat_object_tn <- readRDS(seurat_object_tn_path)
100 | 		cat("\n\n### Processed Seurat object read.")
101 | 		if ("cell_type_annotation_class" %in% colnames(seurat_object_tn@meta.data) & "cell_type_annotation" %in% colnames(seurat_object_tn@meta.data)) {
102 | 			cat("\n# Seurat object already has cell type annotation results saved, skipping this sample.")
103 | 			next
104 | 		}
105 | 	
106 | 		### Check if the Seurat object is after SCT
107 | 		if (!"SCT" %in% names(seurat_object_tn)) {
108 | 			cat("\n# [Warning] Seurat object does not have an \"SCT\" assay, renaming Spatial to SCT.")
109 | 			### Some MERFISH files prepared by Yawei do not have an SCT assay
110 | 			### However, they are already normalized so we can rename the assay to SCT for ease of subsequent analysis
111 | 			file.copy(seurat_object_tn_path, paste0(target_sample_dir, "/processed/Seurat_bk.RDS"), overwrite = TRUE)
112 | 			seurat_object_tn <- RenameAssays(object = seurat_object_tn, Spatial = "SCT")
113 | 			saveRDS(seurat_object_tn, file = seurat_object_tn_path)
114 | 		}
115 | 		
116 | 		### Check if the ST data underwent dimensionality reduction and clustering
117 | 		if (!"umap" %in% names(seurat_object_tn)) {
118 | 			cat("\n# [Warning] Seurat object does not have UMAP dimensional reduction calculated, skipping this sample.")
119 | 			next
120 | 		}
121 | 	
122 | 		cat("\n\n### Start annotation")
123 | 		coords <- seurat_object_tn@meta.data
124 | 		coords <- tibble::rownames_to_column(coords, "spot")
125 | 		coords$seurat_clusters <- as.character(coords$seurat_clusters)
126 | 		
127 | 		### Read marker gene lists and cell class/subclass mappings
128 | 		if (target_species == "Human") {
129 | 			load(paste0(human_ref_dir, "supp.RData"))
130 | 		} else {
131 | 			load(paste0(mouse_ref_dir, "supp.RData"))
132 | 		}
133 | 		
134 | 		### Use AUCell to classify clusters into neuronal vs non-neuronal
135 | 		cells_rankings <- AUCell_buildRankings(GetAssayData(seurat_object_tn, assay = "SCT"), verbose = FALSE)
136 | 		### The AUC value represents the fraction of genes, within the top 20% genes in the ranking, that are included in the signature
137 | 		runAUCell <- tryCatch(
138 | 			{
139 | 				cells_AUC <- AUCell_calcAUC(gene_lists, cells_rankings, aucMaxRank = nrow(cells_rankings) * 0.2, verbose = FALSE)
140 | 			},
141 | 			error = function(e) e
142 | 		)
143 | 		if (inherits(runAUCell, "error")){
144 | 			cat(paste(c("\n# AUCell failed, ST dataset may have a small number of genes. Creating a subsetted gene list."), collapse = ""))
145 | 			data_genes <- rownames(seurat_object_tn)
146 | 			for (gene_list_name in names(gene_lists)) {
147 | 				gene_list <- gene_lists[[gene_list_name]]
148 | 				gene_lists[[gene_list_name]] <- gene_list[gene_list %in% data_genes]
149 | 			}
150 | 			gene_lists <- gene_lists[lapply(gene_lists, length) > 0]
151 | 			cells_AUC <- AUCell_calcAUC(gene_lists, cells_rankings, aucMaxRank = nrow(cells_rankings) * 0.2, verbose = FALSE)
152 | 		}
153 | 		
154 | 		clusters <- sort(unique(coords$seurat_clusters))
155 | 		cluster_class <- data.frame(seurat_clusters = clusters, subclass = rep(NA, length(clusters)))
156 | 		for (i in 1:length(clusters)) {
157 | 			cluster <- clusters[i]
158 | 			spots <- coords[coords$seurat_clusters == cluster, "spot"]
159 | 			tmpmat <- cells_AUC[,colnames(cells_AUC) %in% spots]
160 | 			tmp <- rowSums(tmpmat@assays@data@listData$AUC)
161 | 			cluster_class$subclass[i] <- names(which(tmp == max(tmp)))
162 | 		}
163 | 		cluster_class$class <- cell_type_names[cluster_class$subclass]
164 | 		coords <- join(coords, cluster_class, by = "seurat_clusters")
165 | 		coords$class[coords$class == "glutamatergic"] <- "neuronal"
166 | 		coords$class[coords$class == "GABAergic"] <- "neuronal"
167 | 		coords <- coords[, c("spot", "class")]
168 | 		
169 | 		### Use SingleR to annotate the subclasses
170 | 		cluster_results <- seurat_object_tn[["seurat_clusters"]]$seurat_clusters
171 | 		if (target_species == "Human") {
172 | 			annotation_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_neuronal_human, clusters = cluster_results, labels = ref_data_sce_neuronal_human$label, de.method="wilcox")
173 | 			annotation_non_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_non_neuronal_human, clusters = cluster_results, labels = ref_data_sce_non_neuronal_human$label, de.method="wilcox")
174 | 		} else {
175 | 			annotation_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_neuronal_mouse, clusters = cluster_results, labels = ref_data_sce_neuronal_mouse$label, de.method="wilcox")
176 | 			annotation_non_neuronal <- SingleR(test = GetAssayData(seurat_object_tn, assay = "SCT"), ref = ref_data_sce_non_neuronal_mouse, clusters = cluster_results, labels = ref_data_sce_non_neuronal_mouse$label, de.method="wilcox")
177 | 		}
178 | 		
179 | 		### Process SingleR annotation results
180 | 		tmpdf <- data.frame(spot = colnames(seurat_object_tn), annot_neu = annotation_neuronal$labels[cluster_results])
181 | 		coords <- join(coords, tmpdf, by = "spot")
182 | 		tmpdf2 <- data.frame(spot = colnames(seurat_object_tn), annot_non = annotation_non_neuronal$labels[cluster_results])
183 | 		coords <- join(coords, tmpdf2, by = "spot")
184 | 		coords$cell_type_annotation <- ifelse(coords$class == "neuronal", coords$annot_neu, coords$annot_non)
185 | 		coords$cell_type_annotation_class <- cell_type_names[coords$cell_type_annotation]
186 | 		
187 | 		### Save annotation results to Seurat object
188 | 		seurat_object_tn[["cell_type_annotation"]] <- coords$cell_type_annotation
189 | 		seurat_object_tn[["cell_type_annotation_class"]] <- coords$cell_type_annotation_class
190 | 		saveRDS(seurat_object_tn, file = seurat_object_tn_path)
191 | 		cat("\n\n### Seurat.RDS overwritten with annotation results.")
192 | 		
193 | 		### Visualize annotated cell types
194 | 		pdf(paste0(output_dir, "/cell_type_annotation.pdf"))
195 | 		print(SpatialDimPlot(seurat_object_tn))
196 | 		print(DimPlot(seurat_object_tn, reduction = "umap"))
197 | 		print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation_class"))
198 | 		print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation_class"))
199 | 		print(SpatialDimPlot(seurat_object_tn, group.by = "cell_type_annotation"))
200 | 		print(DimPlot(seurat_object_tn, reduction = "umap", group.by = "cell_type_annotation"))
201 | 		dev.off()
202 | 		cat("\n\n### Visualization of annotated cell types completed.")
203 | 	}
204 | }
205 | 


--------------------------------------------------------------------------------
/data_processing/process_visium_standard.R:
--------------------------------------------------------------------------------
  1 | ### The purpose of this script is to process the 10x Visium data stored in standard structure (HDF5 + spatial/).
  2 | ### The dataset IDs to be processed are read from a preexisting file DSID_list_10x_standard.txt in the shared folder. 
  3 | ### This script will loop through the datasets, read, transform, and process the datasets, and save the processed data as well as a meta table for each dataset.
  4 | ### 
  5 | ### Usage: Rscript --no-save process_visium_standard.R > process_visium_standard.log
  6 | ### 
  7 | ### Author: Saya Dennis; Yiming Li; Sanaz Ghotbaldini
  8 | 
  9 | library(stringr)
 10 | library(dplyr)
 11 | library(Seurat)
 12 | library(patchwork)
 13 | library(data.table)
 14 | # library(SeuratDisk)
 15 | 
 16 | dn <- "/projects/b1131/SpatialT/10x"
 17 | args <- commandArgs(trailingOnly=TRUE)
 18 | dsid <- args[1]
 19 | 
 20 | #### Get PID
 21 | dsid <- paste0("DS", dsid)
 22 | pid <- paste0("PID", substr(dsid, start=3, stop=nchar(dsid)-1)) # e.g. "PID1"
 23 | #### Create empty data frame for meta table 
 24 | meta <- data.frame(DSID = character(0), SampleID = character(0), Nspots = integer(0), Nspots_postQC = integer(0), Ngenes = integer(0), Ngenes_postQC = integer(0), Condition = character(0))
 25 | #### Set dataset directory and get a list of samples 
 26 | dsdir <- paste0(dn, "/", pid, "/", dsid) # e.g. "/projects/b1131/SpatialT/10x/PID1/DS1A"
 27 | sampleids <- dir(dsdir, pattern = dsid) # e.g. list of elements like "DS1A.1"
 28 | 
 29 | cat(paste(c("\n>>>>>>>> Dataset [", dsdir, "] started <<<<<<<<\n"), collapse = ""))
 30 | for (sampleid in sampleids) {
 31 | 	#### Read data
 32 | 	if (file.exists(paste0(dsdir, "/", sampleid, "/original/filtered_feature_bc_matrix.h5"))) {
 33 | 		if (file.exists(paste0(dsdir, "/", sampleid, "/original/spatial/tissue_lowres_image.png"))) {
 34 | 			data <- Load10X_Spatial(paste0(dsdir, "/", sampleid, "/original"), assay = "Spatial")
 35 | 		} else {
 36 | 			cat(paste(c("### Sample: ", sampleid, " [tissue_lowres_image.png] not found, using [tissue_hires_image.png]\n"), collapse = ""))
 37 | 			img <- Read10X_Image(paste0(dsdir, "/", sampleid, "/original/spatial"), image.name = 'tissue_hires_image.png')
 38 | 			data <- Load10X_Spatial(paste0(dsdir, "/", sampleid, "/original"), image = img, assay = "Spatial")
 39 | 			data@images$slice1@scale.factors$lowres <- data@images$slice1@scale.factors$hires
 40 | 		}
 41 | 
 42 | 	} else if (file.exists(paste0(dsdir, "/", sampleid, "/original/Seurat.RDS"))) {
 43 | 	  cat(paste(c("### Sample: ", sampleid, " [filtered_feature_bc_matrix.h5] not found, using Seurat object\n"), collapse = ""))
 44 | 	  data <- readRDS(paste0(dsdir, "/", sampleid, "/original/Seurat.RDS"))
 45 | 	  image_key <- as.character(names(data@images))
 46 | 	  names(data@images)[names(data@images) == image_key] <- "slice1"
 47 | 	  
 48 | 	} else {
 49 | 		cat(paste(c("### Sample: ", sampleid, " [filtered_feature_bc_matrix.h5] not found, using MEX files\n"), collapse = ""))
 50 | 		if (file.exists(paste0(dsdir, "/", sampleid, "/original/spatial/tissue_lowres_image.png"))) {
 51 | 			counts <- Read10X(paste0(dsdir, "/", sampleid, "/original"))
 52 | 			data <- CreateSeuratObject(counts, assay = "Spatial")
 53 | 			img <- Read10X_Image(paste0(dsdir, "/", sampleid, "/original/spatial"))
 54 | 			img <- img[Cells(data)]
 55 | 			DefaultAssay(img) <- DefaultAssay(data)
 56 | 			data[["slice1"]] <- img
 57 | 		} else {
 58 | 			cat(paste(c("### Sample: ", sampleid, " [tissue_lowres_image.png] not found, using [tissue_hires_image.png]\n"), collapse = ""))
 59 | 			counts <- Read10X(paste0(dsdir, "/", sampleid, "/original"))
 60 | 			data <- CreateSeuratObject(counts, assay = "Spatial")
 61 | 			img <- Read10X_Image(paste0(dsdir, "/", sampleid, "/original/spatial"), image.name = 'tissue_hires_image.png')
 62 | 			img <- img[Cells(data)]
 63 | 			DefaultAssay(img) <- DefaultAssay(data)
 64 | 			data[["slice1"]] <- img
 65 | 			data@images$slice1@scale.factors$lowres <- data@images$slice1@scale.factors$hires
 66 | 		}
 67 | 	}
 68 | 	
 69 | 	nspots <- ncol(data)
 70 | 	ngenes <- nrow(data)
 71 | 	
 72 | 	#### Spot QC
 73 | 	## Step 1. Remove the spots with total UMI count < 500 / the total number of genes < 500 / >= 25% mitochondrial reads
 74 | 	data[["percent_mt"]] <- PercentageFeatureSet(data, "^MT-")
 75 | 	data <- data[, data$nCount_Spatial >= 500 & data$nFeature_Spatial >= 500 & data$percent_mt < 25]
 76 | 	## Step 2.
 77 | 	## Remove the spots with total UMI count < median(total UMI count) - 3 * SD(total UMI count).
 78 | 	## Remove the spots with total number of genes < median(total number of genes) - 3 * SD(total number of genes)
 79 | 	data <- data[, data$nCount_Spatial >= median(data$nCount_Spatial) - 3 * sqrt(var(data$nCount_Spatial)) & data$nFeature_Spatial >= median(data$nFeature_Spatial) - 3 * sqrt(var(data$nFeature_Spatial))]
 80 | 	
 81 | 	#### Gene QC
 82 | 	counts <- data.frame(GetAssayData(object = data, assay = "Spatial", slot = "counts"))
 83 | 	counts <- counts > 0
 84 | 	n_spots_per_gene <- rowSums(counts)
 85 | 	data <- data[n_spots_per_gene >= 5,]
 86 | 	
 87 | 	nspots_qc <- ncol(data)
 88 | 	ngenes_qc <- nrow(data)
 89 | 
 90 | 	#### Exclude sample if there are < 50 spots after QC
 91 | 	if (nspots_qc < 50) {
 92 | 		cat(paste(c("### [", sampleid, "] has < 50 spots after QC, excluded\n"), collapse = ""))
 93 | 		next
 94 | 	}
 95 | 	
 96 | 	#### Append to the dataset metatable
 97 | 	meta <- rbind(meta, data.frame(DSID = dsid, SampleID = sampleid, Nspots = nspots, Nspots_postQC = nspots_qc, Ngenes = ngenes, Ngenes_postQC = ngenes_qc, Condition = NA))
 98 | 	
 99 | 	#### Transform and process data
100 | 	data <- SCTransform(data, assay = "Spatial", verbose = FALSE)
101 | 	
102 | 	### Dimensionality reduction and clustering
103 | 	data <- RunPCA(data, assay = "SCT", verbose = FALSE)
104 | 	data <- FindNeighbors(data, reduction = "pca", dims = 1:30, verbose = FALSE)
105 | 	data <- FindClusters(data, resolution = 1.2, verbose = FALSE)
106 | 	## resolution: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html advises 0.4-1.2 for around 3K cells
107 | 	data <- RunUMAP(data, reduction = "pca", dims = 1:30, verbose = FALSE)
108 | 	
109 | 	### Save processed Seurat object
110 | 	## "processed" folder should already exist for all datasets but just in case
111 | 	if (!dir.exists(paste0(dsdir, "/", sampleid, "/processed/"))) {
112 | 		dir.create(paste0(dsdir, "/", sampleid, "/processed/"))
113 | 	}
114 | 	saveRDS(data, file = paste0(dsdir, "/", sampleid, "/processed/Seurat.RDS"))
115 | 
116 | 	## Rename variables for the below code that is pasted from rewrite_text_files.R ##
117 | 	sample_dir <- paste0(dsdir, "/", sampleid)
118 | 
119 | 	seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS"))
120 | 	
121 | 	#### Create spot IDs following R's column name requirements
122 | 	spotmeta <- seurat_object@meta.data
123 | 	spotmeta$new_spot_id <- paste0("sp", 1:nrow(spotmeta))
124 | 	seurat_object[["new_spot_id"]] <- spotmeta$new_spot_id
125 | 	saveRDS(seurat_object, paste0(sample_dir, "/processed/Seurat.RDS"))
126 | 	
127 | 	spot_id_mapping <- spotmeta$new_spot_id
128 | 	names(spot_id_mapping) <- rownames(spotmeta)
129 | 	
130 | 	#### Prepare data for deconvolution
131 | 	if ("Spatial" %in% names(seurat_object@assays)) {
132 | 		counts <- GetAssayData(object = seurat_object, assay = "Spatial", slot = "counts")
133 | 	} else {
134 | 		### MERFISH datasets
135 | 		counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts")
136 | 	}
137 | 	counts <- as.matrix(counts)
138 | 	gene_names <- rownames(counts)
139 | 	counts_t <- data.table(counts)
140 | 	counts_t$gene <- gene_names
141 | 	gc()
142 | 	counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long
143 | 	gc()
144 | 	cell_names <- counts_t$cell
145 | 	counts_t$cell <- NULL ### Remove the gene name column
146 | 	counts_t <- as.matrix(counts_t)
147 | 	rownames(counts_t) <- cell_names
148 | 	saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RDS"))
149 | 	gc()
150 | 	
151 | 	#### Retrieve QC-ed counts and coordinates
152 | 	
153 | 	## Counts
154 | 	counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts")
155 | 	counts <- as.matrix(counts)
156 | 	colnames(counts) <- as.character(spot_id_mapping[colnames(counts)])
157 | 	counts_df <- tibble::rownames_to_column(data.frame(counts), "gene")
158 | 	
159 | 	## Coordinates
160 | 	if ("slice1" %in% names(seurat_object@images)) {
161 | 		location <- seurat_object@images$slice1@coordinates ### Visium
162 | 		location <- location[, c("col", "row")] ### Use the spot coordinates
163 | 	} else {
164 | 		location <- seurat_object@images$image@coordinates ### Others
165 | 		if ("x" %in% colnames(location)) {
166 | 			location <- location[, c("x", "y")] ### Use the spot coordinates
167 | 		} else {
168 | 			### Some prepared MERFISH datasets did not follow the naming standard
169 | 			location <- location[, c("xcoord", "ycoord")] ### Use the spot coordinates
170 | 			colnames(location) <- c("x", "y")
171 | 		}
172 | 	}
173 | 	location$barcode <- rownames(location)
174 | 	colnames(location) <- c("x", "y", "barcode")
175 | 	location$barcode <- spot_id_mapping[location$barcode]
176 | 	
177 | 	## Sometimes (e.g. /share/fsmresfiles/SpatialT/10x/PID153/DS153A/DS153A.1), the number of spots in seurat_object@images is different from ncol(seurat_object)
178 | 	## Not sure why this is the case
179 | 	counts_spots <- colnames(counts_df)
180 | 	location_spots <- location$barcode
181 | 	keep_spots <- intersect(counts_spots, location_spots)
182 | 	counts_df <- counts_df[,c("gene", keep_spots)]
183 | 	rownames(location) <- location$barcode
184 | 	location <- location[keep_spots,]
185 | 	
186 | 	#### Write counts, coordinates, and meta_spots to file
187 | 	write.table(counts_df, file = paste0(sample_dir, "/processed/counts.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE)
188 | 	write.table(location, file = paste0(sample_dir, "/processed/coordinates.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE)
189 | 	saveRDS(list("counts" = counts_df, "coordinates" = location), file = paste0(sample_dir, "/processed/data_frames.RDS"))
190 | 	
191 | 	#### Prepare data for deconvolution (relative counts)
192 | 	if ("Spatial" %in% names(seurat_object@assays)) {
193 | 		DefaultAssay(seurat_object) <- "Spatial"
194 | 		seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6)
195 | 		counts <- GetAssayData(object = seurat_object, assay = "Spatial")
196 | 	} else {
197 | 		### MERFISH datasets
198 | 		seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6)
199 | 		counts <- GetAssayData(object = seurat_object, assay = "SCT")
200 | 	}
201 | 	counts <- as.matrix(counts)
202 | 	gene_names <- rownames(counts)
203 | 	counts_t <- data.table(counts)
204 | 	counts_t$gene <- gene_names
205 | 	gc()
206 | 	counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long
207 | 	gc()
208 | 	cell_names <- counts_t$cell
209 | 	counts_t$cell <- NULL ### Remove the gene name column
210 | 	counts_t <- as.matrix(counts_t)
211 | 	rownames(counts_t) <- cell_names
212 | 	saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RC.RDS"))
213 | 	
214 | 	cat(paste(c("### [", sampleid, "] Finished.\n"), collapse = ""))
215 | }
216 | write.table(meta, file = paste0(dsdir, "/metatable_auto.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
217 | if (file.exists(paste0(dsdir, "/metatable_orig.tsv"))) {
218 | 	meta$Condition <- NULL
219 | 	meta_orig <- fread(paste0(dsdir, "/metatable_orig.tsv"))
220 | 	meta <- merge(meta, meta_orig, by = c("DSID", "SampleID"))
221 | }
222 | write.table(meta, file = paste0(dsdir, "/metatable.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
223 | 
224 | cat(paste(c(">>>>>>>> Dataset: ", dsid, " completed <<<<<<<<\n"), collapse = ""))
225 | 


--------------------------------------------------------------------------------
/data_analysis/drug_discovery/PPI_Drug_Enrichment_Perturbation/drug_screen_perturb_quest.py:
--------------------------------------------------------------------------------
  1 | import cmapPy
  2 | import cmapPy.pandasGEXpress.parse
  3 | import json
  4 | import numpy as np
  5 | from collections import Counter
  6 | np.random.seed(1024)
  7 | import pandas as pd
  8 | import statsmodels
  9 | from statsmodels.stats.multitest import fdrcorrection
 10 | import os
 11 | from sklearn.preprocessing import MinMaxScaler
 12 | scaler = MinMaxScaler(feature_range = (-2,2))
 13 | import math
 14 | import sys
 15 | 
 16 | dsid = sys.argv[1]
 17 | sampleid = sys.argv[2]
 18 | # for single cell samples
 19 | #deg_dir = '/projects/b1131/SpatialT/drug-target/'+dsid+'/'+sampleid+'/DGE_anno_SVG'
 20 | # for deconv samples
 21 | deg_dir = '/projects/b1131/SpatialT/drug-target/'+dsid+'/'+sampleid+'/DGE_dec_SVG'
 22 | cell_direc = [i for i in os.listdir(deg_dir)]
 23 | print('STARTING:', dsid, ' ', sampleid)
 24 | 
 25 | # output dir
 26 | out_dir = '/projects/b1131/SpatialT/drug-target-cmap2-svg/'
 27 | enrich_dir = out_dir+dsid+'/'+sampleid+'/GSEA/'
 28 | top_enrich_dir = out_dir+dsid+'/'+sampleid+'/Enrichment/'
 29 | perturb_dir = out_dir+dsid+'/'+sampleid+'/Perturbation/'
 30 | 
 31 | ##### gene processing #####
 32 | # hgnc genes
 33 | hgnc_v2 = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/HGNC091923.txt', sep='\t')
 34 | hgnc_dict = dict(zip(hgnc_v2['Approved symbol'], hgnc_v2['NCBI gene ID']))
 35 | hgnc_v2_prev = hgnc_v2[hgnc_v2['Previous symbol'].notnull()]
 36 | hgnc_dict_prev = dict(zip(hgnc_v2_prev['Previous symbol'], hgnc_v2_prev['NCBI gene ID']))
 37 | 
 38 | #cmap genes
 39 | gene_info = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/geneinfo_beta.txt', sep='\t')
 40 | 
 41 | # mapping gene indices to real names 
 42 | gene_cmap2 = gene_info[['gene_id','gene_symbol']]
 43 | gene_cmap2.columns = ['num','name']
 44 | lst = []
 45 | for i in gene_cmap2['num'].tolist():
 46 |     if pd.isna(i) != True:
 47 |         lst.append(int(i))
 48 |     else:
 49 |         lst.append(i)
 50 | gene_cmap2['num'] = lst
 51 | gene_dict_cmap2 = gene_cmap2.set_index('num').to_dict()['name']
 52 | gene_dict_cmap2_rev = {v:k for k,v in gene_dict_cmap2.items()}
 53 | print('Genes cleaned for mapping')
 54 | 
 55 | 
 56 | ##### cmap2 enrichment setup #####
 57 | # rank file of 6hr perturbations (from z scores)- gene x compound effects
 58 | cmap2_cmap1_perturb_rank_dur = pd.read_csv('/projects/b1131/SpatialT/cmap_ppi_database/cmap2_6hr_pert_rank.csv', engine='c')
 59 | cmap2_cmap1_perturb_rank_dur = cmap2_cmap1_perturb_rank_dur.set_index('Unnamed: 0')
 60 | cmap2_cmap1_perturb_rank_dur_arry = np.array(cmap2_cmap1_perturb_rank_dur)
 61 | 
 62 | # gene and compound name setup for gsea 
 63 | ROWS_dur = [str(gene_dict_cmap2_rev[i]) for i in cmap2_cmap1_perturb_rank_dur.index]
 64 | COLS_dur = cmap2_cmap1_perturb_rank_dur.columns.tolist()
 65 | ROW2IDX2_val_dur = list(range(0,len(ROWS_dur)))
 66 | COL2IDX2_val_dur = list(range(0,len(COLS_dur)))
 67 | ROW2IDX_key_dur = ROWS_dur
 68 | COL2IDX_key_dur = COLS_dur
 69 | ROW2IDX_dur = {k:v for k,v in zip(ROW2IDX_key_dur, ROW2IDX2_val_dur)}
 70 | COL2IDX_dur = {k:v for k,v in zip(COL2IDX_key_dur, COL2IDX2_val_dur)}
 71 | N_GENES, N_PERTURBATIONS = cmap2_cmap1_perturb_rank_dur_arry.shape
 72 | N_REPEATS = 1000
 73 | RANDOM_RANK = np.random.rand(N_GENES, N_REPEATS).argsort(axis=0) + 1  # gene x repeat
 74 | 
 75 | 
 76 | ##### cmap2 perturbation setup #####
 77 | # cmap2 perturbation zscore
 78 | z_score = np.load('/projects/b1131/SpatialT/cmap_ppi_database/cmap2_6hr_pert_z.npy')
 79 | z_score_df = pd.DataFrame(z_score)
 80 | z_score_df.columns = cmap2_cmap1_perturb_rank_dur.columns
 81 | z_score_df.index = cmap2_cmap1_perturb_rank_dur.index
 82 | print('cmap data read in')
 83 | 
 84 | 
 85 | ##### format dge results ######
 86 | def get_up_down_deg (df):
 87 |     df_up = df[(df['stat']>0.5) & (df['qval']<0.05) ]
 88 |     df_down = df[(df['stat']<-0.5) & (df['qval']<0.05) ]
 89 |     return df_up, df_down
 90 | 
 91 | # transform from genename to number
 92 | def hgnc_gene_up_down(up_df, down_df):
 93 |     gene_num_up = []
 94 |     for i in up_df['gene'].tolist():
 95 |         if i in hgnc_dict.keys() and math.isnan(hgnc_dict[i]) == False:
 96 |             gene_num_up.append(str(int(hgnc_dict[i])))
 97 |         elif i in hgnc_dict_prev.keys() and math.isnan(hgnc_dict_prev[i]) == False:
 98 |             gene_num_up.append(str(int(hgnc_dict_prev[i])))
 99 |         else:
100 |             print('SKIP GENE',i)
101 | 
102 |     gene_num_down = []
103 |     for i in down_df['gene'].tolist():
104 |         if i in hgnc_dict.keys() and math.isnan(hgnc_dict[i]) == False:
105 |             gene_num_down.append(str(int(hgnc_dict[i])))
106 |         elif i in hgnc_dict_prev.keys() and math.isnan(hgnc_dict_prev[i]) == False:
107 |             gene_num_down.append(str(int(hgnc_dict_prev[i])))
108 |         else:
109 |             print('SKIP GENE', i) 
110 |     return gene_num_up, gene_num_down
111 | 
112 | ##### GSEA ####
113 | def cmap2_gsea_setup(gene_num_up, gene_num_down):
114 |     DATA = {
115 |         "U": [ROW2IDX_dur[g] for g in gene_num_up if g in ROW2IDX_dur],
116 |         "D": [ROW2IDX_dur[g] for g in gene_num_down if g in ROW2IDX_dur],
117 |     }
118 |     N_UP = len(DATA["U"])
119 |     N_DOWN = len(DATA["D"])
120 |     #print(N_UP, N_DOWN)
121 |     return DATA, N_UP, N_DOWN
122 | 
123 | # up/down gene set level es
124 | def STEP1(v):
125 |     # deg sorted up/down reg gene perrturb array
126 |     v = sorted(v)
127 |     # t = length of up/down reg gene perturb array
128 |     t = len(v)
129 |     
130 |     # (i+1)/t = fraction of elements up to index i in sorted up/down gene list 
131 |     # v[i]/n_genes = fraction of the rank of the current gene at index i out of the total number of genes
132 |     # total upward (a) and downward (b) influence (es) for each up/down reg gene set
133 |     a = max((i + 1) / t - v[i] / N_GENES for i in range(t)) 
134 |     b = max(v[i] / N_GENES - i / t for i in range(t))
135 |     es = 0
136 |     if a > b: #check what a and b is
137 |         es = a
138 |     elif b > a:
139 |         es = -b
140 |     #print('a:',a,'b:', b)
141 |     return es
142 | 
143 | # diff of up/down gene set es 
144 | def STEP2(rank_u, rank_d):
145 |     #print(rank_u)
146 |     # calculate up genes and down genes es difference for each perturbation to determine overall es  
147 |     es_u = STEP1(rank_u) if rank_u.size else 0
148 |     es_d = STEP1(rank_d) if rank_d.size else 0
149 |     #print(es_u, es_d)
150 |     if np.sign(es_u) == np.sign(es_d):
151 |         return 0
152 |     return es_u - es_d
153 | 
154 | def run_gsea(DATA, N_UP, N_DOWN):
155 |     background = np.array([STEP2(RANDOM_RANK[:N_UP, c], RANDOM_RANK[N_UP:N_UP + N_DOWN, c]) for c in range(N_REPEATS)])
156 |     result = []
157 |     # for each perturbation, calculate the es score difference of up genes and down genes
158 |     for col in range(N_PERTURBATIONS):
159 |         es = STEP2(cmap2_cmap1_perturb_rank_dur_arry[DATA["U"], col], cmap2_cmap1_perturb_rank_dur_arry[DATA["D"], col])
160 |         #print(es)
161 |         p = 1.0
162 |         if es > 0:
163 |             p = np.mean(background > es)
164 |         elif es < 0:
165 |             p = np.mean(background < es)
166 |         result.append((es, p))
167 |     return result
168 | 
169 | 
170 | ##### format gsea #####
171 | # all drugs enrichment score
172 | def format_gsea_res(result, cell):
173 |     res_formatted = pd.DataFrame(result)
174 |     res_formatted.columns = ['es','p']
175 |     res_formatted['es'] = [round(i,3) for i in res_formatted['es'].tolist()]
176 |     res_formatted['p-adj']=statsmodels.stats.multitest.fdrcorrection(res_formatted['p'].tolist(), alpha=0.05, method='indep', is_sorted=False)[1]
177 |     res_formatted = res_formatted.reset_index()
178 |     res_formatted.index = COLS_dur
179 |     #res_formatted = res_formatted.drop('level_0',axis = 1)
180 |     res_formatted['drug'] = [i.split('--')[0] for i in res_formatted.index.tolist()]
181 |     if not os.path.exists(enrich_dir ):
182 |         os.makedirs(enrich_dir )
183 |     res_formatted.to_csv(enrich_dir+cell+'.csv')
184 |     return res_formatted
185 | 
186 | 
187 | # top and bottom 500 enriched drugs - for website visualization
188 | def format_enrich(ds, cell):
189 |     res = pd.DataFrame()
190 |     res['CMAP_instance'] = [i.split('--')[1] for i in ds.index.tolist()]
191 |     res['Drug'] = ds['drug'].tolist()
192 |     res['Enrichment_score'] = ds['es'].tolist()
193 |     res['P-value'] = ds['p'].tolist()
194 |     res['Adj-p']= ds['p-adj'].tolist()
195 |     res = res.sort_values('Enrichment_score', ascending=False)
196 |     res_inv = res.head(500)
197 |     res_pos = res.tail(500)
198 |     if not os.path.exists(top_enrich_dir ):
199 |         os.makedirs(top_enrich_dir )
200 |     res_inv.to_csv(top_enrich_dir+cell+'_INV_es.csv', index=False)
201 |     res_pos.to_csv(top_enrich_dir+cell+'_POS_es.csv', index=False)
202 |     return res_inv, res_pos
203 | 
204 | 
205 | ##### format perturbation #####
206 | # format pertrbation network to create edge and node file
207 | def get_perturb_zs(cell, direction, es_df, dge, cell_type_deg_z):
208 |     perturb_specific_dir = perturb_dir+cell+'/'+direction+'/'
209 |     if not os.path.exists(perturb_specific_dir):
210 |         os.makedirs(perturb_specific_dir)  
211 |     for j,i in es_df.iterrows():
212 |         # adding cmap perturbation rank and z score for each gene for top enriched compounds. 
213 |         cmap_instance = i['CMAP_instance']
214 |         drug = i['Drug']
215 |         cmap_drug = drug + '--' +cmap_instance
216 |         #print(j, drug_new_idx)
217 |         zs = cell_type_deg_z[[cmap_drug]]
218 |         zs.columns = ['z']
219 |         zs = zs.sort_values('z')
220 |         perturb_df = zs.merge(dge, how = 'left', left_index=True, right_on = 'gene')
221 |         
222 |         # edge of perturbation network
223 |         deg_gene_num = perturb_df.shape[0]
224 |         perturb_edges = pd.DataFrame()
225 |         perturb_edges['Source'] = [drug for i in range(deg_gene_num)]
226 |         perturb_edges['CMAP_instance'] = [cmap_instance for i in range(deg_gene_num)]
227 |         perturb_edges['Target'] = perturb_df['gene'].tolist()
228 |         perturb_edges['log2fc'] = perturb_df['stat'].tolist()
229 |         perturb_edges['exp_z'] = perturb_df['z'].tolist()
230 |         perturb_edges['exp_z_norm'] = [i[0] for i in scaler.fit_transform(np.array(perturb_edges['exp_z'].tolist()).reshape(-1, 1))]
231 |         perturb_edges['abs_log2fc'] = [abs(i) for i in perturb_df['stat'].tolist()]
232 |         perturb_edges['sign_log2fc'] = [np.sign(i) for i in perturb_df['stat'].tolist()]
233 |         perturb_edges = perturb_edges.sort_values('exp_z')
234 |         head_edges = perturb_edges.head(30)
235 |         tail_edges = perturb_edges.tail(30)
236 |         perturb_edges_sub = pd.concat([head_edges, tail_edges])
237 |         perturb_edges_sub.to_csv(perturb_specific_dir+cmap_drug+'_perturb.csv',index=False)
238 |         
239 |         # node of perturbation network 
240 |         perturb_nodes = pd.DataFrame()
241 |         perturb_nodes['Id'] = perturb_edges_sub['Target'] 
242 |         perturb_nodes['Label'] = perturb_edges_sub['Target'] 
243 |         perturb_nodes['log2fc'] = perturb_edges_sub['log2fc']
244 |         perturb_nodes['sign_log2fc'] = perturb_edges_sub['sign_log2fc'] 
245 |         perturb_nodes['abs_log2fc'] = perturb_edges_sub['abs_log2fc'] 
246 |         drug_row = pd.DataFrame({'Id':[perturb_edges_sub['Source'].unique().tolist()[0]],
247 |                              'Label':[perturb_edges_sub['Source'].unique().tolist()[0]],
248 |                              'log2fc':[10],
249 |                              'sign_log2fc':[0],
250 |                              'abs_log2fc':[10]})
251 |         perturb_nodes = pd.concat([drug_row, perturb_nodes])
252 |         perturb_nodes.to_csv(perturb_specific_dir+cmap_drug+'_perturb_nodes.csv',index=False)
253 | 
254 | ##### run enrich + perturb ######
255 | def run_enrich_perturb(dsid, sampleid, deg_dir, cell_direc):
256 |     for ct in cell_direc:
257 |         # deg
258 |         cell = ct.split('.csv')[0]
259 |         deg = pd.read_csv(deg_dir + '/' + ct)
260 |         print('deg file read in for:', cell)
261 |         deg_up, deg_down = get_up_down_deg(deg)
262 |         print('deg shape:', deg_up.shape, deg_down.shape)
263 |         if deg_up.shape[0] > 2000 or deg_up.shape[0] <10 or deg_down.shape[0] > 2000 or deg_down.shape[0] <10:
264 |             print('deg too few or too many')
265 |             print('\n')
266 |             continue
267 |         gene_num_up, gene_num_down =  hgnc_gene_up_down(deg_up, deg_down )
268 |         print('degs hgnc-formatted:', dsid, sampleid)    
269 | 
270 |         # enrichment
271 |         DATA, N_UP,N_DOWN =  cmap2_gsea_setup(gene_num_up, gene_num_down)
272 |         print('cmap matched deg:',N_UP,N_DOWN)
273 |         gsea_res= run_gsea(DATA, N_UP, N_DOWN)
274 |         print('finish gsea, head:', gsea_res[:3])
275 |         gsea_res_form = format_gsea_res(gsea_res, cell)
276 |         enrich_inv, enrich_pos = format_enrich(gsea_res_form, cell)
277 |         print('enrich saved, shape:', enrich_inv.shape, enrich_pos.shape, enrich_inv.head(1), enrich_inv.head(1))
278 | 
279 |         # perturbation
280 |         dge =pd.concat([deg_up, deg_down]).sort_values('stat')
281 |         cell_type_deg_z = z_score_df[z_score_df.index.isin(dge['gene'].tolist())]
282 |         print('deg genes matched to cmap:',cell_type_deg_z.shape[0])
283 |         get_perturb_zs(cell, 'INV', enrich_inv, dge, cell_type_deg_z)
284 |         get_perturb_zs(cell, 'POS', enrich_pos, dge, cell_type_deg_z)
285 |         print('inv and pos perturb files saved, shapes:', len(os.listdir(perturb_dir+cell+'/'+'INV/')),len(os.listdir(perturb_dir+cell+'/'+'POS/')) )
286 |         print('\n')
287 | 
288 | run_enrich_perturb(dsid, sampleid, deg_dir, cell_direc)


--------------------------------------------------------------------------------
/data_processing/process_non_visium_standard.R:
--------------------------------------------------------------------------------
  1 | #### Description
  2 | #### * The purpose of this script is to process non-Visium data stored in standard structure (counts.csv + coordinates.csv). This also applies to Visium data with no h5 + spatial data provided for public download.
  3 | #### * This script will loop through the datasets, read, transform, and process the datasets, and save the processed data as well as a metatable for each dataset.
  4 | #### * Note that if after QC, there are < 10 spots left, we will exclude the sample from our database.
  5 | #### 
  6 | #### Author: Yiming Li, Saya Dennis (edits)
  7 | 
  8 | library(stringr)
  9 | library(dplyr)
 10 | library(Seurat)
 11 | library(patchwork)
 12 | library(data.table)
 13 | # library(SeuratDisk)
 14 | 
 15 | dn <- "/projects/b1131/SpatialT/"
 16 | args <- commandArgs(trailingOnly=TRUE)
 17 | dsid <- args[1]
 18 | tech <- args[2]
 19 | dn <- paste0(dn, tech)
 20 | 
 21 | #### Get PID
 22 | dsid <- paste0("DS", dsid)
 23 | pid <- paste0("PID", substr(dsid, start=3, stop=nchar(dsid)-1)) # e.g. "PID203"
 24 | #### Create empty data frame for meta table 
 25 | meta <- data.frame(DSID = character(0), SampleID = character(0), Nspots = integer(0), Nspots_postQC = integer(0), Ngenes = integer(0), Ngenes_postQC = integer(0), Condition = character(0))
 26 | #### Set dataset directory and get a list of samples 
 27 | dsdir <- paste0(dn, "/", pid, "/", dsid) # e.g. "/projects/b1131/SpatialT/Slide-seq/PID203/DS203A"
 28 | sampleids <- dir(dsdir, pattern = dsid) # e.g. list of elements like "DS203A.1"
 29 | 
 30 | cat(paste(c("\n>>>>>>>> Dataset [", dsdir, "] started <<<<<<<<\n"), collapse = ""))
 31 | for (sampleid in sampleids) {
 32 | 	#### Read data
 33 | 	## !!! Assumes that the prepared counts.csv and coordinates.csv are ready under original/
 34 | 	counts <- fread(paste0(dsdir, "/", sampleid, "/original/counts.csv"), sep = ",", header = TRUE)
 35 | 	coordinates <- read.table(paste0(dsdir, "/", sampleid, "/original/coordinates.csv"), sep = ",", header = TRUE)
 36 | 	counts <- counts[counts$gene != "",] ## Remove empty "genes"
 37 | 	## Some datasets have duplicated values in the "gene" or "barcode" column
 38 | 	## Remove these "genes" / "barcodes" since it is hard to determine which observation we should keep
 39 | 	tmp <- table(counts$gene)
 40 | 	tmp <- names(tmp[tmp > 1])
 41 | 	if (length(tmp) > 0) {
 42 | 		counts <- counts[!counts$gene %in% tmp,]
 43 | 		cat(paste(c("# WARNING: Excluded ", as.character(length(tmp)), " genes with multiple associated rows.\n"), collapse = ""))
 44 | 	}
 45 | 	coordinates <- coordinates[!duplicated(coordinates),]
 46 | 	tmp <- table(coordinates$barcode)
 47 | 	tmp <- names(tmp[tmp > 1])
 48 | 	if (length(tmp) > 0) {
 49 | 		coordinates <- coordinates[!coordinates$barcode %in% tmp,]
 50 | 		cat(paste(c("# WARNING: Excluded ", as.character(length(tmp)), " spot IDs with multiple associated rows.\n"), collapse = ""))
 51 | 	}
 52 | 	
 53 | 	#### Generate Seurat object
 54 | 	gene_names <- counts$gene
 55 | 	counts$gene <- NULL
 56 | 	counts <- as.matrix(counts)
 57 | 	rownames(counts) <- gene_names
 58 | 	seurat_object <- CreateSeuratObject(counts = counts, project = 'SlideSeq', assay = "Spatial")
 59 | 	rownames(coordinates) <- coordinates$barcode
 60 | 	## Using the spot coordinates, instead of pixel coordinates
 61 | 	## !!! This Seurat object cannot be directly overlayed on top of the tissue image (if any)
 62 | 	coordinates <- coordinates[,c("row", "col")]
 63 | 	# if (sum(is.na(coordinates$imagerow)) + sum(is.na(coordinates$imagecol)) == 0) {
 64 | 	# 	coordinates <- coordinates[,c("imagerow", "imagecol")]
 65 | 	# } else {
 66 | 	# 	coordinates <- coordinates[,c("row", "col")]
 67 | 	# }
 68 | 	colnames(coordinates) <- c("xcoord", "ycoord")
 69 | 	seurat_object[['images']]<- new(Class = "SlideSeq", assay = "Spatial", coordinates = coordinates)
 70 | 
 71 | 	nspots <- ncol(seurat_object)
 72 | 	ngenes <- nrow(seurat_object)
 73 | 	
 74 | 	#### Spot QC
 75 | 	
 76 | 	## Step 1. Remove the spots with total UMI count < 500 / the total number of genes < 500 / >= 25% mitochondrial reads
 77 | 	seurat_object[["percent_mt"]] <- PercentageFeatureSet(seurat_object, "^MT-")
 78 | 	qc_step1 <- seurat_object$nCount_Spatial >= 500 & seurat_object$nFeature_Spatial >= 500 & seurat_object$percent_mt < 25
 79 | 	if (sum(qc_step1) < 10) {
 80 | 		cat(paste(c("# NOTE: Sample has less than 10 spots after QC. This sample will be excluded from the database and metatable.tsv.\n"), collapse = ""))
 81 | 		next
 82 | 	}
 83 | 	seurat_object <- seurat_object[, qc_step1]
 84 | 
 85 | 	## Step 2.
 86 | 	## Remove the spots with total UMI count < median(total UMI count) - 3 * SD(total UMI count).
 87 | 	## Remove the spots with total number of genes < median(total number of genes) - 3 * SD(total number of genes)
 88 | 	qc_step2 <- seurat_object$nCount_Spatial >= median(seurat_object$nCount_Spatial) - 3 * sqrt(var(seurat_object$nCount_Spatial)) & seurat_object$nFeature_Spatial >= median(seurat_object$nFeature_Spatial) - 3 * sqrt(var(seurat_object$nFeature_Spatial))
 89 | 	if (sum(qc_step2) < 10) {
 90 | 		cat(paste(c("# NOTE: Sample has less than 10 spots after QC. This sample will be excluded from the database and metatable.tsv.\n"), collapse = ""))
 91 | 		next
 92 | 	}
 93 | 	seurat_object <- seurat_object[, qc_step2]
 94 | 	
 95 | 	#### Gene QC
 96 | 	counts <- data.frame(GetAssayData(object = seurat_object, assay = "Spatial", slot = "counts"))
 97 | 	counts <- counts > 0
 98 | 	n_spots_per_gene <- rowSums(counts)
 99 | 	seurat_object <- seurat_object[n_spots_per_gene >= 5,]
100 | 	
101 | 	nspots_qc <- ncol(seurat_object)
102 | 	ngenes_qc <- nrow(seurat_object)
103 | 	
104 | 	#### Append to the dataset metatable
105 | 	meta <- rbind(meta, data.frame(DSID = dsid, SampleID = sampleid, Nspots = nspots, Nspots_postQC = nspots_qc, Ngenes = ngenes, Ngenes_postQC = ngenes_qc, Condition = NA))
106 | 	
107 | 	#### Transform and process seurat_object
108 | 	seurat_object <- SCTransform(seurat_object, assay = "Spatial", verbose = FALSE)
109 | 	
110 | 	#### Dimensionality reduction and clustering
111 | 	
112 | 	#### The number of spots in a ST dataset is often small, need to set the npcs and dims parameters
113 | 	n_pcs <- min(min(dim(seurat_object)) - 1, 50)
114 | 	seurat_object <- RunPCA(seurat_object, assay = "SCT", verbose = FALSE, npcs = n_pcs)
115 | 	n_dims <- min(min(dim(seurat_object)) - 1, 30)
116 | 	seurat_object <- FindNeighbors(seurat_object, reduction = "pca", dims = 1:n_dims, verbose = FALSE)
117 | 	
118 | 	## If the number of spots is < 50, UMAP with uwot (default) will fail, and FindClusters with resolution  = 1.2 will sometimes fail
119 | 	## https://github.com/satijalab/seurat/issues/4312#issuecomment-812938288
120 | 	if (ncol(seurat_object) < 50) {
121 | 		seurat_object <- FindClusters(seurat_object, resolution = 1, verbose = FALSE)
122 | 		## resolution 1.2 sometimes fail
123 | 		seurat_object <- RunUMAP(seurat_object, umap.method = "umap-learn", reduction = "pca", dims = 1:n_dims, verbose = FALSE)
124 | 		cat(paste(c("# WARNING: Sample has less than 50 spots after QC.\n"), collapse = ""))
125 | 	} else {
126 | 		seurat_object <- FindClusters(seurat_object, resolution = 1.2, verbose = FALSE)
127 | 		## resolution: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html advises 0.4-1.2 for around 3K cells
128 | 		seurat_object <- RunUMAP(seurat_object, reduction = "pca", dims = 1:n_dims, verbose = FALSE)
129 | 	}
130 | 	
131 | 	### Save processed Seurat object
132 | 	## "processed" folder should already exist for all datasets but just in case
133 | 	if (!dir.exists(paste0(dsdir, "/", sampleid, "/processed/"))) {
134 | 		dir.create(paste0(dsdir, "/", sampleid, "/processed/"))
135 | 	}
136 | 	saveRDS(seurat_object, file = paste0(dsdir, "/", sampleid, "/processed/Seurat.RDS"))
137 | 	
138 | 	# #### Retrieve QC-ed + transformed counts and coordinates
139 | 	# ## Counts 
140 | 	# counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts")
141 | 	# counts <- tibble::rownames_to_column(data.frame(counts), "gene")
142 | 	# ## Coordinates
143 | 	# coords <- seurat_object@images$image@coordinates
144 | 	# coords <- tibble::rownames_to_column(data.frame(coords), "barcode")
145 | 	# coords <- coords[,c("barcode", "x", "y")]
146 | 	
147 | 	# #### Write counts, coordinates, and meta_spots to file
148 | 	# write.table(counts, file = paste0(dsdir, "/", sampleid, "/processed/counts.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE)
149 | 	# write.table(coords, file = paste0(dsdir, "/", sampleid, "/processed/coordinates.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE)
150 | 	# saveRDS(list("counts" = counts, "coordinates" = coords), file = paste0(dsdir, "/", sampleid, "/processed/data_frames.RDS"))
151 | 	# cat(paste(c("### [", sampleid, "] Finished.\n"), collapse = ""))
152 | 	# gc()
153 | 
154 | 	## Rename variables for the below code that is pasted from rewrite_text_files.R ##
155 | 	sample_dir <- paste0(dsdir, "/", sampleid)
156 | 	
157 | 	seurat_object <- readRDS(paste0(sample_dir, "/processed/Seurat.RDS"))
158 | 	
159 | 	#### Create spot IDs following R's column name requirements
160 | 	spotmeta <- seurat_object@meta.data
161 | 	spotmeta$new_spot_id <- paste0("sp", 1:nrow(spotmeta))
162 | 	seurat_object[["new_spot_id"]] <- spotmeta$new_spot_id
163 | 	saveRDS(seurat_object, paste0(sample_dir, "/processed/Seurat.RDS"))
164 | 	
165 | 	spot_id_mapping <- spotmeta$new_spot_id
166 | 	names(spot_id_mapping) <- rownames(spotmeta)
167 | 	
168 | 	#### Prepare data for deconvolution
169 | 	if ("Spatial" %in% names(seurat_object@assays)) {
170 | 		counts <- GetAssayData(object = seurat_object, assay = "Spatial", slot = "counts")
171 | 	} else {
172 | 		### MERFISH datasets
173 | 		counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts")
174 | 	}
175 | 	counts <- as.matrix(counts)
176 | 	gene_names <- rownames(counts)
177 | 	counts_t <- data.table(counts)
178 | 	counts_t$gene <- gene_names
179 | 	gc()
180 | 	counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long
181 | 	gc()
182 | 	cell_names <- counts_t$cell
183 | 	counts_t$cell <- NULL ### Remove the gene name column
184 | 	counts_t <- as.matrix(counts_t)
185 | 	rownames(counts_t) <- cell_names
186 | 	saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RDS"))
187 | 	gc()
188 | 	
189 | 	#### Retrieve QC-ed counts and coordinates
190 | 	
191 | 	## Counts
192 | 	counts <- GetAssayData(object = seurat_object, assay = "SCT", slot = "counts")
193 | 	counts <- as.matrix(counts)
194 | 	colnames(counts) <- as.character(spot_id_mapping[colnames(counts)])
195 | 	counts_df <- tibble::rownames_to_column(data.frame(counts), "gene")
196 | 	
197 | 	## Coordinates
198 | 	if ("slice1" %in% names(seurat_object@images)) {
199 | 		location <- seurat_object@images$slice1@coordinates ### Visium
200 | 		location <- location[, c("col", "row")] ### Use the spot coordinates
201 | 	} else {
202 | 		location <- seurat_object@images$image@coordinates ### Others
203 | 		if ("x" %in% colnames(location)) {
204 | 			location <- location[, c("x", "y")] ### Use the spot coordinates
205 | 		} else {
206 | 			### Some prepared MERFISH datasets did not follow the naming standard
207 | 			location <- location[, c("xcoord", "ycoord")] ### Use the spot coordinates
208 | 			colnames(location) <- c("x", "y")
209 | 		}
210 | 	}
211 | 	location$barcode <- rownames(location)
212 | 	colnames(location) <- c("x", "y", "barcode")
213 | 	location$barcode <- spot_id_mapping[location$barcode]
214 | 	
215 | 	## Sometimes (e.g. /share/fsmresfiles/SpatialT/10x/PID153/DS153A/DS153A.1), the number of spots in seurat_object@images is different from ncol(seurat_object)
216 | 	## Not sure why this is the case
217 | 	counts_spots <- colnames(counts_df)
218 | 	location_spots <- location$barcode
219 | 	keep_spots <- intersect(counts_spots, location_spots)
220 | 	counts_df <- counts_df[,c("gene", keep_spots)]
221 | 	rownames(location) <- location$barcode
222 | 	location <- location[keep_spots,]
223 | 	
224 | 	#### Write counts, coordinates, and meta_spots to file
225 | 	write.table(counts_df, file = paste0(sample_dir, "/processed/counts.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE)
226 | 	write.table(location, file = paste0(sample_dir, "/processed/coordinates.csv"), quote = FALSE, sep = ",", row.names = FALSE, col.names = TRUE)
227 | 	saveRDS(list("counts" = counts_df, "coordinates" = location), file = paste0(sample_dir, "/processed/data_frames.RDS"))
228 | 	
229 | 	#### Prepare data for deconvolution (relative counts)
230 | 	if ("Spatial" %in% names(seurat_object@assays)) {
231 | 		DefaultAssay(seurat_object) <- "Spatial"
232 | 		seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6)
233 | 		counts <- GetAssayData(object = seurat_object, assay = "Spatial")
234 | 	} else {
235 | 		### MERFISH datasets
236 | 		seurat_object <- NormalizeData(seurat_object, normalization.method = "RC", scale.factor = 1e6)
237 | 		counts <- GetAssayData(object = seurat_object, assay = "SCT")
238 | 	}
239 | 	counts <- as.matrix(counts)
240 | 	gene_names <- rownames(counts)
241 | 	counts_t <- data.table(counts)
242 | 	counts_t$gene <- gene_names
243 | 	gc()
244 | 	counts_t <- transpose(counts_t, keep.names = "cell", make.names = "gene") ### Takes long
245 | 	gc()
246 | 	cell_names <- counts_t$cell
247 | 	counts_t$cell <- NULL ### Remove the gene name column
248 | 	counts_t <- as.matrix(counts_t)
249 | 	rownames(counts_t) <- cell_names
250 | 	saveRDS(counts_t, paste0(sample_dir, "/processed/bk.dat.RC.RDS"))
251 | 	
252 | 	cat(paste(c("### [", sampleid, "] Finished.\n"), collapse = ""))
253 | }
254 | write.table(meta, file = paste0(dsdir, "/metatable_auto.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
255 | if (file.exists(paste0(dsdir, "/metatable_orig.tsv"))) {
256 | 	meta$Condition <- NULL
257 | 	meta_orig <- fread(paste0(dsdir, "/metatable_orig.tsv"))
258 | 	meta <- merge(meta, meta_orig, by = c("DSID", "SampleID"))
259 | }
260 | write.table(meta, file = paste0(dsdir, "/metatable.tsv"), quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
261 | 
262 | cat(paste(c(">>>>>>>> Dataset: ", dsid, " completed <<<<<<<<\n"), collapse = ""))
263 | 


--------------------------------------------------------------------------------