├── .gitignore ├── .readthedocs.yaml ├── Archive ├── MaxFuse_devo │ └── 09302022V │ │ ├── graph.py │ │ ├── match.py │ │ ├── match_utils.py │ │ ├── metrics.py │ │ ├── utils.py │ │ └── utils_bk.py ├── abseq-bmc │ └── code │ │ ├── abseq-dataprep.Rmd │ │ ├── analysis │ │ ├── allmetrices_extraction.ipynb │ │ ├── match_utils.py │ │ ├── metrics.py │ │ └── utils.py │ │ └── benchmark │ │ ├── calculate_metrics.R │ │ ├── methods_running │ │ ├── bsc_ab.R │ │ ├── harm_ab.R │ │ ├── liger_ab.R │ │ ├── maxfuse_ab.py │ │ └── seurat_ab.R │ │ ├── metrics.R │ │ ├── step1.sh │ │ └── step2.sh ├── asapseq-pbmc │ └── code │ │ ├── analysis │ │ ├── asap_metric_extraction.ipynb │ │ ├── match_utils.py │ │ ├── metrics.py │ │ └── utils.py │ │ ├── asap_dataprep.Rmd │ │ └── benchmark │ │ ├── calculate_metrics.R │ │ ├── methods_run │ │ ├── MaxFuse_cite.py │ │ ├── bsc_cite.R │ │ ├── harm_cite.R │ │ ├── liger_cite.R │ │ └── seurat_cite.R │ │ ├── metrics.R │ │ ├── step1.sh │ │ └── step2.sh ├── citeseq-bmc │ └── code │ │ ├── analysis │ │ ├── allmetrices_extraction.ipynb │ │ ├── match_utils.py │ │ ├── metrics.py │ │ └── utils.py │ │ ├── benchmark │ │ ├── calculate_metrics.R │ │ ├── methods_running │ │ │ ├── bsc_cite.R │ │ │ ├── harm_cite.R │ │ │ ├── liger_cite.R │ │ │ ├── maxfuse_cite.py │ │ │ └── seurat_cite.R │ │ ├── metrics.R │ │ ├── step1.sh │ │ └── step2.sh │ │ └── citeseq-bmc-dataprep.Rmd ├── citeseq-pbmc │ └── code │ │ ├── analysis │ │ ├── CM_extraction-drop.ipynb │ │ ├── CM_extraction.ipynb │ │ ├── allmetrices_extraction-drop.ipynb │ │ ├── allmetrices_extraction.ipynb │ │ ├── match_utils.py │ │ ├── metrics.py │ │ ├── plot.Rmd │ │ ├── plot_drop.Rmd │ │ ├── plot_reduction.Rmd │ │ └── utils.py │ │ ├── benchmark │ │ ├── calculate_metrics.R │ │ ├── method_running │ │ │ ├── bsc_cite.R │ │ │ ├── bsc_cite_drop.R │ │ │ ├── bsc_cite_reduction-drop.R │ │ │ ├── bsc_cite_reduction.R │ │ │ ├── harm_cite.R │ │ │ ├── harm_cite_drop.R │ │ │ ├── harm_cite_reduc-drop.R │ │ │ ├── harm_cite_reduc.R │ │ │ ├── liger_cite.R │ │ │ ├── liger_cite_drop.R │ │ │ ├── liger_cite_reduction-drop.R │ │ │ ├── liger_cite_reduction.R │ │ │ ├── maxfues_cite-drop.py │ │ │ ├── maxfuse_cite.py │ │ │ ├── maxfuse_cite_reduction-drop.py │ │ │ ├── maxfuse_cite_reduction.py │ │ │ ├── seurat_cite.R │ │ │ ├── seurat_cite_drop.R │ │ │ ├── seurat_cite_reduc-drop.R │ │ │ └── seurat_cite_reduc.R │ │ ├── metrics.R │ │ ├── reduction.sh │ │ ├── step1-drop.sh │ │ ├── step1.sh │ │ └── step2.sh │ │ └── citeseq-pbmc-dataprep.Rmd ├── hubmap │ └── code │ │ ├── analysis │ │ ├── CL_tri-integration.ipynb │ │ ├── SB_tri-integration.ipynb │ │ ├── plot_cl_results.Rmd │ │ └── plot_sb_results.Rmd │ │ └── preparation │ │ ├── atac │ │ └── prep_hubmap_atac.Rmd │ │ ├── codex │ │ ├── patient_tissues_select.ipynb │ │ └── prep_hubmapCODEX.Rmd │ │ └── rna │ │ └── prep_hubmapRNA.Rmd ├── hubmap_nature │ ├── CL_0719production.py │ ├── SB_0718production.py │ ├── figure_plotting.Rmd │ └── readme.md ├── strong-link │ ├── 10xe18 │ │ ├── analysis │ │ │ ├── calculate_metrics.R │ │ │ ├── check_metrics_e18.ipynb │ │ │ ├── metrics.R │ │ │ └── step2.sh │ │ ├── method_running │ │ │ ├── R_workflow_maestro_10xe18mouse.R │ │ │ ├── glue_e18_mouse_preprocessing.ipynb │ │ │ ├── glue_e18_mouse_training.ipynb │ │ │ ├── glue_e18_prepare_data_h5.R │ │ │ ├── mf_10x_e18.ipynb │ │ │ ├── scj_add_prep_10xe18mouse.R │ │ │ └── scj_config_e18_celltype.py │ │ └── prep_e18.Rmd │ ├── 10xpbmc │ │ ├── analysis │ │ │ ├── calculate_metrics.R │ │ │ ├── check_metrics.ipynb │ │ │ ├── metrics.R │ │ │ └── step2.sh │ │ ├── method_running │ │ │ ├── R_workflow_maestro_10xpbmc.R │ │ │ ├── glue_pbmc_prepare_data_h5.R │ │ │ ├── glue_pbmc_preprocessing.ipynb │ │ │ ├── glue_pbmc_training.ipynb │ │ │ ├── mf_pbmc.ipynb │ │ │ ├── scj_add_prep_data_10xpbmc.R │ │ │ └── scj_config_pbmc.py │ │ └── prep_pbmc.Rmd │ ├── cortical │ │ ├── analysis │ │ │ ├── calculate_metrics.R │ │ │ ├── check_metrics_greenleaf.ipynb │ │ │ ├── metrics.R │ │ │ └── step2.sh │ │ ├── method_running │ │ │ ├── glue_greenleaf_prepare_data_h5.R │ │ │ ├── glue_greenleaf_preprocessing.ipynb │ │ │ ├── glue_greenleaf_training.ipynb │ │ │ ├── maestro_10xgreenleaf.R │ │ │ ├── mf_cortical.ipynb │ │ │ ├── scj_config_greenleaf_celltype.py │ │ │ └── scj_prep_data_10xgreenleaf.R │ │ └── prep_cortical.Rmd │ └── retina │ │ ├── analysis │ │ ├── calculate_metrics.R │ │ ├── metrics.R │ │ ├── retina_extrac_all_metric.ipynb │ │ └── step2.sh │ │ ├── method_running │ │ ├── R_workflow_maestro_retina.R │ │ ├── mf_retina.ipynb │ │ ├── retina_prepare_data_h5.R │ │ ├── retina_preprocessing.ipynb │ │ ├── retina_training.ipynb │ │ ├── scj_config_retina_celltype.py │ │ └── scj_prepare_data_retina.R │ │ └── prep_retina.Rmd ├── teaseq-pbmc │ └── code │ │ ├── analysis │ │ ├── match_utils.py │ │ ├── metrics.py │ │ ├── tea_metric_extraction.ipynb │ │ └── utils.py │ │ ├── benchmark │ │ ├── calculate_metrics.R │ │ ├── methods_running │ │ │ ├── bsc_cite.R │ │ │ ├── harm_cite.R │ │ │ ├── liger_cite.R │ │ │ ├── maxfuse_cite.py │ │ │ └── seurat_cite.R │ │ ├── metrics.R │ │ ├── step1.sh │ │ └── step2.sh │ │ └── teaseq_dataprep.Rmd └── tonsil │ └── code │ ├── analysis │ ├── allmetrices_extraction_tonsil.ipynb │ ├── full_data_postprocessing.ipynb │ ├── match_utils.py │ ├── metrics.py │ ├── plot_tonsil_gcrelated_analysis.Rmd │ ├── plot_tonsil_met.Rmd │ ├── plot_tonsil_umap.Rmd │ └── utils.py │ ├── benchmark │ ├── calculate_metrics.R │ ├── method_running │ │ ├── bsc_batch.R │ │ ├── bsc_full.R │ │ ├── harm_batch.R │ │ ├── hm_full.R │ │ ├── lg_full.R │ │ ├── liger_batch.R │ │ ├── mf_batch.py │ │ ├── mf_full.py │ │ ├── seurat_batch.R │ │ └── sr_full.R │ ├── metrics.R │ ├── step1.sh │ └── step2.sh │ └── preparation_code │ ├── add_centroide_toinput.ipynb │ ├── prep_subsetting_andMore.Rmd │ ├── prepare_gc_related.ipynb │ ├── tonsilcodex_dataprep.Rmd │ └── tonsilrna_dataprep.Rmd ├── LICENSE ├── README.md ├── docs ├── .DS_Store ├── Makefile ├── _static │ └── .Rhistory ├── _templates │ ├── README.md │ ├── class.rst │ └── module.rst ├── api.rst ├── api │ ├── maxfuse.graph.construct_graph.rst │ ├── maxfuse.graph.get_nearest_neighbors.rst │ ├── maxfuse.graph.get_umap_embeddings.rst │ ├── maxfuse.graph.graph_clustering.rst │ ├── maxfuse.graph.leiden_clustering.rst │ ├── maxfuse.graph.rst │ ├── maxfuse.match_utils.address_matching_redundancy.rst │ ├── maxfuse.match_utils.get_initial_matching.rst │ ├── maxfuse.match_utils.get_refined_matching.rst │ ├── maxfuse.match_utils.get_refined_matching_one_iter.rst │ ├── maxfuse.match_utils.match_cells.rst │ ├── maxfuse.match_utils.rst │ ├── maxfuse.metrics.get_foscttm.rst │ ├── maxfuse.metrics.get_knn_alignment_score.rst │ ├── maxfuse.metrics.get_matching_acc.rst │ ├── maxfuse.metrics.get_matching_alignment_score.rst │ ├── maxfuse.metrics.rst │ ├── maxfuse.model.Fusor.construct_graphs.rst │ ├── maxfuse.model.Fusor.filter_bad_matches.rst │ ├── maxfuse.model.Fusor.find_initial_pivots.rst │ ├── maxfuse.model.Fusor.get_embedding.rst │ ├── maxfuse.model.Fusor.get_matching.rst │ ├── maxfuse.model.Fusor.plot_canonical_correlations.rst │ ├── maxfuse.model.Fusor.plot_matching_scores.rst │ ├── maxfuse.model.Fusor.plot_singular_values.rst │ ├── maxfuse.model.Fusor.propagate.rst │ ├── maxfuse.model.Fusor.refine_pivots.rst │ ├── maxfuse.model.Fusor.rst │ ├── maxfuse.model.Fusor.split_into_batches.rst │ ├── maxfuse.model.rst │ ├── maxfuse.rst │ ├── maxfuse.spatial_utils.bind_spatial.rst │ ├── maxfuse.spatial_utils.get_neighborhood_composition.rst │ ├── maxfuse.spatial_utils.get_spatial_knn_indices.rst │ ├── maxfuse.spatial_utils.rst │ ├── maxfuse.utils.cca_embedding.rst │ ├── maxfuse.utils.cdist_correlation.rst │ ├── maxfuse.utils.center_scale.rst │ ├── maxfuse.utils.dict_to_list.rst │ ├── maxfuse.utils.drop_zero_variability_columns.rst │ ├── maxfuse.utils.filter_bad_matches.rst │ ├── maxfuse.utils.get_centroids.rst │ ├── maxfuse.utils.graph_smoothing.rst │ ├── maxfuse.utils.list_to_dict.rst │ ├── maxfuse.utils.pearson_correlation.rst │ ├── maxfuse.utils.process_count_data.rst │ ├── maxfuse.utils.recode.rst │ ├── maxfuse.utils.robust_svd.rst │ ├── maxfuse.utils.rst │ ├── maxfuse.utils.shrink_towards_centroids.rst │ ├── maxfuse.utils.sort_dict.rst │ ├── maxfuse.utils.summarize_clustering.rst │ ├── maxfuse.utils.svd_denoise.rst │ └── maxfuse.utils.svd_embedding.rst ├── citeseq_pbmc_evaluate.ipynb ├── conf.py ├── index.rst ├── make.bat ├── protein_gene_conversion.csv ├── requirements.txt ├── tonsil_codex_rnaseq.ipynb └── tutorials.rst ├── maxfuse ├── __init__.py ├── graph.py ├── match_utils.py ├── metrics.py ├── model.py ├── spatial_utils.py └── utils.py ├── media ├── ai_generated_icon.png ├── fig1.png └── temp.md └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # custom 132 | .idea/ 133 | data/ 134 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.8" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the docs/ directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | 22 | # If using Sphinx, optionally build your docs in additional formats such as PDF 23 | # formats: 24 | # - pdf 25 | 26 | # Optionally declare the Python requirements required to build your docs 27 | python: 28 | install: 29 | - requirements: docs/requirements.txt 30 | -------------------------------------------------------------------------------- /Archive/abseq-bmc/code/benchmark/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("./") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 35 | 36 | # calculate Silhouette width 37 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 38 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 39 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 40 | # calculate ARI 41 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 42 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 43 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 44 | 45 | write_csv(metrics, metrics_fname) 46 | -------------------------------------------------------------------------------- /Archive/abseq-bmc/code/benchmark/methods_running/harm_ab.R: -------------------------------------------------------------------------------- 1 | #harmony benchmar 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/abseq/output/" 8 | in_root = "/abseq/data_prep/" 9 | out_indx = 15 10 | 11 | out_dir =paste0(out_root,"hm/") 12 | in_dir = in_root 13 | dir.create(out_root) 14 | dir.create(out_dir) 15 | # read 16 | rna = readMM(paste0(in_dir,"abseqwta_rna.txt")) 17 | rna = as.matrix(rna) 18 | protein = read.csv(paste0(in_dir,"abseqwta_pro.csv")) 19 | protein = protein[,-1]# remove the row index 20 | meta = read.csv(paste0(in_dir,"abseqwta_meta.csv")) 21 | rna_names = read.csv(paste0(in_dir,"abseqwta_rna_names.csv")) # rna names always the same 22 | colnames(rna) = rna_names$names 23 | 24 | rna = rna[meta$hm_annotate != "dirty",] 25 | protein = protein[meta$hm_annotate != "dirty",] 26 | meta = meta[meta$hm_annotate != "dirty",] 27 | 28 | # change name 29 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 30 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 31 | rna_list = c() 32 | protein_list = c() 33 | for (i in c(1:dim(correspondence)[1])){ 34 | protein_n = as.character(correspondence[i,1]) 35 | rna_n = as.character(correspondence[i,2]) 36 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 37 | next 38 | } 39 | rna_n = strsplit(rna_n, '/')[[1]] 40 | for(r in rna_n){ 41 | if (r %in% rna_names$names){ 42 | rna_list = c(rna_list, r) 43 | protein_list = c(protein_list, protein_n) 44 | } 45 | } 46 | } 47 | # change name end 48 | # first filtering step should be same as in sp 49 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 50 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 51 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 52 | # copy sp filtering 53 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.2] 54 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 55 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 56 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 57 | # then we construct the seurat objects 58 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 59 | x_obj <- NormalizeData(x_obj) 60 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 61 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 62 | # add suerat object datay 63 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 64 | y_obj <- NormalizeData(y_obj) 65 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 66 | #list_modality=list(x_obj,y_obj) 67 | # get shared clean features 68 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 69 | # run harmony in seurat, need to make a new seurat object 70 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 71 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 72 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE) 73 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 74 | # cbind together, scale within modality is better 75 | xy_obj <- xy_obj %>% RunHarmony("orig") 76 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 77 | name_1 = "full_embed_x0.csv" 78 | name_2 = "full_embed_y0.csv" 79 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 80 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 81 | row.names=FALSE) # need to decide output pca cell 82 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 83 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 84 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 85 | 86 | ## 87 | -------------------------------------------------------------------------------- /Archive/abseq-bmc/code/benchmark/methods_running/liger_ab.R: -------------------------------------------------------------------------------- 1 | #liger benchmark 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/abseq/output/" 7 | in_root = "/abseq/data_prep/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"lg/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | rna = readMM(paste0(in_dir,"abseqwta_rna.txt")) 16 | rna = as.matrix(rna) 17 | protein = read.csv(paste0(in_dir,"abseqwta_pro.csv")) 18 | protein = protein[,-1]# remove the row index 19 | meta = read.csv(paste0(in_dir,"abseqwta_meta.csv")) 20 | rna_names = read.csv(paste0(in_dir,"abseqwta_rna_names.csv")) # rna names always the same 21 | colnames(rna) = rna_names$names 22 | 23 | rna = rna[meta$hm_annotate != "dirty",] 24 | protein = protein[meta$hm_annotate != "dirty",] 25 | meta = meta[meta$hm_annotate != "dirty",] 26 | 27 | # change name 28 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 29 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 30 | rna_list = c() 31 | protein_list = c() 32 | for (i in c(1:dim(correspondence)[1])){ 33 | protein_n = as.character(correspondence[i,1]) 34 | rna_n = as.character(correspondence[i,2]) 35 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 36 | next 37 | } 38 | rna_n = strsplit(rna_n, '/')[[1]] 39 | for(r in rna_n){ 40 | if (r %in% rna_names$names){ 41 | rna_list = c(rna_list, r) 42 | protein_list = c(protein_list, protein_n) 43 | } 44 | } 45 | } 46 | # change name end 47 | # first filtering step should be same as in sp 48 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 49 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 50 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 51 | # copy sp filtering to produce better output 52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.2] 53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 54 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 55 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 56 | # then we construct the liger objects 57 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE) 58 | ###Start integration 59 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 60 | # default preprocessing 61 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 62 | # do not need to select genes 63 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 64 | ligerobj@var.genes=features # just use all 65 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 66 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = TRUE) 67 | ligerobj <- quantile_norm(ligerobj) 68 | embedding = ligerobj@H.norm[,c(1:out_indx)] 69 | name_1 = "full_embed_x0.csv" 70 | name_2 = "full_embed_y0.csv" 71 | # no avaliable matching information from liger thus not saved out 72 | # will use knn to serach matching on embedding in downstreatm analysis 73 | # check what cell is filtered out 74 | `%notin%` <- Negate(`%in%`) 75 | filtered = 76 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)] 77 | filtered_id = as.integer(gsub("d1", "", filtered)) # filter id 12774 78 | # extract numbers 79 | write.csv(embedding[c(1:12901),], 80 | paste0(out_dir,name_1), row.names=FALSE) # note one cell got deleted by liger process 81 | write.csv(embedding[c(12902:25803),], 82 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 83 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 84 | 85 | # also get the right original pca values for cells, since one cell got deleted 86 | x_orig = read.csv("/abseq/data_prep/orig_x.csv") 87 | x_orig_sub = x_orig[-filtered_id,] 88 | write.csv(x_orig_sub, "/abseq/data_prep/orig_lg_x.csv", row.names=FALSE) 89 | # save out filtered id 90 | write.csv(data.frame(id = filtered_id), "/abseq/output/lg/filt_id.csv") 91 | 92 | -------------------------------------------------------------------------------- /Archive/abseq-bmc/code/benchmark/methods_running/seurat_ab.R: -------------------------------------------------------------------------------- 1 | #seurat benchmark 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/abseq/output/" 7 | in_root = "/abseq/data_prep/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"sr/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | rna = readMM(paste0(in_dir,"abseqwta_rna.txt")) 16 | rna = as.matrix(rna) 17 | protein = read.csv(paste0(in_dir,"abseqwta_pro.csv")) 18 | protein = protein[,-1]# remove the row index 19 | meta = read.csv(paste0(in_dir,"abseqwta_meta.csv")) 20 | rna_names = read.csv(paste0(in_dir,"abseqwta_rna_names.csv")) # rna names always the same 21 | colnames(rna) = rna_names$names 22 | 23 | rna = rna[meta$hm_annotate != "dirty",] 24 | protein = protein[meta$hm_annotate != "dirty",] 25 | meta = meta[meta$hm_annotate != "dirty",] 26 | 27 | # change name 28 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 29 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 30 | rna_list = c() 31 | protein_list = c() 32 | for (i in c(1:dim(correspondence)[1])){ 33 | protein_n = as.character(correspondence[i,1]) 34 | rna_n = as.character(correspondence[i,2]) 35 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 36 | next 37 | } 38 | rna_n = strsplit(rna_n, '/')[[1]] 39 | for(r in rna_n){ 40 | if (r %in% rna_names$names){ 41 | rna_list = c(rna_list, r) 42 | protein_list = c(protein_list, protein_n) 43 | } 44 | } 45 | } 46 | # change name end 47 | # first filtering step should be same as in sp 48 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 49 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 50 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 51 | # copy sp filtering 52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.2] 53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 54 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub))) 55 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub))) 56 | # then we construct the seurat objects 57 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 58 | x_obj <- NormalizeData(x_obj) 59 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case 60 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 61 | # add suerat object datay 62 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 63 | y_obj <- NormalizeData(y_obj) 64 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 65 | list_modality=list(x_obj,y_obj) 66 | # get transfer anchor 67 | features=intersect(rownames(x_obj),rownames(y_obj)) 68 | pre.anchors <- FindTransferAnchors(reference = x_obj, query = y_obj, 69 | dims = 1:20, features = features) 70 | predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(x_obj), 71 | dims = 1:20) 72 | full_df = data.frame(idx2 = c(1:length(predictions$predicted.id)) -1, idx1 = as.integer(predictions$predicted.id) -1, 73 | score = predictions$prediction.score.max) # mind the r index difference 74 | # get integration embedding 75 | print("starting seurat integration") 76 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality, 77 | dims = 1:20, anchor.features =features, k.filter = 10) 78 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 10) 79 | # 80 | DefaultAssay(xy_int) <- "integrated" 81 | xy_int <- ScaleData(xy_int, verbose = FALSE) 82 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion 83 | embedding = xy_int@reductions$pca@cell.embeddings 84 | name_1 = "full_embed_x0.csv" 85 | name_2 = "full_embed_y0.csv" 86 | #pathout = out_dir 87 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 88 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 89 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 90 | write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide 91 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 92 | -------------------------------------------------------------------------------- /Archive/abseq-bmc/code/benchmark/step1.sh: -------------------------------------------------------------------------------- 1 | ## run this is algo python conda env 2 | python maxfuse_ab.py & 3 | /usr/bin/Rscript seurat_ab.R & 4 | /usr/bin/Rscript liger_ab.R & 5 | /usr/bin/Rscript harm_ab.R & 6 | /usr/bin/Rscript bsc_ab.R -------------------------------------------------------------------------------- /Archive/abseq-bmc/code/benchmark/step2.sh: -------------------------------------------------------------------------------- 1 | # no condo env requirement 2 | # used to calc slt and ari for all methods 3 | 4 | # for mf 5 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/mf/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/mf/full_embed' 0 & 6 | 7 | # for sr 8 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/sr/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/sr/full_embed' 0 & 9 | 10 | # for lg 11 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/lgunimf/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/lgunimf/full_embed' 0 & 12 | 13 | # for hm 14 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/hm/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/hm/full_embed' 0 & 15 | 16 | # for bsc 17 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/bsc/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/bsc/full_embed' 0 -------------------------------------------------------------------------------- /Archive/asapseq-pbmc/code/benchmark/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("./") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | # calculate structure alignment metrics 28 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 29 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 30 | n_idx=n_idx, data_idx='x') 31 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 32 | n_idx=n_idx, data_idx='y') 33 | 34 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 35 | 36 | # calculate Silhouette width 37 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 38 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 39 | #print(slt_res) 40 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 41 | #print(metrics) 42 | # calculate ARI 43 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 44 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 45 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 46 | 47 | # save metrics, because the calculation of kBET is substantially slower. 48 | write_csv(metrics, metrics_fname) -------------------------------------------------------------------------------- /Archive/asapseq-pbmc/code/benchmark/methods_run/harm_cite.R: -------------------------------------------------------------------------------- 1 | #harmony benchmark 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/asap/output/" 8 | in_root = "/asap/data/" 9 | out_indx = 15 10 | 11 | out_dir =paste0(out_root,"hm/") 12 | in_dir = in_root 13 | dir.create(out_root) 14 | dir.create(out_dir) 15 | # read 16 | 17 | protein = read.csv(paste0(in_dir,"adt_pbmc.csv")) 18 | protein = protein[,-which(names(protein) %in% c("X","barcode","CD4.1",'CD8a','CD11b.1'))]# not used channels 19 | colnames(protein) = gsub('\\.','-', colnames(protein)) 20 | colnames(protein) = gsub('-$','', colnames(protein)) 21 | 22 | meta = read.csv(paste0(in_dir,"asap_pbmc_meta.csv")) 23 | 24 | atacactivity = readMM(paste0(in_dir,"genescore_pbmc.txt")) 25 | atacactivity = as.matrix(atacactivity) 26 | gas_names = read.csv(paste0(in_dir ,'genescore_names_pbmc.csv')) 27 | colnames(atacactivity) = gas_names$names 28 | 29 | ## remove 30 | atacactivity = atacactivity[meta$human_ann != "dirt",] 31 | protein = protein[meta$human_ann != "dirt",] 32 | meta = meta[meta$human_ann != "dirt",] 33 | ## 34 | 35 | # change name 36 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 37 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 38 | rna_list = c() 39 | protein_list = c() 40 | for (i in c(1:dim(correspondence)[1])){ 41 | protein_n = as.character(correspondence[i,1]) 42 | rna_n = as.character(correspondence[i,2]) 43 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 44 | next 45 | } 46 | rna_n = strsplit(rna_n, '/')[[1]] 47 | for(r in rna_n){ 48 | if (r %in% gas_names$names){ 49 | rna_list = c(rna_list, r) 50 | protein_list = c(protein_list, protein_n) 51 | } 52 | } 53 | } 54 | 55 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object 56 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 57 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 58 | 59 | # copy sp filtering to produce better output 60 | act.shared.sub = act.shared[,colSds(act.shared)>0.5] 61 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 62 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub)))) 63 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 64 | # then we construct the seurat objects 65 | x_obj=CreateSeuratObject(counts=t(act.shared.sub),assay="x") 66 | #x_obj <- NormalizeData(x_obj) 67 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 68 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 69 | # add suerat object datay 70 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 71 | y_obj <- NormalizeData(y_obj) 72 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 73 | #list_modality=list(x_obj,y_obj) 74 | # get shared clean features 75 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub)) 76 | # run harmony in seurat, need to make a new seurat object 77 | xy_obj = CreateSeuratObject(counts=cbind(t(act.shared.sub[,features]), t(protein.shared.sub[,features]))) 78 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 79 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE) 80 | xy_obj@meta.data$orig = c(rep("x",dim(act.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 81 | # cbind together, scale within modality is better 82 | xy_obj <- xy_obj %>% RunHarmony("orig") 83 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 84 | name_1 = "full_embed_x0.csv" 85 | name_2 = "full_embed_y0.csv" 86 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 87 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 88 | row.names=FALSE) # need to decide output pca cell 89 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 90 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 91 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 92 | -------------------------------------------------------------------------------- /Archive/asapseq-pbmc/code/benchmark/methods_run/liger_cite.R: -------------------------------------------------------------------------------- 1 | #liger benchmark 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/asap/output/" 7 | in_root = "/asap/data/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"lg/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | 16 | protein = read.csv(paste0(in_dir,"adt_pbmc.csv")) 17 | protein = protein[,-which(names(protein) %in% c("X","barcode","CD4.1",'CD8a','CD11b.1'))]# not used channels 18 | colnames(protein) = gsub('\\.','-', colnames(protein)) 19 | colnames(protein) = gsub('-$','', colnames(protein)) 20 | 21 | meta = read.csv(paste0(in_dir,"asap_pbmc_meta.csv")) 22 | 23 | atacactivity = readMM(paste0(in_dir,"genescore_pbmc.txt")) 24 | atacactivity = as.matrix(atacactivity) 25 | gas_names = read.csv(paste0(in_dir ,'genescore_names_pbmc.csv')) 26 | colnames(atacactivity) = gas_names$names 27 | 28 | ## remove 29 | atacactivity = atacactivity[meta$human_ann != "dirt",] 30 | protein = protein[meta$human_ann != "dirt",] 31 | meta = meta[meta$human_ann != "dirt",] 32 | ## 33 | 34 | # change name 35 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 36 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 37 | rna_list = c() 38 | protein_list = c() 39 | for (i in c(1:dim(correspondence)[1])){ 40 | protein_n = as.character(correspondence[i,1]) 41 | rna_n = as.character(correspondence[i,2]) 42 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 43 | next 44 | } 45 | rna_n = strsplit(rna_n, '/')[[1]] 46 | for(r in rna_n){ 47 | if (r %in% gas_names$names){ 48 | rna_list = c(rna_list, r) 49 | protein_list = c(protein_list, protein_n) 50 | } 51 | } 52 | } 53 | 54 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object 55 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 56 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 57 | 58 | # copy sp filtering to produce better output 59 | act.shared.sub = act.shared[,colSds(act.shared)>0.5] 60 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 61 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub)))) 62 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 63 | # then we construct the liger objects 64 | ligerobj=createLiger( list(x = t(act.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE) 65 | ###Start integration 66 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 67 | # default preprocessing 68 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 69 | # do not need to select genes 70 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 71 | ligerobj@var.genes=features # just use all 72 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 73 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = TRUE) 74 | ligerobj <- quantile_norm(ligerobj) 75 | embedding = ligerobj@H.norm[,c(1:out_indx)] 76 | name_1 = "full_embed_x0.csv" 77 | name_2 = "full_embed_y0.csv" 78 | # no avaliable matching information from liger thus not saved out 79 | # will use knn to serach matching on embedding in downstreatm analysis 80 | # check what cell is filtered out 81 | `%notin%` <- Negate(`%in%`) 82 | filtered = 83 | c(rownames(act.shared.sub), rownames(protein.shared.sub))[c(rownames(act.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)] 84 | filtered_id = as.integer(gsub("d1", "", filtered)) # no cells filtered 85 | 86 | write.csv(embedding[c(1:4360),], # no cells filltered out dump way to save out 87 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 88 | write.csv(embedding[c(4361:8720),], 89 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 90 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 91 | 92 | -------------------------------------------------------------------------------- /Archive/asapseq-pbmc/code/benchmark/step1.sh: -------------------------------------------------------------------------------- 1 | ## run this is algo python conda env 2 | python Maxfuse_cite.py & 3 | /usr/bin/Rscript seurat_cite.R & 4 | /usr/bin/Rscript liger_cite.R & 5 | /usr/bin/Rscript harm_cite.R & 6 | /usr/bin/Rscript bsc_cite.R 7 | -------------------------------------------------------------------------------- /Archive/asapseq-pbmc/code/benchmark/step2.sh: -------------------------------------------------------------------------------- 1 | # code to calc slt and ari f1 2 | 3 | # for mf 4 | /usr/bin/Rscript calculate_metrics.R '/asap/output/mf/metrics.csv' '/asap/data/orig' '/asap/output/mf/full_embed' 0 & 5 | 6 | # for sr 7 | /usr/bin/Rscript calculate_metrics.R '/asap/output/sr/metrics.csv' '/asap/data/orig' '/asap/output/sr/full_embed' 0 & 8 | 9 | # for lg 10 | /usr/bin/Rscript calculate_metrics.R '/asap/output/lg/metrics.csv' '/asap/data/orig' '/asap/output/lg/full_embed' 0 & 11 | 12 | # for hm 13 | /usr/bin/Rscript calculate_metrics.R '/asap/output/hm/metrics.csv' '/asap/data/orig' '/asap/output/hm/full_embed' 0 & 14 | 15 | # for bsc 16 | /usr/bin/Rscript calculate_metrics.R '/asap/output/bsc/metrics.csv' '/asap/data/orig' '/asap/output/bsc/full_embed' 0 -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("./") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 35 | # calculate Silhouette width 36 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 37 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 38 | #print(slt_res) 39 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 40 | #print(metrics) 41 | # calculate ARI 42 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 43 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 44 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 45 | 46 | write_csv(metrics, metrics_fname) 47 | -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/methods_running/bsc_cite.R: -------------------------------------------------------------------------------- 1 | # bindsc benchmark 2 | library(bindSC) 3 | library(Seurat) 4 | library(Matrix) 5 | library(matrixStats) 6 | # read in files 7 | out_root = "/bench_test4/output/" 8 | in_root = "/bench_test4/input/" 9 | out_indx = 15 10 | 11 | out_dir =paste0(out_root,"bsc/") 12 | in_dir = in_root 13 | dir.create(out_root) 14 | dir.create(out_dir) 15 | # read 16 | rna = readMM(paste0(in_dir,"rna200.txt")) 17 | rna = as.matrix(rna) 18 | protein = read.csv(paste0(in_dir,"pro200.csv")) 19 | meta = read.csv(paste0(in_dir,"meta200.csv")) 20 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same 21 | colnames(rna) = rna_names$names 22 | 23 | #### for bsc 24 | rownames(rna) = paste0("rna", c(1:nrow(rna))) 25 | rownames(protein) = paste0("pro", c(1:nrow(protein))) 26 | 27 | # change name 28 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 29 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 30 | rna_list = c() 31 | protein_list = c() 32 | for (i in c(1:dim(correspondence)[1])){ 33 | protein_n = as.character(correspondence[i,1]) 34 | rna_n = as.character(correspondence[i,2]) 35 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 36 | next 37 | } 38 | rna_n = strsplit(rna_n, '/')[[1]] 39 | for(r in rna_n){ 40 | if (r %in% rna_names$names){ 41 | rna_list = c(rna_list, r) 42 | protein_list = c(protein_list, protein_n) 43 | } 44 | } 45 | } 46 | # change name end 47 | # first filtering step should be same as in sp 48 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 49 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 50 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 51 | # copy sp filtering 52 | # copy sp filtering 53 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1 & colSds(protein.shared)>0.1] 54 | protein.shared.sub = protein.shared[,colSds(rna.shared)>0.1 & colSds(protein.shared)>0.1] 55 | 56 | # get cluster for bindsc x, using all x features 57 | xc_obj=CreateSeuratObject(counts=t(protein),assay="x") 58 | xc_obj <- NormalizeData(xc_obj) 59 | xc_obj <- ScaleData(xc_obj, features = rownames(xc_obj)) 60 | xc_obj <- RunPCA(xc_obj, features = rownames(xc_obj)) 61 | xc_obj <- FindNeighbors(xc_obj, dims = 1:15) 62 | xc_obj <- FindClusters(xc_obj, resolution = 1) 63 | x_cluster = as.factor(paste0('x_',as.character(Idents(xc_obj)))) 64 | 65 | # get cluster for bindsc x, using all x features 66 | x_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="x") 67 | x_obj <- NormalizeData(x_obj) 68 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))# not used 69 | 70 | # get cluster for bindsc y, using all y features (variable) 71 | y_obj=CreateSeuratObject(counts=t(rna),assay="y") 72 | y_obj <- NormalizeData(y_obj) 73 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 74 | y_obj <- FindVariableFeatures(y_obj, nfeatures = 3000) 75 | y_obj <- RunPCA(y_obj, features = VariableFeatures(object = y_obj)) 76 | y_obj <- FindNeighbors(y_obj, dims = 1:15) 77 | y_obj <- FindClusters(y_obj, resolution = 1) 78 | y_cluster = as.factor(paste0('y_',as.character(Idents(y_obj)))) 79 | 80 | y_input_features = VariableFeatures(object = y_obj) 81 | 82 | ## for Z0 83 | z_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="z") 84 | z_obj <- NormalizeData(z_obj) 85 | 86 | ## now gather all the actual inputs 87 | x_input = x_obj@assays$x@data 88 | y_input = as.matrix(as.data.frame(y_obj@assays$y@data[y_input_features,])) 89 | z0_input = z_obj@assays$z@data 90 | 91 | # start bindsc 92 | res <- BiCCA( X = x_input , 93 | Y = y_input, 94 | Z0 = z0_input, 95 | X.clst = x_cluster, 96 | Y.clst = y_cluster, 97 | alpha = 0.1, 98 | lambda = 0.7, 99 | K = 15, 100 | temp.path = "out", 101 | num.iteration = 50, 102 | tolerance = 0.01, 103 | save = TRUE, 104 | parameter.optimize = FALSE, 105 | block.size = 0) 106 | 107 | name_1 = "full_embed_x0.csv" 108 | name_2 = "full_embed_y0.csv" 109 | pathout = out_dir 110 | write.csv(data.frame(res$r)[,c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # rna embed 111 | write.csv(data.frame(res$u)[,c(1:out_indx)], paste0(out_dir,name_2), row.names=FALSE) # pro embed 112 | write.csv(data.frame(method = "bsc"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 113 | -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/methods_running/harm_cite.R: -------------------------------------------------------------------------------- 1 | # harmony benchmark 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/bench_test4/output/" 8 | in_root = "/bench_test4/input/" 9 | out_indx = 15 10 | 11 | out_dir =paste0(out_root,"hm/") 12 | in_dir = in_root 13 | dir.create(out_root) 14 | dir.create(out_dir) 15 | # read 16 | rna = readMM(paste0(in_dir,"rna200.txt")) 17 | rna = as.matrix(rna) 18 | protein = read.csv(paste0(in_dir,"pro200.csv")) 19 | meta = read.csv(paste0(in_dir,"meta200.csv")) 20 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same 21 | colnames(rna) = rna_names$names 22 | # change name 23 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 24 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 25 | rna_list = c() 26 | protein_list = c() 27 | for (i in c(1:dim(correspondence)[1])){ 28 | protein_n = as.character(correspondence[i,1]) 29 | rna_n = as.character(correspondence[i,2]) 30 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 31 | next 32 | } 33 | rna_n = strsplit(rna_n, '/')[[1]] 34 | for(r in rna_n){ 35 | if (r %in% rna_names$names){ 36 | rna_list = c(rna_list, r) 37 | protein_list = c(protein_list, protein_n) 38 | } 39 | } 40 | } 41 | # change name end 42 | # first filtering step should be same as in sp 43 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 44 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 45 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 46 | # copy sp filtering 47 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1] 48 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 49 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 50 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 51 | # then we construct the seurat objects 52 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 53 | x_obj <- NormalizeData(x_obj) 54 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 55 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 56 | # add suerat object datay 57 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 58 | y_obj <- NormalizeData(y_obj) 59 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 60 | #list_modality=list(x_obj,y_obj) 61 | # get shared clean features 62 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 63 | # run harmony in seurat, need to make a new seurat object 64 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 65 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 66 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE) 67 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 68 | # cbind together, scale within modality is better 69 | xy_obj <- xy_obj %>% RunHarmony("orig") 70 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 71 | name_1 = "full_embed_x0.csv" 72 | name_2 = "full_embed_y0.csv" 73 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 74 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 75 | row.names=FALSE) # need to decide output pca cell 76 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 77 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 78 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 79 | -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/methods_running/liger_cite.R: -------------------------------------------------------------------------------- 1 | # liger benchmark 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/bench_test4/output/" 7 | in_root = "/bench_test4/input/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"lg/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | rna = readMM(paste0(in_dir,"rna200.txt")) 16 | rna = as.matrix(rna) 17 | protein = read.csv(paste0(in_dir,"pro200.csv")) 18 | #meta = read.csv(paste0(in_dir,"meta200.csv")) 19 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same 20 | colnames(rna) = rna_names$names 21 | # change name 22 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 23 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 24 | rna_list = c() 25 | protein_list = c() 26 | for (i in c(1:dim(correspondence)[1])){ 27 | protein_n = as.character(correspondence[i,1]) 28 | rna_n = as.character(correspondence[i,2]) 29 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 30 | next 31 | } 32 | rna_n = strsplit(rna_n, '/')[[1]] 33 | for(r in rna_n){ 34 | if (r %in% rna_names$names){ 35 | rna_list = c(rna_list, r) 36 | protein_list = c(protein_list, protein_n) 37 | } 38 | } 39 | } 40 | # change name end 41 | # first filtering step should be same as in sp 42 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 43 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 44 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 45 | # copy sp filtering to produce better output 46 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1] 47 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 48 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 49 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 50 | # then we construct the liger objects 51 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE) 52 | ###Start integration 53 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 54 | # default preprocessing 55 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 56 | # do not need to select genes 57 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 58 | ligerobj@var.genes=features # just use all 59 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 60 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = TRUE) 61 | ligerobj <- quantile_norm(ligerobj) 62 | embedding = ligerobj@H.norm[,c(1:out_indx)] 63 | name_1 = "full_embed_x0.csv" 64 | name_2 = "full_embed_y0.csv" 65 | # no avaliable matching information from liger thus not saved out 66 | # will use knn to serach matching on embedding in downstreatm analysis 67 | # check what cell is filtered out 68 | `%notin%` <- Negate(`%in%`) 69 | filtered = 70 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)] 71 | filtered_id = as.integer(gsub("d1", "", filtered)) # cells delted by liger process: 1274 72 | # extract numbers 73 | write.csv(embedding[c(1:18726),], 74 | paste0(out_dir,name_1), row.names=FALSE) # incomplete dataset 75 | write.csv(embedding[c(18727:38726),], 76 | paste0(out_dir,name_2), row.names=FALSE) # 77 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 78 | 79 | # also get the right original pca values for cells for downstream analysis 80 | x_orig = read.csv("/bench_test4/input/orig_x.csv") 81 | x_orig_sub = x_orig[-filtered_id,] 82 | write.csv(x_orig_sub, "/bench_test4/input/orig_lg_x.csv") 83 | # save out filtered id 84 | write.csv(data.frame(id = filtered_id), "/bench_test4/output/lg/filt_id.csv") 85 | 86 | -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/methods_running/seurat_cite.R: -------------------------------------------------------------------------------- 1 | # seurat benchmark 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/bench_test4/output/" 7 | in_root = "/bench_test4/input/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"sr/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | rna = readMM(paste0(in_dir,"rna200.txt")) 16 | rna = as.matrix(rna) 17 | protein = read.csv(paste0(in_dir,"pro200.csv")) 18 | meta = read.csv(paste0(in_dir,"meta200.csv")) 19 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same 20 | colnames(rna) = rna_names$names 21 | # change name 22 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 23 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 24 | rna_list = c() 25 | protein_list = c() 26 | for (i in c(1:dim(correspondence)[1])){ 27 | protein_n = as.character(correspondence[i,1]) 28 | rna_n = as.character(correspondence[i,2]) 29 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 30 | next 31 | } 32 | rna_n = strsplit(rna_n, '/')[[1]] 33 | for(r in rna_n){ 34 | if (r %in% rna_names$names){ 35 | rna_list = c(rna_list, r) 36 | protein_list = c(protein_list, protein_n) 37 | } 38 | } 39 | } 40 | # change name end 41 | # first filtering step should be same as in sp 42 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 43 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 44 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 45 | # copy sp filtering 46 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1] 47 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 48 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub))) 49 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub))) 50 | # then we construct the seurat objects 51 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 52 | x_obj <- NormalizeData(x_obj) 53 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case 54 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 55 | # add suerat object datay 56 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 57 | y_obj <- NormalizeData(y_obj) 58 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 59 | list_modality=list(x_obj,y_obj) 60 | # get transfer anchor 61 | features=intersect(rownames(x_obj),rownames(y_obj)) 62 | pre.anchors <- FindTransferAnchors(reference = x_obj, query = y_obj, 63 | dims = 1:20, features = features) 64 | predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(x_obj), 65 | dims = 1:20) 66 | full_df = data.frame(idx2 = c(1:length(predictions$predicted.id)) -1, idx1 = as.integer(predictions$predicted.id) -1, 67 | score = predictions$prediction.score.max) # mind the r index difference 68 | # get integration embedding 69 | print("starting seurat integration") 70 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality, 71 | dims = 1:20, anchor.features =features, k.filter = 10) 72 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 10) 73 | # 74 | DefaultAssay(xy_int) <- "integrated" 75 | xy_int <- ScaleData(xy_int, verbose = FALSE) 76 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion 77 | embedding = xy_int@reductions$pca@cell.embeddings 78 | name_1 = "full_embed_x0.csv" 79 | name_2 = "full_embed_y0.csv" 80 | #pathout = out_dir 81 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 82 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 83 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 84 | write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide 85 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 86 | 87 | -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/step1.sh: -------------------------------------------------------------------------------- 1 | ## quick code to run all methods 2 | python maxfuse_cite.py & 3 | /usr/bin/Rscript seurat_cite.R & 4 | /usr/bin/Rscript liger_cite.R & 5 | /usr/bin/Rscript harm_cite.R & 6 | /usr/bin/Rscript bsc_cite.R -------------------------------------------------------------------------------- /Archive/citeseq-bmc/code/benchmark/step2.sh: -------------------------------------------------------------------------------- 1 | # quick code to calc slt ari f1 scores for all methods 2 | # for mf 3 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/mf/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/mf/full_embed' 0 & 4 | 5 | # for sr 6 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/sr/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/sr/full_embed' 0 & 7 | 8 | # for lg 9 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/lgunimf/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/lgunimf/full_embed' 0 & 10 | 11 | # for hm 12 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/hm/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/hm/full_embed' 0 13 | 14 | # for bsc 15 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/bsc/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/bsc/full_embed' 0 -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/analysis/plot.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "plot_cite" 3 | output: html_document 4 | --- 5 | Script to produce citeseq pbmc plots (full panel version) 6 | These values were calculated before, check the ipynb in this folder for detail 7 | 8 | ```{r} 9 | library(ggplot2) 10 | metrics = read.csv("/bench_test3/output/batch5_resultsV2.csv") # metrics prev calced 11 | metrics$method <- factor(metrics$method,levels = c("mf", "sr", "lg", "hm","bsc")) 12 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 13 | ``` 14 | 15 | ```{r} 16 | # slt f1 score + matching annotation lv1 17 | p = ggplot(metrics) + geom_point(aes(x = ann1, y = slt_f1, color = method), size =2, alpha = 0.5) + 18 | theme_minimal() + scale_color_manual(values = colorv) + 19 | scale_y_continuous(minor_breaks = seq(0, 1, 0.05)) + ylim(c(0.35,0.6)) + xlim(c(0.35,0.97)) 20 | ggsave("/bench_test3/plots/p1V3.svg", height = 3, width = 4.5) 21 | p 22 | ``` 23 | 24 | ```{r} 25 | # ari f1 score + matching anotation lv2 26 | p = ggplot(metrics) + geom_point(aes(x = ann2, y = ari_f1, color = method), size =2, alpha = 0.5) + 27 | theme_minimal() + 28 | scale_color_manual(values = colorv) + scale_y_continuous(minor_breaks = seq(0, 1, 0.05)) + 29 | ylim(c(0.4,0.65)) + xlim(c(0.28,0.87)) 30 | ggsave("/bench_test3/plots/p2V3.svg", height = 3, width = 4.5) 31 | p 32 | ``` 33 | 34 | 35 | ```{r} 36 | # plot foscttm score, since 5 repeats calc sd 37 | 38 | library(dplyr) 39 | # Data 40 | data <- metrics %>% select(method, foscttm) 41 | # Calculates mean, sd, se and IC 42 | my_sum <- data %>% 43 | group_by(method) %>% 44 | dplyr::summarise( 45 | n=n(), 46 | mean=mean(foscttm), 47 | sd=sd(foscttm) 48 | ) %>% 49 | mutate( se=sd/sqrt(n)) %>% 50 | mutate( ic=se * qt((1-0.05)/2 + .5, n-1)) 51 | 52 | # Standard deviation 53 | p = ggplot(my_sum) + 54 | geom_bar( aes(x=method, y=mean, fill=method), stat="identity", alpha=0.7, width = 0.4) + 55 | geom_errorbar( aes(x=method, ymin=mean-sd, ymax=mean+sd), width=0.08, colour="black", alpha=0.9, size=0.2) + 56 | ggtitle("using standard deviation") + theme_minimal() + scale_fill_manual(values = colorv) #+ coord_cartesian(ylim=c(0.6,0.97)) 57 | 58 | ggsave("/bench_test3/plots/p3V2.svg", height = 3, width = 4.5) 59 | p 60 | ``` 61 | 62 | 63 | ```{r} 64 | # plot foscKNN plot along the Ks 65 | 66 | knnsearch = read.csv("/bench_test3/output/batch5_knntmpV2.csv") 67 | knnsearch$step = knnsearch$step+1 # python index dif 68 | knnsearch$method <- factor(knnsearch$method,levels = c("mf", "sr", "lg", "hm", "bsc")) 69 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 70 | 71 | p = ggplot(knnsearch,aes(x=step,y=knn_tmp, colour=method,fill = method)) + 72 | stat_summary(geom = "line", fun.y = mean, size = 0.2) + 73 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.3, colour = NA) + 74 | theme_minimal()+ ggtitle("KNN search true match") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv) 75 | 76 | #ggsave("/home/bkzhu/super_mario/bench_test3/plots/p4V2.svg", height = 3, width = 5) 77 | p 78 | ``` 79 | 80 | 81 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/analysis/plot_drop.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "plot_drop" 3 | output: html_document 4 | --- 5 | 6 | Script to produce citeseq pbmc plots (drop panel version) 7 | These values were calculated before, check the ipynb in this folder for detail 8 | 9 | ```{r} 10 | library(ggplot2) 11 | library(reshape2) 12 | 13 | # read metrics calculated prev 14 | mdrop = read.csv("/bench_test3/output/drop4_batch5_resultsV2.csv", row.names = 1) 15 | mdrop$method <- factor(mdrop$method,levels = c("mf", "sr", "lg", "hm","bsc")) 16 | mdrop$drop <- factor(mdrop$drop,levels = c("dropLv0", "dropLv1", "dropLv2", "dropLv3")) 17 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 18 | ``` 19 | 20 | 21 | ```{r} 22 | temp = melt(mdrop, id = c("method","batch","drop")) 23 | temp2 = subset(temp, temp$variable == "ann1") 24 | 25 | temp2$method <- factor(temp2$method,levels = c("mf", "sr", "lg", "hm", "bsc")) 26 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 27 | 28 | # plot matching accu annotation lv1 when dropps 29 | p = ggplot(temp2,aes(x=drop,y=value, colour=method,fill = method, group = method)) + 30 | stat_summary(geom = "line", fun.y = mean, size = 0.2) + 31 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.2, colour = NA) + 32 | theme_minimal()+ ggtitle("ann1") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)+ 33 | scale_x_discrete(expand=c(0.05, 0.05)) + ylim(c(0.3,1)) 34 | ggsave("/bench_test3/plots/drop-p1V2.svg", height = 3, width = 5) 35 | p 36 | ``` 37 | 38 | ```{r} 39 | temp = melt(mdrop, id = c("method","batch","drop")) 40 | temp2 = subset(temp, temp$variable == "ann2") 41 | 42 | temp2$method <- factor(temp2$method,levels = c("mf", "sr", "lg", "hm", "bsc")) 43 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 44 | 45 | # plot matching accu annotation lv2 when dropps 46 | p = ggplot(temp2,aes(x=drop,y=value, colour=method,fill = method, group = method)) + 47 | stat_summary(geom = "line", fun.y = mean, size = 0.2) + 48 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.2, colour = NA) + 49 | theme_minimal()+ ggtitle("ann2") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv) + 50 | scale_x_discrete(expand=c(0.05, 0.05)) + ylim(c(0.1,0.85)) 51 | 52 | ggsave("/bench_test3/plots/drop-p2V2.svg", height = 3, width = 5) 53 | p 54 | ``` 55 | 56 | 57 | ```{r} 58 | temp = melt(mdrop, id = c("method","batch","drop")) 59 | temp2 = subset(temp, temp$variable == "foscttm") 60 | 61 | temp2$method <- factor(temp2$method,levels = c("mf", "sr", "lg", "hm", "bsc")) 62 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 63 | 64 | # plot foscttm when dropps 65 | p = ggplot(temp2,aes(x=drop,y=value, colour=method,fill = method, group = method)) + 66 | stat_summary(geom = "line", fun.y = mean, size = 0.2) + 67 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.2, colour = NA) + 68 | theme_minimal()+ ggtitle("foscttm") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)+ 69 | scale_x_discrete(expand=c(0.05, 0.05))+ ylim(c(0,0.35)) 70 | 71 | ggsave("/bench_test3/plots/drop-p3V2.svg", height = 3, width = 5) 72 | p 73 | ``` 74 | 75 | 76 | ```{r} 77 | knn_drop = read.csv("/bench_test3/output/drop4_batch5_knntmpV2.csv") 78 | knn_drop_50 = subset(knn_drop, knn_drop$step == 99) 79 | knn_drop_50$method <- factor(knn_drop_50$method,levels = c("mf", "sr", "lg", "hm","bsc")) 80 | knn_drop_50$X <- NULL 81 | knn_drop_50$step <- NULL 82 | knn_drop_50$batch <- NULL 83 | temp3 = melt(knn_drop_50, id = c("method","drop")) 84 | 85 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 86 | 87 | # plot foscKNN when dropps (k set to 100) 88 | p = ggplot(temp3,aes(x=drop,y=value, colour=method,fill = method, group = method)) + 89 | stat_summary(geom = "line", fun.y = mean, size = 0.2) + 90 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.3, colour = NA) + 91 | theme_minimal()+ ggtitle("foscknn") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)+ 92 | scale_x_discrete(expand=c(0.05, 0.05))+ ylim(c(0,0.25)) 93 | 94 | ggsave("/bench_test3/plots/drop-p4V2.svg", height = 3, width = 5) 95 | p 96 | ``` 97 | 98 | 99 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | # script used benmark calculation: slt-f1 and ari-f1 2 | 3 | #!/usr/bin/env Rscript 4 | args = commandArgs(trailingOnly=TRUE) 5 | 6 | metrics_fname = args[1] 7 | orig_fname = args[2] 8 | embed_fname = args[3] 9 | n_idx = as.integer(args[4]) 10 | 11 | # Compute the following metrics: 12 | # - sam_x: structure alignment metric for x data (the larger, the better) 13 | # - sam_y: structure alignment metric for y data (the larger, the better) 14 | # - slt_mix: mixing via Silhouette width (the larger, the better) 15 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 16 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 17 | # - ari_mix: mixing via adjusted random index (the larger, the better) 18 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 19 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 20 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 21 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 22 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 23 | setwd("./") 24 | source("metrics.R")# metric calc code 25 | 26 | # load existing metrics 27 | metrics = read_csv(metrics_fname, col_types=cols()) 28 | 29 | 30 | # calculate structure alignment metrics 31 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 32 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='x') 34 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 35 | n_idx=n_idx, data_idx='y') 36 | print(sam_x) 37 | print(sam_y) 38 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 39 | #print(metrics) 40 | # calculate Silhouette width 41 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 42 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 43 | 44 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 45 | 46 | # calculate ARI 47 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 48 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 49 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 50 | 51 | write_csv(metrics, metrics_fname) 52 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...')) 53 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/method_running/harm_cite.R: -------------------------------------------------------------------------------- 1 | # harmony benchmark, full antibody panel 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/bench_test3/output/" 8 | in_root = "/bench_test3/input/" 9 | batch = 5 10 | out_indx = 15 11 | 12 | for(i in c(1:batch)){ 13 | batch_name = paste0("b",as.character(i),"/") 14 | out_dir =paste0(out_root,batch_name,"hm/") 15 | in_dir = paste0(in_root,batch_name) 16 | dir.create(paste0(out_root,batch_name)) 17 | dir.create(out_dir) 18 | # read 19 | rna = readMM(paste0(in_dir,"rna.txt")) 20 | protein = read.csv(paste0(in_dir,"pro.csv")) 21 | meta = read.csv(paste0(in_dir,"meta.csv")) 22 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same 23 | colnames(rna) = rna_names$names 24 | # change name 25 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 26 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 27 | rna_list = c() 28 | protein_list = c() 29 | for (i in c(1:dim(correspondence)[1])){ 30 | protein_n = as.character(correspondence[i,1]) 31 | rna_n = as.character(correspondence[i,2]) 32 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 33 | next 34 | } 35 | rna_n = strsplit(rna_n, '/')[[1]] 36 | for(r in rna_n){ 37 | if (r %in% rna_names$names){ 38 | rna_list = c(rna_list, r) 39 | protein_list = c(protein_list, protein_n) 40 | } 41 | } 42 | } 43 | # change name end 44 | # first filtering step should be same as in sp 45 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 46 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 47 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 48 | # copy sp filtering 49 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 50 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 51 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 52 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 53 | # then we construct the seurat objects 54 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 55 | x_obj <- NormalizeData(x_obj) 56 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 57 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 58 | # add suerat object datay 59 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 60 | y_obj <- NormalizeData(y_obj) 61 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 62 | #list_modality=list(x_obj,y_obj) 63 | # get shared clean features 64 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 65 | # run harmony in seurat, need to make a new seurat object 66 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 67 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 68 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE) 69 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 70 | # cbind together, scale within modality is better 71 | xy_obj <- xy_obj %>% RunHarmony("orig") 72 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 73 | name_1 = "full_embed_x0.csv" 74 | name_2 = "full_embed_y0.csv" 75 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 76 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 77 | row.names=FALSE) # need to decide output pca cell 78 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 79 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 80 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 81 | } 82 | 83 | ## 84 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/method_running/harm_cite_reduc-drop.R: -------------------------------------------------------------------------------- 1 | #harmony for umap viz and cf matrix, drop antibody version 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/bench_test3/output/reduction-drop/" 8 | in_root = "/bench_test3/input/reduction/" 9 | 10 | out_dir =paste0(out_root,"hm/") 11 | in_dir = in_root 12 | out_indx = 15 13 | 14 | # read 15 | dropped_pro = read.csv("/bench_test3/input/rank30.csv") 16 | target = as.character(dropped_pro$target) 17 | 18 | rna = readMM(paste0(in_dir,"rna.txt")) 19 | protein = read.csv(paste0(in_dir,"pro.csv")) 20 | protein = protein[,target] 21 | 22 | meta = read.csv(paste0(in_dir,"meta.csv")) 23 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same 24 | colnames(rna) = rna_names$names 25 | # change name 26 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 27 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 28 | rna_list = c() 29 | protein_list = c() 30 | for (i in c(1:dim(correspondence)[1])){ 31 | protein_n = as.character(correspondence[i,1]) 32 | rna_n = as.character(correspondence[i,2]) 33 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 34 | next 35 | } 36 | rna_n = strsplit(rna_n, '/')[[1]] 37 | for(r in rna_n){ 38 | if (r %in% rna_names$names){ 39 | rna_list = c(rna_list, r) 40 | protein_list = c(protein_list, protein_n) 41 | } 42 | } 43 | } 44 | # change name end 45 | # first filtering step should be same as in sp 46 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 47 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 48 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 49 | dim(protein.shared) 50 | dim(rna.shared) 51 | # copy sp filtering 52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.05] 54 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub))) 55 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub))) 56 | # 57 | dim(protein.shared.sub) 58 | dim(rna.shared.sub) 59 | # 60 | 61 | # then we construct the seurat objects 62 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 63 | x_obj <- NormalizeData(x_obj) 64 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 65 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 66 | # add suerat object datay 67 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 68 | y_obj <- NormalizeData(y_obj) 69 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 70 | #list_modality=list(x_obj,y_obj) 71 | # get shared clean features 72 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 73 | # run harmony in seurat, need to make a new seurat object 74 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 75 | #xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 76 | xy_obj <- NormalizeData(xy_obj) 77 | xy_obj <- ScaleData(xy_obj, features = rownames(xy_obj)) 78 | xy_obj <- RunPCA(xy_obj, features = rownames(xy_obj), npc = length(features)) 79 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 80 | # cbind together, scale within modality is better 81 | xy_obj <- xy_obj %>% RunHarmony("orig") 82 | 83 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 84 | name_1 = "full_embed_x0.csv" 85 | name_2 = "full_embed_y0.csv" 86 | 87 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 88 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 89 | row.names=FALSE) # need to decide output pca cell 90 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 91 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 92 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 93 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/method_running/harm_cite_reduc.R: -------------------------------------------------------------------------------- 1 | #harmony for umap viz and cf matrix, full antibody version 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/bench_test3/output/reduction/" 8 | in_root = "/bench_test3/input/reduction/" 9 | out_indx = 15 10 | 11 | out_dir =paste0(out_root,"hm/") 12 | in_dir = in_root 13 | # read 14 | rna = readMM(paste0(in_dir,"rna.txt")) 15 | protein = read.csv(paste0(in_dir,"pro.csv")) 16 | meta = read.csv(paste0(in_dir,"meta.csv")) 17 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same 18 | colnames(rna) = rna_names$names 19 | # change name 20 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 21 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 22 | rna_list = c() 23 | protein_list = c() 24 | for (i in c(1:dim(correspondence)[1])){ 25 | protein_n = as.character(correspondence[i,1]) 26 | rna_n = as.character(correspondence[i,2]) 27 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 28 | next 29 | } 30 | rna_n = strsplit(rna_n, '/')[[1]] 31 | for(r in rna_n){ 32 | if (r %in% rna_names$names){ 33 | rna_list = c(rna_list, r) 34 | protein_list = c(protein_list, protein_n) 35 | } 36 | } 37 | } 38 | # change name end 39 | # first filtering step should be same as in sp 40 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 41 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 42 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 43 | # copy sp filtering 44 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 45 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 46 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 47 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 48 | # then we construct the seurat objects 49 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 50 | x_obj <- NormalizeData(x_obj) 51 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 52 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 53 | # add suerat object datay 54 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 55 | y_obj <- NormalizeData(y_obj) 56 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 57 | #list_modality=list(x_obj,y_obj) 58 | # get shared clean features 59 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 60 | # run harmony in seurat, need to make a new seurat object 61 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 62 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 63 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE) 64 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 65 | # cbind together, scale within modality is better 66 | xy_obj <- xy_obj %>% RunHarmony("orig") 67 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 68 | name_1 = "full_embed_x0.csv" 69 | name_2 = "full_embed_y0.csv" 70 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 71 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 72 | row.names=FALSE) # need to decide output pca cell 73 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 74 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 75 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 76 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/method_running/liger_cite.R: -------------------------------------------------------------------------------- 1 | # bindsc benchmark, full antibody panel 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/bench_test3/output/" 7 | in_root = "/bench_test3/input/" 8 | batch = 5 9 | out_indx = 15 10 | 11 | for(i in c(1:batch)){ 12 | batch_name = paste0("b",as.character(i),"/") 13 | out_dir =paste0(out_root,batch_name,"lg/") 14 | in_dir = paste0(in_root,batch_name) 15 | dir.create(paste0(out_root,batch_name)) 16 | dir.create(out_dir) 17 | # read 18 | rna = readMM(paste0(in_dir,"rna.txt")) 19 | protein = read.csv(paste0(in_dir,"pro.csv")) 20 | meta = read.csv(paste0(in_dir,"meta.csv")) 21 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same 22 | colnames(rna) = rna_names$names 23 | # change name 24 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 25 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 26 | rna_list = c() 27 | protein_list = c() 28 | for (i in c(1:dim(correspondence)[1])){ 29 | protein_n = as.character(correspondence[i,1]) 30 | rna_n = as.character(correspondence[i,2]) 31 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 32 | next 33 | } 34 | rna_n = strsplit(rna_n, '/')[[1]] 35 | for(r in rna_n){ 36 | if (r %in% rna_names$names){ 37 | rna_list = c(rna_list, r) 38 | protein_list = c(protein_list, protein_n) 39 | } 40 | } 41 | } 42 | # change name end 43 | # first filtering step should be same as in sp 44 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 45 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 46 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 47 | # copy sp filtering to produce better output 48 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 49 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 50 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 51 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 52 | # then we construct the liger objects 53 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE) 54 | ###Start integration 55 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 56 | # default preprocessing 57 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 58 | # do not need to select genes 59 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 60 | ligerobj@var.genes=features # just use all 61 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 62 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE) 63 | ligerobj <- quantile_norm(ligerobj) 64 | embedding = ligerobj@H.norm[,c(1:out_indx)] 65 | if (dim(embedding)[1] != 20000) { 66 | break 67 | } 68 | name_1 = "full_embed_x0.csv" 69 | name_2 = "full_embed_y0.csv" 70 | # no avaliable matching information from liger thus not saved out 71 | # will use knn to serach matching on embedding in downstreatm analysis 72 | write.csv(embedding[c(1:nrow(rna.shared.sub)),c(1:out_indx)], 73 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 74 | write.csv(embedding[c((nrow(rna.shared.sub) + 1):(nrow(rna.shared.sub) + nrow(protein.shared.sub))),c(1:out_indx)], 75 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 76 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 77 | } -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/method_running/liger_cite_reduction-drop.R: -------------------------------------------------------------------------------- 1 | #liger for umap viz and cf matrix, drop antibody version 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/bench_test3/output/reduction-drop/" 7 | in_root = "/bench_test3/input/reduction/" 8 | 9 | out_dir =paste0(out_root,"lg/") 10 | in_dir = in_root 11 | out_indx = 15 12 | 13 | # read 14 | dropped_pro = read.csv("/bench_test3/input/rank30.csv") 15 | target = as.character(dropped_pro$target) 16 | 17 | rna = readMM(paste0(in_dir,"rna.txt")) 18 | protein = read.csv(paste0(in_dir,"pro.csv")) 19 | meta = read.csv(paste0(in_dir,"meta.csv")) 20 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same 21 | colnames(rna) = rna_names$names 22 | # change name 23 | correspondence = read.csv('protein_rna_name_conversionV11.csv') 24 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 25 | rna_list = c() 26 | protein_list = c() 27 | for (i in c(1:dim(correspondence)[1])){ 28 | protein_n = as.character(correspondence[i,1]) 29 | rna_n = as.character(correspondence[i,2]) 30 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 31 | next 32 | } 33 | rna_n = strsplit(rna_n, '/')[[1]] 34 | for(r in rna_n){ 35 | if (r %in% rna_names$names){ 36 | rna_list = c(rna_list, r) 37 | protein_list = c(protein_list, protein_n) 38 | } 39 | } 40 | } 41 | # change name end 42 | # first filtering step should be same as in sp 43 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 44 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 45 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 46 | # copy sp filtering to produce better output 47 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 48 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 49 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 50 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 51 | 52 | # then we construct the liger objects 53 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = False) 54 | ###Start integration 55 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 56 | max_comp = length(features) 57 | # default preprocessing 58 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 59 | # do not need to select genes 60 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 61 | ligerobj@var.genes=features # just use all 62 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 63 | ligerobj <- optimizeALS(ligerobj, k = (max_comp-1),remove.missing = FALSE) 64 | ligerobj <- quantile_norm(ligerobj) 65 | 66 | if (max_comp <= out_indx) { 67 | out_indx = max_comp - 1 68 | } 69 | embedding = ligerobj@H.norm[,c(1:out_indx)] 70 | name_1 = "full_embed_x0.csv" 71 | name_2 = "full_embed_y0.csv" 72 | 73 | # 74 | `%notin%` <- Negate(`%in%`) 75 | filtered = 76 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)] 77 | filtered_id = as.integer(gsub("d1", "", filtered)) # d119958, one cell got deleted 78 | 79 | # no avaliable matching information from liger thus not saved out 80 | # will use knn to serach matching on embedding in downstreatm analysis 81 | write.csv(embedding[c(1:19999),c(1:out_indx)], 82 | paste0(out_dir,name_1), row.names=FALSE) # one cell got deleted during liger process, keep track during downstream analysis 83 | write.csv(embedding[20000:39999,c(1:out_indx)], 84 | paste0(out_dir,name_2), row.names=FALSE) 85 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 86 | 87 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/method_running/liger_cite_reduction.R: -------------------------------------------------------------------------------- 1 | #script for seurat fusion 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/home/bkzhu/super_mario/bench_test3/output/reduction/" 7 | in_root = "/home/bkzhu/super_mario/bench_test3/input/reduction/" 8 | 9 | out_dir =paste0(out_root,"lg/") 10 | in_dir = in_root 11 | out_indx = 10 12 | 13 | # read 14 | rna = readMM(paste0(in_dir,"rna.txt")) 15 | protein = read.csv(paste0(in_dir,"pro.csv")) 16 | meta = read.csv(paste0(in_dir,"meta.csv")) 17 | rna_names = read.csv("/home/bkzhu/super_mario/bench_test3/input/citeseq_rna_names.csv") # rna names always the same 18 | colnames(rna) = rna_names$names 19 | # change name 20 | correspondence = read.csv('/home/bkzhu/super_mario/production/hubmap/protein_rna_name_conversionV7.csv') 21 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 22 | rna_list = c() 23 | protein_list = c() 24 | for (i in c(1:dim(correspondence)[1])){ 25 | protein_n = as.character(correspondence[i,1]) 26 | rna_n = as.character(correspondence[i,2]) 27 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 28 | next 29 | } 30 | rna_n = strsplit(rna_n, '/')[[1]] 31 | for(r in rna_n){ 32 | if (r %in% rna_names$names){ 33 | rna_list = c(rna_list, r) 34 | protein_list = c(protein_list, protein_n) 35 | } 36 | } 37 | } 38 | # change name end 39 | # first filtering step should be same as in sp 40 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 41 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 42 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 43 | # copy sp filtering to produce better output 44 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 45 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 46 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 47 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 48 | # then we construct the liger objects 49 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = F) 50 | ###Start integration 51 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 52 | # default preprocessing 53 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 54 | # do not need to select genes 55 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 56 | ligerobj@var.genes=features # just use all 57 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 58 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE) 59 | ligerobj <- quantile_norm(ligerobj) 60 | embedding = ligerobj@H.norm[,c(1:out_indx)] 61 | name_1 = "full_embed_x0.csv" 62 | name_2 = "full_embed_y0.csv" 63 | 64 | # 65 | `%notin%` <- Negate(`%in%`) 66 | filtered = 67 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)] 68 | filtered_id = as.integer(gsub("d1", "", filtered)) #d119958 69 | # 70 | # no avaliable matching information from liger thus not saved out 71 | # will use knn to serach matching on embedding in downstreatm analysis 72 | write.csv(embedding[c(1:19999),c(1:out_indx)], 73 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 74 | write.csv(embedding[20000:39999,c(1:out_indx)], 75 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 76 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 77 | 78 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/reduction.sh: -------------------------------------------------------------------------------- 1 | ## run all methods, result used for umap viz and confuse matrix plotting 2 | python cite_mf_reduction.py & 3 | /usr/bin/Rscript seurat_cite_reduc.R & 4 | /usr/bin/Rscript liger_cite_reduction.R & 5 | /usr/bin/Rscript harm_cite_reduc.R & 6 | /usr/bin/Rscript bsc_cite_reduction.R 7 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/step1-drop.sh: -------------------------------------------------------------------------------- 1 | ## script to run all methods at the same time: 2 | ## step one of benchmarking, dropping antibody panel version 3 | python maxfuse_cite-drop.py & 4 | /usr/bin/Rscript seurat_cite_drop.R & 5 | /usr/bin/Rscript liger_cite_drop.R & 6 | /usr/bin/Rscript harm_cite_drop.R & 7 | /usr/bin/Rscript bsc_cite_drop.R 8 | -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/step1.sh: -------------------------------------------------------------------------------- 1 | ## script to run all methods at the same time: 2 | ## step one of benchmarking, full antibody panel version 3 | python maxfuse_cite.py & 4 | /usr/bin/Rscript seurat_cite.R & 5 | /usr/bin/Rscript liger_cite.R & 6 | /usr/bin/Rscript harm_cite.R & 7 | /usr/bin/Rscript bsc_cite.R -------------------------------------------------------------------------------- /Archive/citeseq-pbmc/code/benchmark/step2.sh: -------------------------------------------------------------------------------- 1 | # script to produce ARI f1 score and SLT f1 score 2 | # only done on the full panel, not done one the dropping verions 3 | 4 | # b1-5 for mf 5 | /usr/bin/Rscript calculate_metrics.R '//bench_test3/output/b1/mf/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/mf/full_embed' 0 & 6 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/mf/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/mf/full_embed' 0 & 7 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/mf/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/mf/full_embed' 0 & 8 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/mf/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/mf/full_embed' 0 & 9 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/mf/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/mf/full_embed' 0 & 10 | wait 11 | # b1-5 for sr 12 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/sr/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/sr/full_embed' 0 & 13 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/sr/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/sr/full_embed' 0 & 14 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/sr/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/sr/full_embed' 0 & 15 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/sr/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/sr/full_embed' 0 & 16 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/sr/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/sr/full_embed' 0 & 17 | wait 18 | # b1-5 for lg 19 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/lgunimf/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/lgunimf/full_embed' 0 & 20 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/lgunimf/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/lgunimf/full_embed' 0 & 21 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/lgunimf/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/lgunimf/full_embed' 0 & 22 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/lgunimf/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/lgunimf/full_embed' 0 & 23 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/lgunimf/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/lgunimf/full_embed' 0 & 24 | wait 25 | # b1-5 for hm 26 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/hm/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/hm/full_embed' 0 & 27 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/hm/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/hm/full_embed' 0 & 28 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/hm/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/hm/full_embed' 0 & 29 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/hm/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/hm/full_embed' 0 & 30 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/hm/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/hm/full_embed' 0 31 | wait 32 | # b1-5 for bsc 33 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/bsc/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/bsc/full_embed' 0 & 34 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/bsc/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/bsc/full_embed' 0 & 35 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/bsc/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/bsc/full_embed' 0 & 36 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/bsc/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/bsc/full_embed' 0 & 37 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/bsc/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/bsc/full_embed' 0 -------------------------------------------------------------------------------- /Archive/hubmap_nature/readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the analysis performed related to MaxFuse in the paper **"High resolution single cell maps reveals distinct cell organization and function across different regions of the human intestine"**. 2 | 3 | Note this set analysis used the development version of Maxfuse, also deposited in the [folder](https://github.com/shuxiaoc/maxfuse/tree/main/Archive/MaxFuse_devo/09302022V). 4 | 5 | Since in the MaxFuse paper similar data were used, as that the preprocessing of the data were all the same, for that part please refer to the code that relates to ```codex``` and ```rna``` in [folders](https://github.com/shuxiaoc/maxfuse/tree/main/Archive/hubmap/code/preparation). 6 | 7 | The script in this folder only contains the running of MaxFuse and the relevant downstream analysis presented in the paper "High resolution single cell maps reveals distinct cell organization and function across different regions of the human intestine". 8 | -------------------------------------------------------------------------------- /Archive/strong-link/10xe18/analysis/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | #print(sam_x) 35 | #print(sam_y) 36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 37 | #print(metrics) 38 | # calculate Silhouette width 39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 41 | #print(slt_res) 42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 43 | #print(metrics) 44 | # calculate ARI 45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 48 | 49 | # calculate LISI 50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...')) 51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2]) 53 | 54 | # calculate mixing averaged over clusters 55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...')) 56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname, 57 | n_idx=n_idx) 58 | metrics = metrics %>% add_column(avg_mix=avg_mix) 59 | 60 | # save metrics, because the calculation of kBET is substantially slower. 61 | write_csv(metrics, metrics_fname) 62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...')) 63 | 64 | #### not calculating kBet here because too slow for this stage 65 | # calculate kBET 66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...')) 67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 68 | #metrics = metrics %>% add_column(kBET=kbet_res) 69 | 70 | #write_csv(metrics, metrics_fname) 71 | #print(paste0(format(Sys.Date(), "%c"), ': done!')) -------------------------------------------------------------------------------- /Archive/strong-link/10xe18/analysis/step2.sh: -------------------------------------------------------------------------------- 1 | # no condo env requirement 2 | # calculate ari and slt f1 scores 3 | 4 | # for mf 5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/mf/full_embed' 0 & 6 | 7 | # for scjoint 8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/scjoint/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/scjoint/raw_labels/full_embed' 0 & 9 | 10 | # for maestro 11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/maestro/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/maestro/full_embed' 0 & 12 | 13 | # for glue 14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/glue/full_embed' 0 -------------------------------------------------------------------------------- /Archive/strong-link/10xe18/method_running/glue_e18_prepare_data_h5.R: -------------------------------------------------------------------------------- 1 | ### prepare data for scglue ### 2 | #setwd("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/glue/") 3 | 4 | module unload python/python-3.6.2 5 | module load python/python-3.8.2 6 | conda activate scglue2 7 | R 8 | library(BiocGenerics, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library") 9 | library(S4Vectors, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library") 10 | library(IRanges, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library") 11 | library(GenomeInfoDb, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library") 12 | 13 | library(anndata) 14 | 15 | library(Seurat) 16 | library(Signac,lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library") 17 | library(tables) 18 | library(reticulate) 19 | 20 | library(SingleCellExperiment) 21 | library(DropletUtils) 22 | library(scater) 23 | library(ggplot2) 24 | 25 | #### e18 mouse data #### 26 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/data/10x_RNA_ATAC_EmbryonicMouseBrain/") 27 | #setwd("/Users/sijia_work/Dropbox/SingleCellAlignment/data/10x_RNA_ATAC_EmbryonicMouseBrain/") 28 | 29 | e18=readRDS("e18.4.20210917.rds") 30 | table(e18$celltype) 31 | 32 | e18mouseRNA = e18@assays[["RNA"]]@counts 33 | e18mouseATAC = e18@assays[["ATAC"]]@counts 34 | 35 | e18.obj.rna <- CreateSeuratObject( 36 | counts = e18mouseRNA, 37 | assay = "RNA" 38 | ) 39 | e18.obj.atac <- CreateSeuratObject( 40 | counts = e18mouseATAC, 41 | assay = "RNA" 42 | ) 43 | 44 | e18.obj.rna$celltype <- e18$celltype 45 | e18.obj.atac$celltype <- e18$celltype 46 | 47 | e18.obj.rna@meta.data$domain <- "scRNA-seq" 48 | e18.obj.atac@meta.data$domain <- "scATAC-seq" 49 | 50 | 51 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/e18mouse") 52 | library(SeuratDisk) 53 | SaveH5Seurat(e18.obj.rna, filename = "e18mouse_RNA_v2.h5Seurat") 54 | Convert("e18mouse_RNA_v2.h5Seurat", dest = "h5ad") 55 | SaveH5Seurat(e18.obj.atac, filename = "e18mouse_ATAC_v2.h5Seurat") 56 | Convert("e18mouse_ATAC_v2.h5Seurat", dest = "h5ad") 57 | -------------------------------------------------------------------------------- /Archive/strong-link/10xe18/method_running/scj_add_prep_10xe18mouse.R: -------------------------------------------------------------------------------- 1 | qlogin -now no 2 | module unload python/python-3.6.2 3 | module load python/python-3.8.2 4 | conda activate MAESTRO 5 | R 6 | 7 | 8 | ### prepare scJoint input of h5 file ### 9 | 10 | library(Seurat) 11 | library(Signac) 12 | library(tables) 13 | library(reticulate) 14 | 15 | library(SingleCellExperiment) 16 | library(DropletUtils) 17 | library(scater) 18 | library(ggplot2) 19 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/data/10x_RNA_ATAC_EmbryonicMouseBrain/") 20 | 21 | e18=readRDS("e18.4.20210917.rds") 22 | 23 | table(e18$celltype) 24 | 25 | e18mouseRNA = e18@assays[["RNA"]]@counts 26 | 27 | e18mouseATAC = e18@assays[["ATAC"]]@counts 28 | 29 | DefaultAssay(e18)='ATAC' 30 | Annotation(e18) 31 | frags=UpdatePath(Fragments(e18)[[1]], new.path = 'e18_mouse_brain_fresh_5k_atac_fragments.tsv.gz') 32 | Fragments(e18)=NULL 33 | e18=SetAssayData(e18, slot = "fragments", new.data = frags) 34 | 35 | e18gene.activities <- GeneActivity(e18) 36 | 37 | library(Seurat) 38 | library(Signac) 39 | library(EnsDb.Hsapiens.v86) 40 | library(GenomeInfoDb) 41 | library(dplyr) 42 | library(ggplot2) 43 | 44 | 45 | e18[["ACTIVITY"]] <- CreateAssayObject(counts = e18gene.activities) 46 | DefaultAssay(e18) <- "ACTIVITY" 47 | # SCTransform normalization and PCA dimensional reduction on gene activity 48 | e18<- SCTransform(e18, assay="ACTIVITY", verbose = FALSE, new.assay.name = 'SCT.ACTIVITY') %>% RunPCA(verbose=F, reduction.name = 'pca.activity') %>% RunUMAP(verbose=F, dims = 1:20, reduction='pca.activity', reduction.name='umap.activity') 49 | 50 | 51 | e18$celltype -> e18_celltype 52 | 53 | e18.obj.rna <- CreateSeuratObject( 54 | counts = e18mouseRNA, 55 | assay = "RNA" 56 | ) 57 | 58 | # Only keep common genes between two dataset 59 | common_genes <- intersect(rownames(e18.obj.rna), 60 | rownames(e18gene.activities)) 61 | length(common_genes) 62 | 63 | e18.obj.activity <- CreateSeuratObject( 64 | counts = e18gene.activities, 65 | assay = "RNA" 66 | ) 67 | 68 | ### create logcounts ### 69 | activity.sce <- as.SingleCellExperiment(e18.obj.activity) 70 | rna.sce <- as.SingleCellExperiment(e18.obj.rna) 71 | 72 | # Extract the logcounts data from sce object 73 | exprs_atac <- logcounts(activity.sce[common_genes, ]) 74 | exprs_rna <- logcounts(rna.sce[common_genes, ]) 75 | 76 | source("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/data_to_h5.R") 77 | #source("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/data_to_h5.R") 78 | write_h5_scJoint(exprs_list = list(rna = exprs_rna, 79 | atac = exprs_atac), 80 | h5file_list = c("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/exprs_10xe18_rna.h5", 81 | "/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/exprs_10xe18_atac.h5")) 82 | 83 | write_csv_scJoint(cellType_list = list(names(e18_celltype)), 84 | csv_list = c("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/cellname_cellType_10xe18.csv")) 85 | write_csv_scJoint(cellType_list = list(e18_celltype), 86 | csv_list = c("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/cellType_10xe18.csv")) 87 | 88 | 89 | 90 | ### final output ### 91 | e18_predict_label <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/e18_predictlabel_celltype.csv") 92 | e18_rna_pred <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_rna_predictions.txt",header=F,sep=" ") 93 | e18_atac_pred_label <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_atac_knn_predictions.txt",header=F) 94 | e18_atac_pred <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_atac_predictions.txt",header=F,sep=" ") 95 | e18_idx_label <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/label_to_idx.txt",header=F,sep=" ") 96 | 97 | e18_atac_embed <-read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_atac_embeddings.txt",header=F,sep=" ") 98 | e18_rna_embed <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_rna_embeddings.txt",header=F,sep=" ") 99 | -------------------------------------------------------------------------------- /Archive/strong-link/10xe18/prep_e18.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "prep_e18" 3 | output: html_document 4 | --- 5 | 6 | code to prep 10x e18 (mouse embryonic brain scATAC/scRNA multiome dataset from 10x genomics): 7 | rna information from "e18.4.20210917.rds" 8 | atac information from "e18_mouse_brain_fresh_5k_atac_fragments.tsv.gz" 9 | 10 | @Nancy Zhang and @Sijia Huang for source of these two files or any preprocessing related to the original data. 11 | 12 | ```{r} 13 | # calculate gene activity score by signac 14 | # gene activity score is used by MaxFuse. Other methods directly use Fragments 15 | 16 | library(Signac) 17 | 18 | e18 = readRDS("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/e18.4.20210917.rds") 19 | DefaultAssay(e18)='ATAC' 20 | Annotation(e18) 21 | frags=UpdatePath(Fragments(e18)[[1]], 22 | new.path = '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/e18_mouse_brain_fresh_5k_atac_fragments.tsv.gz') 23 | Fragments(e18)=NULL 24 | e18=SetAssayData(e18, slot = "fragments", new.data = frags) 25 | gene.activities <- GeneActivity(e18) 26 | 27 | # csnk2a1 duplication problem 28 | temp = gene.activities[14703,] + gene.activities[14704,] 29 | gene.activities = gene.activities[-c(14703, 14704),] 30 | gene.activities = rbind(gene.activities,temp) 31 | rownames(gene.activities)[21977] = 'Csnk2a1' 32 | ``` 33 | 34 | 35 | ```{r} 36 | ## okay start saving out 37 | e18_rna_sct = as.data.frame(t(e18@assays$SCT@data)) 38 | e18_rna_sct_names = colnames(e18_rna_sct) 39 | 40 | library(Matrix) 41 | ## rna 42 | e18_rna_sct = as(as.matrix(e18_rna_sct), "dgCMatrix") 43 | writeMM(e18_rna_sct, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_rna.txt") 44 | write.csv(data.frame(names = e18_rna_sct_names), "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_rna_names.csv") 45 | 46 | ## atac_GAS 47 | e18_gas = as.data.frame(t(gene.activities)) 48 | e18_gas_names = colnames(e18_gas) 49 | e18_gas = as(as.matrix(e18_gas), "dgCMatrix") 50 | writeMM(e18_gas, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_GAS.txt") 51 | write.csv(data.frame(names = e18_gas_names), "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_GAS_names.csv") 52 | 53 | ## atac_lsi 54 | e18_lsi = e18@reductions$lsi@cell.embeddings[,c(2:50)] 55 | write.csv(e18_lsi, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_LSI49.csv") 56 | 57 | ## meta 58 | meta_data = e18@meta.data 59 | write.csv(meta_data, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_meta.csv") 60 | ``` 61 | 62 | 63 | ############## produce RNA and ATAC embedding for slt and ari calculation 64 | ## ATAC embedding can just direclty use LSI scores; RNA use PCA embedding; both 15 dimensions 65 | 66 | ```{r} 67 | ## rna 68 | rna = readMM("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_rna.txt") 69 | rna = as.matrix(rna) 70 | rna_names = read.csv('/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/rna_names.csv') 71 | colnames(rna) = rna_names$names 72 | rownames(rna) = paste0("cell",c(1:nrow(rna))) 73 | 74 | library(Seurat) 75 | temp_obj1 = CreateSeuratObject(counts=t(rna),assay="rna") 76 | temp_obj1 = SetAssayData(object = temp_obj1, slot = "data", new.data = t(rna), assay="rna") # input data already sctnorm 77 | temp_obj1 = ScaleData(temp_obj1) 78 | temp_obj1 <- FindVariableFeatures(temp_obj1, selection.method = "vst", nfeatures = 2000) 79 | temp_obj1 = RunPCA(temp_obj1, features = rownames(temp_obj1)) 80 | meta = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_meta.csv") 81 | pca = as.data.frame(temp_obj1@reductions$pca@cell.embeddings[,c(1:15)]) 82 | pca$label = meta$annotation 83 | write.csv(pca, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig_x.csv", row.names = F) 84 | 85 | # lsi 86 | lsi = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_LSI49.csv") 87 | lsi_orig = lsi[,c(1:15)] 88 | lsi_orig$label = meta$annotation 89 | write.csv(lsi_orig, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig_y.csv", row.names = F) 90 | ``` 91 | 92 | 93 | -------------------------------------------------------------------------------- /Archive/strong-link/10xpbmc/analysis/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | #print(sam_x) 35 | #print(sam_y) 36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 37 | #print(metrics) 38 | # calculate Silhouette width 39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 41 | #print(slt_res) 42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 43 | #print(metrics) 44 | # calculate ARI 45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 48 | 49 | # calculate LISI 50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...')) 51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2]) 53 | 54 | # calculate mixing averaged over clusters 55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...')) 56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname, 57 | n_idx=n_idx) 58 | metrics = metrics %>% add_column(avg_mix=avg_mix) 59 | 60 | # save metrics, because the calculation of kBET is substantially slower. 61 | write_csv(metrics, metrics_fname) 62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...')) 63 | 64 | #### not calculating kBet here because too slow for this stage 65 | # calculate kBET 66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...')) 67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 68 | #metrics = metrics %>% add_column(kBET=kbet_res) 69 | 70 | #write_csv(metrics, metrics_fname) 71 | #print(paste0(format(Sys.Date(), "%c"), ': done!')) -------------------------------------------------------------------------------- /Archive/strong-link/10xpbmc/analysis/step2.sh: -------------------------------------------------------------------------------- 1 | # no condo env requirement 2 | # calculate ari and slt f1 scores 3 | 4 | # for mf 5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/mf/full_embed' 0 & 6 | 7 | # for scjoint 8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/scJoint/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/scJoint/raw_labels/full_embed' 0 & 9 | 10 | # for maestro 11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/maestro/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/maestro/full_embed' 0 & 12 | 13 | # for glue 14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/glue/full_embed' 0 -------------------------------------------------------------------------------- /Archive/strong-link/10xpbmc/method_running/glue_pbmc_prepare_data_h5.R: -------------------------------------------------------------------------------- 1 | module unload python/python-3.6.2 2 | module load python/python-3.8.2 3 | conda activate scglue2 4 | R 5 | 6 | library(anndata) 7 | #### pbmc data #### 8 | library(Seurat) 9 | library(Signac) 10 | library(EnsDb.Hsapiens.v86) 11 | library(GenomeInfoDb) 12 | library(dplyr) 13 | library(ggplot2) 14 | 15 | 16 | setwd("/home/mnt/nzh/nzhanglab/project/SingleCellAlignment/data/10x_RNA_ATAC_PBMC") 17 | load("pbmc_chromvar_annotated.rda") 18 | pbmc.rna = pbmc@assays[["RNA"]]@counts 19 | pbmc.obj.rna <- CreateSeuratObject( 20 | counts = pbmc.rna, 21 | assay = "RNA" 22 | ) 23 | 24 | pbmc.atac = pbmc@assays[["ATAC"]]@counts 25 | pbmc.obj.atac <- CreateSeuratObject( 26 | counts = pbmc.atac, 27 | assay = "RNA" 28 | ) 29 | 30 | pbmc$citeseq.celltype -> pbmc_celltype 31 | 32 | pbmc.obj.rna$celltype <- pbmc_celltype 33 | pbmc.obj.atac$celltype <- pbmc_celltype 34 | 35 | pbmc.obj.rna@meta.data$domain <- "scRNA-seq" 36 | pbmc.obj.atac@meta.data$domain <- "scATAC-seq" 37 | 38 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/pbmc") 39 | library(SeuratDisk) 40 | SaveH5Seurat(pbmc.obj.rna, filename = "pbmc_RNA_v2.h5Seurat") 41 | Convert("pbmc_RNA_v2.h5Seurat", dest = "h5ad") 42 | SaveH5Seurat(pbmc.obj.atac, filename = "pbmc_ATAC_v2.h5Seurat") 43 | Convert("pbmc_ATAC_v2.h5Seurat", dest = "h5ad") 44 | -------------------------------------------------------------------------------- /Archive/strong-link/10xpbmc/prep_pbmc.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "prep_pbmc" 3 | output: html_document 4 | --- 5 | 6 | code to prep 10x pbmc (pbmc scATAC/scRNA multiome dataset from 10x genomics): 7 | rna information from "pbmc_chromvar_annotated.rda" 8 | atac information from "pbmc_chromvar_annotated.rda" # gene activity score pre calculated by signac already avaliable in the seurat object 9 | 10 | @Nancy Zhang and @Sijia Huang for source of this file or any preprocessing related to the original data. 11 | 12 | 13 | ```{r} 14 | load("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/pbmc_chromvar_annotated.rda") 15 | 16 | DefaultAssay(object = pbmc) <- "RNA" 17 | # do sctransform as zmm requirement 18 | pbmc <- PercentageFeatureSet(pbmc, pattern = "^MT-", col.name = "percent.mt") 19 | pbmc[["SCT"]] <- NULL 20 | pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) #reachediteration limit 21 | ``` 22 | 23 | ```{r} 24 | ## okay start saving out 25 | pbmc_rna_sct = as.data.frame(t(pbmc@assays$SCT@data)) 26 | pbmc_rna_sct_names = colnames(pbmc_rna_sct) 27 | 28 | library(Matrix) 29 | ## rna 30 | pbmc_rna_sct = as(as.matrix(pbmc_rna_sct), "dgCMatrix") 31 | writeMM(pbmc_rna_sct, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_rna.txt") 32 | write.csv(data.frame(names = pbmc_rna_sct_names), "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_rna_names.csv") 33 | ## atac_GAS 34 | pbmc_gas = as.data.frame(t(pbmc@assays$ACTIVITY@data)) 35 | pbmc_gas_names = colnames(pbmc_gas) 36 | pbmc_gas = as(as.matrix(pbmc_gas), "dgCMatrix") 37 | writeMM(pbmc_gas, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_GAS.txt") 38 | write.csv(data.frame(names = pbmc_gas_names), "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_GAS_names.csv") 39 | ## atac_lsi 40 | pbmc_lsi = pbmc@reductions$lsi@cell.embeddings[,c(2:50)] 41 | write.csv(pbmc_lsi, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_LSI49.csv") 42 | ## meta 43 | meta_data = pbmc@meta.data 44 | write.csv(meta_data, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_meta.csv") 45 | ``` 46 | 47 | 48 | ############## produce RNA and ATAC embedding for slt and ari calculation 49 | ## ATAC embedding can just direclty use LSI scores; RNA use PCA embedding; both 15 dimensions 50 | 51 | ```{r} 52 | ## rna 53 | rna = readMM("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_rna.txt") 54 | rna = as.matrix(rna) 55 | rna_names = read.csv('/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/rna_names.csv') 56 | colnames(rna) = rna_names$names 57 | rownames(rna) = paste0("cell",c(1:nrow(rna))) 58 | 59 | library(Seurat) 60 | temp_obj1 = CreateSeuratObject(counts=t(rna),assay="rna") 61 | temp_obj1 = SetAssayData(object = temp_obj1, slot = "data", new.data = t(rna), assay="rna") # input data already sctnorm 62 | temp_obj1 = ScaleData(temp_obj1) 63 | temp_obj1 <- FindVariableFeatures(temp_obj1, selection.method = "vst", nfeatures = 2000) 64 | temp_obj1 = RunPCA(temp_obj1, features = rownames(temp_obj1)) 65 | meta = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_meta.csv") 66 | pca = as.data.frame(temp_obj1@reductions$pca@cell.embeddings[,c(1:15)]) 67 | pca$label = meta$annotation 68 | write.csv(pca, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig_x.csv", row.names = F) 69 | 70 | # lsi 71 | lsi = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_LSI49.csv") 72 | lsi_orig = lsi[,c(1:15)] 73 | lsi_orig$label = meta$annotation 74 | write.csv(lsi_orig, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig_y.csv", row.names = F) 75 | ``` 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /Archive/strong-link/cortical/analysis/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | #print(sam_x) 35 | #print(sam_y) 36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 37 | #print(metrics) 38 | # calculate Silhouette width 39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 41 | #print(slt_res) 42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 43 | #print(metrics) 44 | # calculate ARI 45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 48 | 49 | # calculate LISI 50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...')) 51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2]) 53 | 54 | # calculate mixing averaged over clusters 55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...')) 56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname, 57 | n_idx=n_idx) 58 | metrics = metrics %>% add_column(avg_mix=avg_mix) 59 | 60 | # save metrics, because the calculation of kBET is substantially slower. 61 | write_csv(metrics, metrics_fname) 62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...')) 63 | 64 | #### not calculating kBet here because too slow for this stage 65 | # calculate kBET 66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...')) 67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 68 | #metrics = metrics %>% add_column(kBET=kbet_res) 69 | 70 | #write_csv(metrics, metrics_fname) 71 | #print(paste0(format(Sys.Date(), "%c"), ': done!')) -------------------------------------------------------------------------------- /Archive/strong-link/cortical/analysis/step2.sh: -------------------------------------------------------------------------------- 1 | # no condo env requirement 2 | # calculate ari and slt f1 scores 3 | 4 | # for mf 5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/mf/full_embed' 0 & 6 | 7 | # for scjoint 8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/scjoint/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/scjoint/raw_labels/full_embed' 0 & 9 | 10 | # for maestro 11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/maestro/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/maestro/full_embed' 0 & 12 | 13 | # for glue 14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/glue/full_embed' 0 -------------------------------------------------------------------------------- /Archive/strong-link/cortical/method_running/glue_greenleaf_prepare_data_h5.R: -------------------------------------------------------------------------------- 1 | module unload python/python-3.6.2 2 | module load python/python-3.8.2 3 | conda activate scglue2 4 | R 5 | 6 | library(anndata) 7 | #### greenleaf data #### 8 | library(Seurat) 9 | library(Signac) 10 | library(EnsDb.Hsapiens.v86) 11 | library(GenomeInfoDb) 12 | library(dplyr) 13 | library(ggplot2) 14 | 15 | 16 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/data/10x_RNA_ATAC_GreenleafCortical") 17 | 18 | load("Writeup14n_10x_greenleaf.RData") 19 | 20 | greenleaf.rna = greenleaf@assays$RNA@counts 21 | greenleaf.atac = greenleaf@assays$ATAC@counts 22 | 23 | greenleaf.obj.rna <- CreateSeuratObject( 24 | counts = greenleaf.rna, 25 | assay = "RNA" 26 | ) 27 | 28 | greenleaf.obj.atac <- CreateSeuratObject( 29 | counts = greenleaf.atac, 30 | assay = "RNA" 31 | ) 32 | 33 | greenleaf$celltype -> greenleaf_celltype 34 | 35 | greenleaf.obj.rna$celltype <- greenleaf_celltype 36 | greenleaf.obj.atac$celltype <- greenleaf_celltype 37 | 38 | greenleaf.obj.rna@meta.data$domain <- "scRNA-seq" 39 | greenleaf.obj.atac@meta.data$domain <- "scATAC-seq" 40 | 41 | 42 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/greenleaf") 43 | library(SeuratDisk) 44 | SaveH5Seurat(greenleaf.obj.rna, filename = "greenleaf_RNA_v2.h5Seurat") 45 | Convert("greenleaf_RNA_v2.h5Seurat", dest = "h5ad") 46 | SaveH5Seurat(greenleaf.obj.atac, filename = "greenleaf_ATAC_v2.h5Seurat") 47 | Convert("greenleaf_ATAC_v2.h5Seurat", dest = "h5ad") 48 | -------------------------------------------------------------------------------- /Archive/strong-link/cortical/prep_cortical.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "prep_cortical" 3 | output: html_document 4 | --- 5 | 6 | code to human cerebral cortical (scATAC/scRNA multiome dataset from https://pubmed.ncbi.nlm.nih.gov/34390642/ by greenleaf lab): 7 | rna information from "Writeup14n_10x_greenleaf.RData" 8 | atac information from "Writeup14n_10x_greenleaf.RData" # gene activity score pre calculated by signac already avaliable in the seurat object 9 | 10 | @Nancy Zhang and @Sijia Huang for source of this file. 11 | 12 | This processing code written by Zongming Ma 13 | 14 | ```{r} 15 | load("Writeup14n_10x_greenleaf.RData") 16 | 17 | dat1=t(greenleaf@assays$SCT@data) 18 | dat2=t(greenleaf@assays$geneActivity@data) 19 | names1= colnames(dat1) 20 | names2= colnames(dat2) 21 | 22 | temp= strsplit(names2, split="ATAC-") 23 | temp=unlist(temp) 24 | temp=temp[seq(2,length(temp),2)] 25 | names2=temp 26 | 27 | sum(names1 %in% names2) 28 | sum(names2 %in% names1) 29 | 30 | # make sure column names match 31 | mm = match(names1, names2) 32 | dat1=dat1[,!is.na(mm)] 33 | dat2=dat2[,mm[!is.na(mm)]] 34 | 35 | dat0=t(greenleaf@assays$RNA@data) 36 | names0 = colnames(dat0) 37 | names1 = colnames(dat1) 38 | sum(names0 %in% names1) 39 | sum(names1 %in% names0) 40 | mm0 = match(names1, names0) 41 | dat0=dat0[,mm0[!is.na(mm0)]] 42 | 43 | dim(dat0) 44 | # dim(dat1) 45 | dim(dat2) 46 | # make sure row names match 47 | # rnacells=rownames(dat1) 48 | rnacells=rownames(dat0) 49 | ataccells=rownames(dat2) 50 | plot(match(rnacells, ataccells)) 51 | 52 | # write out data matrices in sparse matrix form. 53 | write.table(greenleaf@meta.data, file="greenleaf_cortical_meta.csv", sep=",", col.names=TRUE, row.names=FALSE) 54 | # write out names of genes for RNA. 55 | rna_names = as.data.frame(colnames(dat1)) 56 | colnames(rna_names)="names" 57 | write.table(rna_names, file="greenleaf_cortical_rna_names.csv", sep=",", col.names=TRUE, row.names=TRUE) 58 | 59 | # write out the sparse matrices. 60 | writeMM(dat0, "greenleaf_cortical_RNAcount_data.mtx") 61 | writeMM(dat1, "greenleaf_cortical_SCT_data.mtx") 62 | writeMM(dat2, "greenleaf_cortical_GENEACTIVITY_data.mtx") 63 | 64 | # calculate additional LSI scores 65 | DefaultAssay(greenleaf) <- "ATAC" 66 | greenleaf <- RunTFIDF(greenleaf) 67 | greenleaf <- FindTopFeatures(greenleaf, min.cutoff = 'q0') 68 | greenleaf <- RunSVD(greenleaf, n = 200) 69 | 70 | dat3 = greenleaf@reductions$lsi@cell.embeddings 71 | dim(dat3) 72 | peakcells = rownames(dat3) 73 | plot(match(rnacells, peakcells)) # make sure row names match. 74 | write.table(dat3[,-1], file="greenleaf_cortical_peak_lsi.csv", sep=",", col.names=TRUE, row.names=TRUE) 75 | ``` 76 | 77 | 78 | ############## produce RNA and ATAC embedding for slt and ari calculation 79 | ## ATAC embedding can just direclty use LSI scores; RNA use PCA embedding; both 15 dimensions 80 | 81 | ```{r} 82 | ## rna 83 | rna = readMM("/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_RNAcount_data.mtx") 84 | rna = as.matrix(rna) 85 | rna_names = read.csv('/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_rna_names.csv') 86 | colnames(rna) = rna_names$names 87 | rownames(rna) = paste0("cell",c(1:nrow(rna))) 88 | 89 | library(Seurat) 90 | temp_obj1 = CreateSeuratObject(counts=t(rna),assay="rna") 91 | temp_obj1 = SetAssayData(object = temp_obj1, slot = "data", new.data = t(rna), assay="rna") # input data already sctnorm 92 | temp_obj1 = ScaleData(temp_obj1) 93 | temp_obj1 <- FindVariableFeatures(temp_obj1, selection.method = "vst", nfeatures = 2000) 94 | temp_obj1 = RunPCA(temp_obj1, features = rownames(temp_obj1)) 95 | meta = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_meta.csv") 96 | pca = as.data.frame(temp_obj1@reductions$pca@cell.embeddings[,c(1:15)]) 97 | pca$label = meta$annotation 98 | write.csv(pca, "/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig_x.csv", row.names = F) 99 | 100 | # lsi 101 | lsi = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_peak_lsi.csv") 102 | lsi_orig = lsi[,c(1:15)] 103 | lsi_orig$label = meta$annotation 104 | write.csv(lsi_orig, "/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig_y.csv", row.names = F) 105 | ``` 106 | 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Archive/strong-link/retina/analysis/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | #print(sam_x) 35 | #print(sam_y) 36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 37 | #print(metrics) 38 | # calculate Silhouette width 39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 41 | #print(slt_res) 42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 43 | #print(metrics) 44 | # calculate ARI 45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 48 | 49 | # calculate LISI 50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...')) 51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2]) 53 | 54 | # calculate mixing averaged over clusters 55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...')) 56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname, 57 | n_idx=n_idx) 58 | metrics = metrics %>% add_column(avg_mix=avg_mix) 59 | 60 | # save metrics, because the calculation of kBET is substantially slower. 61 | write_csv(metrics, metrics_fname) 62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...')) 63 | 64 | #### not calculating kBet here because too slow for this stage 65 | # calculate kBET 66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...')) 67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 68 | #metrics = metrics %>% add_column(kBET=kbet_res) 69 | 70 | #write_csv(metrics, metrics_fname) 71 | #print(paste0(format(Sys.Date(), "%c"), ': done!')) -------------------------------------------------------------------------------- /Archive/strong-link/retina/analysis/step2.sh: -------------------------------------------------------------------------------- 1 | # no condo env requirement 2 | # calculate ari and slt f1 scores 3 | 4 | # for mf 5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/mf/full_embed' 0 & 6 | 7 | # for scjoint 8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/scj/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/scj/full_embed' 0 & 9 | 10 | # for maestro 11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/ms/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/ms/full_embed' 0 & 12 | 13 | # for glue 14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/glue/full_embed' 0 -------------------------------------------------------------------------------- /Archive/strong-link/retina/method_running/retina_prepare_data_h5.R: -------------------------------------------------------------------------------- 1 | module unload python/python-3.6.2 2 | module load python/python-3.8.2 3 | conda activate /home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2 4 | R 5 | 6 | #### retina data #### 7 | library(Seurat) 8 | library(Signac) 9 | library(GenomeInfoDb) 10 | library(dplyr) 11 | library(ggplot2) 12 | 13 | 14 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/data/Retina/") 15 | retina<-readRDS("data/retina_peak.rds") 16 | 17 | meta <- read.csv("data/meta20k.csv") 18 | colnames(meta)[1]<-c("barcode") 19 | retina.rna = retina@assays$RNA@counts 20 | 21 | meta1 <- meta[match(colnames(retina.rna), meta$barcode),] 22 | 23 | meta1$annotation ->retina_celltype 24 | 25 | meta_subset <- read.csv("data/meta_20k.csv") 26 | colnames(meta_subset)[1]<-c("barcode") 27 | subset_retina.rna <- retina.rna[,meta_subset$barcode] 28 | meta_subset$annotation ->retina_celltype 29 | 30 | 31 | retina.obj.rna <- CreateSeuratObject( 32 | counts = subset_retina.rna, 33 | assay = "RNA" 34 | ) 35 | retina.obj.rna$celltype <- retina_celltype 36 | 37 | 38 | retina.atac = retina@assays$peak@counts 39 | subset_retina.atac <- retina.atac[,meta_subset$barcode] 40 | 41 | 42 | retina.obj.atac <- CreateSeuratObject( 43 | counts = subset_retina.atac, 44 | assay = "RNA" 45 | ) 46 | 47 | retina.obj.atac$celltype <- retina_celltype 48 | 49 | retina.obj.rna@meta.data$domain <- "scRNA-seq" 50 | retina.obj.atac@meta.data$domain <- "scATAC-seq" 51 | 52 | 53 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/retina") 54 | library(SeuratDisk) 55 | SaveH5Seurat(retina.obj.rna, filename = "retina_RNA.h5Seurat") 56 | Convert("retina_RNA.h5Seurat", dest = "h5ad") 57 | SaveH5Seurat(retina.obj.atac, filename = "retina_ATAC.h5Seurat") 58 | Convert("retina_ATAC.h5Seurat", dest = "h5ad") 59 | -------------------------------------------------------------------------------- /Archive/teaseq-pbmc/code/benchmark/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("./") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | 28 | # calculate structure alignment metrics 29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...')) 30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname, 31 | n_idx=n_idx, data_idx='x') 32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname, 33 | n_idx=n_idx, data_idx='y') 34 | #print(sam_x) 35 | #print(sam_y) 36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y) 37 | #print(metrics) 38 | # calculate Silhouette width 39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 41 | #print(slt_res) 42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 43 | #print(metrics) 44 | # calculate ARI 45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 48 | 49 | # calculate LISI 50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...')) 51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2]) 53 | 54 | # calculate mixing averaged over clusters 55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...')) 56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname, 57 | n_idx=n_idx) 58 | metrics = metrics %>% add_column(avg_mix=avg_mix) 59 | 60 | # save metrics, because the calculation of kBET is substantially slower. 61 | write_csv(metrics, metrics_fname) 62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...')) 63 | 64 | #### not calculating kBet here because too slow for this stage 65 | # calculate kBET 66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...')) 67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 68 | #metrics = metrics %>% add_column(kBET=kbet_res) 69 | 70 | #write_csv(metrics, metrics_fname) 71 | #print(paste0(format(Sys.Date(), "%c"), ': done!')) -------------------------------------------------------------------------------- /Archive/teaseq-pbmc/code/benchmark/methods_running/harm_cite.R: -------------------------------------------------------------------------------- 1 | #script for seurat fusion 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/ICICLE/output/" 8 | in_root = "/ICICLE/data/" 9 | out_indx = 15 10 | 11 | out_dir =paste0(out_root,"hm/") 12 | in_dir = in_root 13 | dir.create(out_root) 14 | dir.create(out_dir) 15 | # read 16 | 17 | protein = read.csv(paste0(in_dir,"adt.csv")) 18 | colnames(protein) = gsub('\\.','-', colnames(protein)) 19 | colnames(protein) = gsub('-$','', colnames(protein)) 20 | protein$cell_barcode <- NULL 21 | protein$total <- NULL 22 | 23 | meta = read.csv(paste0(in_dir,"atac_meta.csv")) 24 | 25 | atacactivity = readMM(paste0(in_dir,"genescore_tea.txt")) 26 | atacactivity = as.matrix(atacactivity) 27 | gas_names = read.csv(paste0(in_dir ,'genescore_names_tea.csv')) 28 | colnames(atacactivity) = gas_names$names 29 | 30 | # change name 31 | correspondence = read.csv('/conversion_v12.csv') 32 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 33 | rna_list = c() 34 | protein_list = c() 35 | for (i in c(1:dim(correspondence)[1])){ 36 | protein_n = as.character(correspondence[i,1]) 37 | rna_n = as.character(correspondence[i,2]) 38 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 39 | next 40 | } 41 | rna_n = strsplit(rna_n, '/')[[1]] 42 | for(r in rna_n){ 43 | if (r %in% gas_names$names){ 44 | rna_list = c(rna_list, r) 45 | protein_list = c(protein_list, protein_n) 46 | } 47 | } 48 | } 49 | 50 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object 51 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 52 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 53 | 54 | # copy sp filtering to produce better output 55 | act.shared.sub = act.shared[,colSds(act.shared)>0.36] 56 | protein.shared.sub = protein.shared[,colSds(protein.shared)>3.6] 57 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub)))) 58 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 59 | # then we construct the seurat objects 60 | x_obj=CreateSeuratObject(counts=t(act.shared.sub),assay="x") 61 | #x_obj <- NormalizeData(x_obj) 62 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 63 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 64 | # add suerat object datay 65 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 66 | y_obj <- NormalizeData(y_obj) 67 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 68 | #list_modality=list(x_obj,y_obj) 69 | # get shared clean features 70 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub)) 71 | # run harmony in seurat, need to make a new seurat object 72 | xy_obj = CreateSeuratObject(counts=cbind(t(act.shared.sub[,features]), t(protein.shared.sub[,features]))) 73 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 74 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE) 75 | xy_obj@meta.data$orig = c(rep("x",dim(act.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 76 | # cbind together, scale within modality is better 77 | xy_obj <- xy_obj %>% RunHarmony("orig") 78 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 79 | name_1 = "full_embed_x0.csv" 80 | name_2 = "full_embed_y0.csv" 81 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 82 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 83 | row.names=FALSE) # need to decide output pca cell 84 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 85 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 86 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 87 | ## 88 | -------------------------------------------------------------------------------- /Archive/teaseq-pbmc/code/benchmark/methods_running/liger_cite.R: -------------------------------------------------------------------------------- 1 | #liger benchmark 2 | library(rliger) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/ICICLE/output/" 7 | in_root = "/ICICLE/data/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"lg/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | 16 | protein = read.csv(paste0(in_dir,"adt.csv")) 17 | colnames(protein) = gsub('\\.','-', colnames(protein)) 18 | colnames(protein) = gsub('-$','', colnames(protein)) 19 | protein$cell_barcode <- NULL 20 | protein$total <- NULL 21 | 22 | meta = read.csv(paste0(in_dir,"atac_meta.csv")) 23 | 24 | atacactivity = readMM(paste0(in_dir,"genescore_tea.txt")) 25 | atacactivity = as.matrix(atacactivity) 26 | gas_names = read.csv(paste0(in_dir ,'genescore_names_tea.csv')) 27 | colnames(atacactivity) = gas_names$names 28 | 29 | # change name 30 | correspondence = read.csv('conversion_v12.csv') 31 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 32 | rna_list = c() 33 | protein_list = c() 34 | for (i in c(1:dim(correspondence)[1])){ 35 | protein_n = as.character(correspondence[i,1]) 36 | rna_n = as.character(correspondence[i,2]) 37 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 38 | next 39 | } 40 | rna_n = strsplit(rna_n, '/')[[1]] 41 | for(r in rna_n){ 42 | if (r %in% gas_names$names){ 43 | rna_list = c(rna_list, r) 44 | protein_list = c(protein_list, protein_n) 45 | } 46 | } 47 | } 48 | 49 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object 50 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 51 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 52 | 53 | # copy sp filtering to produce better output 54 | act.shared.sub = act.shared[,colSds(act.shared)>0.36] 55 | protein.shared.sub = protein.shared[,colSds(protein.shared)>3.6] 56 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub)))) 57 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 58 | # then we construct the liger objects 59 | ligerobj=createLiger( list(x = t(act.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE) 60 | ###Start integration 61 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 62 | # default preprocessing 63 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 64 | # do not need to select genes 65 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1) 66 | ligerobj@var.genes=features # just use all 67 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 68 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE) 69 | ligerobj <- quantile_norm(ligerobj) 70 | embedding = ligerobj@H.norm[,c(1:out_indx)] 71 | name_1 = "full_embed_x0.csv" 72 | name_2 = "full_embed_y0.csv" 73 | # no avaliable matching information from liger thus not saved out 74 | # will use knn to serach matching on embedding in downstreatm analysis 75 | # check what cell is filtered out 76 | `%notin%` <- Negate(`%in%`) 77 | filtered = 78 | c(rownames(act.shared.sub), rownames(protein.shared.sub))[c(rownames(act.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)] 79 | filtered_id = as.integer(gsub("d1", "", filtered)) # some cells got delted during liger process 80 | 81 | write.csv(embedding[c(1:7472),], 82 | paste0(out_dir,name_1), row.names=FALSE) # some cells got delted during liger process 83 | write.csv(embedding[c(7473:14954),], 84 | paste0(out_dir,name_2), row.names=FALSE) 85 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 86 | 87 | #### cells got filtered out, remake the pca lsi embedding files for downstream calc of slt and ari scores 88 | orig_x = read.csv("/ICICLE/data/orig_x.csv") 89 | orig_y = read.csv("/ICICLE/data/orig_y.csv") 90 | 91 | write.csv(orig_x[-filtered_id,], "/ICICLE/data/orig_lg_x.csv" , row.names=FALSE) 92 | write.csv(orig_y, "/ICICLE/data/orig_lg_y.csv" , row.names=FALSE) 93 | write.csv(meta[-filtered_id,], "/ICICLE/data/atac_meta_lgdrop.csv" , row.names=FALSE) -------------------------------------------------------------------------------- /Archive/teaseq-pbmc/code/benchmark/methods_running/seurat_cite.R: -------------------------------------------------------------------------------- 1 | #seurat benchmark 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | # read in files 6 | out_root = "/ICICLE/output/" 7 | in_root = "/ICICLE/data/" 8 | out_indx = 15 9 | 10 | out_dir =paste0(out_root,"sr/") 11 | in_dir = in_root 12 | dir.create(out_root) 13 | dir.create(out_dir) 14 | # read 15 | 16 | protein = read.csv(paste0(in_dir,"adt.csv")) 17 | colnames(protein) = gsub('\\.','-', colnames(protein)) 18 | colnames(protein) = gsub('-$','', colnames(protein)) 19 | protein$cell_barcode <- NULL 20 | protein$total <- NULL 21 | 22 | meta = read.csv(paste0(in_dir,"atac_meta.csv")) 23 | 24 | atacactivity = readMM(paste0(in_dir,"genescore_tea.txt")) 25 | atacactivity = as.matrix(atacactivity) 26 | gas_names = read.csv(paste0(in_dir ,'genescore_names_tea.csv')) 27 | colnames(atacactivity) = gas_names$names 28 | 29 | # change name 30 | correspondence = read.csv('conversion_v12.csv') 31 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 32 | rna_list = c() 33 | protein_list = c() 34 | for (i in c(1:dim(correspondence)[1])){ 35 | protein_n = as.character(correspondence[i,1]) 36 | rna_n = as.character(correspondence[i,2]) 37 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 38 | next 39 | } 40 | rna_n = strsplit(rna_n, '/')[[1]] 41 | for(r in rna_n){ 42 | if (r %in% gas_names$names){ 43 | rna_list = c(rna_list, r) 44 | protein_list = c(protein_list, protein_n) 45 | } 46 | } 47 | } 48 | 49 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object 50 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 51 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 52 | 53 | # copy sp filtering 54 | act.shared.sub = act.shared[,colSds(act.shared)>0.36] 55 | protein.shared.sub = protein.shared[,colSds(protein.shared)>3.6] 56 | rownames(act.shared.sub) = as.character(c(1:nrow(act.shared.sub))) 57 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub))) 58 | 59 | # then we construct the seurat objects 60 | x_obj=CreateSeuratObject(counts=t(act.shared.sub),assay="x") 61 | #x_obj <- NormalizeData(x_obj) # atac skip norm 62 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case 63 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 64 | # add suerat object datay 65 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 66 | y_obj <- NormalizeData(y_obj) 67 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 68 | list_modality=list(x_obj,y_obj) 69 | # get transfer anchor 70 | features=intersect(rownames(x_obj),rownames(y_obj)) 71 | pre.anchors <- FindTransferAnchors(reference = x_obj, query = y_obj, 72 | dims = 1:20, features = features) 73 | predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(x_obj), 74 | dims = 1:20) 75 | full_df = data.frame(idx2 = c(1:length(predictions$predicted.id)) -1, idx1 = as.integer(predictions$predicted.id) -1, 76 | score = predictions$prediction.score.max) # mind the r index difference 77 | 78 | # get integration embedding 79 | print("starting seurat integration") 80 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality, 81 | dims = 1:20, anchor.features =features, k.filter = 10) 82 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 20) 83 | # 84 | DefaultAssay(xy_int) <- "integrated" 85 | xy_int <- ScaleData(xy_int, verbose = FALSE) 86 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion 87 | embedding = xy_int@reductions$pca@cell.embeddings 88 | name_1 = "full_embed_x0.csv" 89 | name_2 = "full_embed_y0.csv" 90 | #pathout = out_dir 91 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 92 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 93 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 94 | write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide 95 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 96 | -------------------------------------------------------------------------------- /Archive/teaseq-pbmc/code/benchmark/step1.sh: -------------------------------------------------------------------------------- 1 | ## run this is algo python conda env 2 | python maxfuse_cite.py & 3 | /usr/bin/Rscript seurat_cite.R & 4 | /usr/bin/Rscript liger_cite.R & 5 | /usr/bin/Rscript harm_cite.R & 6 | /usr/bin/Rscript bsc_cite.R 7 | -------------------------------------------------------------------------------- /Archive/teaseq-pbmc/code/benchmark/step2.sh: -------------------------------------------------------------------------------- 1 | # no condo env requirement 2 | # for mf 3 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/mf/metrics.csv' '/ICICLE/data/orig' '/ICICLE/output/mf/full_embed' 0 & 4 | 5 | # for sr 6 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/sr/metrics.csv' '/ICICLE/data/orig' '/ICICLE/output/sr/full_embed' 0 & 7 | 8 | # for lg 9 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/lg/metrics.csv' '/ICICLE/data/orig_lg' '/ICICLE/output/lg/full_embed' 0 & 10 | 11 | # for hm 12 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/hm/metrics.csv' '/ICICLE/data/orig' '/ICICLE/output/hm/full_embed' 0 & 13 | 14 | # for bsc 15 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/bsc/metrics.csv' '/ICICLE/data/orig' '//ICICLE/output/bsc/full_embed' 0 -------------------------------------------------------------------------------- /Archive/tonsil/code/analysis/plot_tonsil_met.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "plot_tonsil_met" 3 | output: html_document 4 | --- 5 | 6 | Script to produce metrics related to the batched data ( 10k 30k, 5 batches in total). 7 | 8 | 9 | ```{r} 10 | library(ggplot2) 11 | metrics = read.csv("tonsil_v2/match/bench_out/batch_metrics_resultV2.csv") 12 | metrics$method <- factor(metrics$method,levels = c("mf", "sr", "lg", "hm","bsc")) 13 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA") 14 | ``` 15 | 16 | 17 | ```{r} 18 | library(dplyr) 19 | # Data 20 | data <- metrics %>% select(method, ann1) 21 | #data$foscttm = 1 - data$foscttm 22 | # Calculates mean, sd, se and IC 23 | my_sum <- data %>% 24 | group_by(method) %>% 25 | dplyr::summarise( 26 | n=n(), 27 | mean=mean(ann1), 28 | sd=sd(ann1) 29 | ) %>% 30 | mutate( se=sd/sqrt(n)) %>% 31 | mutate( ic=se * qt((1-0.05)/2 + .5, n-1)) 32 | 33 | # Standard deviation 34 | 35 | 36 | ### this is batched matching accuracy 37 | p = ggplot() + 38 | geom_bar(data=my_sum, aes(x=method, y=mean, fill=method), stat="identity", alpha=0.7, width = 0.4) + 39 | geom_errorbar(data=my_sum, aes(x=method, ymin=mean-sd, ymax=mean+sd), width=0.08, colour="black", alpha=0.9, size=0.2) + 40 | ggtitle("using standard deviation") + theme_minimal() + scale_fill_manual(values = colorv) + coord_cartesian(ylim=c(0.25,0.97)) + 41 | geom_point(data=data, aes(y=ann1, x=method, fill=method),alpha=0.5, size=0.5) 42 | #ggsave("tonsil_v2/plots/batch_ann.svg", height = 3, width = 4.5) 43 | p 44 | ``` 45 | 46 | ```{r} 47 | # slt and ari f1 48 | 49 | p = ggplot(metrics) + geom_point(aes(x = slt_f1, y = ari_f1, color = method), size =2, alpha = 0.5) + 50 | theme_minimal() + scale_color_manual(values = colorv) + 51 | ylim(c(0.35,0.62)) + xlim(c(0.35,0.55)) 52 | #ggsave("tonsil_v2/plots/slt_ari.svg", height = 3, width = 4.5) 53 | p 54 | ``` 55 | 56 | ```{r} 57 | # closed up version 58 | p = ggplot(metrics) + geom_point(aes(x = slt_f1, y = ari_f1, color = method), size =2, alpha = 0.5) + 59 | theme_minimal() + scale_color_manual(values = colorv) + 60 | ylim(c(0.56,0.62)) + xlim(c(0.51,0.55)) 61 | ggsave("tonsil_v2/plots/slt_ari_small.svg", height = 3, width = 4.5) 62 | p 63 | ``` 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /Archive/tonsil/code/benchmark/calculate_metrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | metrics_fname = args[1] 5 | orig_fname = args[2] 6 | embed_fname = args[3] 7 | n_idx = as.integer(args[4]) 8 | 9 | # Compute the following metrics: 10 | # - sam_x: structure alignment metric for x data (the larger, the better) 11 | # - sam_y: structure alignment metric for y data (the larger, the better) 12 | # - slt_mix: mixing via Silhouette width (the larger, the better) 13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better) 14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better) 15 | # - ari_mix: mixing via adjusted random index (the larger, the better) 16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better) 17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better) 18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better) 19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better) 20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better) 21 | setwd("./") 22 | source("metrics.R") 23 | 24 | # load existing metrics 25 | metrics = read_csv(metrics_fname, col_types=cols()) 26 | 27 | # calculate Silhouette width 28 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...')) 29 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 30 | #print(slt_res) 31 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3]) 32 | #print(metrics) 33 | # calculate ARI 34 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...')) 35 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx) 36 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3]) 37 | write_csv(metrics, metrics_fname) -------------------------------------------------------------------------------- /Archive/tonsil/code/benchmark/method_running/harm_batch.R: -------------------------------------------------------------------------------- 1 | #harmony benchmark on 10k30k 5 batch cells, result used to produce matching accu and slt ari repeats 2 | library(Seurat) 3 | library(Matrix) 4 | library(matrixStats) 5 | library(harmony) 6 | # read in files 7 | out_root = "/tonsil_v2/match/bench_out/" 8 | in_root = "/tonsil_v2/match/bench_input/" 9 | batch = 5 10 | out_indx = 15 11 | 12 | for(i in c(1:5)){ 13 | batch_name = paste0("b",as.character(i),"/") 14 | out_dir =paste0(out_root,batch_name,"hm/") 15 | in_dir = paste0(in_root,batch_name) 16 | dir.create(paste0(out_root,batch_name)) 17 | dir.create(out_dir) 18 | # read 19 | rna = readMM(paste0(in_dir,"rna.txt")) 20 | protein = read.csv(paste0(in_dir,"pro.csv")) 21 | meta_rna = read.csv(paste0(in_dir,"meta_rna.csv")) 22 | meta_pro = read.csv(paste0(in_dir,"meta_pro.csv")) 23 | 24 | # note this version caused name different, correct back 25 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV' 26 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR' 27 | 28 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same 29 | colnames(rna) = rna_names$names 30 | # change name 31 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV8.csv') 32 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 33 | rna_list = c() 34 | protein_list = c() 35 | for (j in c(1:dim(correspondence)[1])){ 36 | protein_n = as.character(correspondence[j,1]) 37 | rna_n = as.character(correspondence[j,2]) 38 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 39 | next 40 | } 41 | rna_n = strsplit(rna_n, '/')[[1]] 42 | for(r in rna_n){ 43 | if (r %in% rna_names$names){ 44 | rna_list = c(rna_list, r) 45 | protein_list = c(protein_list, protein_n) 46 | } 47 | } 48 | } 49 | # change name end 50 | # first filtering step should be same as in sp 51 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 52 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 53 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 54 | # copy sp filtering 55 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 56 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 57 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 58 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 59 | # then we construct the seurat objects 60 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 61 | x_obj <- NormalizeData(x_obj) 62 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 63 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 64 | # add suerat object datay 65 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 66 | y_obj <- NormalizeData(y_obj) 67 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 68 | #list_modality=list(x_obj,y_obj) 69 | # get shared clean features 70 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 71 | # run harmony in seurat, need to make a new seurat object 72 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 73 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 74 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = 15, verbose = FALSE) 75 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 76 | # cbind together, scale within modality is better 77 | xy_obj <- xy_obj %>% RunHarmony("orig") 78 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 79 | name_1 = "full_embed_x0.csv" 80 | name_2 = "full_embed_y0.csv" 81 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 82 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 83 | row.names=FALSE) # need to decide output pca cell 84 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 85 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 86 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 87 | } -------------------------------------------------------------------------------- /Archive/tonsil/code/benchmark/method_running/hm_full.R: -------------------------------------------------------------------------------- 1 | # full data set run for related spatial analysis 2 | # for harmony 3 | 4 | library(Seurat) 5 | library(harmony) 6 | library(Matrix) 7 | library(matrixStats) 8 | 9 | root_dir = '/tonsil_v2/' 10 | out_dir = '/tonsil_v2/match/match_output/full/' 11 | out_indx = 15 12 | 13 | ## 14 | out_dir =paste0(out_dir,"hm/") 15 | dir.create(out_dir) 16 | 17 | rna = readMM(paste0(root_dir,"/RNA/tonsil_rna_0510.txt")) 18 | protein = read.csv(paste0(root_dir,"/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715V2.csv")) 19 | 20 | meta_rna = read.csv(paste0(root_dir,"/RNA/tonsil_rna_0510_meta.csv")) 21 | 22 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV' 23 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR' 24 | 25 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same 26 | colnames(rna) = rna_names$names 27 | 28 | #### for bsc 29 | rownames(rna) = paste0("rna", c(1:nrow(rna))) 30 | rownames(protein) = paste0("pro", c(1:nrow(protein))) 31 | 32 | # change name 33 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV11.csv') 34 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 35 | rna_list = c() 36 | protein_list = c() 37 | for (j in c(1:dim(correspondence)[1])){ 38 | protein_n = as.character(correspondence[j,1]) 39 | rna_n = as.character(correspondence[j,2]) 40 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 41 | next 42 | } 43 | rna_n = strsplit(rna_n, '/')[[1]] 44 | for(r in rna_n){ 45 | if (r %in% rna_names$names){ 46 | rna_list = c(rna_list, r) 47 | protein_list = c(protein_list, protein_n) 48 | } 49 | } 50 | } 51 | # get clean shared features 52 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 53 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 54 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 55 | 56 | # copy sp filtering 57 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 58 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 59 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub))) 60 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub))) 61 | 62 | # then we construct the seurat objects 63 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 64 | x_obj <- NormalizeData(x_obj) 65 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case 66 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 67 | # add suerat object datay 68 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 69 | y_obj <- NormalizeData(y_obj) 70 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 71 | list_modality=list(x_obj,y_obj) 72 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) 73 | # run harmony in seurat, need to make a new seurat object 74 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features]))) 75 | #xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long 76 | xy_obj <- NormalizeData(xy_obj) 77 | xy_obj = ScaleData(xy_obj) 78 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = 15, verbose = FALSE) 79 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1])) 80 | # cbind together, scale within modality is better 81 | xy_obj <- xy_obj %>% RunHarmony("orig") 82 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)] 83 | name_1 = "full_embed_x0.csv" 84 | name_2 = "full_embed_y0.csv" 85 | # does not directly produce matching info, produce later using knn with embeddning distance matrix 86 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), 87 | row.names=FALSE) # need to decide output pca cell 88 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 89 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 90 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 91 | -------------------------------------------------------------------------------- /Archive/tonsil/code/benchmark/method_running/lg_full.R: -------------------------------------------------------------------------------- 1 | # full data set run for related spatial analysis 2 | # for liger 3 | 4 | library(rliger) 5 | library(Matrix) 6 | library(matrixStats) 7 | 8 | root_dir = '/tonsil_v2/' 9 | out_dir = '/tonsil_v2/match/match_output/full/' 10 | out_indx = 15 11 | `%notin%` <- Negate(`%in%`) 12 | 13 | out_dir =paste0(out_dir,"lg/") 14 | dir.create(out_dir) 15 | 16 | rna = readMM(paste0(root_dir,"/RNA/tonsil_rna_0510.txt")) 17 | protein = read.csv(paste0(root_dir,"/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715V2.csv")) 18 | 19 | meta_rna = read.csv(paste0(root_dir,"/RNA/tonsil_rna_0510_meta.csv")) 20 | 21 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV' 22 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR' 23 | 24 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same 25 | colnames(rna) = rna_names$names 26 | 27 | # change name 28 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV11.csv') 29 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 30 | rna_list = c() 31 | protein_list = c() 32 | for (j in c(1:dim(correspondence)[1])){ 33 | protein_n = as.character(correspondence[j,1]) 34 | rna_n = as.character(correspondence[j,2]) 35 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 36 | next 37 | } 38 | rna_n = strsplit(rna_n, '/')[[1]] 39 | for(r in rna_n){ 40 | if (r %in% rna_names$names){ 41 | rna_list = c(rna_list, r) 42 | protein_list = c(protein_list, protein_n) 43 | } 44 | } 45 | } 46 | # get clean shared features 47 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 48 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 49 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 50 | 51 | # copy sp filtering 52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 54 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub)))) 55 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub)))) 56 | 57 | # then we construct the liger objects 58 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE) 59 | ###Start integration 60 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality 61 | # default preprocessing 62 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE) 63 | # do not need to select genes 64 | #ligerobj <- selectGenes(ligerobj, var.thres= 0,unshared = TRUE, 65 | # unshared.datasets = list(2), unshared.thresh= 0, alpha.thresh = 1) # unimf version of liger 66 | ligerobj@var.genes=features # only used for length 67 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE) 68 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE) 69 | #ligerobj <- optimizeALS(ligerobj, use.unshared = TRUE, k = 20,remove.missing = FALSE) 70 | ligerobj <- quantile_norm(ligerobj) 71 | embedding = ligerobj@H.norm[,c(1:out_indx)] 72 | 73 | name_1 = "full_embed_x0.csv" 74 | name_2 = "full_embed_y0.csv" 75 | # no avaliable matching information from liger thus not saved out 76 | # will use knn to serach matching on embedding in downstreatm analysis 77 | 78 | 79 | # before proceed, make sure what cells got deleted 80 | a1 = rownames(rna.shared.sub)[rownames(rna.shared.sub) %notin% rownames(embedding)] 81 | b1 = length(a1) # 39 rna cells got removed 82 | 83 | a2 = rownames(protein.shared.sub)[rownames(protein.shared.sub) %notin% rownames(embedding)] 84 | b2 = length(a2) # 8 cdx cells got removed 85 | 86 | rn = nrow(rna.shared.sub) 87 | pn = nrow(protein.shared.sub) 88 | 89 | write.csv(embedding[c(1:(rn - b1)),c(1:out_indx)], 90 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 91 | write.csv(embedding[c((rn + 1 - b1):(rn + pn - b1 - b2)),c(1:out_indx)], 92 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 93 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 94 | 95 | ## get ids of missing cells 96 | 97 | a1_s = as.integer(gsub("d1", "", a1)) 98 | a2_s = as.integer(gsub("d2", "", a2)) 99 | 100 | write.csv(data.frame(id = a1_s),paste0(out_dir,'d1_id.csv')) 101 | write.csv(data.frame(id = a2_s),paste0(out_dir,'d2_id.csv')) 102 | -------------------------------------------------------------------------------- /Archive/tonsil/code/benchmark/method_running/sr_full.R: -------------------------------------------------------------------------------- 1 | # full data set run for related spatial analysis 2 | # for liger 3 | 4 | library(Seurat) 5 | library(Matrix) 6 | library(matrixStats) 7 | 8 | root_dir = '/tonsil_v2/' 9 | out_dir = '/tonsil_v2/match/match_output/full/' 10 | out_indx = 15 11 | ## 12 | out_dir =paste0(out_dir,"sr/") 13 | dir.create(out_dir) 14 | 15 | rna = readMM(paste0(root_dir,"/RNA/tonsil_rna_0510.txt")) 16 | protein = read.csv(paste0(root_dir,"/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715V2.csv")) 17 | 18 | meta_rna = read.csv(paste0(root_dir,"/RNA/tonsil_rna_0510_meta.csv")) 19 | 20 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV' 21 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR' 22 | 23 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same 24 | colnames(rna) = rna_names$names 25 | 26 | #### for bsc 27 | rownames(rna) = paste0("rna", c(1:nrow(rna))) 28 | rownames(protein) = paste0("pro", c(1:nrow(protein))) 29 | 30 | # change name 31 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV11.csv') 32 | correspondence = correspondence[!apply(correspondence == "", 1, all),] 33 | rna_list = c() 34 | protein_list = c() 35 | for (j in c(1:dim(correspondence)[1])){ 36 | protein_n = as.character(correspondence[j,1]) 37 | rna_n = as.character(correspondence[j,2]) 38 | if (grepl("Ignore", rna_n, fixed = TRUE)){ 39 | next 40 | } 41 | rna_n = strsplit(rna_n, '/')[[1]] 42 | for(r in rna_n){ 43 | if (r %in% rna_names$names){ 44 | rna_list = c(rna_list, r) 45 | protein_list = c(protein_list, protein_n) 46 | } 47 | } 48 | } 49 | # get clean shared features 50 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object 51 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object 52 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same 53 | 54 | # copy sp filtering 55 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5] 56 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1] 57 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub))) 58 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub))) 59 | 60 | # then we construct the seurat objects 61 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x") 62 | x_obj <- NormalizeData(x_obj) 63 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case 64 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 65 | # add suerat object datay 66 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y") 67 | y_obj <- NormalizeData(y_obj) 68 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 69 | list_modality=list(x_obj,y_obj) 70 | # get transfer anchor 71 | features=intersect(rownames(x_obj),rownames(y_obj)) 72 | #pre.anchors <- FindTransferAnchors(reference = y_obj, query = x_obj, 73 | # dims = 1:20, features = features) 74 | #predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(y_obj), 75 | # dims = 1:20) 76 | #full_df = data.frame(idx1 = c(1:length(predictions$predicted.id)) -1, idx2 = as.integer(predictions$predicted.id) -1, 77 | # score = predictions$prediction.score.max) # mind the r index difference 78 | # get integration embedding 79 | print("starting seurat integration") 80 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality, 81 | dims = 1:20, anchor.features =features, k.filter = 10) 82 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 10) 83 | # 84 | DefaultAssay(xy_int) <- "integrated" 85 | xy_int <- ScaleData(xy_int, verbose = FALSE) 86 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion 87 | embedding = xy_int@reductions$pca@cell.embeddings 88 | name_1 = "full_embed_x0.csv" 89 | name_2 = "full_embed_y0.csv" 90 | #pathout = out_dir 91 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell 92 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)], 93 | paste0(out_dir,name_2), row.names=FALSE) # need to decide 94 | #write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide 95 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE) 96 | -------------------------------------------------------------------------------- /Archive/tonsil/code/benchmark/step1.sh: -------------------------------------------------------------------------------- 1 | ## run this is algo python conda env 2 | python mf_batch.py & 3 | /usr/bin/Rscript seurat_batch.R & 4 | /usr/bin/Rscript liger_batch.R & 5 | /usr/bin/Rscript harm_batch.R & 6 | /usr/bin/Rscript bsc_batch.R 7 | 8 | -------------------------------------------------------------------------------- /Archive/tonsil/code/preparation_code/prep_subsetting_andMore.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "subset" 3 | output: html_document 4 | --- 5 | 6 | Script to prepare data (subsetting etc) for different benchmarking tests, related to codex -rna -tonsil scenario 7 | 8 | ################### first: cells used to benchmark matching accu, slt and ari F1 scores ########################## 9 | Since for this task we can not include too many cells (slt and ari can not run on > 40k cells due to cpu limitation) 10 | so we subsampled 10k scrnaseq cells and 30k codex cells to be used in this benchmarking process, and there are 5 batches intotal 11 | 12 | ```{r} 13 | # script to produce test batches 14 | # for codex rna matching 15 | library(Matrix) 16 | library(Seurat) 17 | 18 | rna_full = readMM("/tonsil_v2/RNA/tonsil_rna_0510.txt") 19 | protei_full = read.csv("/tonsil_v2/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715_wstepV2.csv") 20 | meta_full = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_meta.csv") 21 | root = "/tonsil_v2/match/bench_input/" 22 | batch = 5 23 | 24 | c2u = colnames(protei_full)[6:51] # just protein columns 25 | for (i in c(1:5)){ # locked in case miss press 26 | batch_name = paste0("b",as.character(i),"/") 27 | out_dir =paste0(root,batch_name) 28 | dir.create(out_dir) 29 | # create files 30 | set.seed(i) 31 | randix1 = sample(dim(rna_full)[1], 10000) # every batch test 10k cells 32 | randix2 = sample(dim(protei_full)[1], 30000) # every batch test 30k cells 33 | 34 | rna = rna_full[randix1,] 35 | pro = protei_full[randix2,c2u] 36 | meta1 = meta_full[randix1,] # rna meta 37 | meta2 = protei_full[randix2,c(2:5,52:57)] # pro meta info 38 | 39 | write.csv(meta1, paste0(out_dir,"meta_rna.csv"),row.names = FALSE) 40 | write.csv(meta2, paste0(out_dir,"meta_pro.csv"),row.names = FALSE) 41 | write.csv(pro, paste0(out_dir,"pro.csv"),row.names = FALSE) 42 | writeMM(rna, paste0(out_dir,"rna.txt")) 43 | # create pca reduction orgin files 44 | rna_names = read.csv('/tonsil_v2/RNA/tonsil_rna_0510_names.csv') 45 | colnames(rna) = rna_names$names 46 | rownames(rna) = as.character(c(1:nrow(rna))) 47 | # pro 48 | rownames(pro) = as.character(c(1:nrow(pro))) 49 | # meta 50 | # use seurat as standard to produce reduction 51 | x_obj=CreateSeuratObject(counts=t(rna),assay="x") 52 | x_obj <- NormalizeData(x_obj) 53 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) 54 | x_obj <- ScaleData(x_obj, features = rownames(x_obj)) 55 | x_obj <- RunPCA(x_obj, features = VariableFeatures(object = x_obj)) 56 | pca_rna = as.data.frame(x_obj@reductions$pca@cell.embeddings[,c(1:15)]) 57 | pca_rna$label = meta1$cluster.info 58 | write.csv(pca_rna, paste0(out_dir,"orig_x.csv"), row.names=FALSE) 59 | 60 | # produce adt reduction 61 | y_obj=CreateSeuratObject(counts=t(pro),assay="y") 62 | y_obj <- NormalizeData(y_obj) 63 | y_obj <- ScaleData(y_obj, features = rownames(y_obj)) 64 | y_obj <- RunPCA(y_obj, features = rownames(y_obj)) 65 | pca_pro = as.data.frame(y_obj@reductions$pca@cell.embeddings[,c(1:15)]) 66 | pca_pro$label = meta2$cluster.term #### could change if we want different labels 67 | write.csv(pca_pro, paste0(out_dir,"orig_y.csv"), row.names=FALSE) 68 | 69 | } 70 | ``` 71 | 72 | 73 | ############### the second case for full analysis, all the cells in the codex tonsil subregion and all the rna cells were used ############ 74 | in this case we just use the original dataset, this cells were used for all methods to do GC related analysis + confusion matrix plotting 75 | 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Academic Software License Agreement (12/2/2021) 2 | 3 | STANFORD ACADEMIC SOFTWARE LICENSE AGREEMENT FOR "Integration of spatial and single-cell data across modalities with weak linkage (MaxFuse)" 4 | 5 | 6 | 1. This is a legal agreement (“Agreement”) between ______________________ (“RECIPIENT” or “you”), and THE BOARD OF TRUSTEES OF THE LELAND STANFORD JUNIOR UNIVERSITY (“STANFORD”) and THE TRUSTEES OF THE UNIVERSITY OF PENNSYLVANIA (“PENN”). Stanford and Penn have assignments to “Integrative Matching and Analysis of Cells across single-cell multi-omics datasets with overlapping and non-overlapping features” (“Software”) which was developed in the laboratory of Professor Garry Nolan at Stanford and Zongming Ma at Penn. 7 | 2. By accepting, receiving, and using Software, including any accompanying information, materials or manuals you are agreeing to be bound by the terms of this Agreement. If you do not agree to the terms of this Agreement, promptly return the Software to STANFORD OR PENN. 8 | 3. STANFORD AND PENN grant to RECIPIENT a royalty-free, nonexclusive, and nontransferable license to use the Software furnished hereunder, upon the terms and conditions set out below. 9 | 4. RECIPIENT acknowledges that the Software is a research tool still in the development stage and that it is being supplied as is, without any accompanying services, support or improvements from STANFORD OR PENN. STANFORD AND PENN MAKE NO REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED OTHER THAN SET OUT IN THIS AGREEMENT. 10 | 5. STANFORD AND PENN do not grant any licenses under any STANFORD AND PENN patent or patent application by this Agreement. 11 | 6. RECIPIENT agrees to use the Software solely for internal academic non-commercial purposes and shall not distribute or transfer it to another location or to any other person without prior written permission from STANFORD AND PENN. In particular, no article in this license grants commercial use rights to RECIPIENT. 12 | 7. RECIPIENT agrees not to reverse engineer, reverse assemble, reverse compile decompile, disassemble, or otherwise attempt to re-create the source code for the Software. RECIPIENT acknowledges that any programs created based on the Software will be considered a derivative of Software and owned by STANFORD AND PENN. 13 | 8. RECIPIENT may NOT make modifications to the Software or integrate Software into RECIPIENT’s own software. 14 | 9. RECIPIENT may not further distribute Software without express written permission of STANFORD AND PENN. If permission to transfer the Software is given, RECIPIENT warrants that RECIPIENT will not remove or export any part of the Software from the United States except in full compliance with all United States export regulations and other applicable laws. 15 | 10. RECIPIENT will use the Software in compliance with all applicable laws, policies and regulations including, but not limited to, any approvals, informed consent and patient confidentiality principles. 16 | 11. RECIPIENT will indemnify, hold harmless, and defend STANFORD AND PENN against any claim of any kind arising out of or related to the exercise of any rights granted under this Agreement or the breach of this Agreement by RECIPIENT. 17 | 12. Title and copyright to the Software and any derivatives and any associated documentation shall at all times remain with STANFORD AND PENN, and RECIPIENT agrees to preserve same. 18 | 13. If RECIPIENT plans to publish any peer reviewed papers, abstracts, or similar publications, RECIPIENT agrees to acknowledge Software and its creators in a manner consistent with academic (industry) practice. 19 | 14. This agreement may be terminated by either party upon thirty (30) days written notice to the other party. In the event of termination, RECIPIENT shall destroy or return immediately all Software and all copies thereof to STANFORD OR PENN upon STANFORD’s OR PENN’S request. 20 | 15. The parties to this document agree that a copy of the original signature (including an electronic copy) may be used for any and all purposes for which the original signature may have been used. The parties further waive any right to challenge the admissibility or authenticity of this document in a court of law based solely on the absence of an original signature. 21 | 22 | RECIPIENT 23 | Signature _____________________________________________________ 24 | Name ________________________________________________________ 25 | Title _________________________________________________________ 26 | Date _________________________________________________________ 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MaxFuse: MAtching X-modality via FUzzy Smoothed Embedding 2 | 3 | 4 | 5 | 6 | ## Description 7 | 8 | MaxFuse is a Python package for integrating single-cell datasets from different modalities with no overlapping features and/or under low signal-to-noise ratio regimes. For most single-cell cross modality integration methods, the feasibility of cross-modal integration relies on the existence of highly correlated, a priori 'linked' features. When such linked features are few or uninformative, a scenario that we call 'weak linkage', existing methods fail. We developed MaxFuse, a cross-modal data integration method that, through iterative co-embedding, data smoothing, and cell matching, leverages all information in each modality to obtain high-quality integration. A prototypical example of weak linkage is the integration of **spatial proteomic data** with **single-cell sequencing data**. For details, please refer to the [paper]([https://www.biorxiv.org/content/10.1101/2023.01.12.523851](https://www.nature.com/articles/s41587-023-01935-0)). 9 | 10 | This work has been led by Shuxiao Chen from [Ma Lab](http://www-stat.wharton.upenn.edu/~zongming/) @Upenn and Bokai Zhu from [Nolan lab](https://web.stanford.edu/group/nolan/) @Stanford. 11 | 12 | 13 | 14 | ## Installation 15 | MaxFuse is hosted on `pypi` and can be installed via `pip`. We recommend working with a fresh virtual environment. In the following example we use conda. 16 | 17 | ``` 18 | conda create -n maxfuse python=3.8 19 | conda activate maxfuse 20 | python -m pip install maxfuse 21 | ``` 22 | 23 | ## Vignettes 24 | 25 | 26 | 27 | 28 | Example1: Protein -- RNA test run on ground-truth CITE-seq [here](https://github.com/shuxiaoc/maxfuse/blob/main/docs/citeseq_pbmc_evaluate.ipynb). 29 | 30 | Example2: Protein -- RNA test run on tissue [here](https://github.com/shuxiaoc/maxfuse/blob/main/docs/tonsil_codex_rnaseq.ipynb). 31 | 32 | Note in cases when integrating single cell data across **protein** and **RNA** modalities, many times the nomenclature of features are different (e.g., mRNA ```ITGAM``` could be named as ```CD11b-1``` when used as antibody). We gathered a [.csv](https://github.com/shuxiaoc/maxfuse/blob/main/docs/protein_gene_conversion.csv) file that covers many of such naming conversions and used during the ```MaxFuse``` process. Of course, this is not a complete conversion, and users should manually add in new naming conversions if they were not included in this .csv file. 33 | 34 | ## API documentation 35 | 36 | For detailed documentation of ```MaxFuse``` API, you can visit our [readthedocs](https://maxfuse.readthedocs.io/en/latest/) page. 37 | 38 | ## Code archive 39 | 40 | The analysis presented in the [manuscript](https://www.biorxiv.org/content/10.1101/2023.01.12.523851) was also deposited in this GitHub repository, under this [folder](https://github.com/shuxiaoc/maxfuse/tree/main/Archive). Note in the manuscript we used a development version of ```MaxFuse``` with slightly different grammar and can also be found there. If you require additional information on the analysis/data, please contact Zongming Ma (zongming.ma@yale.edu). 41 | 42 | ## License 43 | 44 | ```MaxFuse``` is under the [Academic Software License Agreement](https://github.com/shuxiaoc/maxfuse/blob/main/LICENSE), please use accordingly. 45 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/docs/.DS_Store -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | # change sphinx-build to the desired version 8 | SPHINXBUILD ?= /Users/shuxiaochen/miniconda3/envs/maxfuse/bin/sphinx-build 9 | SOURCEDIR = . 10 | BUILDDIR = _build 11 | 12 | # Put it first so that "make" without argument is like "make help". 13 | help: 14 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 15 | 16 | .PHONY: help Makefile 17 | 18 | # Catch-all target: route all unknown targets to Sphinx using the new 19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 20 | %: Makefile 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/_static/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/docs/_static/.Rhistory -------------------------------------------------------------------------------- /docs/_templates/README.md: -------------------------------------------------------------------------------- 1 | # Templates 2 | 3 | These templates are adapted from [JamesALeedham/Sphinx-Autosummary-Recursion](https://github.com/JamesALeedham/Sphinx-Autosummary-Recursion). 4 | -------------------------------------------------------------------------------- /docs/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :show-inheritance: 7 | 8 | {% block methods %} 9 | {% if methods %} 10 | .. rubric:: {{ _('Methods') }} 11 | 12 | .. autosummary:: 13 | :toctree: 14 | :nosignatures: 15 | {% for item in methods %} 16 | {%- if item in members and item not in inherited_members and not item.startswith('_') %} 17 | ~{{ name }}.{{ item }} 18 | {%- endif -%} 19 | {%- endfor %} 20 | {% endif %} 21 | {% endblock %} 22 | 23 | {% block attributes %} 24 | {% if attributes %} 25 | .. rubric:: {{ _('Attributes') }} 26 | 27 | .. autosummary:: 28 | {% for item in attributes %} 29 | {%- if item in members and item not in inherited_members and not item.startswith('_') %} 30 | ~{{ name }}.{{ item }} 31 | {%- endif -%} 32 | {%- endfor %} 33 | {% endif %} 34 | {% endblock %} 35 | -------------------------------------------------------------------------------- /docs/_templates/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. automodule:: {{ fullname }} 4 | 5 | {% block attributes %} 6 | {% if attributes %} 7 | .. rubric:: Module attributes 8 | 9 | .. autosummary:: 10 | :toctree: 11 | {% for item in attributes %} 12 | {{ item }} 13 | {%- endfor %} 14 | {% endif %} 15 | {% endblock %} 16 | 17 | {% block functions %} 18 | {% if functions %} 19 | .. rubric:: {{ _('Functions') }} 20 | 21 | .. autosummary:: 22 | :toctree: 23 | :nosignatures: 24 | {% for item in functions %} 25 | {{ item }} 26 | {%- endfor %} 27 | {% endif %} 28 | {% endblock %} 29 | 30 | {% block classes %} 31 | {% if classes %} 32 | .. rubric:: {{ _('Classes') }} 33 | 34 | .. autosummary:: 35 | :toctree: 36 | :template: class.rst 37 | :nosignatures: 38 | {% for item in classes %} 39 | {{ item }} 40 | {%- endfor %} 41 | {% endif %} 42 | {% endblock %} 43 | 44 | {% block exceptions %} 45 | {% if exceptions %} 46 | .. rubric:: {{ _('Exceptions') }} 47 | 48 | .. autosummary:: 49 | :toctree: 50 | {% for item in exceptions %} 51 | {{ item }} 52 | {%- endfor %} 53 | {% endif %} 54 | {% endblock %} 55 | 56 | {% block modules %} 57 | {% if modules %} 58 | .. rubric:: Submodules 59 | 60 | .. autosummary:: 61 | :toctree: 62 | :template: module.rst 63 | :recursive: 64 | {% for item in modules %} 65 | {{ item }} 66 | {%- endfor %} 67 | {% endif %} 68 | {% endblock %} 69 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | This section provides detailed API documentation for all public functions 5 | and classes in the ``MaxFuse`` package. 6 | 7 | .. autosummary:: 8 | :toctree: api 9 | :template: module.rst 10 | :recursive: 11 | 12 | maxfuse 13 | -------------------------------------------------------------------------------- /docs/api/maxfuse.graph.construct_graph.rst: -------------------------------------------------------------------------------- 1 | maxfuse.graph.construct\_graph 2 | ============================== 3 | 4 | .. currentmodule:: maxfuse.graph 5 | 6 | .. autofunction:: construct_graph -------------------------------------------------------------------------------- /docs/api/maxfuse.graph.get_nearest_neighbors.rst: -------------------------------------------------------------------------------- 1 | maxfuse.graph.get\_nearest\_neighbors 2 | ===================================== 3 | 4 | .. currentmodule:: maxfuse.graph 5 | 6 | .. autofunction:: get_nearest_neighbors -------------------------------------------------------------------------------- /docs/api/maxfuse.graph.get_umap_embeddings.rst: -------------------------------------------------------------------------------- 1 | maxfuse.graph.get\_umap\_embeddings 2 | =================================== 3 | 4 | .. currentmodule:: maxfuse.graph 5 | 6 | .. autofunction:: get_umap_embeddings -------------------------------------------------------------------------------- /docs/api/maxfuse.graph.graph_clustering.rst: -------------------------------------------------------------------------------- 1 | maxfuse.graph.graph\_clustering 2 | =============================== 3 | 4 | .. currentmodule:: maxfuse.graph 5 | 6 | .. autofunction:: graph_clustering -------------------------------------------------------------------------------- /docs/api/maxfuse.graph.leiden_clustering.rst: -------------------------------------------------------------------------------- 1 | maxfuse.graph.leiden\_clustering 2 | ================================ 3 | 4 | .. currentmodule:: maxfuse.graph 5 | 6 | .. autofunction:: leiden_clustering -------------------------------------------------------------------------------- /docs/api/maxfuse.graph.rst: -------------------------------------------------------------------------------- 1 | maxfuse.graph 2 | ============= 3 | 4 | .. automodule:: maxfuse.graph 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | .. rubric:: Functions 13 | 14 | .. autosummary:: 15 | :toctree: 16 | :nosignatures: 17 | 18 | construct_graph 19 | get_nearest_neighbors 20 | get_umap_embeddings 21 | graph_clustering 22 | leiden_clustering 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/api/maxfuse.match_utils.address_matching_redundancy.rst: -------------------------------------------------------------------------------- 1 | maxfuse.match\_utils.address\_matching\_redundancy 2 | ================================================== 3 | 4 | .. currentmodule:: maxfuse.match_utils 5 | 6 | .. autofunction:: address_matching_redundancy -------------------------------------------------------------------------------- /docs/api/maxfuse.match_utils.get_initial_matching.rst: -------------------------------------------------------------------------------- 1 | maxfuse.match\_utils.get\_initial\_matching 2 | =========================================== 3 | 4 | .. currentmodule:: maxfuse.match_utils 5 | 6 | .. autofunction:: get_initial_matching -------------------------------------------------------------------------------- /docs/api/maxfuse.match_utils.get_refined_matching.rst: -------------------------------------------------------------------------------- 1 | maxfuse.match\_utils.get\_refined\_matching 2 | =========================================== 3 | 4 | .. currentmodule:: maxfuse.match_utils 5 | 6 | .. autofunction:: get_refined_matching -------------------------------------------------------------------------------- /docs/api/maxfuse.match_utils.get_refined_matching_one_iter.rst: -------------------------------------------------------------------------------- 1 | maxfuse.match\_utils.get\_refined\_matching\_one\_iter 2 | ====================================================== 3 | 4 | .. currentmodule:: maxfuse.match_utils 5 | 6 | .. autofunction:: get_refined_matching_one_iter -------------------------------------------------------------------------------- /docs/api/maxfuse.match_utils.match_cells.rst: -------------------------------------------------------------------------------- 1 | maxfuse.match\_utils.match\_cells 2 | ================================= 3 | 4 | .. currentmodule:: maxfuse.match_utils 5 | 6 | .. autofunction:: match_cells -------------------------------------------------------------------------------- /docs/api/maxfuse.match_utils.rst: -------------------------------------------------------------------------------- 1 | maxfuse.match\_utils 2 | ==================== 3 | 4 | .. automodule:: maxfuse.match_utils 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | .. rubric:: Functions 13 | 14 | .. autosummary:: 15 | :toctree: 16 | :nosignatures: 17 | 18 | address_matching_redundancy 19 | get_initial_matching 20 | get_refined_matching 21 | get_refined_matching_one_iter 22 | match_cells 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/api/maxfuse.metrics.get_foscttm.rst: -------------------------------------------------------------------------------- 1 | maxfuse.metrics.get\_foscttm 2 | ============================ 3 | 4 | .. currentmodule:: maxfuse.metrics 5 | 6 | .. autofunction:: get_foscttm -------------------------------------------------------------------------------- /docs/api/maxfuse.metrics.get_knn_alignment_score.rst: -------------------------------------------------------------------------------- 1 | maxfuse.metrics.get\_knn\_alignment\_score 2 | ========================================== 3 | 4 | .. currentmodule:: maxfuse.metrics 5 | 6 | .. autofunction:: get_knn_alignment_score -------------------------------------------------------------------------------- /docs/api/maxfuse.metrics.get_matching_acc.rst: -------------------------------------------------------------------------------- 1 | maxfuse.metrics.get\_matching\_acc 2 | ================================== 3 | 4 | .. currentmodule:: maxfuse.metrics 5 | 6 | .. autofunction:: get_matching_acc -------------------------------------------------------------------------------- /docs/api/maxfuse.metrics.get_matching_alignment_score.rst: -------------------------------------------------------------------------------- 1 | maxfuse.metrics.get\_matching\_alignment\_score 2 | =============================================== 3 | 4 | .. currentmodule:: maxfuse.metrics 5 | 6 | .. autofunction:: get_matching_alignment_score -------------------------------------------------------------------------------- /docs/api/maxfuse.metrics.rst: -------------------------------------------------------------------------------- 1 | maxfuse.metrics 2 | =============== 3 | 4 | .. automodule:: maxfuse.metrics 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | .. rubric:: Functions 13 | 14 | .. autosummary:: 15 | :toctree: 16 | :nosignatures: 17 | 18 | get_foscttm 19 | get_knn_alignment_score 20 | get_matching_acc 21 | get_matching_alignment_score 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.construct_graphs.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.construct\_graphs 2 | ===================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.construct_graphs -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.filter_bad_matches.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.filter\_bad\_matches 2 | ======================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.filter_bad_matches -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.find_initial_pivots.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.find\_initial\_pivots 2 | ========================================= 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.find_initial_pivots -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.get_embedding.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.get\_embedding 2 | ================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.get_embedding -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.get_matching.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.get\_matching 2 | ================================= 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.get_matching -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.plot_canonical_correlations.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.plot\_canonical\_correlations 2 | ================================================= 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.plot_canonical_correlations -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.plot_matching_scores.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.plot\_matching\_scores 2 | ========================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.plot_matching_scores -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.plot_singular_values.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.plot\_singular\_values 2 | ========================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.plot_singular_values -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.propagate.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.propagate 2 | ============================= 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.propagate -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.refine_pivots.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.refine\_pivots 2 | ================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.refine_pivots -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor 2 | =================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. autoclass:: Fusor 7 | :show-inheritance: 8 | 9 | 10 | 11 | .. rubric:: Methods 12 | 13 | .. autosummary:: 14 | :toctree: 15 | :nosignatures: 16 | 17 | ~Fusor.construct_graphs 18 | ~Fusor.filter_bad_matches 19 | ~Fusor.find_initial_pivots 20 | ~Fusor.get_embedding 21 | ~Fusor.get_matching 22 | ~Fusor.plot_canonical_correlations 23 | ~Fusor.plot_matching_scores 24 | ~Fusor.plot_singular_values 25 | ~Fusor.propagate 26 | ~Fusor.refine_pivots 27 | ~Fusor.split_into_batches 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /docs/api/maxfuse.model.Fusor.split_into_batches.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model.Fusor.split\_into\_batches 2 | ======================================== 3 | 4 | .. currentmodule:: maxfuse.model 5 | 6 | .. automethod:: Fusor.split_into_batches -------------------------------------------------------------------------------- /docs/api/maxfuse.model.rst: -------------------------------------------------------------------------------- 1 | maxfuse.model 2 | ============= 3 | 4 | .. automodule:: maxfuse.model 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | .. rubric:: Classes 17 | 18 | .. autosummary:: 19 | :toctree: 20 | :template: class.rst 21 | :nosignatures: 22 | 23 | Fusor 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /docs/api/maxfuse.rst: -------------------------------------------------------------------------------- 1 | maxfuse 2 | ======= 3 | 4 | .. automodule:: maxfuse 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | .. rubric:: Submodules 25 | 26 | .. autosummary:: 27 | :toctree: 28 | :template: module.rst 29 | :recursive: 30 | 31 | maxfuse.graph 32 | maxfuse.match_utils 33 | maxfuse.metrics 34 | maxfuse.model 35 | maxfuse.spatial_utils 36 | maxfuse.utils 37 | 38 | -------------------------------------------------------------------------------- /docs/api/maxfuse.spatial_utils.bind_spatial.rst: -------------------------------------------------------------------------------- 1 | maxfuse.spatial\_utils.bind\_spatial 2 | ==================================== 3 | 4 | .. currentmodule:: maxfuse.spatial_utils 5 | 6 | .. autofunction:: bind_spatial -------------------------------------------------------------------------------- /docs/api/maxfuse.spatial_utils.get_neighborhood_composition.rst: -------------------------------------------------------------------------------- 1 | maxfuse.spatial\_utils.get\_neighborhood\_composition 2 | ===================================================== 3 | 4 | .. currentmodule:: maxfuse.spatial_utils 5 | 6 | .. autofunction:: get_neighborhood_composition -------------------------------------------------------------------------------- /docs/api/maxfuse.spatial_utils.get_spatial_knn_indices.rst: -------------------------------------------------------------------------------- 1 | maxfuse.spatial\_utils.get\_spatial\_knn\_indices 2 | ================================================= 3 | 4 | .. currentmodule:: maxfuse.spatial_utils 5 | 6 | .. autofunction:: get_spatial_knn_indices -------------------------------------------------------------------------------- /docs/api/maxfuse.spatial_utils.rst: -------------------------------------------------------------------------------- 1 | maxfuse.spatial\_utils 2 | ====================== 3 | 4 | .. automodule:: maxfuse.spatial_utils 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | .. rubric:: Functions 13 | 14 | .. autosummary:: 15 | :toctree: 16 | :nosignatures: 17 | 18 | bind_spatial 19 | get_neighborhood_composition 20 | get_spatial_knn_indices 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.cca_embedding.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.cca\_embedding 2 | ============================ 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: cca_embedding -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.cdist_correlation.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.cdist\_correlation 2 | ================================ 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: cdist_correlation -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.center_scale.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.center\_scale 2 | =========================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: center_scale -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.dict_to_list.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.dict\_to\_list 2 | ============================ 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: dict_to_list -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.drop_zero_variability_columns.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.drop\_zero\_variability\_columns 2 | ============================================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: drop_zero_variability_columns -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.filter_bad_matches.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.filter\_bad\_matches 2 | ================================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: filter_bad_matches -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.get_centroids.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.get\_centroids 2 | ============================ 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: get_centroids -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.graph_smoothing.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.graph\_smoothing 2 | ============================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: graph_smoothing -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.list_to_dict.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.list\_to\_dict 2 | ============================ 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: list_to_dict -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.pearson_correlation.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.pearson\_correlation 2 | ================================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: pearson_correlation -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.process_count_data.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.process\_count\_data 2 | ================================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: process_count_data -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.recode.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.recode 2 | ==================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: recode -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.robust_svd.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.robust\_svd 2 | ========================= 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: robust_svd -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils 2 | ============= 3 | 4 | .. automodule:: maxfuse.utils 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | .. rubric:: Functions 13 | 14 | .. autosummary:: 15 | :toctree: 16 | :nosignatures: 17 | 18 | cca_embedding 19 | cdist_correlation 20 | center_scale 21 | dict_to_list 22 | drop_zero_variability_columns 23 | filter_bad_matches 24 | get_centroids 25 | graph_smoothing 26 | list_to_dict 27 | pearson_correlation 28 | process_count_data 29 | recode 30 | robust_svd 31 | shrink_towards_centroids 32 | sort_dict 33 | summarize_clustering 34 | svd_denoise 35 | svd_embedding 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.shrink_towards_centroids.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.shrink\_towards\_centroids 2 | ======================================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: shrink_towards_centroids -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.sort_dict.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.sort\_dict 2 | ======================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: sort_dict -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.summarize_clustering.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.summarize\_clustering 2 | =================================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: summarize_clustering -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.svd_denoise.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.svd\_denoise 2 | ========================== 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: svd_denoise -------------------------------------------------------------------------------- /docs/api/maxfuse.utils.svd_embedding.rst: -------------------------------------------------------------------------------- 1 | maxfuse.utils.svd\_embedding 2 | ============================ 3 | 4 | .. currentmodule:: maxfuse.utils 5 | 6 | .. autofunction:: svd_embedding -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Sphinx configuration 3 | """ 4 | # add system path to make maxfuse importable 5 | import inspect 6 | import sphinx_autodoc_typehints 7 | 8 | project = 'MaxFuse' 9 | version = '0.0.1' 10 | release = '0.0.1' 11 | author = "Shuxiao Chen, Bokai Zhu" 12 | 13 | extensions = [ 14 | 'sphinx.ext.autodoc', 15 | 'sphinx.ext.autosummary', 16 | 'sphinx.ext.intersphinx', 17 | 'sphinx.ext.napoleon', 18 | 'sphinx.ext.viewcode', 19 | 'sphinx.ext.mathjax', 20 | 'sphinx_autodoc_typehints', 21 | 'sphinx_copybutton', 22 | 'nbsphinx' 23 | ] 24 | 25 | templates_path = ['_templates'] 26 | html_static_path = ['_static'] 27 | html_css_files = ['custom.css'] 28 | source_suffix = '.rst' 29 | master_doc = 'index' 30 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 31 | 32 | html_show_sourcelink = True 33 | set_type_checking_flag = True 34 | typehints_fully_qualified = True 35 | napoleon_use_rtype = False 36 | autosummary_generate = True 37 | autosummary_generate_overwrite = True 38 | autodoc_preserve_defaults = True 39 | autodoc_inherit_docstrings = True 40 | autodoc_default_options = { 41 | 'autosummary': True 42 | } 43 | 44 | # html_favicon = '_static/favicon.ico' 45 | html_theme = 'sphinx_rtd_theme' 46 | 47 | intersphinx_mapping = dict( 48 | python=('https://docs.python.org/3/', None), 49 | numpy=('https://numpy.org/doc/stable/', None), 50 | scipy=('https://docs.scipy.org/doc/scipy/', None), 51 | pandas=('https://pandas.pydata.org/pandas-docs/stable/', None), 52 | sklearn=('https://scikit-learn.org/stable/', None), 53 | matplotlib=('https://matplotlib.org/stable/', None), 54 | seaborn=('https://seaborn.pydata.org/', None), 55 | # networkx=('https://networkx.org/documentation/stable/', None), 56 | anndata=('https://anndata.readthedocs.io/en/stable/', None), 57 | scanpy=('https://scanpy.readthedocs.io/en/stable/', None), 58 | # torch=('https://pytorch.org/docs/stable/', None), 59 | ignite=('https://pytorch.org/ignite/', None), 60 | # plotly=('https://plotly.com/python-api-reference/', None) 61 | ) 62 | 63 | qualname_overrides = { 64 | 'anndata._core.anndata.AnnData': 'anndata.AnnData', 65 | 'matplotlib.axes._axes.Axes': 'matplotlib.axes.Axes', 66 | 'numpy.random.mtrand.RandomState': 'numpy.random.RandomState', 67 | 'pandas.core.frame.DataFrame': 'pandas.DataFrame', 68 | 'scipy.sparse.base.spmatrix': 'scipy.sparse.spmatrix', 69 | 'seaborn.axisgrid.JointGrid': 'seaborn.JointGrid', 70 | } 71 | 72 | fa_orig = sphinx_autodoc_typehints.format_annotation 73 | 74 | def format_annotation(annotation, config, fully_qualified=True): # pylint: disable=unused-argument 75 | r""" 76 | Adapted from https://github.com/agronholm/sphinx-autodoc-typehints/issues/38#issuecomment-448517805 77 | """ 78 | if inspect.isclass(annotation): 79 | full_name = f'{annotation.__module__}.{annotation.__qualname__}' 80 | override = qualname_overrides.get(full_name) 81 | if override is not None: 82 | return f':py:class:`~{override}`' 83 | return fa_orig(annotation, config) 84 | sphinx_autodoc_typehints.format_annotation = format_annotation 85 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | MaxFuse Documentation 2 | ================================== 3 | 4 | ``MaxFuse`` (``Ma``\ tching ``x``\ cross modalities via ``Fu``\ zzy ``s``\ moothed ``e``\ mbeddings) 5 | is a python package for integrating single-cell datasets from different modalities with no overlapping features and/or under low signal-to-noise ratio regimes. 6 | For most single-cell cross modality integration methods, the feasibility of cross-modal integration relies on the existence of highly correlated, a priori 'linked' features. 7 | When such linked features are few or uninformative, a scenario that we call 'weak linkage', existing methods fail. 8 | We developed MaxFuse, a cross-modal data integration method that, through iterative co-embedding, data smoothing, and cell matching, leverages all information in each modality to obtain high-quality integration. 9 | A prototypical example of weak linkage is the integration of spatial proteomic data with single-cell sequencing data. 10 | For details, please refer to `the manuscript `__. 11 | 12 | 13 | *************** 14 | Getting started 15 | *************** 16 | 17 | The ``MaxFuse`` package can also be installed via pip: 18 | 19 | .. code-block:: bash 20 | :linenos: 21 | 22 | conda create -n maxfuse python=3.8 23 | conda activate maxfuse 24 | python -m pip install maxfuse 25 | 26 | .. note:: 27 | To avoid potential dependency conflicts, we recommend 28 | installing within Python virtual environment such as conda. 29 | 30 | Now you are all set! Please proceed to `tutorials `__ 31 | for a list of examples. 32 | 33 | Note in cases when integrating single cell data across protein and RNA modalities, 34 | many times the nomenclature of features are different (e.g., mRNA ITGAM could be named as CD11b-1 when used as antibody). 35 | We gathered a `.csv `__ file that covers many of such naming conversions and used during the MaxFuse process. 36 | Of course, this is not a complete conversion, and users should manually add in new naming conversions if they were not included in this .csv file. 37 | 38 | 39 | *************** 40 | Code archive 41 | *************** 42 | The analysis presented in `the manuscript `__ was also 43 | deposited in `this `__ GitHub repository, under `this `__ folder. 44 | Note in the manuscript we used a development version of MaxFuse with slightly different grammar and can also be found there. 45 | If you require additional information on the analysis/data, please contact Zongming Ma (zongming@wharton.upenn.edu). 46 | 47 | 48 | *************** 49 | License 50 | *************** 51 | MaxFuse is under the `Academic Software License Agreement `__, please use accordingly. 52 | 53 | 54 | 55 | .. toctree:: 56 | :maxdepth: 2 57 | :caption: Contents 58 | 59 | tutorials 60 | api 61 | 62 | ****************** 63 | Indices and tables 64 | ****************** 65 | 66 | * :ref:`genindex` 67 | * :ref:`modindex` 68 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | maxfuse 2 | sphinx_autodoc_typehints 3 | sphinx_copybutton 4 | nbsphinx 5 | sphinx_rtd_theme 6 | -------------------------------------------------------------------------------- /docs/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | .. toctree:: 5 | 6 | citeseq_pbmc_evaluate.ipynb 7 | tonsil_codex_rnaseq.ipynb 8 | -------------------------------------------------------------------------------- /maxfuse/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | MaxFuse 3 | """ 4 | from . import graph, match_utils, metrics, model, spatial_utils, utils -------------------------------------------------------------------------------- /maxfuse/spatial_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for dealing with spatial data 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.neighbors import NearestNeighbors 7 | 8 | 9 | def bind_spatial(features, nbhd, wt_on_features=0.7): 10 | """ 11 | Return a new array of form [wt_on_features * features / feature_norm, (1-wt_on_features) * nbhd / nbhd_norm] 12 | 13 | Parameters 14 | ---------- 15 | features: np.ndarray of shape (n_samples, n_features) 16 | Feature matrix 17 | nbhd: np.ndarray of shape (n_samples, n_clusters) 18 | Cell neighborhood composition matrix 19 | wt_on_features: float, default=0.7 20 | Weight to put on the feature matrix. 21 | 22 | Returns 23 | ------- 24 | res: np.ndarray of shape (n_samples, n_features+n_clusters) 25 | 26 | """ 27 | # normalize two kinds of info for easier tuning of weight 28 | feature_norm = np.linalg.norm(features) 29 | nbhd_norm = np.linalg.norm(nbhd) 30 | res = np.concatenate(( 31 | wt_on_features * features / feature_norm, 32 | (1-wt_on_features) * nbhd / nbhd_norm 33 | ), axis=1) 34 | return res 35 | 36 | 37 | def get_spatial_knn_indices(locations, n_neighbors=15, method='kd_tree'): 38 | """ 39 | Compute k-nearest neighbors of locations. 40 | 41 | Parameters 42 | ---------- 43 | locations: np.ndarray of shape (n_samples, 2) 44 | Data matrix 45 | n_neighbors: int 46 | Number of nearest neighbors 47 | method: str, default='kd_tree' 48 | Method to use when computing the nearest neighbors, one of ['ball_tree', 'kd_tree', 'brute'] 49 | 50 | Returns 51 | ------- 52 | knn_indices: np.ndarray of shape (n_samples, n_neighbors) 53 | Each row represents the knn of that sample 54 | """ 55 | locations = np.array(locations) 56 | assert n_neighbors <= locations.shape[0] 57 | # k-NN indices, may be asymmetric 58 | _, knn_indices = NearestNeighbors( 59 | n_neighbors=n_neighbors, algorithm=method 60 | ).fit(locations).kneighbors(locations) 61 | return knn_indices 62 | 63 | 64 | def get_neighborhood_composition(knn_indices, labels, log1p=False): 65 | """ 66 | Compute the composition of neighbors for each sample. 67 | 68 | Parameters 69 | ---------- 70 | knn_indices: np.ndarray of shape (n_samples, n_neighbors) 71 | Each row represents the knn of that sample 72 | labels: np.ndarray of shape (n_samples, ) 73 | Cluster labels 74 | log1p: bool, default=False 75 | Whether to apply log1p transformation 76 | 77 | Returns 78 | ------- 79 | comp: np.ndarray of shape (n_samples, n_neighbors) 80 | The composition (in proportion) of neighbors for each sample. 81 | """ 82 | labels = list(labels) 83 | n, k = knn_indices.shape 84 | unique_clusters = np.unique(labels) 85 | n_clusters = len(unique_clusters) 86 | label_to_clust_idx = {label: i for i, label in enumerate(unique_clusters)} 87 | 88 | comp = np.zeros((n, n_clusters)) 89 | for i, neighbors in enumerate(knn_indices): 90 | good_neighbors = [nb for nb in neighbors if nb != -1] 91 | for nb in good_neighbors: 92 | comp[i, label_to_clust_idx[labels[nb]]] += 1 93 | 94 | if log1p: 95 | comp = np.log1p(comp) 96 | return comp 97 | -------------------------------------------------------------------------------- /media/ai_generated_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/media/ai_generated_icon.png -------------------------------------------------------------------------------- /media/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/media/fig1.png -------------------------------------------------------------------------------- /media/temp.md: -------------------------------------------------------------------------------- 1 | temp.md 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "maxfuse" 7 | version = "0.0.2" 8 | authors = [ 9 | { name="Shuxiao Chen", email="shuxiaoc@gmail.com" }, 10 | { name="Bokai Zhu", email="bkzhu@stanford.edu" }, 11 | ] 12 | description = "Cross-modality matching of single cells via iterative fuzzy smoothed embedding" 13 | readme = "README.md" 14 | requires-python = ">=3.8" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "igraph", 21 | "leidenalg", 22 | "numpy", 23 | "pandas", 24 | "scanpy", 25 | "scipy", 26 | "scikit-learn", 27 | "matplotlib", 28 | "requests", 29 | ] 30 | 31 | [project.urls] 32 | "Homepage" = "https://github.com/shuxiaoc/maxfuse" 33 | "Bug Tracker" = "https://github.com/shuxiaoc/maxfuse/issues" 34 | 35 | [options.packages.find] 36 | where = '.' 37 | --------------------------------------------------------------------------------