├── .gitignore
├── .readthedocs.yaml
├── Archive
├── MaxFuse_devo
│ └── 09302022V
│ │ ├── graph.py
│ │ ├── match.py
│ │ ├── match_utils.py
│ │ ├── metrics.py
│ │ ├── utils.py
│ │ └── utils_bk.py
├── abseq-bmc
│ └── code
│ │ ├── abseq-dataprep.Rmd
│ │ ├── analysis
│ │ ├── allmetrices_extraction.ipynb
│ │ ├── match_utils.py
│ │ ├── metrics.py
│ │ └── utils.py
│ │ └── benchmark
│ │ ├── calculate_metrics.R
│ │ ├── methods_running
│ │ ├── bsc_ab.R
│ │ ├── harm_ab.R
│ │ ├── liger_ab.R
│ │ ├── maxfuse_ab.py
│ │ └── seurat_ab.R
│ │ ├── metrics.R
│ │ ├── step1.sh
│ │ └── step2.sh
├── asapseq-pbmc
│ └── code
│ │ ├── analysis
│ │ ├── asap_metric_extraction.ipynb
│ │ ├── match_utils.py
│ │ ├── metrics.py
│ │ └── utils.py
│ │ ├── asap_dataprep.Rmd
│ │ └── benchmark
│ │ ├── calculate_metrics.R
│ │ ├── methods_run
│ │ ├── MaxFuse_cite.py
│ │ ├── bsc_cite.R
│ │ ├── harm_cite.R
│ │ ├── liger_cite.R
│ │ └── seurat_cite.R
│ │ ├── metrics.R
│ │ ├── step1.sh
│ │ └── step2.sh
├── citeseq-bmc
│ └── code
│ │ ├── analysis
│ │ ├── allmetrices_extraction.ipynb
│ │ ├── match_utils.py
│ │ ├── metrics.py
│ │ └── utils.py
│ │ ├── benchmark
│ │ ├── calculate_metrics.R
│ │ ├── methods_running
│ │ │ ├── bsc_cite.R
│ │ │ ├── harm_cite.R
│ │ │ ├── liger_cite.R
│ │ │ ├── maxfuse_cite.py
│ │ │ └── seurat_cite.R
│ │ ├── metrics.R
│ │ ├── step1.sh
│ │ └── step2.sh
│ │ └── citeseq-bmc-dataprep.Rmd
├── citeseq-pbmc
│ └── code
│ │ ├── analysis
│ │ ├── CM_extraction-drop.ipynb
│ │ ├── CM_extraction.ipynb
│ │ ├── allmetrices_extraction-drop.ipynb
│ │ ├── allmetrices_extraction.ipynb
│ │ ├── match_utils.py
│ │ ├── metrics.py
│ │ ├── plot.Rmd
│ │ ├── plot_drop.Rmd
│ │ ├── plot_reduction.Rmd
│ │ └── utils.py
│ │ ├── benchmark
│ │ ├── calculate_metrics.R
│ │ ├── method_running
│ │ │ ├── bsc_cite.R
│ │ │ ├── bsc_cite_drop.R
│ │ │ ├── bsc_cite_reduction-drop.R
│ │ │ ├── bsc_cite_reduction.R
│ │ │ ├── harm_cite.R
│ │ │ ├── harm_cite_drop.R
│ │ │ ├── harm_cite_reduc-drop.R
│ │ │ ├── harm_cite_reduc.R
│ │ │ ├── liger_cite.R
│ │ │ ├── liger_cite_drop.R
│ │ │ ├── liger_cite_reduction-drop.R
│ │ │ ├── liger_cite_reduction.R
│ │ │ ├── maxfues_cite-drop.py
│ │ │ ├── maxfuse_cite.py
│ │ │ ├── maxfuse_cite_reduction-drop.py
│ │ │ ├── maxfuse_cite_reduction.py
│ │ │ ├── seurat_cite.R
│ │ │ ├── seurat_cite_drop.R
│ │ │ ├── seurat_cite_reduc-drop.R
│ │ │ └── seurat_cite_reduc.R
│ │ ├── metrics.R
│ │ ├── reduction.sh
│ │ ├── step1-drop.sh
│ │ ├── step1.sh
│ │ └── step2.sh
│ │ └── citeseq-pbmc-dataprep.Rmd
├── hubmap
│ └── code
│ │ ├── analysis
│ │ ├── CL_tri-integration.ipynb
│ │ ├── SB_tri-integration.ipynb
│ │ ├── plot_cl_results.Rmd
│ │ └── plot_sb_results.Rmd
│ │ └── preparation
│ │ ├── atac
│ │ └── prep_hubmap_atac.Rmd
│ │ ├── codex
│ │ ├── patient_tissues_select.ipynb
│ │ └── prep_hubmapCODEX.Rmd
│ │ └── rna
│ │ └── prep_hubmapRNA.Rmd
├── hubmap_nature
│ ├── CL_0719production.py
│ ├── SB_0718production.py
│ ├── figure_plotting.Rmd
│ └── readme.md
├── strong-link
│ ├── 10xe18
│ │ ├── analysis
│ │ │ ├── calculate_metrics.R
│ │ │ ├── check_metrics_e18.ipynb
│ │ │ ├── metrics.R
│ │ │ └── step2.sh
│ │ ├── method_running
│ │ │ ├── R_workflow_maestro_10xe18mouse.R
│ │ │ ├── glue_e18_mouse_preprocessing.ipynb
│ │ │ ├── glue_e18_mouse_training.ipynb
│ │ │ ├── glue_e18_prepare_data_h5.R
│ │ │ ├── mf_10x_e18.ipynb
│ │ │ ├── scj_add_prep_10xe18mouse.R
│ │ │ └── scj_config_e18_celltype.py
│ │ └── prep_e18.Rmd
│ ├── 10xpbmc
│ │ ├── analysis
│ │ │ ├── calculate_metrics.R
│ │ │ ├── check_metrics.ipynb
│ │ │ ├── metrics.R
│ │ │ └── step2.sh
│ │ ├── method_running
│ │ │ ├── R_workflow_maestro_10xpbmc.R
│ │ │ ├── glue_pbmc_prepare_data_h5.R
│ │ │ ├── glue_pbmc_preprocessing.ipynb
│ │ │ ├── glue_pbmc_training.ipynb
│ │ │ ├── mf_pbmc.ipynb
│ │ │ ├── scj_add_prep_data_10xpbmc.R
│ │ │ └── scj_config_pbmc.py
│ │ └── prep_pbmc.Rmd
│ ├── cortical
│ │ ├── analysis
│ │ │ ├── calculate_metrics.R
│ │ │ ├── check_metrics_greenleaf.ipynb
│ │ │ ├── metrics.R
│ │ │ └── step2.sh
│ │ ├── method_running
│ │ │ ├── glue_greenleaf_prepare_data_h5.R
│ │ │ ├── glue_greenleaf_preprocessing.ipynb
│ │ │ ├── glue_greenleaf_training.ipynb
│ │ │ ├── maestro_10xgreenleaf.R
│ │ │ ├── mf_cortical.ipynb
│ │ │ ├── scj_config_greenleaf_celltype.py
│ │ │ └── scj_prep_data_10xgreenleaf.R
│ │ └── prep_cortical.Rmd
│ └── retina
│ │ ├── analysis
│ │ ├── calculate_metrics.R
│ │ ├── metrics.R
│ │ ├── retina_extrac_all_metric.ipynb
│ │ └── step2.sh
│ │ ├── method_running
│ │ ├── R_workflow_maestro_retina.R
│ │ ├── mf_retina.ipynb
│ │ ├── retina_prepare_data_h5.R
│ │ ├── retina_preprocessing.ipynb
│ │ ├── retina_training.ipynb
│ │ ├── scj_config_retina_celltype.py
│ │ └── scj_prepare_data_retina.R
│ │ └── prep_retina.Rmd
├── teaseq-pbmc
│ └── code
│ │ ├── analysis
│ │ ├── match_utils.py
│ │ ├── metrics.py
│ │ ├── tea_metric_extraction.ipynb
│ │ └── utils.py
│ │ ├── benchmark
│ │ ├── calculate_metrics.R
│ │ ├── methods_running
│ │ │ ├── bsc_cite.R
│ │ │ ├── harm_cite.R
│ │ │ ├── liger_cite.R
│ │ │ ├── maxfuse_cite.py
│ │ │ └── seurat_cite.R
│ │ ├── metrics.R
│ │ ├── step1.sh
│ │ └── step2.sh
│ │ └── teaseq_dataprep.Rmd
└── tonsil
│ └── code
│ ├── analysis
│ ├── allmetrices_extraction_tonsil.ipynb
│ ├── full_data_postprocessing.ipynb
│ ├── match_utils.py
│ ├── metrics.py
│ ├── plot_tonsil_gcrelated_analysis.Rmd
│ ├── plot_tonsil_met.Rmd
│ ├── plot_tonsil_umap.Rmd
│ └── utils.py
│ ├── benchmark
│ ├── calculate_metrics.R
│ ├── method_running
│ │ ├── bsc_batch.R
│ │ ├── bsc_full.R
│ │ ├── harm_batch.R
│ │ ├── hm_full.R
│ │ ├── lg_full.R
│ │ ├── liger_batch.R
│ │ ├── mf_batch.py
│ │ ├── mf_full.py
│ │ ├── seurat_batch.R
│ │ └── sr_full.R
│ ├── metrics.R
│ ├── step1.sh
│ └── step2.sh
│ └── preparation_code
│ ├── add_centroide_toinput.ipynb
│ ├── prep_subsetting_andMore.Rmd
│ ├── prepare_gc_related.ipynb
│ ├── tonsilcodex_dataprep.Rmd
│ └── tonsilrna_dataprep.Rmd
├── LICENSE
├── README.md
├── docs
├── .DS_Store
├── Makefile
├── _static
│ └── .Rhistory
├── _templates
│ ├── README.md
│ ├── class.rst
│ └── module.rst
├── api.rst
├── api
│ ├── maxfuse.graph.construct_graph.rst
│ ├── maxfuse.graph.get_nearest_neighbors.rst
│ ├── maxfuse.graph.get_umap_embeddings.rst
│ ├── maxfuse.graph.graph_clustering.rst
│ ├── maxfuse.graph.leiden_clustering.rst
│ ├── maxfuse.graph.rst
│ ├── maxfuse.match_utils.address_matching_redundancy.rst
│ ├── maxfuse.match_utils.get_initial_matching.rst
│ ├── maxfuse.match_utils.get_refined_matching.rst
│ ├── maxfuse.match_utils.get_refined_matching_one_iter.rst
│ ├── maxfuse.match_utils.match_cells.rst
│ ├── maxfuse.match_utils.rst
│ ├── maxfuse.metrics.get_foscttm.rst
│ ├── maxfuse.metrics.get_knn_alignment_score.rst
│ ├── maxfuse.metrics.get_matching_acc.rst
│ ├── maxfuse.metrics.get_matching_alignment_score.rst
│ ├── maxfuse.metrics.rst
│ ├── maxfuse.model.Fusor.construct_graphs.rst
│ ├── maxfuse.model.Fusor.filter_bad_matches.rst
│ ├── maxfuse.model.Fusor.find_initial_pivots.rst
│ ├── maxfuse.model.Fusor.get_embedding.rst
│ ├── maxfuse.model.Fusor.get_matching.rst
│ ├── maxfuse.model.Fusor.plot_canonical_correlations.rst
│ ├── maxfuse.model.Fusor.plot_matching_scores.rst
│ ├── maxfuse.model.Fusor.plot_singular_values.rst
│ ├── maxfuse.model.Fusor.propagate.rst
│ ├── maxfuse.model.Fusor.refine_pivots.rst
│ ├── maxfuse.model.Fusor.rst
│ ├── maxfuse.model.Fusor.split_into_batches.rst
│ ├── maxfuse.model.rst
│ ├── maxfuse.rst
│ ├── maxfuse.spatial_utils.bind_spatial.rst
│ ├── maxfuse.spatial_utils.get_neighborhood_composition.rst
│ ├── maxfuse.spatial_utils.get_spatial_knn_indices.rst
│ ├── maxfuse.spatial_utils.rst
│ ├── maxfuse.utils.cca_embedding.rst
│ ├── maxfuse.utils.cdist_correlation.rst
│ ├── maxfuse.utils.center_scale.rst
│ ├── maxfuse.utils.dict_to_list.rst
│ ├── maxfuse.utils.drop_zero_variability_columns.rst
│ ├── maxfuse.utils.filter_bad_matches.rst
│ ├── maxfuse.utils.get_centroids.rst
│ ├── maxfuse.utils.graph_smoothing.rst
│ ├── maxfuse.utils.list_to_dict.rst
│ ├── maxfuse.utils.pearson_correlation.rst
│ ├── maxfuse.utils.process_count_data.rst
│ ├── maxfuse.utils.recode.rst
│ ├── maxfuse.utils.robust_svd.rst
│ ├── maxfuse.utils.rst
│ ├── maxfuse.utils.shrink_towards_centroids.rst
│ ├── maxfuse.utils.sort_dict.rst
│ ├── maxfuse.utils.summarize_clustering.rst
│ ├── maxfuse.utils.svd_denoise.rst
│ └── maxfuse.utils.svd_embedding.rst
├── citeseq_pbmc_evaluate.ipynb
├── conf.py
├── index.rst
├── make.bat
├── protein_gene_conversion.csv
├── requirements.txt
├── tonsil_codex_rnaseq.ipynb
└── tutorials.rst
├── maxfuse
├── __init__.py
├── graph.py
├── match_utils.py
├── metrics.py
├── model.py
├── spatial_utils.py
└── utils.py
├── media
├── ai_generated_icon.png
├── fig1.png
└── temp.md
└── pyproject.toml
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # custom
132 | .idea/
133 | data/
134 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the version of Python and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.8"
13 | # You can also specify other tool versions:
14 | # nodejs: "19"
15 | # rust: "1.64"
16 | # golang: "1.19"
17 |
18 | # Build documentation in the docs/ directory with Sphinx
19 | sphinx:
20 | configuration: docs/conf.py
21 |
22 | # If using Sphinx, optionally build your docs in additional formats such as PDF
23 | # formats:
24 | # - pdf
25 |
26 | # Optionally declare the Python requirements required to build your docs
27 | python:
28 | install:
29 | - requirements: docs/requirements.txt
30 |
--------------------------------------------------------------------------------
/Archive/abseq-bmc/code/benchmark/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("./")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
35 |
36 | # calculate Silhouette width
37 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
38 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
39 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
40 | # calculate ARI
41 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
42 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
43 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
44 |
45 | write_csv(metrics, metrics_fname)
46 |
--------------------------------------------------------------------------------
/Archive/abseq-bmc/code/benchmark/methods_running/harm_ab.R:
--------------------------------------------------------------------------------
1 | #harmony benchmar
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/abseq/output/"
8 | in_root = "/abseq/data_prep/"
9 | out_indx = 15
10 |
11 | out_dir =paste0(out_root,"hm/")
12 | in_dir = in_root
13 | dir.create(out_root)
14 | dir.create(out_dir)
15 | # read
16 | rna = readMM(paste0(in_dir,"abseqwta_rna.txt"))
17 | rna = as.matrix(rna)
18 | protein = read.csv(paste0(in_dir,"abseqwta_pro.csv"))
19 | protein = protein[,-1]# remove the row index
20 | meta = read.csv(paste0(in_dir,"abseqwta_meta.csv"))
21 | rna_names = read.csv(paste0(in_dir,"abseqwta_rna_names.csv")) # rna names always the same
22 | colnames(rna) = rna_names$names
23 |
24 | rna = rna[meta$hm_annotate != "dirty",]
25 | protein = protein[meta$hm_annotate != "dirty",]
26 | meta = meta[meta$hm_annotate != "dirty",]
27 |
28 | # change name
29 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
30 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
31 | rna_list = c()
32 | protein_list = c()
33 | for (i in c(1:dim(correspondence)[1])){
34 | protein_n = as.character(correspondence[i,1])
35 | rna_n = as.character(correspondence[i,2])
36 | if (grepl("Ignore", rna_n, fixed = TRUE)){
37 | next
38 | }
39 | rna_n = strsplit(rna_n, '/')[[1]]
40 | for(r in rna_n){
41 | if (r %in% rna_names$names){
42 | rna_list = c(rna_list, r)
43 | protein_list = c(protein_list, protein_n)
44 | }
45 | }
46 | }
47 | # change name end
48 | # first filtering step should be same as in sp
49 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
50 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
51 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
52 | # copy sp filtering
53 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.2]
54 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
55 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
56 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
57 | # then we construct the seurat objects
58 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
59 | x_obj <- NormalizeData(x_obj)
60 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
61 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
62 | # add suerat object datay
63 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
64 | y_obj <- NormalizeData(y_obj)
65 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
66 | #list_modality=list(x_obj,y_obj)
67 | # get shared clean features
68 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
69 | # run harmony in seurat, need to make a new seurat object
70 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
71 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
72 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE)
73 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
74 | # cbind together, scale within modality is better
75 | xy_obj <- xy_obj %>% RunHarmony("orig")
76 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
77 | name_1 = "full_embed_x0.csv"
78 | name_2 = "full_embed_y0.csv"
79 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
80 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
81 | row.names=FALSE) # need to decide output pca cell
82 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
83 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
84 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
85 |
86 | ##
87 |
--------------------------------------------------------------------------------
/Archive/abseq-bmc/code/benchmark/methods_running/liger_ab.R:
--------------------------------------------------------------------------------
1 | #liger benchmark
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/abseq/output/"
7 | in_root = "/abseq/data_prep/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"lg/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 | rna = readMM(paste0(in_dir,"abseqwta_rna.txt"))
16 | rna = as.matrix(rna)
17 | protein = read.csv(paste0(in_dir,"abseqwta_pro.csv"))
18 | protein = protein[,-1]# remove the row index
19 | meta = read.csv(paste0(in_dir,"abseqwta_meta.csv"))
20 | rna_names = read.csv(paste0(in_dir,"abseqwta_rna_names.csv")) # rna names always the same
21 | colnames(rna) = rna_names$names
22 |
23 | rna = rna[meta$hm_annotate != "dirty",]
24 | protein = protein[meta$hm_annotate != "dirty",]
25 | meta = meta[meta$hm_annotate != "dirty",]
26 |
27 | # change name
28 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
29 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
30 | rna_list = c()
31 | protein_list = c()
32 | for (i in c(1:dim(correspondence)[1])){
33 | protein_n = as.character(correspondence[i,1])
34 | rna_n = as.character(correspondence[i,2])
35 | if (grepl("Ignore", rna_n, fixed = TRUE)){
36 | next
37 | }
38 | rna_n = strsplit(rna_n, '/')[[1]]
39 | for(r in rna_n){
40 | if (r %in% rna_names$names){
41 | rna_list = c(rna_list, r)
42 | protein_list = c(protein_list, protein_n)
43 | }
44 | }
45 | }
46 | # change name end
47 | # first filtering step should be same as in sp
48 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
49 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
50 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
51 | # copy sp filtering to produce better output
52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.2]
53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
54 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
55 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
56 | # then we construct the liger objects
57 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE)
58 | ###Start integration
59 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
60 | # default preprocessing
61 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
62 | # do not need to select genes
63 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
64 | ligerobj@var.genes=features # just use all
65 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
66 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = TRUE)
67 | ligerobj <- quantile_norm(ligerobj)
68 | embedding = ligerobj@H.norm[,c(1:out_indx)]
69 | name_1 = "full_embed_x0.csv"
70 | name_2 = "full_embed_y0.csv"
71 | # no avaliable matching information from liger thus not saved out
72 | # will use knn to serach matching on embedding in downstreatm analysis
73 | # check what cell is filtered out
74 | `%notin%` <- Negate(`%in%`)
75 | filtered =
76 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)]
77 | filtered_id = as.integer(gsub("d1", "", filtered)) # filter id 12774
78 | # extract numbers
79 | write.csv(embedding[c(1:12901),],
80 | paste0(out_dir,name_1), row.names=FALSE) # note one cell got deleted by liger process
81 | write.csv(embedding[c(12902:25803),],
82 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
83 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
84 |
85 | # also get the right original pca values for cells, since one cell got deleted
86 | x_orig = read.csv("/abseq/data_prep/orig_x.csv")
87 | x_orig_sub = x_orig[-filtered_id,]
88 | write.csv(x_orig_sub, "/abseq/data_prep/orig_lg_x.csv", row.names=FALSE)
89 | # save out filtered id
90 | write.csv(data.frame(id = filtered_id), "/abseq/output/lg/filt_id.csv")
91 |
92 |
--------------------------------------------------------------------------------
/Archive/abseq-bmc/code/benchmark/methods_running/seurat_ab.R:
--------------------------------------------------------------------------------
1 | #seurat benchmark
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/abseq/output/"
7 | in_root = "/abseq/data_prep/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"sr/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 | rna = readMM(paste0(in_dir,"abseqwta_rna.txt"))
16 | rna = as.matrix(rna)
17 | protein = read.csv(paste0(in_dir,"abseqwta_pro.csv"))
18 | protein = protein[,-1]# remove the row index
19 | meta = read.csv(paste0(in_dir,"abseqwta_meta.csv"))
20 | rna_names = read.csv(paste0(in_dir,"abseqwta_rna_names.csv")) # rna names always the same
21 | colnames(rna) = rna_names$names
22 |
23 | rna = rna[meta$hm_annotate != "dirty",]
24 | protein = protein[meta$hm_annotate != "dirty",]
25 | meta = meta[meta$hm_annotate != "dirty",]
26 |
27 | # change name
28 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
29 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
30 | rna_list = c()
31 | protein_list = c()
32 | for (i in c(1:dim(correspondence)[1])){
33 | protein_n = as.character(correspondence[i,1])
34 | rna_n = as.character(correspondence[i,2])
35 | if (grepl("Ignore", rna_n, fixed = TRUE)){
36 | next
37 | }
38 | rna_n = strsplit(rna_n, '/')[[1]]
39 | for(r in rna_n){
40 | if (r %in% rna_names$names){
41 | rna_list = c(rna_list, r)
42 | protein_list = c(protein_list, protein_n)
43 | }
44 | }
45 | }
46 | # change name end
47 | # first filtering step should be same as in sp
48 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
49 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
50 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
51 | # copy sp filtering
52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.2]
53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
54 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub)))
55 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub)))
56 | # then we construct the seurat objects
57 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
58 | x_obj <- NormalizeData(x_obj)
59 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case
60 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
61 | # add suerat object datay
62 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
63 | y_obj <- NormalizeData(y_obj)
64 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
65 | list_modality=list(x_obj,y_obj)
66 | # get transfer anchor
67 | features=intersect(rownames(x_obj),rownames(y_obj))
68 | pre.anchors <- FindTransferAnchors(reference = x_obj, query = y_obj,
69 | dims = 1:20, features = features)
70 | predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(x_obj),
71 | dims = 1:20)
72 | full_df = data.frame(idx2 = c(1:length(predictions$predicted.id)) -1, idx1 = as.integer(predictions$predicted.id) -1,
73 | score = predictions$prediction.score.max) # mind the r index difference
74 | # get integration embedding
75 | print("starting seurat integration")
76 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality,
77 | dims = 1:20, anchor.features =features, k.filter = 10)
78 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 10)
79 | #
80 | DefaultAssay(xy_int) <- "integrated"
81 | xy_int <- ScaleData(xy_int, verbose = FALSE)
82 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion
83 | embedding = xy_int@reductions$pca@cell.embeddings
84 | name_1 = "full_embed_x0.csv"
85 | name_2 = "full_embed_y0.csv"
86 | #pathout = out_dir
87 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
88 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
89 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
90 | write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide
91 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
92 |
--------------------------------------------------------------------------------
/Archive/abseq-bmc/code/benchmark/step1.sh:
--------------------------------------------------------------------------------
1 | ## run this is algo python conda env
2 | python maxfuse_ab.py &
3 | /usr/bin/Rscript seurat_ab.R &
4 | /usr/bin/Rscript liger_ab.R &
5 | /usr/bin/Rscript harm_ab.R &
6 | /usr/bin/Rscript bsc_ab.R
--------------------------------------------------------------------------------
/Archive/abseq-bmc/code/benchmark/step2.sh:
--------------------------------------------------------------------------------
1 | # no condo env requirement
2 | # used to calc slt and ari for all methods
3 |
4 | # for mf
5 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/mf/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/mf/full_embed' 0 &
6 |
7 | # for sr
8 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/sr/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/sr/full_embed' 0 &
9 |
10 | # for lg
11 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/lgunimf/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/lgunimf/full_embed' 0 &
12 |
13 | # for hm
14 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/hm/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/hm/full_embed' 0 &
15 |
16 | # for bsc
17 | /usr/bin/Rscript calculate_metrics.R '/abseq/output/bsc/metrics.csv' '/abseq/data_prep/orig' '/abseq/output/bsc/full_embed' 0
--------------------------------------------------------------------------------
/Archive/asapseq-pbmc/code/benchmark/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("./")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 | # calculate structure alignment metrics
28 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
29 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
30 | n_idx=n_idx, data_idx='x')
31 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
32 | n_idx=n_idx, data_idx='y')
33 |
34 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
35 |
36 | # calculate Silhouette width
37 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
38 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
39 | #print(slt_res)
40 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
41 | #print(metrics)
42 | # calculate ARI
43 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
44 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
45 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
46 |
47 | # save metrics, because the calculation of kBET is substantially slower.
48 | write_csv(metrics, metrics_fname)
--------------------------------------------------------------------------------
/Archive/asapseq-pbmc/code/benchmark/methods_run/harm_cite.R:
--------------------------------------------------------------------------------
1 | #harmony benchmark
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/asap/output/"
8 | in_root = "/asap/data/"
9 | out_indx = 15
10 |
11 | out_dir =paste0(out_root,"hm/")
12 | in_dir = in_root
13 | dir.create(out_root)
14 | dir.create(out_dir)
15 | # read
16 |
17 | protein = read.csv(paste0(in_dir,"adt_pbmc.csv"))
18 | protein = protein[,-which(names(protein) %in% c("X","barcode","CD4.1",'CD8a','CD11b.1'))]# not used channels
19 | colnames(protein) = gsub('\\.','-', colnames(protein))
20 | colnames(protein) = gsub('-$','', colnames(protein))
21 |
22 | meta = read.csv(paste0(in_dir,"asap_pbmc_meta.csv"))
23 |
24 | atacactivity = readMM(paste0(in_dir,"genescore_pbmc.txt"))
25 | atacactivity = as.matrix(atacactivity)
26 | gas_names = read.csv(paste0(in_dir ,'genescore_names_pbmc.csv'))
27 | colnames(atacactivity) = gas_names$names
28 |
29 | ## remove
30 | atacactivity = atacactivity[meta$human_ann != "dirt",]
31 | protein = protein[meta$human_ann != "dirt",]
32 | meta = meta[meta$human_ann != "dirt",]
33 | ##
34 |
35 | # change name
36 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
37 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
38 | rna_list = c()
39 | protein_list = c()
40 | for (i in c(1:dim(correspondence)[1])){
41 | protein_n = as.character(correspondence[i,1])
42 | rna_n = as.character(correspondence[i,2])
43 | if (grepl("Ignore", rna_n, fixed = TRUE)){
44 | next
45 | }
46 | rna_n = strsplit(rna_n, '/')[[1]]
47 | for(r in rna_n){
48 | if (r %in% gas_names$names){
49 | rna_list = c(rna_list, r)
50 | protein_list = c(protein_list, protein_n)
51 | }
52 | }
53 | }
54 |
55 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object
56 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
57 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
58 |
59 | # copy sp filtering to produce better output
60 | act.shared.sub = act.shared[,colSds(act.shared)>0.5]
61 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
62 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub))))
63 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
64 | # then we construct the seurat objects
65 | x_obj=CreateSeuratObject(counts=t(act.shared.sub),assay="x")
66 | #x_obj <- NormalizeData(x_obj)
67 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
68 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
69 | # add suerat object datay
70 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
71 | y_obj <- NormalizeData(y_obj)
72 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
73 | #list_modality=list(x_obj,y_obj)
74 | # get shared clean features
75 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub))
76 | # run harmony in seurat, need to make a new seurat object
77 | xy_obj = CreateSeuratObject(counts=cbind(t(act.shared.sub[,features]), t(protein.shared.sub[,features])))
78 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
79 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE)
80 | xy_obj@meta.data$orig = c(rep("x",dim(act.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
81 | # cbind together, scale within modality is better
82 | xy_obj <- xy_obj %>% RunHarmony("orig")
83 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
84 | name_1 = "full_embed_x0.csv"
85 | name_2 = "full_embed_y0.csv"
86 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
87 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
88 | row.names=FALSE) # need to decide output pca cell
89 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
90 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
91 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
92 |
--------------------------------------------------------------------------------
/Archive/asapseq-pbmc/code/benchmark/methods_run/liger_cite.R:
--------------------------------------------------------------------------------
1 | #liger benchmark
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/asap/output/"
7 | in_root = "/asap/data/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"lg/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 |
16 | protein = read.csv(paste0(in_dir,"adt_pbmc.csv"))
17 | protein = protein[,-which(names(protein) %in% c("X","barcode","CD4.1",'CD8a','CD11b.1'))]# not used channels
18 | colnames(protein) = gsub('\\.','-', colnames(protein))
19 | colnames(protein) = gsub('-$','', colnames(protein))
20 |
21 | meta = read.csv(paste0(in_dir,"asap_pbmc_meta.csv"))
22 |
23 | atacactivity = readMM(paste0(in_dir,"genescore_pbmc.txt"))
24 | atacactivity = as.matrix(atacactivity)
25 | gas_names = read.csv(paste0(in_dir ,'genescore_names_pbmc.csv'))
26 | colnames(atacactivity) = gas_names$names
27 |
28 | ## remove
29 | atacactivity = atacactivity[meta$human_ann != "dirt",]
30 | protein = protein[meta$human_ann != "dirt",]
31 | meta = meta[meta$human_ann != "dirt",]
32 | ##
33 |
34 | # change name
35 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
36 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
37 | rna_list = c()
38 | protein_list = c()
39 | for (i in c(1:dim(correspondence)[1])){
40 | protein_n = as.character(correspondence[i,1])
41 | rna_n = as.character(correspondence[i,2])
42 | if (grepl("Ignore", rna_n, fixed = TRUE)){
43 | next
44 | }
45 | rna_n = strsplit(rna_n, '/')[[1]]
46 | for(r in rna_n){
47 | if (r %in% gas_names$names){
48 | rna_list = c(rna_list, r)
49 | protein_list = c(protein_list, protein_n)
50 | }
51 | }
52 | }
53 |
54 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object
55 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
56 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
57 |
58 | # copy sp filtering to produce better output
59 | act.shared.sub = act.shared[,colSds(act.shared)>0.5]
60 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
61 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub))))
62 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
63 | # then we construct the liger objects
64 | ligerobj=createLiger( list(x = t(act.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE)
65 | ###Start integration
66 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
67 | # default preprocessing
68 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
69 | # do not need to select genes
70 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
71 | ligerobj@var.genes=features # just use all
72 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
73 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = TRUE)
74 | ligerobj <- quantile_norm(ligerobj)
75 | embedding = ligerobj@H.norm[,c(1:out_indx)]
76 | name_1 = "full_embed_x0.csv"
77 | name_2 = "full_embed_y0.csv"
78 | # no avaliable matching information from liger thus not saved out
79 | # will use knn to serach matching on embedding in downstreatm analysis
80 | # check what cell is filtered out
81 | `%notin%` <- Negate(`%in%`)
82 | filtered =
83 | c(rownames(act.shared.sub), rownames(protein.shared.sub))[c(rownames(act.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)]
84 | filtered_id = as.integer(gsub("d1", "", filtered)) # no cells filtered
85 |
86 | write.csv(embedding[c(1:4360),], # no cells filltered out dump way to save out
87 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
88 | write.csv(embedding[c(4361:8720),],
89 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
90 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
91 |
92 |
--------------------------------------------------------------------------------
/Archive/asapseq-pbmc/code/benchmark/step1.sh:
--------------------------------------------------------------------------------
1 | ## run this is algo python conda env
2 | python Maxfuse_cite.py &
3 | /usr/bin/Rscript seurat_cite.R &
4 | /usr/bin/Rscript liger_cite.R &
5 | /usr/bin/Rscript harm_cite.R &
6 | /usr/bin/Rscript bsc_cite.R
7 |
--------------------------------------------------------------------------------
/Archive/asapseq-pbmc/code/benchmark/step2.sh:
--------------------------------------------------------------------------------
1 | # code to calc slt and ari f1
2 |
3 | # for mf
4 | /usr/bin/Rscript calculate_metrics.R '/asap/output/mf/metrics.csv' '/asap/data/orig' '/asap/output/mf/full_embed' 0 &
5 |
6 | # for sr
7 | /usr/bin/Rscript calculate_metrics.R '/asap/output/sr/metrics.csv' '/asap/data/orig' '/asap/output/sr/full_embed' 0 &
8 |
9 | # for lg
10 | /usr/bin/Rscript calculate_metrics.R '/asap/output/lg/metrics.csv' '/asap/data/orig' '/asap/output/lg/full_embed' 0 &
11 |
12 | # for hm
13 | /usr/bin/Rscript calculate_metrics.R '/asap/output/hm/metrics.csv' '/asap/data/orig' '/asap/output/hm/full_embed' 0 &
14 |
15 | # for bsc
16 | /usr/bin/Rscript calculate_metrics.R '/asap/output/bsc/metrics.csv' '/asap/data/orig' '/asap/output/bsc/full_embed' 0
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("./")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
35 | # calculate Silhouette width
36 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
37 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
38 | #print(slt_res)
39 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
40 | #print(metrics)
41 | # calculate ARI
42 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
43 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
44 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
45 |
46 | write_csv(metrics, metrics_fname)
47 |
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/methods_running/bsc_cite.R:
--------------------------------------------------------------------------------
1 | # bindsc benchmark
2 | library(bindSC)
3 | library(Seurat)
4 | library(Matrix)
5 | library(matrixStats)
6 | # read in files
7 | out_root = "/bench_test4/output/"
8 | in_root = "/bench_test4/input/"
9 | out_indx = 15
10 |
11 | out_dir =paste0(out_root,"bsc/")
12 | in_dir = in_root
13 | dir.create(out_root)
14 | dir.create(out_dir)
15 | # read
16 | rna = readMM(paste0(in_dir,"rna200.txt"))
17 | rna = as.matrix(rna)
18 | protein = read.csv(paste0(in_dir,"pro200.csv"))
19 | meta = read.csv(paste0(in_dir,"meta200.csv"))
20 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same
21 | colnames(rna) = rna_names$names
22 |
23 | #### for bsc
24 | rownames(rna) = paste0("rna", c(1:nrow(rna)))
25 | rownames(protein) = paste0("pro", c(1:nrow(protein)))
26 |
27 | # change name
28 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
29 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
30 | rna_list = c()
31 | protein_list = c()
32 | for (i in c(1:dim(correspondence)[1])){
33 | protein_n = as.character(correspondence[i,1])
34 | rna_n = as.character(correspondence[i,2])
35 | if (grepl("Ignore", rna_n, fixed = TRUE)){
36 | next
37 | }
38 | rna_n = strsplit(rna_n, '/')[[1]]
39 | for(r in rna_n){
40 | if (r %in% rna_names$names){
41 | rna_list = c(rna_list, r)
42 | protein_list = c(protein_list, protein_n)
43 | }
44 | }
45 | }
46 | # change name end
47 | # first filtering step should be same as in sp
48 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
49 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
50 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
51 | # copy sp filtering
52 | # copy sp filtering
53 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1 & colSds(protein.shared)>0.1]
54 | protein.shared.sub = protein.shared[,colSds(rna.shared)>0.1 & colSds(protein.shared)>0.1]
55 |
56 | # get cluster for bindsc x, using all x features
57 | xc_obj=CreateSeuratObject(counts=t(protein),assay="x")
58 | xc_obj <- NormalizeData(xc_obj)
59 | xc_obj <- ScaleData(xc_obj, features = rownames(xc_obj))
60 | xc_obj <- RunPCA(xc_obj, features = rownames(xc_obj))
61 | xc_obj <- FindNeighbors(xc_obj, dims = 1:15)
62 | xc_obj <- FindClusters(xc_obj, resolution = 1)
63 | x_cluster = as.factor(paste0('x_',as.character(Idents(xc_obj))))
64 |
65 | # get cluster for bindsc x, using all x features
66 | x_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="x")
67 | x_obj <- NormalizeData(x_obj)
68 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))# not used
69 |
70 | # get cluster for bindsc y, using all y features (variable)
71 | y_obj=CreateSeuratObject(counts=t(rna),assay="y")
72 | y_obj <- NormalizeData(y_obj)
73 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
74 | y_obj <- FindVariableFeatures(y_obj, nfeatures = 3000)
75 | y_obj <- RunPCA(y_obj, features = VariableFeatures(object = y_obj))
76 | y_obj <- FindNeighbors(y_obj, dims = 1:15)
77 | y_obj <- FindClusters(y_obj, resolution = 1)
78 | y_cluster = as.factor(paste0('y_',as.character(Idents(y_obj))))
79 |
80 | y_input_features = VariableFeatures(object = y_obj)
81 |
82 | ## for Z0
83 | z_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="z")
84 | z_obj <- NormalizeData(z_obj)
85 |
86 | ## now gather all the actual inputs
87 | x_input = x_obj@assays$x@data
88 | y_input = as.matrix(as.data.frame(y_obj@assays$y@data[y_input_features,]))
89 | z0_input = z_obj@assays$z@data
90 |
91 | # start bindsc
92 | res <- BiCCA( X = x_input ,
93 | Y = y_input,
94 | Z0 = z0_input,
95 | X.clst = x_cluster,
96 | Y.clst = y_cluster,
97 | alpha = 0.1,
98 | lambda = 0.7,
99 | K = 15,
100 | temp.path = "out",
101 | num.iteration = 50,
102 | tolerance = 0.01,
103 | save = TRUE,
104 | parameter.optimize = FALSE,
105 | block.size = 0)
106 |
107 | name_1 = "full_embed_x0.csv"
108 | name_2 = "full_embed_y0.csv"
109 | pathout = out_dir
110 | write.csv(data.frame(res$r)[,c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # rna embed
111 | write.csv(data.frame(res$u)[,c(1:out_indx)], paste0(out_dir,name_2), row.names=FALSE) # pro embed
112 | write.csv(data.frame(method = "bsc"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
113 |
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/methods_running/harm_cite.R:
--------------------------------------------------------------------------------
1 | # harmony benchmark
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/bench_test4/output/"
8 | in_root = "/bench_test4/input/"
9 | out_indx = 15
10 |
11 | out_dir =paste0(out_root,"hm/")
12 | in_dir = in_root
13 | dir.create(out_root)
14 | dir.create(out_dir)
15 | # read
16 | rna = readMM(paste0(in_dir,"rna200.txt"))
17 | rna = as.matrix(rna)
18 | protein = read.csv(paste0(in_dir,"pro200.csv"))
19 | meta = read.csv(paste0(in_dir,"meta200.csv"))
20 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same
21 | colnames(rna) = rna_names$names
22 | # change name
23 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
24 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
25 | rna_list = c()
26 | protein_list = c()
27 | for (i in c(1:dim(correspondence)[1])){
28 | protein_n = as.character(correspondence[i,1])
29 | rna_n = as.character(correspondence[i,2])
30 | if (grepl("Ignore", rna_n, fixed = TRUE)){
31 | next
32 | }
33 | rna_n = strsplit(rna_n, '/')[[1]]
34 | for(r in rna_n){
35 | if (r %in% rna_names$names){
36 | rna_list = c(rna_list, r)
37 | protein_list = c(protein_list, protein_n)
38 | }
39 | }
40 | }
41 | # change name end
42 | # first filtering step should be same as in sp
43 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
44 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
45 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
46 | # copy sp filtering
47 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1]
48 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
49 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
50 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
51 | # then we construct the seurat objects
52 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
53 | x_obj <- NormalizeData(x_obj)
54 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
55 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
56 | # add suerat object datay
57 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
58 | y_obj <- NormalizeData(y_obj)
59 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
60 | #list_modality=list(x_obj,y_obj)
61 | # get shared clean features
62 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
63 | # run harmony in seurat, need to make a new seurat object
64 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
65 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
66 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE)
67 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
68 | # cbind together, scale within modality is better
69 | xy_obj <- xy_obj %>% RunHarmony("orig")
70 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
71 | name_1 = "full_embed_x0.csv"
72 | name_2 = "full_embed_y0.csv"
73 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
74 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
75 | row.names=FALSE) # need to decide output pca cell
76 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
77 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
78 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
79 |
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/methods_running/liger_cite.R:
--------------------------------------------------------------------------------
1 | # liger benchmark
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/bench_test4/output/"
7 | in_root = "/bench_test4/input/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"lg/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 | rna = readMM(paste0(in_dir,"rna200.txt"))
16 | rna = as.matrix(rna)
17 | protein = read.csv(paste0(in_dir,"pro200.csv"))
18 | #meta = read.csv(paste0(in_dir,"meta200.csv"))
19 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same
20 | colnames(rna) = rna_names$names
21 | # change name
22 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
23 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
24 | rna_list = c()
25 | protein_list = c()
26 | for (i in c(1:dim(correspondence)[1])){
27 | protein_n = as.character(correspondence[i,1])
28 | rna_n = as.character(correspondence[i,2])
29 | if (grepl("Ignore", rna_n, fixed = TRUE)){
30 | next
31 | }
32 | rna_n = strsplit(rna_n, '/')[[1]]
33 | for(r in rna_n){
34 | if (r %in% rna_names$names){
35 | rna_list = c(rna_list, r)
36 | protein_list = c(protein_list, protein_n)
37 | }
38 | }
39 | }
40 | # change name end
41 | # first filtering step should be same as in sp
42 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
43 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
44 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
45 | # copy sp filtering to produce better output
46 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1]
47 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
48 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
49 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
50 | # then we construct the liger objects
51 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE)
52 | ###Start integration
53 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
54 | # default preprocessing
55 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
56 | # do not need to select genes
57 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
58 | ligerobj@var.genes=features # just use all
59 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
60 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = TRUE)
61 | ligerobj <- quantile_norm(ligerobj)
62 | embedding = ligerobj@H.norm[,c(1:out_indx)]
63 | name_1 = "full_embed_x0.csv"
64 | name_2 = "full_embed_y0.csv"
65 | # no avaliable matching information from liger thus not saved out
66 | # will use knn to serach matching on embedding in downstreatm analysis
67 | # check what cell is filtered out
68 | `%notin%` <- Negate(`%in%`)
69 | filtered =
70 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)]
71 | filtered_id = as.integer(gsub("d1", "", filtered)) # cells delted by liger process: 1274
72 | # extract numbers
73 | write.csv(embedding[c(1:18726),],
74 | paste0(out_dir,name_1), row.names=FALSE) # incomplete dataset
75 | write.csv(embedding[c(18727:38726),],
76 | paste0(out_dir,name_2), row.names=FALSE) #
77 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
78 |
79 | # also get the right original pca values for cells for downstream analysis
80 | x_orig = read.csv("/bench_test4/input/orig_x.csv")
81 | x_orig_sub = x_orig[-filtered_id,]
82 | write.csv(x_orig_sub, "/bench_test4/input/orig_lg_x.csv")
83 | # save out filtered id
84 | write.csv(data.frame(id = filtered_id), "/bench_test4/output/lg/filt_id.csv")
85 |
86 |
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/methods_running/seurat_cite.R:
--------------------------------------------------------------------------------
1 | # seurat benchmark
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/bench_test4/output/"
7 | in_root = "/bench_test4/input/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"sr/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 | rna = readMM(paste0(in_dir,"rna200.txt"))
16 | rna = as.matrix(rna)
17 | protein = read.csv(paste0(in_dir,"pro200.csv"))
18 | meta = read.csv(paste0(in_dir,"meta200.csv"))
19 | rna_names = read.csv("/bench_test4/input/rna_names.csv") # rna names always the same
20 | colnames(rna) = rna_names$names
21 | # change name
22 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
23 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
24 | rna_list = c()
25 | protein_list = c()
26 | for (i in c(1:dim(correspondence)[1])){
27 | protein_n = as.character(correspondence[i,1])
28 | rna_n = as.character(correspondence[i,2])
29 | if (grepl("Ignore", rna_n, fixed = TRUE)){
30 | next
31 | }
32 | rna_n = strsplit(rna_n, '/')[[1]]
33 | for(r in rna_n){
34 | if (r %in% rna_names$names){
35 | rna_list = c(rna_list, r)
36 | protein_list = c(protein_list, protein_n)
37 | }
38 | }
39 | }
40 | # change name end
41 | # first filtering step should be same as in sp
42 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
43 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
44 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
45 | # copy sp filtering
46 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.1]
47 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
48 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub)))
49 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub)))
50 | # then we construct the seurat objects
51 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
52 | x_obj <- NormalizeData(x_obj)
53 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case
54 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
55 | # add suerat object datay
56 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
57 | y_obj <- NormalizeData(y_obj)
58 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
59 | list_modality=list(x_obj,y_obj)
60 | # get transfer anchor
61 | features=intersect(rownames(x_obj),rownames(y_obj))
62 | pre.anchors <- FindTransferAnchors(reference = x_obj, query = y_obj,
63 | dims = 1:20, features = features)
64 | predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(x_obj),
65 | dims = 1:20)
66 | full_df = data.frame(idx2 = c(1:length(predictions$predicted.id)) -1, idx1 = as.integer(predictions$predicted.id) -1,
67 | score = predictions$prediction.score.max) # mind the r index difference
68 | # get integration embedding
69 | print("starting seurat integration")
70 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality,
71 | dims = 1:20, anchor.features =features, k.filter = 10)
72 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 10)
73 | #
74 | DefaultAssay(xy_int) <- "integrated"
75 | xy_int <- ScaleData(xy_int, verbose = FALSE)
76 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion
77 | embedding = xy_int@reductions$pca@cell.embeddings
78 | name_1 = "full_embed_x0.csv"
79 | name_2 = "full_embed_y0.csv"
80 | #pathout = out_dir
81 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
82 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
83 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
84 | write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide
85 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
86 |
87 |
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/step1.sh:
--------------------------------------------------------------------------------
1 | ## quick code to run all methods
2 | python maxfuse_cite.py &
3 | /usr/bin/Rscript seurat_cite.R &
4 | /usr/bin/Rscript liger_cite.R &
5 | /usr/bin/Rscript harm_cite.R &
6 | /usr/bin/Rscript bsc_cite.R
--------------------------------------------------------------------------------
/Archive/citeseq-bmc/code/benchmark/step2.sh:
--------------------------------------------------------------------------------
1 | # quick code to calc slt ari f1 scores for all methods
2 | # for mf
3 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/mf/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/mf/full_embed' 0 &
4 |
5 | # for sr
6 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/sr/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/sr/full_embed' 0 &
7 |
8 | # for lg
9 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/lgunimf/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/lgunimf/full_embed' 0 &
10 |
11 | # for hm
12 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/hm/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/hm/full_embed' 0
13 |
14 | # for bsc
15 | /usr/bin/Rscript calculate_metrics.R '/bench_test4/output/bsc/metrics.csv' '/bench_test4/input/orig' '/bench_test4/output/bsc/full_embed' 0
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/analysis/plot.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "plot_cite"
3 | output: html_document
4 | ---
5 | Script to produce citeseq pbmc plots (full panel version)
6 | These values were calculated before, check the ipynb in this folder for detail
7 |
8 | ```{r}
9 | library(ggplot2)
10 | metrics = read.csv("/bench_test3/output/batch5_resultsV2.csv") # metrics prev calced
11 | metrics$method <- factor(metrics$method,levels = c("mf", "sr", "lg", "hm","bsc"))
12 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
13 | ```
14 |
15 | ```{r}
16 | # slt f1 score + matching annotation lv1
17 | p = ggplot(metrics) + geom_point(aes(x = ann1, y = slt_f1, color = method), size =2, alpha = 0.5) +
18 | theme_minimal() + scale_color_manual(values = colorv) +
19 | scale_y_continuous(minor_breaks = seq(0, 1, 0.05)) + ylim(c(0.35,0.6)) + xlim(c(0.35,0.97))
20 | ggsave("/bench_test3/plots/p1V3.svg", height = 3, width = 4.5)
21 | p
22 | ```
23 |
24 | ```{r}
25 | # ari f1 score + matching anotation lv2
26 | p = ggplot(metrics) + geom_point(aes(x = ann2, y = ari_f1, color = method), size =2, alpha = 0.5) +
27 | theme_minimal() +
28 | scale_color_manual(values = colorv) + scale_y_continuous(minor_breaks = seq(0, 1, 0.05)) +
29 | ylim(c(0.4,0.65)) + xlim(c(0.28,0.87))
30 | ggsave("/bench_test3/plots/p2V3.svg", height = 3, width = 4.5)
31 | p
32 | ```
33 |
34 |
35 | ```{r}
36 | # plot foscttm score, since 5 repeats calc sd
37 |
38 | library(dplyr)
39 | # Data
40 | data <- metrics %>% select(method, foscttm)
41 | # Calculates mean, sd, se and IC
42 | my_sum <- data %>%
43 | group_by(method) %>%
44 | dplyr::summarise(
45 | n=n(),
46 | mean=mean(foscttm),
47 | sd=sd(foscttm)
48 | ) %>%
49 | mutate( se=sd/sqrt(n)) %>%
50 | mutate( ic=se * qt((1-0.05)/2 + .5, n-1))
51 |
52 | # Standard deviation
53 | p = ggplot(my_sum) +
54 | geom_bar( aes(x=method, y=mean, fill=method), stat="identity", alpha=0.7, width = 0.4) +
55 | geom_errorbar( aes(x=method, ymin=mean-sd, ymax=mean+sd), width=0.08, colour="black", alpha=0.9, size=0.2) +
56 | ggtitle("using standard deviation") + theme_minimal() + scale_fill_manual(values = colorv) #+ coord_cartesian(ylim=c(0.6,0.97))
57 |
58 | ggsave("/bench_test3/plots/p3V2.svg", height = 3, width = 4.5)
59 | p
60 | ```
61 |
62 |
63 | ```{r}
64 | # plot foscKNN plot along the Ks
65 |
66 | knnsearch = read.csv("/bench_test3/output/batch5_knntmpV2.csv")
67 | knnsearch$step = knnsearch$step+1 # python index dif
68 | knnsearch$method <- factor(knnsearch$method,levels = c("mf", "sr", "lg", "hm", "bsc"))
69 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
70 |
71 | p = ggplot(knnsearch,aes(x=step,y=knn_tmp, colour=method,fill = method)) +
72 | stat_summary(geom = "line", fun.y = mean, size = 0.2) +
73 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.3, colour = NA) +
74 | theme_minimal()+ ggtitle("KNN search true match") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)
75 |
76 | #ggsave("/home/bkzhu/super_mario/bench_test3/plots/p4V2.svg", height = 3, width = 5)
77 | p
78 | ```
79 |
80 |
81 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/analysis/plot_drop.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "plot_drop"
3 | output: html_document
4 | ---
5 |
6 | Script to produce citeseq pbmc plots (drop panel version)
7 | These values were calculated before, check the ipynb in this folder for detail
8 |
9 | ```{r}
10 | library(ggplot2)
11 | library(reshape2)
12 |
13 | # read metrics calculated prev
14 | mdrop = read.csv("/bench_test3/output/drop4_batch5_resultsV2.csv", row.names = 1)
15 | mdrop$method <- factor(mdrop$method,levels = c("mf", "sr", "lg", "hm","bsc"))
16 | mdrop$drop <- factor(mdrop$drop,levels = c("dropLv0", "dropLv1", "dropLv2", "dropLv3"))
17 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
18 | ```
19 |
20 |
21 | ```{r}
22 | temp = melt(mdrop, id = c("method","batch","drop"))
23 | temp2 = subset(temp, temp$variable == "ann1")
24 |
25 | temp2$method <- factor(temp2$method,levels = c("mf", "sr", "lg", "hm", "bsc"))
26 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
27 |
28 | # plot matching accu annotation lv1 when dropps
29 | p = ggplot(temp2,aes(x=drop,y=value, colour=method,fill = method, group = method)) +
30 | stat_summary(geom = "line", fun.y = mean, size = 0.2) +
31 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.2, colour = NA) +
32 | theme_minimal()+ ggtitle("ann1") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)+
33 | scale_x_discrete(expand=c(0.05, 0.05)) + ylim(c(0.3,1))
34 | ggsave("/bench_test3/plots/drop-p1V2.svg", height = 3, width = 5)
35 | p
36 | ```
37 |
38 | ```{r}
39 | temp = melt(mdrop, id = c("method","batch","drop"))
40 | temp2 = subset(temp, temp$variable == "ann2")
41 |
42 | temp2$method <- factor(temp2$method,levels = c("mf", "sr", "lg", "hm", "bsc"))
43 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
44 |
45 | # plot matching accu annotation lv2 when dropps
46 | p = ggplot(temp2,aes(x=drop,y=value, colour=method,fill = method, group = method)) +
47 | stat_summary(geom = "line", fun.y = mean, size = 0.2) +
48 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.2, colour = NA) +
49 | theme_minimal()+ ggtitle("ann2") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv) +
50 | scale_x_discrete(expand=c(0.05, 0.05)) + ylim(c(0.1,0.85))
51 |
52 | ggsave("/bench_test3/plots/drop-p2V2.svg", height = 3, width = 5)
53 | p
54 | ```
55 |
56 |
57 | ```{r}
58 | temp = melt(mdrop, id = c("method","batch","drop"))
59 | temp2 = subset(temp, temp$variable == "foscttm")
60 |
61 | temp2$method <- factor(temp2$method,levels = c("mf", "sr", "lg", "hm", "bsc"))
62 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
63 |
64 | # plot foscttm when dropps
65 | p = ggplot(temp2,aes(x=drop,y=value, colour=method,fill = method, group = method)) +
66 | stat_summary(geom = "line", fun.y = mean, size = 0.2) +
67 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.2, colour = NA) +
68 | theme_minimal()+ ggtitle("foscttm") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)+
69 | scale_x_discrete(expand=c(0.05, 0.05))+ ylim(c(0,0.35))
70 |
71 | ggsave("/bench_test3/plots/drop-p3V2.svg", height = 3, width = 5)
72 | p
73 | ```
74 |
75 |
76 | ```{r}
77 | knn_drop = read.csv("/bench_test3/output/drop4_batch5_knntmpV2.csv")
78 | knn_drop_50 = subset(knn_drop, knn_drop$step == 99)
79 | knn_drop_50$method <- factor(knn_drop_50$method,levels = c("mf", "sr", "lg", "hm","bsc"))
80 | knn_drop_50$X <- NULL
81 | knn_drop_50$step <- NULL
82 | knn_drop_50$batch <- NULL
83 | temp3 = melt(knn_drop_50, id = c("method","drop"))
84 |
85 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
86 |
87 | # plot foscKNN when dropps (k set to 100)
88 | p = ggplot(temp3,aes(x=drop,y=value, colour=method,fill = method, group = method)) +
89 | stat_summary(geom = "line", fun.y = mean, size = 0.2) +
90 | stat_summary(geom = "ribbon", fun.data = mean_cl_normal, fun.args=list(conf.int=0.85), alpha = 0.3, colour = NA) +
91 | theme_minimal()+ ggtitle("foscknn") + scale_fill_manual(values = colorv) + scale_color_manual(values = colorv)+
92 | scale_x_discrete(expand=c(0.05, 0.05))+ ylim(c(0,0.25))
93 |
94 | ggsave("/bench_test3/plots/drop-p4V2.svg", height = 3, width = 5)
95 | p
96 | ```
97 |
98 |
99 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | # script used benmark calculation: slt-f1 and ari-f1
2 |
3 | #!/usr/bin/env Rscript
4 | args = commandArgs(trailingOnly=TRUE)
5 |
6 | metrics_fname = args[1]
7 | orig_fname = args[2]
8 | embed_fname = args[3]
9 | n_idx = as.integer(args[4])
10 |
11 | # Compute the following metrics:
12 | # - sam_x: structure alignment metric for x data (the larger, the better)
13 | # - sam_y: structure alignment metric for y data (the larger, the better)
14 | # - slt_mix: mixing via Silhouette width (the larger, the better)
15 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
16 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
17 | # - ari_mix: mixing via adjusted random index (the larger, the better)
18 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
19 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
20 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
21 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
22 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
23 | setwd("./")
24 | source("metrics.R")# metric calc code
25 |
26 | # load existing metrics
27 | metrics = read_csv(metrics_fname, col_types=cols())
28 |
29 |
30 | # calculate structure alignment metrics
31 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
32 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='x')
34 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
35 | n_idx=n_idx, data_idx='y')
36 | print(sam_x)
37 | print(sam_y)
38 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
39 | #print(metrics)
40 | # calculate Silhouette width
41 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
42 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
43 |
44 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
45 |
46 | # calculate ARI
47 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
48 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
49 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
50 |
51 | write_csv(metrics, metrics_fname)
52 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...'))
53 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/method_running/harm_cite.R:
--------------------------------------------------------------------------------
1 | # harmony benchmark, full antibody panel
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/bench_test3/output/"
8 | in_root = "/bench_test3/input/"
9 | batch = 5
10 | out_indx = 15
11 |
12 | for(i in c(1:batch)){
13 | batch_name = paste0("b",as.character(i),"/")
14 | out_dir =paste0(out_root,batch_name,"hm/")
15 | in_dir = paste0(in_root,batch_name)
16 | dir.create(paste0(out_root,batch_name))
17 | dir.create(out_dir)
18 | # read
19 | rna = readMM(paste0(in_dir,"rna.txt"))
20 | protein = read.csv(paste0(in_dir,"pro.csv"))
21 | meta = read.csv(paste0(in_dir,"meta.csv"))
22 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same
23 | colnames(rna) = rna_names$names
24 | # change name
25 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
26 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
27 | rna_list = c()
28 | protein_list = c()
29 | for (i in c(1:dim(correspondence)[1])){
30 | protein_n = as.character(correspondence[i,1])
31 | rna_n = as.character(correspondence[i,2])
32 | if (grepl("Ignore", rna_n, fixed = TRUE)){
33 | next
34 | }
35 | rna_n = strsplit(rna_n, '/')[[1]]
36 | for(r in rna_n){
37 | if (r %in% rna_names$names){
38 | rna_list = c(rna_list, r)
39 | protein_list = c(protein_list, protein_n)
40 | }
41 | }
42 | }
43 | # change name end
44 | # first filtering step should be same as in sp
45 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
46 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
47 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
48 | # copy sp filtering
49 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
50 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
51 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
52 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
53 | # then we construct the seurat objects
54 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
55 | x_obj <- NormalizeData(x_obj)
56 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
57 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
58 | # add suerat object datay
59 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
60 | y_obj <- NormalizeData(y_obj)
61 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
62 | #list_modality=list(x_obj,y_obj)
63 | # get shared clean features
64 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
65 | # run harmony in seurat, need to make a new seurat object
66 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
67 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
68 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE)
69 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
70 | # cbind together, scale within modality is better
71 | xy_obj <- xy_obj %>% RunHarmony("orig")
72 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
73 | name_1 = "full_embed_x0.csv"
74 | name_2 = "full_embed_y0.csv"
75 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
76 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
77 | row.names=FALSE) # need to decide output pca cell
78 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
79 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
80 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
81 | }
82 |
83 | ##
84 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/method_running/harm_cite_reduc-drop.R:
--------------------------------------------------------------------------------
1 | #harmony for umap viz and cf matrix, drop antibody version
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/bench_test3/output/reduction-drop/"
8 | in_root = "/bench_test3/input/reduction/"
9 |
10 | out_dir =paste0(out_root,"hm/")
11 | in_dir = in_root
12 | out_indx = 15
13 |
14 | # read
15 | dropped_pro = read.csv("/bench_test3/input/rank30.csv")
16 | target = as.character(dropped_pro$target)
17 |
18 | rna = readMM(paste0(in_dir,"rna.txt"))
19 | protein = read.csv(paste0(in_dir,"pro.csv"))
20 | protein = protein[,target]
21 |
22 | meta = read.csv(paste0(in_dir,"meta.csv"))
23 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same
24 | colnames(rna) = rna_names$names
25 | # change name
26 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
27 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
28 | rna_list = c()
29 | protein_list = c()
30 | for (i in c(1:dim(correspondence)[1])){
31 | protein_n = as.character(correspondence[i,1])
32 | rna_n = as.character(correspondence[i,2])
33 | if (grepl("Ignore", rna_n, fixed = TRUE)){
34 | next
35 | }
36 | rna_n = strsplit(rna_n, '/')[[1]]
37 | for(r in rna_n){
38 | if (r %in% rna_names$names){
39 | rna_list = c(rna_list, r)
40 | protein_list = c(protein_list, protein_n)
41 | }
42 | }
43 | }
44 | # change name end
45 | # first filtering step should be same as in sp
46 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
47 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
48 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
49 | dim(protein.shared)
50 | dim(rna.shared)
51 | # copy sp filtering
52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.05]
54 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub)))
55 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub)))
56 | #
57 | dim(protein.shared.sub)
58 | dim(rna.shared.sub)
59 | #
60 |
61 | # then we construct the seurat objects
62 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
63 | x_obj <- NormalizeData(x_obj)
64 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
65 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
66 | # add suerat object datay
67 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
68 | y_obj <- NormalizeData(y_obj)
69 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
70 | #list_modality=list(x_obj,y_obj)
71 | # get shared clean features
72 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
73 | # run harmony in seurat, need to make a new seurat object
74 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
75 | #xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
76 | xy_obj <- NormalizeData(xy_obj)
77 | xy_obj <- ScaleData(xy_obj, features = rownames(xy_obj))
78 | xy_obj <- RunPCA(xy_obj, features = rownames(xy_obj), npc = length(features))
79 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
80 | # cbind together, scale within modality is better
81 | xy_obj <- xy_obj %>% RunHarmony("orig")
82 |
83 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
84 | name_1 = "full_embed_x0.csv"
85 | name_2 = "full_embed_y0.csv"
86 |
87 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
88 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
89 | row.names=FALSE) # need to decide output pca cell
90 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
91 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
92 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
93 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/method_running/harm_cite_reduc.R:
--------------------------------------------------------------------------------
1 | #harmony for umap viz and cf matrix, full antibody version
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/bench_test3/output/reduction/"
8 | in_root = "/bench_test3/input/reduction/"
9 | out_indx = 15
10 |
11 | out_dir =paste0(out_root,"hm/")
12 | in_dir = in_root
13 | # read
14 | rna = readMM(paste0(in_dir,"rna.txt"))
15 | protein = read.csv(paste0(in_dir,"pro.csv"))
16 | meta = read.csv(paste0(in_dir,"meta.csv"))
17 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same
18 | colnames(rna) = rna_names$names
19 | # change name
20 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
21 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
22 | rna_list = c()
23 | protein_list = c()
24 | for (i in c(1:dim(correspondence)[1])){
25 | protein_n = as.character(correspondence[i,1])
26 | rna_n = as.character(correspondence[i,2])
27 | if (grepl("Ignore", rna_n, fixed = TRUE)){
28 | next
29 | }
30 | rna_n = strsplit(rna_n, '/')[[1]]
31 | for(r in rna_n){
32 | if (r %in% rna_names$names){
33 | rna_list = c(rna_list, r)
34 | protein_list = c(protein_list, protein_n)
35 | }
36 | }
37 | }
38 | # change name end
39 | # first filtering step should be same as in sp
40 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
41 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
42 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
43 | # copy sp filtering
44 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
45 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
46 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
47 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
48 | # then we construct the seurat objects
49 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
50 | x_obj <- NormalizeData(x_obj)
51 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
52 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
53 | # add suerat object datay
54 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
55 | y_obj <- NormalizeData(y_obj)
56 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
57 | #list_modality=list(x_obj,y_obj)
58 | # get shared clean features
59 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
60 | # run harmony in seurat, need to make a new seurat object
61 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
62 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
63 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE)
64 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
65 | # cbind together, scale within modality is better
66 | xy_obj <- xy_obj %>% RunHarmony("orig")
67 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
68 | name_1 = "full_embed_x0.csv"
69 | name_2 = "full_embed_y0.csv"
70 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
71 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
72 | row.names=FALSE) # need to decide output pca cell
73 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
74 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
75 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
76 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/method_running/liger_cite.R:
--------------------------------------------------------------------------------
1 | # bindsc benchmark, full antibody panel
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/bench_test3/output/"
7 | in_root = "/bench_test3/input/"
8 | batch = 5
9 | out_indx = 15
10 |
11 | for(i in c(1:batch)){
12 | batch_name = paste0("b",as.character(i),"/")
13 | out_dir =paste0(out_root,batch_name,"lg/")
14 | in_dir = paste0(in_root,batch_name)
15 | dir.create(paste0(out_root,batch_name))
16 | dir.create(out_dir)
17 | # read
18 | rna = readMM(paste0(in_dir,"rna.txt"))
19 | protein = read.csv(paste0(in_dir,"pro.csv"))
20 | meta = read.csv(paste0(in_dir,"meta.csv"))
21 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same
22 | colnames(rna) = rna_names$names
23 | # change name
24 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
25 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
26 | rna_list = c()
27 | protein_list = c()
28 | for (i in c(1:dim(correspondence)[1])){
29 | protein_n = as.character(correspondence[i,1])
30 | rna_n = as.character(correspondence[i,2])
31 | if (grepl("Ignore", rna_n, fixed = TRUE)){
32 | next
33 | }
34 | rna_n = strsplit(rna_n, '/')[[1]]
35 | for(r in rna_n){
36 | if (r %in% rna_names$names){
37 | rna_list = c(rna_list, r)
38 | protein_list = c(protein_list, protein_n)
39 | }
40 | }
41 | }
42 | # change name end
43 | # first filtering step should be same as in sp
44 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
45 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
46 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
47 | # copy sp filtering to produce better output
48 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
49 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
50 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
51 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
52 | # then we construct the liger objects
53 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE)
54 | ###Start integration
55 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
56 | # default preprocessing
57 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
58 | # do not need to select genes
59 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
60 | ligerobj@var.genes=features # just use all
61 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
62 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE)
63 | ligerobj <- quantile_norm(ligerobj)
64 | embedding = ligerobj@H.norm[,c(1:out_indx)]
65 | if (dim(embedding)[1] != 20000) {
66 | break
67 | }
68 | name_1 = "full_embed_x0.csv"
69 | name_2 = "full_embed_y0.csv"
70 | # no avaliable matching information from liger thus not saved out
71 | # will use knn to serach matching on embedding in downstreatm analysis
72 | write.csv(embedding[c(1:nrow(rna.shared.sub)),c(1:out_indx)],
73 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
74 | write.csv(embedding[c((nrow(rna.shared.sub) + 1):(nrow(rna.shared.sub) + nrow(protein.shared.sub))),c(1:out_indx)],
75 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
76 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
77 | }
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/method_running/liger_cite_reduction-drop.R:
--------------------------------------------------------------------------------
1 | #liger for umap viz and cf matrix, drop antibody version
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/bench_test3/output/reduction-drop/"
7 | in_root = "/bench_test3/input/reduction/"
8 |
9 | out_dir =paste0(out_root,"lg/")
10 | in_dir = in_root
11 | out_indx = 15
12 |
13 | # read
14 | dropped_pro = read.csv("/bench_test3/input/rank30.csv")
15 | target = as.character(dropped_pro$target)
16 |
17 | rna = readMM(paste0(in_dir,"rna.txt"))
18 | protein = read.csv(paste0(in_dir,"pro.csv"))
19 | meta = read.csv(paste0(in_dir,"meta.csv"))
20 | rna_names = read.csv("/bench_test3/input/citeseq_rna_names.csv") # rna names always the same
21 | colnames(rna) = rna_names$names
22 | # change name
23 | correspondence = read.csv('protein_rna_name_conversionV11.csv')
24 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
25 | rna_list = c()
26 | protein_list = c()
27 | for (i in c(1:dim(correspondence)[1])){
28 | protein_n = as.character(correspondence[i,1])
29 | rna_n = as.character(correspondence[i,2])
30 | if (grepl("Ignore", rna_n, fixed = TRUE)){
31 | next
32 | }
33 | rna_n = strsplit(rna_n, '/')[[1]]
34 | for(r in rna_n){
35 | if (r %in% rna_names$names){
36 | rna_list = c(rna_list, r)
37 | protein_list = c(protein_list, protein_n)
38 | }
39 | }
40 | }
41 | # change name end
42 | # first filtering step should be same as in sp
43 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
44 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
45 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
46 | # copy sp filtering to produce better output
47 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
48 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
49 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
50 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
51 |
52 | # then we construct the liger objects
53 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = False)
54 | ###Start integration
55 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
56 | max_comp = length(features)
57 | # default preprocessing
58 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
59 | # do not need to select genes
60 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
61 | ligerobj@var.genes=features # just use all
62 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
63 | ligerobj <- optimizeALS(ligerobj, k = (max_comp-1),remove.missing = FALSE)
64 | ligerobj <- quantile_norm(ligerobj)
65 |
66 | if (max_comp <= out_indx) {
67 | out_indx = max_comp - 1
68 | }
69 | embedding = ligerobj@H.norm[,c(1:out_indx)]
70 | name_1 = "full_embed_x0.csv"
71 | name_2 = "full_embed_y0.csv"
72 |
73 | #
74 | `%notin%` <- Negate(`%in%`)
75 | filtered =
76 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)]
77 | filtered_id = as.integer(gsub("d1", "", filtered)) # d119958, one cell got deleted
78 |
79 | # no avaliable matching information from liger thus not saved out
80 | # will use knn to serach matching on embedding in downstreatm analysis
81 | write.csv(embedding[c(1:19999),c(1:out_indx)],
82 | paste0(out_dir,name_1), row.names=FALSE) # one cell got deleted during liger process, keep track during downstream analysis
83 | write.csv(embedding[20000:39999,c(1:out_indx)],
84 | paste0(out_dir,name_2), row.names=FALSE)
85 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
86 |
87 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/method_running/liger_cite_reduction.R:
--------------------------------------------------------------------------------
1 | #script for seurat fusion
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/home/bkzhu/super_mario/bench_test3/output/reduction/"
7 | in_root = "/home/bkzhu/super_mario/bench_test3/input/reduction/"
8 |
9 | out_dir =paste0(out_root,"lg/")
10 | in_dir = in_root
11 | out_indx = 10
12 |
13 | # read
14 | rna = readMM(paste0(in_dir,"rna.txt"))
15 | protein = read.csv(paste0(in_dir,"pro.csv"))
16 | meta = read.csv(paste0(in_dir,"meta.csv"))
17 | rna_names = read.csv("/home/bkzhu/super_mario/bench_test3/input/citeseq_rna_names.csv") # rna names always the same
18 | colnames(rna) = rna_names$names
19 | # change name
20 | correspondence = read.csv('/home/bkzhu/super_mario/production/hubmap/protein_rna_name_conversionV7.csv')
21 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
22 | rna_list = c()
23 | protein_list = c()
24 | for (i in c(1:dim(correspondence)[1])){
25 | protein_n = as.character(correspondence[i,1])
26 | rna_n = as.character(correspondence[i,2])
27 | if (grepl("Ignore", rna_n, fixed = TRUE)){
28 | next
29 | }
30 | rna_n = strsplit(rna_n, '/')[[1]]
31 | for(r in rna_n){
32 | if (r %in% rna_names$names){
33 | rna_list = c(rna_list, r)
34 | protein_list = c(protein_list, protein_n)
35 | }
36 | }
37 | }
38 | # change name end
39 | # first filtering step should be same as in sp
40 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
41 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
42 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
43 | # copy sp filtering to produce better output
44 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
45 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
46 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
47 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
48 | # then we construct the liger objects
49 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = F)
50 | ###Start integration
51 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
52 | # default preprocessing
53 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
54 | # do not need to select genes
55 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
56 | ligerobj@var.genes=features # just use all
57 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
58 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE)
59 | ligerobj <- quantile_norm(ligerobj)
60 | embedding = ligerobj@H.norm[,c(1:out_indx)]
61 | name_1 = "full_embed_x0.csv"
62 | name_2 = "full_embed_y0.csv"
63 |
64 | #
65 | `%notin%` <- Negate(`%in%`)
66 | filtered =
67 | c(rownames(rna.shared.sub), rownames(protein.shared.sub))[c(rownames(rna.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)]
68 | filtered_id = as.integer(gsub("d1", "", filtered)) #d119958
69 | #
70 | # no avaliable matching information from liger thus not saved out
71 | # will use knn to serach matching on embedding in downstreatm analysis
72 | write.csv(embedding[c(1:19999),c(1:out_indx)],
73 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
74 | write.csv(embedding[20000:39999,c(1:out_indx)],
75 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
76 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
77 |
78 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/reduction.sh:
--------------------------------------------------------------------------------
1 | ## run all methods, result used for umap viz and confuse matrix plotting
2 | python cite_mf_reduction.py &
3 | /usr/bin/Rscript seurat_cite_reduc.R &
4 | /usr/bin/Rscript liger_cite_reduction.R &
5 | /usr/bin/Rscript harm_cite_reduc.R &
6 | /usr/bin/Rscript bsc_cite_reduction.R
7 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/step1-drop.sh:
--------------------------------------------------------------------------------
1 | ## script to run all methods at the same time:
2 | ## step one of benchmarking, dropping antibody panel version
3 | python maxfuse_cite-drop.py &
4 | /usr/bin/Rscript seurat_cite_drop.R &
5 | /usr/bin/Rscript liger_cite_drop.R &
6 | /usr/bin/Rscript harm_cite_drop.R &
7 | /usr/bin/Rscript bsc_cite_drop.R
8 |
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/step1.sh:
--------------------------------------------------------------------------------
1 | ## script to run all methods at the same time:
2 | ## step one of benchmarking, full antibody panel version
3 | python maxfuse_cite.py &
4 | /usr/bin/Rscript seurat_cite.R &
5 | /usr/bin/Rscript liger_cite.R &
6 | /usr/bin/Rscript harm_cite.R &
7 | /usr/bin/Rscript bsc_cite.R
--------------------------------------------------------------------------------
/Archive/citeseq-pbmc/code/benchmark/step2.sh:
--------------------------------------------------------------------------------
1 | # script to produce ARI f1 score and SLT f1 score
2 | # only done on the full panel, not done one the dropping verions
3 |
4 | # b1-5 for mf
5 | /usr/bin/Rscript calculate_metrics.R '//bench_test3/output/b1/mf/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/mf/full_embed' 0 &
6 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/mf/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/mf/full_embed' 0 &
7 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/mf/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/mf/full_embed' 0 &
8 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/mf/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/mf/full_embed' 0 &
9 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/mf/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/mf/full_embed' 0 &
10 | wait
11 | # b1-5 for sr
12 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/sr/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/sr/full_embed' 0 &
13 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/sr/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/sr/full_embed' 0 &
14 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/sr/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/sr/full_embed' 0 &
15 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/sr/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/sr/full_embed' 0 &
16 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/sr/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/sr/full_embed' 0 &
17 | wait
18 | # b1-5 for lg
19 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/lgunimf/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/lgunimf/full_embed' 0 &
20 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/lgunimf/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/lgunimf/full_embed' 0 &
21 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/lgunimf/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/lgunimf/full_embed' 0 &
22 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/lgunimf/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/lgunimf/full_embed' 0 &
23 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/lgunimf/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/lgunimf/full_embed' 0 &
24 | wait
25 | # b1-5 for hm
26 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/hm/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/hm/full_embed' 0 &
27 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/hm/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/hm/full_embed' 0 &
28 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/hm/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/hm/full_embed' 0 &
29 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/hm/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/hm/full_embed' 0 &
30 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/hm/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/hm/full_embed' 0
31 | wait
32 | # b1-5 for bsc
33 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b1/bsc/metrics.csv' '/bench_test3/input/b1/orig' '/bench_test3/output/b1/bsc/full_embed' 0 &
34 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b2/bsc/metrics.csv' '/bench_test3/input/b2/orig' '/bench_test3/output/b2/bsc/full_embed' 0 &
35 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b3/bsc/metrics.csv' '/bench_test3/input/b3/orig' '/bench_test3/output/b3/bsc/full_embed' 0 &
36 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b4/bsc/metrics.csv' '/bench_test3/input/b4/orig' '/bench_test3/output/b4/bsc/full_embed' 0 &
37 | /usr/bin/Rscript calculate_metrics.R '/bench_test3/output/b5/bsc/metrics.csv' '/bench_test3/input/b5/orig' '/bench_test3/output/b5/bsc/full_embed' 0
--------------------------------------------------------------------------------
/Archive/hubmap_nature/readme.md:
--------------------------------------------------------------------------------
1 | This folder contains the analysis performed related to MaxFuse in the paper **"High resolution single cell maps reveals distinct cell organization and function across different regions of the human intestine"**.
2 |
3 | Note this set analysis used the development version of Maxfuse, also deposited in the [folder](https://github.com/shuxiaoc/maxfuse/tree/main/Archive/MaxFuse_devo/09302022V).
4 |
5 | Since in the MaxFuse paper similar data were used, as that the preprocessing of the data were all the same, for that part please refer to the code that relates to ```codex``` and ```rna``` in [folders](https://github.com/shuxiaoc/maxfuse/tree/main/Archive/hubmap/code/preparation).
6 |
7 | The script in this folder only contains the running of MaxFuse and the relevant downstream analysis presented in the paper "High resolution single cell maps reveals distinct cell organization and function across different regions of the human intestine".
8 |
--------------------------------------------------------------------------------
/Archive/strong-link/10xe18/analysis/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | #print(sam_x)
35 | #print(sam_y)
36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
37 | #print(metrics)
38 | # calculate Silhouette width
39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
41 | #print(slt_res)
42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
43 | #print(metrics)
44 | # calculate ARI
45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
48 |
49 | # calculate LISI
50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...'))
51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2])
53 |
54 | # calculate mixing averaged over clusters
55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...'))
56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname,
57 | n_idx=n_idx)
58 | metrics = metrics %>% add_column(avg_mix=avg_mix)
59 |
60 | # save metrics, because the calculation of kBET is substantially slower.
61 | write_csv(metrics, metrics_fname)
62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...'))
63 |
64 | #### not calculating kBet here because too slow for this stage
65 | # calculate kBET
66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...'))
67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
68 | #metrics = metrics %>% add_column(kBET=kbet_res)
69 |
70 | #write_csv(metrics, metrics_fname)
71 | #print(paste0(format(Sys.Date(), "%c"), ': done!'))
--------------------------------------------------------------------------------
/Archive/strong-link/10xe18/analysis/step2.sh:
--------------------------------------------------------------------------------
1 | # no condo env requirement
2 | # calculate ari and slt f1 scores
3 |
4 | # for mf
5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/mf/full_embed' 0 &
6 |
7 | # for scjoint
8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/scjoint/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/scjoint/raw_labels/full_embed' 0 &
9 |
10 | # for maestro
11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/maestro/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/maestro/full_embed' 0 &
12 |
13 | # for glue
14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/glue/full_embed' 0
--------------------------------------------------------------------------------
/Archive/strong-link/10xe18/method_running/glue_e18_prepare_data_h5.R:
--------------------------------------------------------------------------------
1 | ### prepare data for scglue ###
2 | #setwd("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/glue/")
3 |
4 | module unload python/python-3.6.2
5 | module load python/python-3.8.2
6 | conda activate scglue2
7 | R
8 | library(BiocGenerics, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library")
9 | library(S4Vectors, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library")
10 | library(IRanges, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library")
11 | library(GenomeInfoDb, lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library")
12 |
13 | library(anndata)
14 |
15 | library(Seurat)
16 | library(Signac,lib.loc="/home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2/envs/MAESTRO/lib/R/library")
17 | library(tables)
18 | library(reticulate)
19 |
20 | library(SingleCellExperiment)
21 | library(DropletUtils)
22 | library(scater)
23 | library(ggplot2)
24 |
25 | #### e18 mouse data ####
26 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/data/10x_RNA_ATAC_EmbryonicMouseBrain/")
27 | #setwd("/Users/sijia_work/Dropbox/SingleCellAlignment/data/10x_RNA_ATAC_EmbryonicMouseBrain/")
28 |
29 | e18=readRDS("e18.4.20210917.rds")
30 | table(e18$celltype)
31 |
32 | e18mouseRNA = e18@assays[["RNA"]]@counts
33 | e18mouseATAC = e18@assays[["ATAC"]]@counts
34 |
35 | e18.obj.rna <- CreateSeuratObject(
36 | counts = e18mouseRNA,
37 | assay = "RNA"
38 | )
39 | e18.obj.atac <- CreateSeuratObject(
40 | counts = e18mouseATAC,
41 | assay = "RNA"
42 | )
43 |
44 | e18.obj.rna$celltype <- e18$celltype
45 | e18.obj.atac$celltype <- e18$celltype
46 |
47 | e18.obj.rna@meta.data$domain <- "scRNA-seq"
48 | e18.obj.atac@meta.data$domain <- "scATAC-seq"
49 |
50 |
51 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/e18mouse")
52 | library(SeuratDisk)
53 | SaveH5Seurat(e18.obj.rna, filename = "e18mouse_RNA_v2.h5Seurat")
54 | Convert("e18mouse_RNA_v2.h5Seurat", dest = "h5ad")
55 | SaveH5Seurat(e18.obj.atac, filename = "e18mouse_ATAC_v2.h5Seurat")
56 | Convert("e18mouse_ATAC_v2.h5Seurat", dest = "h5ad")
57 |
--------------------------------------------------------------------------------
/Archive/strong-link/10xe18/method_running/scj_add_prep_10xe18mouse.R:
--------------------------------------------------------------------------------
1 | qlogin -now no
2 | module unload python/python-3.6.2
3 | module load python/python-3.8.2
4 | conda activate MAESTRO
5 | R
6 |
7 |
8 | ### prepare scJoint input of h5 file ###
9 |
10 | library(Seurat)
11 | library(Signac)
12 | library(tables)
13 | library(reticulate)
14 |
15 | library(SingleCellExperiment)
16 | library(DropletUtils)
17 | library(scater)
18 | library(ggplot2)
19 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/data/10x_RNA_ATAC_EmbryonicMouseBrain/")
20 |
21 | e18=readRDS("e18.4.20210917.rds")
22 |
23 | table(e18$celltype)
24 |
25 | e18mouseRNA = e18@assays[["RNA"]]@counts
26 |
27 | e18mouseATAC = e18@assays[["ATAC"]]@counts
28 |
29 | DefaultAssay(e18)='ATAC'
30 | Annotation(e18)
31 | frags=UpdatePath(Fragments(e18)[[1]], new.path = 'e18_mouse_brain_fresh_5k_atac_fragments.tsv.gz')
32 | Fragments(e18)=NULL
33 | e18=SetAssayData(e18, slot = "fragments", new.data = frags)
34 |
35 | e18gene.activities <- GeneActivity(e18)
36 |
37 | library(Seurat)
38 | library(Signac)
39 | library(EnsDb.Hsapiens.v86)
40 | library(GenomeInfoDb)
41 | library(dplyr)
42 | library(ggplot2)
43 |
44 |
45 | e18[["ACTIVITY"]] <- CreateAssayObject(counts = e18gene.activities)
46 | DefaultAssay(e18) <- "ACTIVITY"
47 | # SCTransform normalization and PCA dimensional reduction on gene activity
48 | e18<- SCTransform(e18, assay="ACTIVITY", verbose = FALSE, new.assay.name = 'SCT.ACTIVITY') %>% RunPCA(verbose=F, reduction.name = 'pca.activity') %>% RunUMAP(verbose=F, dims = 1:20, reduction='pca.activity', reduction.name='umap.activity')
49 |
50 |
51 | e18$celltype -> e18_celltype
52 |
53 | e18.obj.rna <- CreateSeuratObject(
54 | counts = e18mouseRNA,
55 | assay = "RNA"
56 | )
57 |
58 | # Only keep common genes between two dataset
59 | common_genes <- intersect(rownames(e18.obj.rna),
60 | rownames(e18gene.activities))
61 | length(common_genes)
62 |
63 | e18.obj.activity <- CreateSeuratObject(
64 | counts = e18gene.activities,
65 | assay = "RNA"
66 | )
67 |
68 | ### create logcounts ###
69 | activity.sce <- as.SingleCellExperiment(e18.obj.activity)
70 | rna.sce <- as.SingleCellExperiment(e18.obj.rna)
71 |
72 | # Extract the logcounts data from sce object
73 | exprs_atac <- logcounts(activity.sce[common_genes, ])
74 | exprs_rna <- logcounts(rna.sce[common_genes, ])
75 |
76 | source("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/data_to_h5.R")
77 | #source("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/data_to_h5.R")
78 | write_h5_scJoint(exprs_list = list(rna = exprs_rna,
79 | atac = exprs_atac),
80 | h5file_list = c("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/exprs_10xe18_rna.h5",
81 | "/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/exprs_10xe18_atac.h5"))
82 |
83 | write_csv_scJoint(cellType_list = list(names(e18_celltype)),
84 | csv_list = c("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/cellname_cellType_10xe18.csv"))
85 | write_csv_scJoint(cellType_list = list(e18_celltype),
86 | csv_list = c("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scJoint/e18mouse/cellType_10xe18.csv"))
87 |
88 |
89 |
90 | ### final output ###
91 | e18_predict_label <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/e18_predictlabel_celltype.csv")
92 | e18_rna_pred <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_rna_predictions.txt",header=F,sep=" ")
93 | e18_atac_pred_label <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_atac_knn_predictions.txt",header=F)
94 | e18_atac_pred <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_atac_predictions.txt",header=F,sep=" ")
95 | e18_idx_label <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/label_to_idx.txt",header=F,sep=" ")
96 |
97 | e18_atac_embed <-read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_atac_embeddings.txt",header=F,sep=" ")
98 | e18_rna_embed <- read.csv("/Users/sijia_work/Documents/nancy_projects/scATAC_RNA/maestro/scJoint/e18mouse/barcode_labels/exprs_10xe18_rna_embeddings.txt",header=F,sep=" ")
99 |
--------------------------------------------------------------------------------
/Archive/strong-link/10xe18/prep_e18.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "prep_e18"
3 | output: html_document
4 | ---
5 |
6 | code to prep 10x e18 (mouse embryonic brain scATAC/scRNA multiome dataset from 10x genomics):
7 | rna information from "e18.4.20210917.rds"
8 | atac information from "e18_mouse_brain_fresh_5k_atac_fragments.tsv.gz"
9 |
10 | @Nancy Zhang and @Sijia Huang for source of these two files or any preprocessing related to the original data.
11 |
12 | ```{r}
13 | # calculate gene activity score by signac
14 | # gene activity score is used by MaxFuse. Other methods directly use Fragments
15 |
16 | library(Signac)
17 |
18 | e18 = readRDS("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/e18.4.20210917.rds")
19 | DefaultAssay(e18)='ATAC'
20 | Annotation(e18)
21 | frags=UpdatePath(Fragments(e18)[[1]],
22 | new.path = '/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/e18_mouse_brain_fresh_5k_atac_fragments.tsv.gz')
23 | Fragments(e18)=NULL
24 | e18=SetAssayData(e18, slot = "fragments", new.data = frags)
25 | gene.activities <- GeneActivity(e18)
26 |
27 | # csnk2a1 duplication problem
28 | temp = gene.activities[14703,] + gene.activities[14704,]
29 | gene.activities = gene.activities[-c(14703, 14704),]
30 | gene.activities = rbind(gene.activities,temp)
31 | rownames(gene.activities)[21977] = 'Csnk2a1'
32 | ```
33 |
34 |
35 | ```{r}
36 | ## okay start saving out
37 | e18_rna_sct = as.data.frame(t(e18@assays$SCT@data))
38 | e18_rna_sct_names = colnames(e18_rna_sct)
39 |
40 | library(Matrix)
41 | ## rna
42 | e18_rna_sct = as(as.matrix(e18_rna_sct), "dgCMatrix")
43 | writeMM(e18_rna_sct, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_rna.txt")
44 | write.csv(data.frame(names = e18_rna_sct_names), "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_rna_names.csv")
45 |
46 | ## atac_GAS
47 | e18_gas = as.data.frame(t(gene.activities))
48 | e18_gas_names = colnames(e18_gas)
49 | e18_gas = as(as.matrix(e18_gas), "dgCMatrix")
50 | writeMM(e18_gas, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_GAS.txt")
51 | write.csv(data.frame(names = e18_gas_names), "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_GAS_names.csv")
52 |
53 | ## atac_lsi
54 | e18_lsi = e18@reductions$lsi@cell.embeddings[,c(2:50)]
55 | write.csv(e18_lsi, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_LSI49.csv")
56 |
57 | ## meta
58 | meta_data = e18@meta.data
59 | write.csv(meta_data, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_meta.csv")
60 | ```
61 |
62 |
63 | ############## produce RNA and ATAC embedding for slt and ari calculation
64 | ## ATAC embedding can just direclty use LSI scores; RNA use PCA embedding; both 15 dimensions
65 |
66 | ```{r}
67 | ## rna
68 | rna = readMM("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_rna.txt")
69 | rna = as.matrix(rna)
70 | rna_names = read.csv('/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/rna_names.csv')
71 | colnames(rna) = rna_names$names
72 | rownames(rna) = paste0("cell",c(1:nrow(rna)))
73 |
74 | library(Seurat)
75 | temp_obj1 = CreateSeuratObject(counts=t(rna),assay="rna")
76 | temp_obj1 = SetAssayData(object = temp_obj1, slot = "data", new.data = t(rna), assay="rna") # input data already sctnorm
77 | temp_obj1 = ScaleData(temp_obj1)
78 | temp_obj1 <- FindVariableFeatures(temp_obj1, selection.method = "vst", nfeatures = 2000)
79 | temp_obj1 = RunPCA(temp_obj1, features = rownames(temp_obj1))
80 | meta = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_meta.csv")
81 | pca = as.data.frame(temp_obj1@reductions$pca@cell.embeddings[,c(1:15)])
82 | pca$label = meta$annotation
83 | write.csv(pca, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig_x.csv", row.names = F)
84 |
85 | # lsi
86 | lsi = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/10x_e18_LSI49.csv")
87 | lsi_orig = lsi[,c(1:15)]
88 | lsi_orig$label = meta$annotation
89 | write.csv(lsi_orig, "/home/bkzhu/super_mario/atac_bench_nrz/10x_e18/data/orig_y.csv", row.names = F)
90 | ```
91 |
92 |
93 |
--------------------------------------------------------------------------------
/Archive/strong-link/10xpbmc/analysis/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | #print(sam_x)
35 | #print(sam_y)
36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
37 | #print(metrics)
38 | # calculate Silhouette width
39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
41 | #print(slt_res)
42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
43 | #print(metrics)
44 | # calculate ARI
45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
48 |
49 | # calculate LISI
50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...'))
51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2])
53 |
54 | # calculate mixing averaged over clusters
55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...'))
56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname,
57 | n_idx=n_idx)
58 | metrics = metrics %>% add_column(avg_mix=avg_mix)
59 |
60 | # save metrics, because the calculation of kBET is substantially slower.
61 | write_csv(metrics, metrics_fname)
62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...'))
63 |
64 | #### not calculating kBet here because too slow for this stage
65 | # calculate kBET
66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...'))
67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
68 | #metrics = metrics %>% add_column(kBET=kbet_res)
69 |
70 | #write_csv(metrics, metrics_fname)
71 | #print(paste0(format(Sys.Date(), "%c"), ': done!'))
--------------------------------------------------------------------------------
/Archive/strong-link/10xpbmc/analysis/step2.sh:
--------------------------------------------------------------------------------
1 | # no condo env requirement
2 | # calculate ari and slt f1 scores
3 |
4 | # for mf
5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/mf/full_embed' 0 &
6 |
7 | # for scjoint
8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/scJoint/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/scJoint/raw_labels/full_embed' 0 &
9 |
10 | # for maestro
11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/maestro/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/maestro/full_embed' 0 &
12 |
13 | # for glue
14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/glue/full_embed' 0
--------------------------------------------------------------------------------
/Archive/strong-link/10xpbmc/method_running/glue_pbmc_prepare_data_h5.R:
--------------------------------------------------------------------------------
1 | module unload python/python-3.6.2
2 | module load python/python-3.8.2
3 | conda activate scglue2
4 | R
5 |
6 | library(anndata)
7 | #### pbmc data ####
8 | library(Seurat)
9 | library(Signac)
10 | library(EnsDb.Hsapiens.v86)
11 | library(GenomeInfoDb)
12 | library(dplyr)
13 | library(ggplot2)
14 |
15 |
16 | setwd("/home/mnt/nzh/nzhanglab/project/SingleCellAlignment/data/10x_RNA_ATAC_PBMC")
17 | load("pbmc_chromvar_annotated.rda")
18 | pbmc.rna = pbmc@assays[["RNA"]]@counts
19 | pbmc.obj.rna <- CreateSeuratObject(
20 | counts = pbmc.rna,
21 | assay = "RNA"
22 | )
23 |
24 | pbmc.atac = pbmc@assays[["ATAC"]]@counts
25 | pbmc.obj.atac <- CreateSeuratObject(
26 | counts = pbmc.atac,
27 | assay = "RNA"
28 | )
29 |
30 | pbmc$citeseq.celltype -> pbmc_celltype
31 |
32 | pbmc.obj.rna$celltype <- pbmc_celltype
33 | pbmc.obj.atac$celltype <- pbmc_celltype
34 |
35 | pbmc.obj.rna@meta.data$domain <- "scRNA-seq"
36 | pbmc.obj.atac@meta.data$domain <- "scATAC-seq"
37 |
38 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/pbmc")
39 | library(SeuratDisk)
40 | SaveH5Seurat(pbmc.obj.rna, filename = "pbmc_RNA_v2.h5Seurat")
41 | Convert("pbmc_RNA_v2.h5Seurat", dest = "h5ad")
42 | SaveH5Seurat(pbmc.obj.atac, filename = "pbmc_ATAC_v2.h5Seurat")
43 | Convert("pbmc_ATAC_v2.h5Seurat", dest = "h5ad")
44 |
--------------------------------------------------------------------------------
/Archive/strong-link/10xpbmc/prep_pbmc.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "prep_pbmc"
3 | output: html_document
4 | ---
5 |
6 | code to prep 10x pbmc (pbmc scATAC/scRNA multiome dataset from 10x genomics):
7 | rna information from "pbmc_chromvar_annotated.rda"
8 | atac information from "pbmc_chromvar_annotated.rda" # gene activity score pre calculated by signac already avaliable in the seurat object
9 |
10 | @Nancy Zhang and @Sijia Huang for source of this file or any preprocessing related to the original data.
11 |
12 |
13 | ```{r}
14 | load("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/pbmc_chromvar_annotated.rda")
15 |
16 | DefaultAssay(object = pbmc) <- "RNA"
17 | # do sctransform as zmm requirement
18 | pbmc <- PercentageFeatureSet(pbmc, pattern = "^MT-", col.name = "percent.mt")
19 | pbmc[["SCT"]] <- NULL
20 | pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) #reachediteration limit
21 | ```
22 |
23 | ```{r}
24 | ## okay start saving out
25 | pbmc_rna_sct = as.data.frame(t(pbmc@assays$SCT@data))
26 | pbmc_rna_sct_names = colnames(pbmc_rna_sct)
27 |
28 | library(Matrix)
29 | ## rna
30 | pbmc_rna_sct = as(as.matrix(pbmc_rna_sct), "dgCMatrix")
31 | writeMM(pbmc_rna_sct, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_rna.txt")
32 | write.csv(data.frame(names = pbmc_rna_sct_names), "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_rna_names.csv")
33 | ## atac_GAS
34 | pbmc_gas = as.data.frame(t(pbmc@assays$ACTIVITY@data))
35 | pbmc_gas_names = colnames(pbmc_gas)
36 | pbmc_gas = as(as.matrix(pbmc_gas), "dgCMatrix")
37 | writeMM(pbmc_gas, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_GAS.txt")
38 | write.csv(data.frame(names = pbmc_gas_names), "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_GAS_names.csv")
39 | ## atac_lsi
40 | pbmc_lsi = pbmc@reductions$lsi@cell.embeddings[,c(2:50)]
41 | write.csv(pbmc_lsi, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_LSI49.csv")
42 | ## meta
43 | meta_data = pbmc@meta.data
44 | write.csv(meta_data, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_meta.csv")
45 | ```
46 |
47 |
48 | ############## produce RNA and ATAC embedding for slt and ari calculation
49 | ## ATAC embedding can just direclty use LSI scores; RNA use PCA embedding; both 15 dimensions
50 |
51 | ```{r}
52 | ## rna
53 | rna = readMM("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_rna.txt")
54 | rna = as.matrix(rna)
55 | rna_names = read.csv('/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/rna_names.csv')
56 | colnames(rna) = rna_names$names
57 | rownames(rna) = paste0("cell",c(1:nrow(rna)))
58 |
59 | library(Seurat)
60 | temp_obj1 = CreateSeuratObject(counts=t(rna),assay="rna")
61 | temp_obj1 = SetAssayData(object = temp_obj1, slot = "data", new.data = t(rna), assay="rna") # input data already sctnorm
62 | temp_obj1 = ScaleData(temp_obj1)
63 | temp_obj1 <- FindVariableFeatures(temp_obj1, selection.method = "vst", nfeatures = 2000)
64 | temp_obj1 = RunPCA(temp_obj1, features = rownames(temp_obj1))
65 | meta = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_meta.csv")
66 | pca = as.data.frame(temp_obj1@reductions$pca@cell.embeddings[,c(1:15)])
67 | pca$label = meta$annotation
68 | write.csv(pca, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig_x.csv", row.names = F)
69 |
70 | # lsi
71 | lsi = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/10x_pbmc_LSI49.csv")
72 | lsi_orig = lsi[,c(1:15)]
73 | lsi_orig$label = meta$annotation
74 | write.csv(lsi_orig, "/home/bkzhu/super_mario/atac_bench_nrz/10xpbmc/data/orig_y.csv", row.names = F)
75 | ```
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/Archive/strong-link/cortical/analysis/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | #print(sam_x)
35 | #print(sam_y)
36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
37 | #print(metrics)
38 | # calculate Silhouette width
39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
41 | #print(slt_res)
42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
43 | #print(metrics)
44 | # calculate ARI
45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
48 |
49 | # calculate LISI
50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...'))
51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2])
53 |
54 | # calculate mixing averaged over clusters
55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...'))
56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname,
57 | n_idx=n_idx)
58 | metrics = metrics %>% add_column(avg_mix=avg_mix)
59 |
60 | # save metrics, because the calculation of kBET is substantially slower.
61 | write_csv(metrics, metrics_fname)
62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...'))
63 |
64 | #### not calculating kBet here because too slow for this stage
65 | # calculate kBET
66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...'))
67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
68 | #metrics = metrics %>% add_column(kBET=kbet_res)
69 |
70 | #write_csv(metrics, metrics_fname)
71 | #print(paste0(format(Sys.Date(), "%c"), ': done!'))
--------------------------------------------------------------------------------
/Archive/strong-link/cortical/analysis/step2.sh:
--------------------------------------------------------------------------------
1 | # no condo env requirement
2 | # calculate ari and slt f1 scores
3 |
4 | # for mf
5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/mf/full_embed' 0 &
6 |
7 | # for scjoint
8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/scjoint/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/scjoint/raw_labels/full_embed' 0 &
9 |
10 | # for maestro
11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/maestro/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/maestro/full_embed' 0 &
12 |
13 | # for glue
14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/glue/full_embed' 0
--------------------------------------------------------------------------------
/Archive/strong-link/cortical/method_running/glue_greenleaf_prepare_data_h5.R:
--------------------------------------------------------------------------------
1 | module unload python/python-3.6.2
2 | module load python/python-3.8.2
3 | conda activate scglue2
4 | R
5 |
6 | library(anndata)
7 | #### greenleaf data ####
8 | library(Seurat)
9 | library(Signac)
10 | library(EnsDb.Hsapiens.v86)
11 | library(GenomeInfoDb)
12 | library(dplyr)
13 | library(ggplot2)
14 |
15 |
16 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/data/10x_RNA_ATAC_GreenleafCortical")
17 |
18 | load("Writeup14n_10x_greenleaf.RData")
19 |
20 | greenleaf.rna = greenleaf@assays$RNA@counts
21 | greenleaf.atac = greenleaf@assays$ATAC@counts
22 |
23 | greenleaf.obj.rna <- CreateSeuratObject(
24 | counts = greenleaf.rna,
25 | assay = "RNA"
26 | )
27 |
28 | greenleaf.obj.atac <- CreateSeuratObject(
29 | counts = greenleaf.atac,
30 | assay = "RNA"
31 | )
32 |
33 | greenleaf$celltype -> greenleaf_celltype
34 |
35 | greenleaf.obj.rna$celltype <- greenleaf_celltype
36 | greenleaf.obj.atac$celltype <- greenleaf_celltype
37 |
38 | greenleaf.obj.rna@meta.data$domain <- "scRNA-seq"
39 | greenleaf.obj.atac@meta.data$domain <- "scATAC-seq"
40 |
41 |
42 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/greenleaf")
43 | library(SeuratDisk)
44 | SaveH5Seurat(greenleaf.obj.rna, filename = "greenleaf_RNA_v2.h5Seurat")
45 | Convert("greenleaf_RNA_v2.h5Seurat", dest = "h5ad")
46 | SaveH5Seurat(greenleaf.obj.atac, filename = "greenleaf_ATAC_v2.h5Seurat")
47 | Convert("greenleaf_ATAC_v2.h5Seurat", dest = "h5ad")
48 |
--------------------------------------------------------------------------------
/Archive/strong-link/cortical/prep_cortical.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "prep_cortical"
3 | output: html_document
4 | ---
5 |
6 | code to human cerebral cortical (scATAC/scRNA multiome dataset from https://pubmed.ncbi.nlm.nih.gov/34390642/ by greenleaf lab):
7 | rna information from "Writeup14n_10x_greenleaf.RData"
8 | atac information from "Writeup14n_10x_greenleaf.RData" # gene activity score pre calculated by signac already avaliable in the seurat object
9 |
10 | @Nancy Zhang and @Sijia Huang for source of this file.
11 |
12 | This processing code written by Zongming Ma
13 |
14 | ```{r}
15 | load("Writeup14n_10x_greenleaf.RData")
16 |
17 | dat1=t(greenleaf@assays$SCT@data)
18 | dat2=t(greenleaf@assays$geneActivity@data)
19 | names1= colnames(dat1)
20 | names2= colnames(dat2)
21 |
22 | temp= strsplit(names2, split="ATAC-")
23 | temp=unlist(temp)
24 | temp=temp[seq(2,length(temp),2)]
25 | names2=temp
26 |
27 | sum(names1 %in% names2)
28 | sum(names2 %in% names1)
29 |
30 | # make sure column names match
31 | mm = match(names1, names2)
32 | dat1=dat1[,!is.na(mm)]
33 | dat2=dat2[,mm[!is.na(mm)]]
34 |
35 | dat0=t(greenleaf@assays$RNA@data)
36 | names0 = colnames(dat0)
37 | names1 = colnames(dat1)
38 | sum(names0 %in% names1)
39 | sum(names1 %in% names0)
40 | mm0 = match(names1, names0)
41 | dat0=dat0[,mm0[!is.na(mm0)]]
42 |
43 | dim(dat0)
44 | # dim(dat1)
45 | dim(dat2)
46 | # make sure row names match
47 | # rnacells=rownames(dat1)
48 | rnacells=rownames(dat0)
49 | ataccells=rownames(dat2)
50 | plot(match(rnacells, ataccells))
51 |
52 | # write out data matrices in sparse matrix form.
53 | write.table(greenleaf@meta.data, file="greenleaf_cortical_meta.csv", sep=",", col.names=TRUE, row.names=FALSE)
54 | # write out names of genes for RNA.
55 | rna_names = as.data.frame(colnames(dat1))
56 | colnames(rna_names)="names"
57 | write.table(rna_names, file="greenleaf_cortical_rna_names.csv", sep=",", col.names=TRUE, row.names=TRUE)
58 |
59 | # write out the sparse matrices.
60 | writeMM(dat0, "greenleaf_cortical_RNAcount_data.mtx")
61 | writeMM(dat1, "greenleaf_cortical_SCT_data.mtx")
62 | writeMM(dat2, "greenleaf_cortical_GENEACTIVITY_data.mtx")
63 |
64 | # calculate additional LSI scores
65 | DefaultAssay(greenleaf) <- "ATAC"
66 | greenleaf <- RunTFIDF(greenleaf)
67 | greenleaf <- FindTopFeatures(greenleaf, min.cutoff = 'q0')
68 | greenleaf <- RunSVD(greenleaf, n = 200)
69 |
70 | dat3 = greenleaf@reductions$lsi@cell.embeddings
71 | dim(dat3)
72 | peakcells = rownames(dat3)
73 | plot(match(rnacells, peakcells)) # make sure row names match.
74 | write.table(dat3[,-1], file="greenleaf_cortical_peak_lsi.csv", sep=",", col.names=TRUE, row.names=TRUE)
75 | ```
76 |
77 |
78 | ############## produce RNA and ATAC embedding for slt and ari calculation
79 | ## ATAC embedding can just direclty use LSI scores; RNA use PCA embedding; both 15 dimensions
80 |
81 | ```{r}
82 | ## rna
83 | rna = readMM("/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_RNAcount_data.mtx")
84 | rna = as.matrix(rna)
85 | rna_names = read.csv('/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_rna_names.csv')
86 | colnames(rna) = rna_names$names
87 | rownames(rna) = paste0("cell",c(1:nrow(rna)))
88 |
89 | library(Seurat)
90 | temp_obj1 = CreateSeuratObject(counts=t(rna),assay="rna")
91 | temp_obj1 = SetAssayData(object = temp_obj1, slot = "data", new.data = t(rna), assay="rna") # input data already sctnorm
92 | temp_obj1 = ScaleData(temp_obj1)
93 | temp_obj1 <- FindVariableFeatures(temp_obj1, selection.method = "vst", nfeatures = 2000)
94 | temp_obj1 = RunPCA(temp_obj1, features = rownames(temp_obj1))
95 | meta = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_meta.csv")
96 | pca = as.data.frame(temp_obj1@reductions$pca@cell.embeddings[,c(1:15)])
97 | pca$label = meta$annotation
98 | write.csv(pca, "/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig_x.csv", row.names = F)
99 |
100 | # lsi
101 | lsi = read.csv("/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/greenleaf_cortical_peak_lsi.csv")
102 | lsi_orig = lsi[,c(1:15)]
103 | lsi_orig$label = meta$annotation
104 | write.csv(lsi_orig, "/home/bkzhu/super_mario/atac_bench_nrz/greanleaf_cortical/data/orig_y.csv", row.names = F)
105 | ```
106 |
107 |
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/Archive/strong-link/retina/analysis/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("/home/bkzhu/super_mario/abseq/scripts_benchmark/")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | #print(sam_x)
35 | #print(sam_y)
36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
37 | #print(metrics)
38 | # calculate Silhouette width
39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
41 | #print(slt_res)
42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
43 | #print(metrics)
44 | # calculate ARI
45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
48 |
49 | # calculate LISI
50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...'))
51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2])
53 |
54 | # calculate mixing averaged over clusters
55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...'))
56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname,
57 | n_idx=n_idx)
58 | metrics = metrics %>% add_column(avg_mix=avg_mix)
59 |
60 | # save metrics, because the calculation of kBET is substantially slower.
61 | write_csv(metrics, metrics_fname)
62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...'))
63 |
64 | #### not calculating kBet here because too slow for this stage
65 | # calculate kBET
66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...'))
67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
68 | #metrics = metrics %>% add_column(kBET=kbet_res)
69 |
70 | #write_csv(metrics, metrics_fname)
71 | #print(paste0(format(Sys.Date(), "%c"), ': done!'))
--------------------------------------------------------------------------------
/Archive/strong-link/retina/analysis/step2.sh:
--------------------------------------------------------------------------------
1 | # no condo env requirement
2 | # calculate ari and slt f1 scores
3 |
4 | # for mf
5 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/mf/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/mf/full_embed' 0 &
6 |
7 | # for scjoint
8 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/scj/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/scj/full_embed' 0 &
9 |
10 | # for maestro
11 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/ms/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/ms/full_embed' 0 &
12 |
13 | # for glue
14 | /usr/bin/Rscript calculate_metrics.R '/home/bkzhu/super_mario/atac_bench_nrz/retina/glue/metrics.csv' '/home/bkzhu/super_mario/atac_bench_nrz/retina/data/orig' '/home/bkzhu/super_mario/atac_bench_nrz/retina/glue/full_embed' 0
--------------------------------------------------------------------------------
/Archive/strong-link/retina/method_running/retina_prepare_data_h5.R:
--------------------------------------------------------------------------------
1 | module unload python/python-3.6.2
2 | module load python/python-3.8.2
3 | conda activate /home/mnt/nzh/nzhanglab/project/shuang/miniconda3/envs/scglue2
4 | R
5 |
6 | #### retina data ####
7 | library(Seurat)
8 | library(Signac)
9 | library(GenomeInfoDb)
10 | library(dplyr)
11 | library(ggplot2)
12 |
13 |
14 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/data/Retina/")
15 | retina<-readRDS("data/retina_peak.rds")
16 |
17 | meta <- read.csv("data/meta20k.csv")
18 | colnames(meta)[1]<-c("barcode")
19 | retina.rna = retina@assays$RNA@counts
20 |
21 | meta1 <- meta[match(colnames(retina.rna), meta$barcode),]
22 |
23 | meta1$annotation ->retina_celltype
24 |
25 | meta_subset <- read.csv("data/meta_20k.csv")
26 | colnames(meta_subset)[1]<-c("barcode")
27 | subset_retina.rna <- retina.rna[,meta_subset$barcode]
28 | meta_subset$annotation ->retina_celltype
29 |
30 |
31 | retina.obj.rna <- CreateSeuratObject(
32 | counts = subset_retina.rna,
33 | assay = "RNA"
34 | )
35 | retina.obj.rna$celltype <- retina_celltype
36 |
37 |
38 | retina.atac = retina@assays$peak@counts
39 | subset_retina.atac <- retina.atac[,meta_subset$barcode]
40 |
41 |
42 | retina.obj.atac <- CreateSeuratObject(
43 | counts = subset_retina.atac,
44 | assay = "RNA"
45 | )
46 |
47 | retina.obj.atac$celltype <- retina_celltype
48 |
49 | retina.obj.rna@meta.data$domain <- "scRNA-seq"
50 | retina.obj.atac@meta.data$domain <- "scATAC-seq"
51 |
52 |
53 | setwd("/home/mnt/nzh/nzhanglab/project/shuang/scATAC/comparison_methods/scglue/retina")
54 | library(SeuratDisk)
55 | SaveH5Seurat(retina.obj.rna, filename = "retina_RNA.h5Seurat")
56 | Convert("retina_RNA.h5Seurat", dest = "h5ad")
57 | SaveH5Seurat(retina.obj.atac, filename = "retina_ATAC.h5Seurat")
58 | Convert("retina_ATAC.h5Seurat", dest = "h5ad")
59 |
--------------------------------------------------------------------------------
/Archive/teaseq-pbmc/code/benchmark/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("./")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 |
28 | # calculate structure alignment metrics
29 | print(paste0(format(Sys.Date(), "%c"), ': calculating structure alignment metrics...'))
30 | sam_x = sam(orig_fname=orig_fname, embed_fname=embed_fname,
31 | n_idx=n_idx, data_idx='x')
32 | sam_y= sam(orig_fname=orig_fname, embed_fname=embed_fname,
33 | n_idx=n_idx, data_idx='y')
34 | #print(sam_x)
35 | #print(sam_y)
36 | metrics = metrics %>% add_column(sam_x=sam_x) %>% add_column(sam_y=sam_y)
37 | #print(metrics)
38 | # calculate Silhouette width
39 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
40 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
41 | #print(slt_res)
42 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
43 | #print(metrics)
44 | # calculate ARI
45 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
46 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
47 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
48 |
49 | # calculate LISI
50 | print(paste0(format(Sys.Date(), "%c"), ': calculating Local Inverse Simpson’s Index...'))
51 | lisi_res = lisi(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
52 | metrics = metrics %>% add_column(lisi_mix=lisi_res[, 1]) %>% add_column(lisi_clust=lisi_res[, 2])
53 |
54 | # calculate mixing averaged over clusters
55 | print(paste0(format(Sys.Date(), "%c"), ': calculating mixing quality...'))
56 | avg_mix = mix(orig_fname=orig_fname, embed_fname=embed_fname,
57 | n_idx=n_idx)
58 | metrics = metrics %>% add_column(avg_mix=avg_mix)
59 |
60 | # save metrics, because the calculation of kBET is substantially slower.
61 | write_csv(metrics, metrics_fname)
62 | #print(paste0(format(Sys.Date(), "%c"), ': nearly done...'))
63 |
64 | #### not calculating kBet here because too slow for this stage
65 | # calculate kBET
66 | #print(paste0(format(Sys.Date(), "%c"), ': calculating kBET...'))
67 | #kbet_res = kbet(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
68 | #metrics = metrics %>% add_column(kBET=kbet_res)
69 |
70 | #write_csv(metrics, metrics_fname)
71 | #print(paste0(format(Sys.Date(), "%c"), ': done!'))
--------------------------------------------------------------------------------
/Archive/teaseq-pbmc/code/benchmark/methods_running/harm_cite.R:
--------------------------------------------------------------------------------
1 | #script for seurat fusion
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/ICICLE/output/"
8 | in_root = "/ICICLE/data/"
9 | out_indx = 15
10 |
11 | out_dir =paste0(out_root,"hm/")
12 | in_dir = in_root
13 | dir.create(out_root)
14 | dir.create(out_dir)
15 | # read
16 |
17 | protein = read.csv(paste0(in_dir,"adt.csv"))
18 | colnames(protein) = gsub('\\.','-', colnames(protein))
19 | colnames(protein) = gsub('-$','', colnames(protein))
20 | protein$cell_barcode <- NULL
21 | protein$total <- NULL
22 |
23 | meta = read.csv(paste0(in_dir,"atac_meta.csv"))
24 |
25 | atacactivity = readMM(paste0(in_dir,"genescore_tea.txt"))
26 | atacactivity = as.matrix(atacactivity)
27 | gas_names = read.csv(paste0(in_dir ,'genescore_names_tea.csv'))
28 | colnames(atacactivity) = gas_names$names
29 |
30 | # change name
31 | correspondence = read.csv('/conversion_v12.csv')
32 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
33 | rna_list = c()
34 | protein_list = c()
35 | for (i in c(1:dim(correspondence)[1])){
36 | protein_n = as.character(correspondence[i,1])
37 | rna_n = as.character(correspondence[i,2])
38 | if (grepl("Ignore", rna_n, fixed = TRUE)){
39 | next
40 | }
41 | rna_n = strsplit(rna_n, '/')[[1]]
42 | for(r in rna_n){
43 | if (r %in% gas_names$names){
44 | rna_list = c(rna_list, r)
45 | protein_list = c(protein_list, protein_n)
46 | }
47 | }
48 | }
49 |
50 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object
51 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
52 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
53 |
54 | # copy sp filtering to produce better output
55 | act.shared.sub = act.shared[,colSds(act.shared)>0.36]
56 | protein.shared.sub = protein.shared[,colSds(protein.shared)>3.6]
57 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub))))
58 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
59 | # then we construct the seurat objects
60 | x_obj=CreateSeuratObject(counts=t(act.shared.sub),assay="x")
61 | #x_obj <- NormalizeData(x_obj)
62 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
63 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
64 | # add suerat object datay
65 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
66 | y_obj <- NormalizeData(y_obj)
67 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
68 | #list_modality=list(x_obj,y_obj)
69 | # get shared clean features
70 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub))
71 | # run harmony in seurat, need to make a new seurat object
72 | xy_obj = CreateSeuratObject(counts=cbind(t(act.shared.sub[,features]), t(protein.shared.sub[,features])))
73 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
74 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = out_indx, verbose = FALSE)
75 | xy_obj@meta.data$orig = c(rep("x",dim(act.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
76 | # cbind together, scale within modality is better
77 | xy_obj <- xy_obj %>% RunHarmony("orig")
78 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
79 | name_1 = "full_embed_x0.csv"
80 | name_2 = "full_embed_y0.csv"
81 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
82 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
83 | row.names=FALSE) # need to decide output pca cell
84 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
85 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
86 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
87 | ##
88 |
--------------------------------------------------------------------------------
/Archive/teaseq-pbmc/code/benchmark/methods_running/liger_cite.R:
--------------------------------------------------------------------------------
1 | #liger benchmark
2 | library(rliger)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/ICICLE/output/"
7 | in_root = "/ICICLE/data/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"lg/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 |
16 | protein = read.csv(paste0(in_dir,"adt.csv"))
17 | colnames(protein) = gsub('\\.','-', colnames(protein))
18 | colnames(protein) = gsub('-$','', colnames(protein))
19 | protein$cell_barcode <- NULL
20 | protein$total <- NULL
21 |
22 | meta = read.csv(paste0(in_dir,"atac_meta.csv"))
23 |
24 | atacactivity = readMM(paste0(in_dir,"genescore_tea.txt"))
25 | atacactivity = as.matrix(atacactivity)
26 | gas_names = read.csv(paste0(in_dir ,'genescore_names_tea.csv'))
27 | colnames(atacactivity) = gas_names$names
28 |
29 | # change name
30 | correspondence = read.csv('conversion_v12.csv')
31 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
32 | rna_list = c()
33 | protein_list = c()
34 | for (i in c(1:dim(correspondence)[1])){
35 | protein_n = as.character(correspondence[i,1])
36 | rna_n = as.character(correspondence[i,2])
37 | if (grepl("Ignore", rna_n, fixed = TRUE)){
38 | next
39 | }
40 | rna_n = strsplit(rna_n, '/')[[1]]
41 | for(r in rna_n){
42 | if (r %in% gas_names$names){
43 | rna_list = c(rna_list, r)
44 | protein_list = c(protein_list, protein_n)
45 | }
46 | }
47 | }
48 |
49 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object
50 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
51 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
52 |
53 | # copy sp filtering to produce better output
54 | act.shared.sub = act.shared[,colSds(act.shared)>0.36]
55 | protein.shared.sub = protein.shared[,colSds(protein.shared)>3.6]
56 | rownames(act.shared.sub) = paste0("d1",as.character(c(1:nrow(act.shared.sub))))
57 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
58 | # then we construct the liger objects
59 | ligerobj=createLiger( list(x = t(act.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE)
60 | ###Start integration
61 | features=intersect(colnames(act.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
62 | # default preprocessing
63 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
64 | # do not need to select genes
65 | #ligerobj <- selectGenes(ifnb_liger, var.thresh = 0, alpha.thresh=1)
66 | ligerobj@var.genes=features # just use all
67 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
68 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE)
69 | ligerobj <- quantile_norm(ligerobj)
70 | embedding = ligerobj@H.norm[,c(1:out_indx)]
71 | name_1 = "full_embed_x0.csv"
72 | name_2 = "full_embed_y0.csv"
73 | # no avaliable matching information from liger thus not saved out
74 | # will use knn to serach matching on embedding in downstreatm analysis
75 | # check what cell is filtered out
76 | `%notin%` <- Negate(`%in%`)
77 | filtered =
78 | c(rownames(act.shared.sub), rownames(protein.shared.sub))[c(rownames(act.shared.sub), rownames(protein.shared.sub)) %notin% rownames(ligerobj@H.norm)]
79 | filtered_id = as.integer(gsub("d1", "", filtered)) # some cells got delted during liger process
80 |
81 | write.csv(embedding[c(1:7472),],
82 | paste0(out_dir,name_1), row.names=FALSE) # some cells got delted during liger process
83 | write.csv(embedding[c(7473:14954),],
84 | paste0(out_dir,name_2), row.names=FALSE)
85 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
86 |
87 | #### cells got filtered out, remake the pca lsi embedding files for downstream calc of slt and ari scores
88 | orig_x = read.csv("/ICICLE/data/orig_x.csv")
89 | orig_y = read.csv("/ICICLE/data/orig_y.csv")
90 |
91 | write.csv(orig_x[-filtered_id,], "/ICICLE/data/orig_lg_x.csv" , row.names=FALSE)
92 | write.csv(orig_y, "/ICICLE/data/orig_lg_y.csv" , row.names=FALSE)
93 | write.csv(meta[-filtered_id,], "/ICICLE/data/atac_meta_lgdrop.csv" , row.names=FALSE)
--------------------------------------------------------------------------------
/Archive/teaseq-pbmc/code/benchmark/methods_running/seurat_cite.R:
--------------------------------------------------------------------------------
1 | #seurat benchmark
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | # read in files
6 | out_root = "/ICICLE/output/"
7 | in_root = "/ICICLE/data/"
8 | out_indx = 15
9 |
10 | out_dir =paste0(out_root,"sr/")
11 | in_dir = in_root
12 | dir.create(out_root)
13 | dir.create(out_dir)
14 | # read
15 |
16 | protein = read.csv(paste0(in_dir,"adt.csv"))
17 | colnames(protein) = gsub('\\.','-', colnames(protein))
18 | colnames(protein) = gsub('-$','', colnames(protein))
19 | protein$cell_barcode <- NULL
20 | protein$total <- NULL
21 |
22 | meta = read.csv(paste0(in_dir,"atac_meta.csv"))
23 |
24 | atacactivity = readMM(paste0(in_dir,"genescore_tea.txt"))
25 | atacactivity = as.matrix(atacactivity)
26 | gas_names = read.csv(paste0(in_dir ,'genescore_names_tea.csv'))
27 | colnames(atacactivity) = gas_names$names
28 |
29 | # change name
30 | correspondence = read.csv('conversion_v12.csv')
31 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
32 | rna_list = c()
33 | protein_list = c()
34 | for (i in c(1:dim(correspondence)[1])){
35 | protein_n = as.character(correspondence[i,1])
36 | rna_n = as.character(correspondence[i,2])
37 | if (grepl("Ignore", rna_n, fixed = TRUE)){
38 | next
39 | }
40 | rna_n = strsplit(rna_n, '/')[[1]]
41 | for(r in rna_n){
42 | if (r %in% gas_names$names){
43 | rna_list = c(rna_list, r)
44 | protein_list = c(protein_list, protein_n)
45 | }
46 | }
47 | }
48 |
49 | act.shared = as.matrix(atacactivity[,rna_list[protein_list %in% colnames(protein)]]) # protein object
50 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
51 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
52 |
53 | # copy sp filtering
54 | act.shared.sub = act.shared[,colSds(act.shared)>0.36]
55 | protein.shared.sub = protein.shared[,colSds(protein.shared)>3.6]
56 | rownames(act.shared.sub) = as.character(c(1:nrow(act.shared.sub)))
57 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub)))
58 |
59 | # then we construct the seurat objects
60 | x_obj=CreateSeuratObject(counts=t(act.shared.sub),assay="x")
61 | #x_obj <- NormalizeData(x_obj) # atac skip norm
62 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case
63 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
64 | # add suerat object datay
65 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
66 | y_obj <- NormalizeData(y_obj)
67 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
68 | list_modality=list(x_obj,y_obj)
69 | # get transfer anchor
70 | features=intersect(rownames(x_obj),rownames(y_obj))
71 | pre.anchors <- FindTransferAnchors(reference = x_obj, query = y_obj,
72 | dims = 1:20, features = features)
73 | predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(x_obj),
74 | dims = 1:20)
75 | full_df = data.frame(idx2 = c(1:length(predictions$predicted.id)) -1, idx1 = as.integer(predictions$predicted.id) -1,
76 | score = predictions$prediction.score.max) # mind the r index difference
77 |
78 | # get integration embedding
79 | print("starting seurat integration")
80 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality,
81 | dims = 1:20, anchor.features =features, k.filter = 10)
82 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 20)
83 | #
84 | DefaultAssay(xy_int) <- "integrated"
85 | xy_int <- ScaleData(xy_int, verbose = FALSE)
86 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion
87 | embedding = xy_int@reductions$pca@cell.embeddings
88 | name_1 = "full_embed_x0.csv"
89 | name_2 = "full_embed_y0.csv"
90 | #pathout = out_dir
91 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
92 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
93 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
94 | write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide
95 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
96 |
--------------------------------------------------------------------------------
/Archive/teaseq-pbmc/code/benchmark/step1.sh:
--------------------------------------------------------------------------------
1 | ## run this is algo python conda env
2 | python maxfuse_cite.py &
3 | /usr/bin/Rscript seurat_cite.R &
4 | /usr/bin/Rscript liger_cite.R &
5 | /usr/bin/Rscript harm_cite.R &
6 | /usr/bin/Rscript bsc_cite.R
7 |
--------------------------------------------------------------------------------
/Archive/teaseq-pbmc/code/benchmark/step2.sh:
--------------------------------------------------------------------------------
1 | # no condo env requirement
2 | # for mf
3 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/mf/metrics.csv' '/ICICLE/data/orig' '/ICICLE/output/mf/full_embed' 0 &
4 |
5 | # for sr
6 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/sr/metrics.csv' '/ICICLE/data/orig' '/ICICLE/output/sr/full_embed' 0 &
7 |
8 | # for lg
9 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/lg/metrics.csv' '/ICICLE/data/orig_lg' '/ICICLE/output/lg/full_embed' 0 &
10 |
11 | # for hm
12 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/hm/metrics.csv' '/ICICLE/data/orig' '/ICICLE/output/hm/full_embed' 0 &
13 |
14 | # for bsc
15 | /usr/bin/Rscript calculate_metrics.R '/ICICLE/output/bsc/metrics.csv' '/ICICLE/data/orig' '//ICICLE/output/bsc/full_embed' 0
--------------------------------------------------------------------------------
/Archive/tonsil/code/analysis/plot_tonsil_met.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "plot_tonsil_met"
3 | output: html_document
4 | ---
5 |
6 | Script to produce metrics related to the batched data ( 10k 30k, 5 batches in total).
7 |
8 |
9 | ```{r}
10 | library(ggplot2)
11 | metrics = read.csv("tonsil_v2/match/bench_out/batch_metrics_resultV2.csv")
12 | metrics$method <- factor(metrics$method,levels = c("mf", "sr", "lg", "hm","bsc"))
13 | colorv = c("#f6511d","#ffb400","#00a6ed","#7fb800","#A149FA")
14 | ```
15 |
16 |
17 | ```{r}
18 | library(dplyr)
19 | # Data
20 | data <- metrics %>% select(method, ann1)
21 | #data$foscttm = 1 - data$foscttm
22 | # Calculates mean, sd, se and IC
23 | my_sum <- data %>%
24 | group_by(method) %>%
25 | dplyr::summarise(
26 | n=n(),
27 | mean=mean(ann1),
28 | sd=sd(ann1)
29 | ) %>%
30 | mutate( se=sd/sqrt(n)) %>%
31 | mutate( ic=se * qt((1-0.05)/2 + .5, n-1))
32 |
33 | # Standard deviation
34 |
35 |
36 | ### this is batched matching accuracy
37 | p = ggplot() +
38 | geom_bar(data=my_sum, aes(x=method, y=mean, fill=method), stat="identity", alpha=0.7, width = 0.4) +
39 | geom_errorbar(data=my_sum, aes(x=method, ymin=mean-sd, ymax=mean+sd), width=0.08, colour="black", alpha=0.9, size=0.2) +
40 | ggtitle("using standard deviation") + theme_minimal() + scale_fill_manual(values = colorv) + coord_cartesian(ylim=c(0.25,0.97)) +
41 | geom_point(data=data, aes(y=ann1, x=method, fill=method),alpha=0.5, size=0.5)
42 | #ggsave("tonsil_v2/plots/batch_ann.svg", height = 3, width = 4.5)
43 | p
44 | ```
45 |
46 | ```{r}
47 | # slt and ari f1
48 |
49 | p = ggplot(metrics) + geom_point(aes(x = slt_f1, y = ari_f1, color = method), size =2, alpha = 0.5) +
50 | theme_minimal() + scale_color_manual(values = colorv) +
51 | ylim(c(0.35,0.62)) + xlim(c(0.35,0.55))
52 | #ggsave("tonsil_v2/plots/slt_ari.svg", height = 3, width = 4.5)
53 | p
54 | ```
55 |
56 | ```{r}
57 | # closed up version
58 | p = ggplot(metrics) + geom_point(aes(x = slt_f1, y = ari_f1, color = method), size =2, alpha = 0.5) +
59 | theme_minimal() + scale_color_manual(values = colorv) +
60 | ylim(c(0.56,0.62)) + xlim(c(0.51,0.55))
61 | ggsave("tonsil_v2/plots/slt_ari_small.svg", height = 3, width = 4.5)
62 | p
63 | ```
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/Archive/tonsil/code/benchmark/calculate_metrics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args = commandArgs(trailingOnly=TRUE)
3 |
4 | metrics_fname = args[1]
5 | orig_fname = args[2]
6 | embed_fname = args[3]
7 | n_idx = as.integer(args[4])
8 |
9 | # Compute the following metrics:
10 | # - sam_x: structure alignment metric for x data (the larger, the better)
11 | # - sam_y: structure alignment metric for y data (the larger, the better)
12 | # - slt_mix: mixing via Silhouette width (the larger, the better)
13 | # - slt_clust: quality of embeddings for clustering via Silhouette width (the larger, the better)
14 | # - slt_f1: an integrated metric using both slt_mix and slt_clust (the larger, the better)
15 | # - ari_mix: mixing via adjusted random index (the larger, the better)
16 | # - ari_clust: quality of embeddings for clustering via adjusted random index (the larger, the better)
17 | # - lisi_mix: mixing via Local Inverse Simpson’s Index (LISI) (the larger, the better)
18 | # - lisi_clust: quality of embeddings for clustering via LISI (the larger, the better)
19 | # - kbet: mixing via k-nearest neighbour batch effect test (kBET) (the larger, the better)
20 | # - avg_mix: mixing metric via two sample test, averaged over all clusters (the larger, the better)
21 | setwd("./")
22 | source("metrics.R")
23 |
24 | # load existing metrics
25 | metrics = read_csv(metrics_fname, col_types=cols())
26 |
27 | # calculate Silhouette width
28 | print(paste0(format(Sys.Date(), "%c"), ': calculating Silhouette width...'))
29 | slt_res = slt(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
30 | #print(slt_res)
31 | metrics = metrics %>% add_column(slt_mix=slt_res[, 1]) %>% add_column(slt_clust=slt_res[, 2]) %>% add_column(slt_f1=slt_res[, 3])
32 | #print(metrics)
33 | # calculate ARI
34 | print(paste0(format(Sys.Date(), "%c"), ': calculating adjusted random index...'))
35 | ari_res = ari(orig_fname=orig_fname, embed_fname=embed_fname, n_idx=n_idx)
36 | metrics = metrics %>% add_column(ari_mix=ari_res[, 1]) %>% add_column(ari_clust=ari_res[, 2]) %>% add_column(ari_f1=ari_res[, 3])
37 | write_csv(metrics, metrics_fname)
--------------------------------------------------------------------------------
/Archive/tonsil/code/benchmark/method_running/harm_batch.R:
--------------------------------------------------------------------------------
1 | #harmony benchmark on 10k30k 5 batch cells, result used to produce matching accu and slt ari repeats
2 | library(Seurat)
3 | library(Matrix)
4 | library(matrixStats)
5 | library(harmony)
6 | # read in files
7 | out_root = "/tonsil_v2/match/bench_out/"
8 | in_root = "/tonsil_v2/match/bench_input/"
9 | batch = 5
10 | out_indx = 15
11 |
12 | for(i in c(1:5)){
13 | batch_name = paste0("b",as.character(i),"/")
14 | out_dir =paste0(out_root,batch_name,"hm/")
15 | in_dir = paste0(in_root,batch_name)
16 | dir.create(paste0(out_root,batch_name))
17 | dir.create(out_dir)
18 | # read
19 | rna = readMM(paste0(in_dir,"rna.txt"))
20 | protein = read.csv(paste0(in_dir,"pro.csv"))
21 | meta_rna = read.csv(paste0(in_dir,"meta_rna.csv"))
22 | meta_pro = read.csv(paste0(in_dir,"meta_pro.csv"))
23 |
24 | # note this version caused name different, correct back
25 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV'
26 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR'
27 |
28 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same
29 | colnames(rna) = rna_names$names
30 | # change name
31 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV8.csv')
32 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
33 | rna_list = c()
34 | protein_list = c()
35 | for (j in c(1:dim(correspondence)[1])){
36 | protein_n = as.character(correspondence[j,1])
37 | rna_n = as.character(correspondence[j,2])
38 | if (grepl("Ignore", rna_n, fixed = TRUE)){
39 | next
40 | }
41 | rna_n = strsplit(rna_n, '/')[[1]]
42 | for(r in rna_n){
43 | if (r %in% rna_names$names){
44 | rna_list = c(rna_list, r)
45 | protein_list = c(protein_list, protein_n)
46 | }
47 | }
48 | }
49 | # change name end
50 | # first filtering step should be same as in sp
51 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
52 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
53 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
54 | # copy sp filtering
55 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
56 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
57 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
58 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
59 | # then we construct the seurat objects
60 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
61 | x_obj <- NormalizeData(x_obj)
62 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
63 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
64 | # add suerat object datay
65 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
66 | y_obj <- NormalizeData(y_obj)
67 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
68 | #list_modality=list(x_obj,y_obj)
69 | # get shared clean features
70 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
71 | # run harmony in seurat, need to make a new seurat object
72 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
73 | xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
74 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = 15, verbose = FALSE)
75 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
76 | # cbind together, scale within modality is better
77 | xy_obj <- xy_obj %>% RunHarmony("orig")
78 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
79 | name_1 = "full_embed_x0.csv"
80 | name_2 = "full_embed_y0.csv"
81 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
82 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
83 | row.names=FALSE) # need to decide output pca cell
84 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
85 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
86 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
87 | }
--------------------------------------------------------------------------------
/Archive/tonsil/code/benchmark/method_running/hm_full.R:
--------------------------------------------------------------------------------
1 | # full data set run for related spatial analysis
2 | # for harmony
3 |
4 | library(Seurat)
5 | library(harmony)
6 | library(Matrix)
7 | library(matrixStats)
8 |
9 | root_dir = '/tonsil_v2/'
10 | out_dir = '/tonsil_v2/match/match_output/full/'
11 | out_indx = 15
12 |
13 | ##
14 | out_dir =paste0(out_dir,"hm/")
15 | dir.create(out_dir)
16 |
17 | rna = readMM(paste0(root_dir,"/RNA/tonsil_rna_0510.txt"))
18 | protein = read.csv(paste0(root_dir,"/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715V2.csv"))
19 |
20 | meta_rna = read.csv(paste0(root_dir,"/RNA/tonsil_rna_0510_meta.csv"))
21 |
22 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV'
23 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR'
24 |
25 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same
26 | colnames(rna) = rna_names$names
27 |
28 | #### for bsc
29 | rownames(rna) = paste0("rna", c(1:nrow(rna)))
30 | rownames(protein) = paste0("pro", c(1:nrow(protein)))
31 |
32 | # change name
33 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV11.csv')
34 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
35 | rna_list = c()
36 | protein_list = c()
37 | for (j in c(1:dim(correspondence)[1])){
38 | protein_n = as.character(correspondence[j,1])
39 | rna_n = as.character(correspondence[j,2])
40 | if (grepl("Ignore", rna_n, fixed = TRUE)){
41 | next
42 | }
43 | rna_n = strsplit(rna_n, '/')[[1]]
44 | for(r in rna_n){
45 | if (r %in% rna_names$names){
46 | rna_list = c(rna_list, r)
47 | protein_list = c(protein_list, protein_n)
48 | }
49 | }
50 | }
51 | # get clean shared features
52 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
53 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
54 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
55 |
56 | # copy sp filtering
57 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
58 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
59 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub)))
60 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub)))
61 |
62 | # then we construct the seurat objects
63 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
64 | x_obj <- NormalizeData(x_obj)
65 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case
66 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
67 | # add suerat object datay
68 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
69 | y_obj <- NormalizeData(y_obj)
70 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
71 | list_modality=list(x_obj,y_obj)
72 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub))
73 | # run harmony in seurat, need to make a new seurat object
74 | xy_obj = CreateSeuratObject(counts=cbind(t(rna.shared.sub[,features]), t(protein.shared.sub[,features])))
75 | #xy_obj = SetAssayData(xy_obj, slot = "scale.data", cbind(x_obj@assays$x@scale.data[features,], y_obj@assays$y@scale.data[features,])) # takes very long
76 | xy_obj <- NormalizeData(xy_obj)
77 | xy_obj = ScaleData(xy_obj)
78 | xy_obj = RunPCA(xy_obj, features = rownames(xy_obj), npcs = 15, verbose = FALSE)
79 | xy_obj@meta.data$orig = c(rep("x",dim(rna.shared.sub)[1]), rep("x",dim(protein.shared.sub)[1]))
80 | # cbind together, scale within modality is better
81 | xy_obj <- xy_obj %>% RunHarmony("orig")
82 | embedding = Embeddings(xy_obj, 'harmony')[,c(1:out_indx)]
83 | name_1 = "full_embed_x0.csv"
84 | name_2 = "full_embed_y0.csv"
85 | # does not directly produce matching info, produce later using knn with embeddning distance matrix
86 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1),
87 | row.names=FALSE) # need to decide output pca cell
88 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
89 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
90 | write.csv(data.frame(method = "hm"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
91 |
--------------------------------------------------------------------------------
/Archive/tonsil/code/benchmark/method_running/lg_full.R:
--------------------------------------------------------------------------------
1 | # full data set run for related spatial analysis
2 | # for liger
3 |
4 | library(rliger)
5 | library(Matrix)
6 | library(matrixStats)
7 |
8 | root_dir = '/tonsil_v2/'
9 | out_dir = '/tonsil_v2/match/match_output/full/'
10 | out_indx = 15
11 | `%notin%` <- Negate(`%in%`)
12 |
13 | out_dir =paste0(out_dir,"lg/")
14 | dir.create(out_dir)
15 |
16 | rna = readMM(paste0(root_dir,"/RNA/tonsil_rna_0510.txt"))
17 | protein = read.csv(paste0(root_dir,"/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715V2.csv"))
18 |
19 | meta_rna = read.csv(paste0(root_dir,"/RNA/tonsil_rna_0510_meta.csv"))
20 |
21 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV'
22 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR'
23 |
24 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same
25 | colnames(rna) = rna_names$names
26 |
27 | # change name
28 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV11.csv')
29 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
30 | rna_list = c()
31 | protein_list = c()
32 | for (j in c(1:dim(correspondence)[1])){
33 | protein_n = as.character(correspondence[j,1])
34 | rna_n = as.character(correspondence[j,2])
35 | if (grepl("Ignore", rna_n, fixed = TRUE)){
36 | next
37 | }
38 | rna_n = strsplit(rna_n, '/')[[1]]
39 | for(r in rna_n){
40 | if (r %in% rna_names$names){
41 | rna_list = c(rna_list, r)
42 | protein_list = c(protein_list, protein_n)
43 | }
44 | }
45 | }
46 | # get clean shared features
47 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
48 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
49 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
50 |
51 | # copy sp filtering
52 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
53 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
54 | rownames(rna.shared.sub) = paste0("d1",as.character(c(1:nrow(rna.shared.sub))))
55 | rownames(protein.shared.sub) = paste0("d2",as.character(c(1:nrow(protein.shared.sub))))
56 |
57 | # then we construct the liger objects
58 | ligerobj=createLiger( list(x = t(rna.shared.sub), y = t(protein.shared.sub)), remove.missing = FALSE)
59 | ###Start integration
60 | features=intersect(colnames(rna.shared.sub),colnames(protein.shared.sub)) # shared features accross datasets with good quality
61 | # default preprocessing
62 | ligerobj <- rliger::normalize(ligerobj, remove.missing = FALSE)
63 | # do not need to select genes
64 | #ligerobj <- selectGenes(ligerobj, var.thres= 0,unshared = TRUE,
65 | # unshared.datasets = list(2), unshared.thresh= 0, alpha.thresh = 1) # unimf version of liger
66 | ligerobj@var.genes=features # only used for length
67 | ligerobj <- scaleNotCenter(ligerobj, remove.missing = FALSE)
68 | ligerobj <- optimizeALS(ligerobj, k = 20,remove.missing = FALSE)
69 | #ligerobj <- optimizeALS(ligerobj, use.unshared = TRUE, k = 20,remove.missing = FALSE)
70 | ligerobj <- quantile_norm(ligerobj)
71 | embedding = ligerobj@H.norm[,c(1:out_indx)]
72 |
73 | name_1 = "full_embed_x0.csv"
74 | name_2 = "full_embed_y0.csv"
75 | # no avaliable matching information from liger thus not saved out
76 | # will use knn to serach matching on embedding in downstreatm analysis
77 |
78 |
79 | # before proceed, make sure what cells got deleted
80 | a1 = rownames(rna.shared.sub)[rownames(rna.shared.sub) %notin% rownames(embedding)]
81 | b1 = length(a1) # 39 rna cells got removed
82 |
83 | a2 = rownames(protein.shared.sub)[rownames(protein.shared.sub) %notin% rownames(embedding)]
84 | b2 = length(a2) # 8 cdx cells got removed
85 |
86 | rn = nrow(rna.shared.sub)
87 | pn = nrow(protein.shared.sub)
88 |
89 | write.csv(embedding[c(1:(rn - b1)),c(1:out_indx)],
90 | paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
91 | write.csv(embedding[c((rn + 1 - b1):(rn + pn - b1 - b2)),c(1:out_indx)],
92 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
93 | write.csv(data.frame(method = "lg"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
94 |
95 | ## get ids of missing cells
96 |
97 | a1_s = as.integer(gsub("d1", "", a1))
98 | a2_s = as.integer(gsub("d2", "", a2))
99 |
100 | write.csv(data.frame(id = a1_s),paste0(out_dir,'d1_id.csv'))
101 | write.csv(data.frame(id = a2_s),paste0(out_dir,'d2_id.csv'))
102 |
--------------------------------------------------------------------------------
/Archive/tonsil/code/benchmark/method_running/sr_full.R:
--------------------------------------------------------------------------------
1 | # full data set run for related spatial analysis
2 | # for liger
3 |
4 | library(Seurat)
5 | library(Matrix)
6 | library(matrixStats)
7 |
8 | root_dir = '/tonsil_v2/'
9 | out_dir = '/tonsil_v2/match/match_output/full/'
10 | out_indx = 15
11 | ##
12 | out_dir =paste0(out_dir,"sr/")
13 | dir.create(out_dir)
14 |
15 | rna = readMM(paste0(root_dir,"/RNA/tonsil_rna_0510.txt"))
16 | protein = read.csv(paste0(root_dir,"/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715V2.csv"))
17 |
18 | meta_rna = read.csv(paste0(root_dir,"/RNA/tonsil_rna_0510_meta.csv"))
19 |
20 | names(protein)[names(protein) == 'collagen.IV'] <- 'collagen IV'
21 | names(protein)[names(protein) == 'HLA.DR'] <- 'HLA DR'
22 |
23 | rna_names = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_names.csv") # rna names always the same
24 | colnames(rna) = rna_names$names
25 |
26 | #### for bsc
27 | rownames(rna) = paste0("rna", c(1:nrow(rna)))
28 | rownames(protein) = paste0("pro", c(1:nrow(protein)))
29 |
30 | # change name
31 | correspondence = read.csv('/tonsil_v2/match/protein_rna_name_conversionV11.csv')
32 | correspondence = correspondence[!apply(correspondence == "", 1, all),]
33 | rna_list = c()
34 | protein_list = c()
35 | for (j in c(1:dim(correspondence)[1])){
36 | protein_n = as.character(correspondence[j,1])
37 | rna_n = as.character(correspondence[j,2])
38 | if (grepl("Ignore", rna_n, fixed = TRUE)){
39 | next
40 | }
41 | rna_n = strsplit(rna_n, '/')[[1]]
42 | for(r in rna_n){
43 | if (r %in% rna_names$names){
44 | rna_list = c(rna_list, r)
45 | protein_list = c(protein_list, protein_n)
46 | }
47 | }
48 | }
49 | # get clean shared features
50 | rna.shared = as.matrix(rna[,rna_list[protein_list %in% colnames(protein)]]) # protein object
51 | protein.shared = as.matrix(protein[,protein_list[protein_list %in% colnames(protein)]]) # rna object
52 | colnames(protein.shared) = rna_list[protein_list %in% colnames(protein)] # make sure feature names same
53 |
54 | # copy sp filtering
55 | rna.shared.sub = rna.shared[,colSds(rna.shared)>0.5]
56 | protein.shared.sub = protein.shared[,colSds(protein.shared)>0.1]
57 | rownames(rna.shared.sub) = as.character(c(1:nrow(rna.shared.sub)))
58 | rownames(protein.shared.sub) = as.character(c(1:nrow(protein.shared.sub)))
59 |
60 | # then we construct the seurat objects
61 | x_obj=CreateSeuratObject(counts=t(rna.shared.sub),assay="x")
62 | x_obj <- NormalizeData(x_obj)
63 | #x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000) # no need to select variable genes in this case
64 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
65 | # add suerat object datay
66 | y_obj=CreateSeuratObject(counts=t(protein.shared.sub),assay="y")
67 | y_obj <- NormalizeData(y_obj)
68 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
69 | list_modality=list(x_obj,y_obj)
70 | # get transfer anchor
71 | features=intersect(rownames(x_obj),rownames(y_obj))
72 | #pre.anchors <- FindTransferAnchors(reference = y_obj, query = x_obj,
73 | # dims = 1:20, features = features)
74 | #predictions <- TransferData(anchorset = pre.anchors, refdata = colnames(y_obj),
75 | # dims = 1:20)
76 | #full_df = data.frame(idx1 = c(1:length(predictions$predicted.id)) -1, idx2 = as.integer(predictions$predicted.id) -1,
77 | # score = predictions$prediction.score.max) # mind the r index difference
78 | # get integration embedding
79 | print("starting seurat integration")
80 | Int.anchors <- FindIntegrationAnchors(object.list = list_modality,
81 | dims = 1:20, anchor.features =features, k.filter = 10)
82 | xy_int <- IntegrateData(anchorset = Int.anchors, dims = 1:20, k.weight = 10)
83 | #
84 | DefaultAssay(xy_int) <- "integrated"
85 | xy_int <- ScaleData(xy_int, verbose = FALSE)
86 | xy_int <- RunPCA(xy_int, npcs = out_indx, verbose = FALSE) # index of pca, 15 as fusion
87 | embedding = xy_int@reductions$pca@cell.embeddings
88 | name_1 = "full_embed_x0.csv"
89 | name_2 = "full_embed_y0.csv"
90 | #pathout = out_dir
91 | write.csv(embedding[c(1:ncol(x_obj)),c(1:out_indx)], paste0(out_dir,name_1), row.names=FALSE) # need to decide output pca cell
92 | write.csv(embedding[c((ncol(x_obj) + 1):(ncol(x_obj) + ncol(y_obj))),c(1:out_indx)],
93 | paste0(out_dir,name_2), row.names=FALSE) # need to decide
94 | #write.csv(full_df, paste0(out_dir,"full_idx.csv"), row.names=FALSE) # need to decide
95 | write.csv(data.frame(method = "sr"), paste0(out_dir,"metrics.csv"), row.names=FALSE)
96 |
--------------------------------------------------------------------------------
/Archive/tonsil/code/benchmark/step1.sh:
--------------------------------------------------------------------------------
1 | ## run this is algo python conda env
2 | python mf_batch.py &
3 | /usr/bin/Rscript seurat_batch.R &
4 | /usr/bin/Rscript liger_batch.R &
5 | /usr/bin/Rscript harm_batch.R &
6 | /usr/bin/Rscript bsc_batch.R
7 |
8 |
--------------------------------------------------------------------------------
/Archive/tonsil/code/preparation_code/prep_subsetting_andMore.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "subset"
3 | output: html_document
4 | ---
5 |
6 | Script to prepare data (subsetting etc) for different benchmarking tests, related to codex -rna -tonsil scenario
7 |
8 | ################### first: cells used to benchmark matching accu, slt and ari F1 scores ##########################
9 | Since for this task we can not include too many cells (slt and ari can not run on > 40k cells due to cpu limitation)
10 | so we subsampled 10k scrnaseq cells and 30k codex cells to be used in this benchmarking process, and there are 5 batches intotal
11 |
12 | ```{r}
13 | # script to produce test batches
14 | # for codex rna matching
15 | library(Matrix)
16 | library(Seurat)
17 |
18 | rna_full = readMM("/tonsil_v2/RNA/tonsil_rna_0510.txt")
19 | protei_full = read.csv("/tonsil_v2/Codex/FCS_output_DeepCell_extOnly/formatch_clusters_x28_y715_wstepV2.csv")
20 | meta_full = read.csv("/tonsil_v2/RNA/tonsil_rna_0510_meta.csv")
21 | root = "/tonsil_v2/match/bench_input/"
22 | batch = 5
23 |
24 | c2u = colnames(protei_full)[6:51] # just protein columns
25 | for (i in c(1:5)){ # locked in case miss press
26 | batch_name = paste0("b",as.character(i),"/")
27 | out_dir =paste0(root,batch_name)
28 | dir.create(out_dir)
29 | # create files
30 | set.seed(i)
31 | randix1 = sample(dim(rna_full)[1], 10000) # every batch test 10k cells
32 | randix2 = sample(dim(protei_full)[1], 30000) # every batch test 30k cells
33 |
34 | rna = rna_full[randix1,]
35 | pro = protei_full[randix2,c2u]
36 | meta1 = meta_full[randix1,] # rna meta
37 | meta2 = protei_full[randix2,c(2:5,52:57)] # pro meta info
38 |
39 | write.csv(meta1, paste0(out_dir,"meta_rna.csv"),row.names = FALSE)
40 | write.csv(meta2, paste0(out_dir,"meta_pro.csv"),row.names = FALSE)
41 | write.csv(pro, paste0(out_dir,"pro.csv"),row.names = FALSE)
42 | writeMM(rna, paste0(out_dir,"rna.txt"))
43 | # create pca reduction orgin files
44 | rna_names = read.csv('/tonsil_v2/RNA/tonsil_rna_0510_names.csv')
45 | colnames(rna) = rna_names$names
46 | rownames(rna) = as.character(c(1:nrow(rna)))
47 | # pro
48 | rownames(pro) = as.character(c(1:nrow(pro)))
49 | # meta
50 | # use seurat as standard to produce reduction
51 | x_obj=CreateSeuratObject(counts=t(rna),assay="x")
52 | x_obj <- NormalizeData(x_obj)
53 | x_obj <- FindVariableFeatures(x_obj, selection.method = "vst", nfeatures = 3000)
54 | x_obj <- ScaleData(x_obj, features = rownames(x_obj))
55 | x_obj <- RunPCA(x_obj, features = VariableFeatures(object = x_obj))
56 | pca_rna = as.data.frame(x_obj@reductions$pca@cell.embeddings[,c(1:15)])
57 | pca_rna$label = meta1$cluster.info
58 | write.csv(pca_rna, paste0(out_dir,"orig_x.csv"), row.names=FALSE)
59 |
60 | # produce adt reduction
61 | y_obj=CreateSeuratObject(counts=t(pro),assay="y")
62 | y_obj <- NormalizeData(y_obj)
63 | y_obj <- ScaleData(y_obj, features = rownames(y_obj))
64 | y_obj <- RunPCA(y_obj, features = rownames(y_obj))
65 | pca_pro = as.data.frame(y_obj@reductions$pca@cell.embeddings[,c(1:15)])
66 | pca_pro$label = meta2$cluster.term #### could change if we want different labels
67 | write.csv(pca_pro, paste0(out_dir,"orig_y.csv"), row.names=FALSE)
68 |
69 | }
70 | ```
71 |
72 |
73 | ############### the second case for full analysis, all the cells in the codex tonsil subregion and all the rna cells were used ############
74 | in this case we just use the original dataset, this cells were used for all methods to do GC related analysis + confusion matrix plotting
75 |
76 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Academic Software License Agreement (12/2/2021)
2 |
3 | STANFORD ACADEMIC SOFTWARE LICENSE AGREEMENT FOR "Integration of spatial and single-cell data across modalities with weak linkage (MaxFuse)"
4 |
5 |
6 | 1. This is a legal agreement (“Agreement”) between ______________________ (“RECIPIENT” or “you”), and THE BOARD OF TRUSTEES OF THE LELAND STANFORD JUNIOR UNIVERSITY (“STANFORD”) and THE TRUSTEES OF THE UNIVERSITY OF PENNSYLVANIA (“PENN”). Stanford and Penn have assignments to “Integrative Matching and Analysis of Cells across single-cell multi-omics datasets with overlapping and non-overlapping features” (“Software”) which was developed in the laboratory of Professor Garry Nolan at Stanford and Zongming Ma at Penn.
7 | 2. By accepting, receiving, and using Software, including any accompanying information, materials or manuals you are agreeing to be bound by the terms of this Agreement. If you do not agree to the terms of this Agreement, promptly return the Software to STANFORD OR PENN.
8 | 3. STANFORD AND PENN grant to RECIPIENT a royalty-free, nonexclusive, and nontransferable license to use the Software furnished hereunder, upon the terms and conditions set out below.
9 | 4. RECIPIENT acknowledges that the Software is a research tool still in the development stage and that it is being supplied as is, without any accompanying services, support or improvements from STANFORD OR PENN. STANFORD AND PENN MAKE NO REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED OTHER THAN SET OUT IN THIS AGREEMENT.
10 | 5. STANFORD AND PENN do not grant any licenses under any STANFORD AND PENN patent or patent application by this Agreement.
11 | 6. RECIPIENT agrees to use the Software solely for internal academic non-commercial purposes and shall not distribute or transfer it to another location or to any other person without prior written permission from STANFORD AND PENN. In particular, no article in this license grants commercial use rights to RECIPIENT.
12 | 7. RECIPIENT agrees not to reverse engineer, reverse assemble, reverse compile decompile, disassemble, or otherwise attempt to re-create the source code for the Software. RECIPIENT acknowledges that any programs created based on the Software will be considered a derivative of Software and owned by STANFORD AND PENN.
13 | 8. RECIPIENT may NOT make modifications to the Software or integrate Software into RECIPIENT’s own software.
14 | 9. RECIPIENT may not further distribute Software without express written permission of STANFORD AND PENN. If permission to transfer the Software is given, RECIPIENT warrants that RECIPIENT will not remove or export any part of the Software from the United States except in full compliance with all United States export regulations and other applicable laws.
15 | 10. RECIPIENT will use the Software in compliance with all applicable laws, policies and regulations including, but not limited to, any approvals, informed consent and patient confidentiality principles.
16 | 11. RECIPIENT will indemnify, hold harmless, and defend STANFORD AND PENN against any claim of any kind arising out of or related to the exercise of any rights granted under this Agreement or the breach of this Agreement by RECIPIENT.
17 | 12. Title and copyright to the Software and any derivatives and any associated documentation shall at all times remain with STANFORD AND PENN, and RECIPIENT agrees to preserve same.
18 | 13. If RECIPIENT plans to publish any peer reviewed papers, abstracts, or similar publications, RECIPIENT agrees to acknowledge Software and its creators in a manner consistent with academic (industry) practice.
19 | 14. This agreement may be terminated by either party upon thirty (30) days written notice to the other party. In the event of termination, RECIPIENT shall destroy or return immediately all Software and all copies thereof to STANFORD OR PENN upon STANFORD’s OR PENN’S request.
20 | 15. The parties to this document agree that a copy of the original signature (including an electronic copy) may be used for any and all purposes for which the original signature may have been used. The parties further waive any right to challenge the admissibility or authenticity of this document in a court of law based solely on the absence of an original signature.
21 |
22 | RECIPIENT
23 | Signature _____________________________________________________
24 | Name ________________________________________________________
25 | Title _________________________________________________________
26 | Date _________________________________________________________
27 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MaxFuse: MAtching X-modality via FUzzy Smoothed Embedding
2 |
3 |
4 |
5 |
6 | ## Description
7 |
8 | MaxFuse is a Python package for integrating single-cell datasets from different modalities with no overlapping features and/or under low signal-to-noise ratio regimes. For most single-cell cross modality integration methods, the feasibility of cross-modal integration relies on the existence of highly correlated, a priori 'linked' features. When such linked features are few or uninformative, a scenario that we call 'weak linkage', existing methods fail. We developed MaxFuse, a cross-modal data integration method that, through iterative co-embedding, data smoothing, and cell matching, leverages all information in each modality to obtain high-quality integration. A prototypical example of weak linkage is the integration of **spatial proteomic data** with **single-cell sequencing data**. For details, please refer to the [paper]([https://www.biorxiv.org/content/10.1101/2023.01.12.523851](https://www.nature.com/articles/s41587-023-01935-0)).
9 |
10 | This work has been led by Shuxiao Chen from [Ma Lab](http://www-stat.wharton.upenn.edu/~zongming/) @Upenn and Bokai Zhu from [Nolan lab](https://web.stanford.edu/group/nolan/) @Stanford.
11 |
12 |
13 |
14 | ## Installation
15 | MaxFuse is hosted on `pypi` and can be installed via `pip`. We recommend working with a fresh virtual environment. In the following example we use conda.
16 |
17 | ```
18 | conda create -n maxfuse python=3.8
19 | conda activate maxfuse
20 | python -m pip install maxfuse
21 | ```
22 |
23 | ## Vignettes
24 |
25 |
26 |
27 |
28 | Example1: Protein -- RNA test run on ground-truth CITE-seq [here](https://github.com/shuxiaoc/maxfuse/blob/main/docs/citeseq_pbmc_evaluate.ipynb).
29 |
30 | Example2: Protein -- RNA test run on tissue [here](https://github.com/shuxiaoc/maxfuse/blob/main/docs/tonsil_codex_rnaseq.ipynb).
31 |
32 | Note in cases when integrating single cell data across **protein** and **RNA** modalities, many times the nomenclature of features are different (e.g., mRNA ```ITGAM``` could be named as ```CD11b-1``` when used as antibody). We gathered a [.csv](https://github.com/shuxiaoc/maxfuse/blob/main/docs/protein_gene_conversion.csv) file that covers many of such naming conversions and used during the ```MaxFuse``` process. Of course, this is not a complete conversion, and users should manually add in new naming conversions if they were not included in this .csv file.
33 |
34 | ## API documentation
35 |
36 | For detailed documentation of ```MaxFuse``` API, you can visit our [readthedocs](https://maxfuse.readthedocs.io/en/latest/) page.
37 |
38 | ## Code archive
39 |
40 | The analysis presented in the [manuscript](https://www.biorxiv.org/content/10.1101/2023.01.12.523851) was also deposited in this GitHub repository, under this [folder](https://github.com/shuxiaoc/maxfuse/tree/main/Archive). Note in the manuscript we used a development version of ```MaxFuse``` with slightly different grammar and can also be found there. If you require additional information on the analysis/data, please contact Zongming Ma (zongming.ma@yale.edu).
41 |
42 | ## License
43 |
44 | ```MaxFuse``` is under the [Academic Software License Agreement](https://github.com/shuxiaoc/maxfuse/blob/main/LICENSE), please use accordingly.
45 |
--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/docs/.DS_Store
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | # change sphinx-build to the desired version
8 | SPHINXBUILD ?= /Users/shuxiaochen/miniconda3/envs/maxfuse/bin/sphinx-build
9 | SOURCEDIR = .
10 | BUILDDIR = _build
11 |
12 | # Put it first so that "make" without argument is like "make help".
13 | help:
14 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
15 |
16 | .PHONY: help Makefile
17 |
18 | # Catch-all target: route all unknown targets to Sphinx using the new
19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
20 | %: Makefile
21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 |
--------------------------------------------------------------------------------
/docs/_static/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/docs/_static/.Rhistory
--------------------------------------------------------------------------------
/docs/_templates/README.md:
--------------------------------------------------------------------------------
1 | # Templates
2 |
3 | These templates are adapted from [JamesALeedham/Sphinx-Autosummary-Recursion](https://github.com/JamesALeedham/Sphinx-Autosummary-Recursion).
4 |
--------------------------------------------------------------------------------
/docs/_templates/class.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 |
3 | .. currentmodule:: {{ module }}
4 |
5 | .. autoclass:: {{ objname }}
6 | :show-inheritance:
7 |
8 | {% block methods %}
9 | {% if methods %}
10 | .. rubric:: {{ _('Methods') }}
11 |
12 | .. autosummary::
13 | :toctree:
14 | :nosignatures:
15 | {% for item in methods %}
16 | {%- if item in members and item not in inherited_members and not item.startswith('_') %}
17 | ~{{ name }}.{{ item }}
18 | {%- endif -%}
19 | {%- endfor %}
20 | {% endif %}
21 | {% endblock %}
22 |
23 | {% block attributes %}
24 | {% if attributes %}
25 | .. rubric:: {{ _('Attributes') }}
26 |
27 | .. autosummary::
28 | {% for item in attributes %}
29 | {%- if item in members and item not in inherited_members and not item.startswith('_') %}
30 | ~{{ name }}.{{ item }}
31 | {%- endif -%}
32 | {%- endfor %}
33 | {% endif %}
34 | {% endblock %}
35 |
--------------------------------------------------------------------------------
/docs/_templates/module.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 |
3 | .. automodule:: {{ fullname }}
4 |
5 | {% block attributes %}
6 | {% if attributes %}
7 | .. rubric:: Module attributes
8 |
9 | .. autosummary::
10 | :toctree:
11 | {% for item in attributes %}
12 | {{ item }}
13 | {%- endfor %}
14 | {% endif %}
15 | {% endblock %}
16 |
17 | {% block functions %}
18 | {% if functions %}
19 | .. rubric:: {{ _('Functions') }}
20 |
21 | .. autosummary::
22 | :toctree:
23 | :nosignatures:
24 | {% for item in functions %}
25 | {{ item }}
26 | {%- endfor %}
27 | {% endif %}
28 | {% endblock %}
29 |
30 | {% block classes %}
31 | {% if classes %}
32 | .. rubric:: {{ _('Classes') }}
33 |
34 | .. autosummary::
35 | :toctree:
36 | :template: class.rst
37 | :nosignatures:
38 | {% for item in classes %}
39 | {{ item }}
40 | {%- endfor %}
41 | {% endif %}
42 | {% endblock %}
43 |
44 | {% block exceptions %}
45 | {% if exceptions %}
46 | .. rubric:: {{ _('Exceptions') }}
47 |
48 | .. autosummary::
49 | :toctree:
50 | {% for item in exceptions %}
51 | {{ item }}
52 | {%- endfor %}
53 | {% endif %}
54 | {% endblock %}
55 |
56 | {% block modules %}
57 | {% if modules %}
58 | .. rubric:: Submodules
59 |
60 | .. autosummary::
61 | :toctree:
62 | :template: module.rst
63 | :recursive:
64 | {% for item in modules %}
65 | {{ item }}
66 | {%- endfor %}
67 | {% endif %}
68 | {% endblock %}
69 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API documentation
2 | =================
3 |
4 | This section provides detailed API documentation for all public functions
5 | and classes in the ``MaxFuse`` package.
6 |
7 | .. autosummary::
8 | :toctree: api
9 | :template: module.rst
10 | :recursive:
11 |
12 | maxfuse
13 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.graph.construct_graph.rst:
--------------------------------------------------------------------------------
1 | maxfuse.graph.construct\_graph
2 | ==============================
3 |
4 | .. currentmodule:: maxfuse.graph
5 |
6 | .. autofunction:: construct_graph
--------------------------------------------------------------------------------
/docs/api/maxfuse.graph.get_nearest_neighbors.rst:
--------------------------------------------------------------------------------
1 | maxfuse.graph.get\_nearest\_neighbors
2 | =====================================
3 |
4 | .. currentmodule:: maxfuse.graph
5 |
6 | .. autofunction:: get_nearest_neighbors
--------------------------------------------------------------------------------
/docs/api/maxfuse.graph.get_umap_embeddings.rst:
--------------------------------------------------------------------------------
1 | maxfuse.graph.get\_umap\_embeddings
2 | ===================================
3 |
4 | .. currentmodule:: maxfuse.graph
5 |
6 | .. autofunction:: get_umap_embeddings
--------------------------------------------------------------------------------
/docs/api/maxfuse.graph.graph_clustering.rst:
--------------------------------------------------------------------------------
1 | maxfuse.graph.graph\_clustering
2 | ===============================
3 |
4 | .. currentmodule:: maxfuse.graph
5 |
6 | .. autofunction:: graph_clustering
--------------------------------------------------------------------------------
/docs/api/maxfuse.graph.leiden_clustering.rst:
--------------------------------------------------------------------------------
1 | maxfuse.graph.leiden\_clustering
2 | ================================
3 |
4 | .. currentmodule:: maxfuse.graph
5 |
6 | .. autofunction:: leiden_clustering
--------------------------------------------------------------------------------
/docs/api/maxfuse.graph.rst:
--------------------------------------------------------------------------------
1 | maxfuse.graph
2 | =============
3 |
4 | .. automodule:: maxfuse.graph
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | .. rubric:: Functions
13 |
14 | .. autosummary::
15 | :toctree:
16 | :nosignatures:
17 |
18 | construct_graph
19 | get_nearest_neighbors
20 | get_umap_embeddings
21 | graph_clustering
22 | leiden_clustering
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.match_utils.address_matching_redundancy.rst:
--------------------------------------------------------------------------------
1 | maxfuse.match\_utils.address\_matching\_redundancy
2 | ==================================================
3 |
4 | .. currentmodule:: maxfuse.match_utils
5 |
6 | .. autofunction:: address_matching_redundancy
--------------------------------------------------------------------------------
/docs/api/maxfuse.match_utils.get_initial_matching.rst:
--------------------------------------------------------------------------------
1 | maxfuse.match\_utils.get\_initial\_matching
2 | ===========================================
3 |
4 | .. currentmodule:: maxfuse.match_utils
5 |
6 | .. autofunction:: get_initial_matching
--------------------------------------------------------------------------------
/docs/api/maxfuse.match_utils.get_refined_matching.rst:
--------------------------------------------------------------------------------
1 | maxfuse.match\_utils.get\_refined\_matching
2 | ===========================================
3 |
4 | .. currentmodule:: maxfuse.match_utils
5 |
6 | .. autofunction:: get_refined_matching
--------------------------------------------------------------------------------
/docs/api/maxfuse.match_utils.get_refined_matching_one_iter.rst:
--------------------------------------------------------------------------------
1 | maxfuse.match\_utils.get\_refined\_matching\_one\_iter
2 | ======================================================
3 |
4 | .. currentmodule:: maxfuse.match_utils
5 |
6 | .. autofunction:: get_refined_matching_one_iter
--------------------------------------------------------------------------------
/docs/api/maxfuse.match_utils.match_cells.rst:
--------------------------------------------------------------------------------
1 | maxfuse.match\_utils.match\_cells
2 | =================================
3 |
4 | .. currentmodule:: maxfuse.match_utils
5 |
6 | .. autofunction:: match_cells
--------------------------------------------------------------------------------
/docs/api/maxfuse.match_utils.rst:
--------------------------------------------------------------------------------
1 | maxfuse.match\_utils
2 | ====================
3 |
4 | .. automodule:: maxfuse.match_utils
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | .. rubric:: Functions
13 |
14 | .. autosummary::
15 | :toctree:
16 | :nosignatures:
17 |
18 | address_matching_redundancy
19 | get_initial_matching
20 | get_refined_matching
21 | get_refined_matching_one_iter
22 | match_cells
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.metrics.get_foscttm.rst:
--------------------------------------------------------------------------------
1 | maxfuse.metrics.get\_foscttm
2 | ============================
3 |
4 | .. currentmodule:: maxfuse.metrics
5 |
6 | .. autofunction:: get_foscttm
--------------------------------------------------------------------------------
/docs/api/maxfuse.metrics.get_knn_alignment_score.rst:
--------------------------------------------------------------------------------
1 | maxfuse.metrics.get\_knn\_alignment\_score
2 | ==========================================
3 |
4 | .. currentmodule:: maxfuse.metrics
5 |
6 | .. autofunction:: get_knn_alignment_score
--------------------------------------------------------------------------------
/docs/api/maxfuse.metrics.get_matching_acc.rst:
--------------------------------------------------------------------------------
1 | maxfuse.metrics.get\_matching\_acc
2 | ==================================
3 |
4 | .. currentmodule:: maxfuse.metrics
5 |
6 | .. autofunction:: get_matching_acc
--------------------------------------------------------------------------------
/docs/api/maxfuse.metrics.get_matching_alignment_score.rst:
--------------------------------------------------------------------------------
1 | maxfuse.metrics.get\_matching\_alignment\_score
2 | ===============================================
3 |
4 | .. currentmodule:: maxfuse.metrics
5 |
6 | .. autofunction:: get_matching_alignment_score
--------------------------------------------------------------------------------
/docs/api/maxfuse.metrics.rst:
--------------------------------------------------------------------------------
1 | maxfuse.metrics
2 | ===============
3 |
4 | .. automodule:: maxfuse.metrics
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | .. rubric:: Functions
13 |
14 | .. autosummary::
15 | :toctree:
16 | :nosignatures:
17 |
18 | get_foscttm
19 | get_knn_alignment_score
20 | get_matching_acc
21 | get_matching_alignment_score
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.construct_graphs.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.construct\_graphs
2 | =====================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.construct_graphs
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.filter_bad_matches.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.filter\_bad\_matches
2 | ========================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.filter_bad_matches
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.find_initial_pivots.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.find\_initial\_pivots
2 | =========================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.find_initial_pivots
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.get_embedding.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.get\_embedding
2 | ==================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.get_embedding
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.get_matching.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.get\_matching
2 | =================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.get_matching
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.plot_canonical_correlations.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.plot\_canonical\_correlations
2 | =================================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.plot_canonical_correlations
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.plot_matching_scores.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.plot\_matching\_scores
2 | ==========================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.plot_matching_scores
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.plot_singular_values.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.plot\_singular\_values
2 | ==========================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.plot_singular_values
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.propagate.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.propagate
2 | =============================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.propagate
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.refine_pivots.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.refine\_pivots
2 | ==================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.refine_pivots
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor
2 | ===================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. autoclass:: Fusor
7 | :show-inheritance:
8 |
9 |
10 |
11 | .. rubric:: Methods
12 |
13 | .. autosummary::
14 | :toctree:
15 | :nosignatures:
16 |
17 | ~Fusor.construct_graphs
18 | ~Fusor.filter_bad_matches
19 | ~Fusor.find_initial_pivots
20 | ~Fusor.get_embedding
21 | ~Fusor.get_matching
22 | ~Fusor.plot_canonical_correlations
23 | ~Fusor.plot_matching_scores
24 | ~Fusor.plot_singular_values
25 | ~Fusor.propagate
26 | ~Fusor.refine_pivots
27 | ~Fusor.split_into_batches
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.Fusor.split_into_batches.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model.Fusor.split\_into\_batches
2 | ========================================
3 |
4 | .. currentmodule:: maxfuse.model
5 |
6 | .. automethod:: Fusor.split_into_batches
--------------------------------------------------------------------------------
/docs/api/maxfuse.model.rst:
--------------------------------------------------------------------------------
1 | maxfuse.model
2 | =============
3 |
4 | .. automodule:: maxfuse.model
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | .. rubric:: Classes
17 |
18 | .. autosummary::
19 | :toctree:
20 | :template: class.rst
21 | :nosignatures:
22 |
23 | Fusor
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.rst:
--------------------------------------------------------------------------------
1 | maxfuse
2 | =======
3 |
4 | .. automodule:: maxfuse
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | .. rubric:: Submodules
25 |
26 | .. autosummary::
27 | :toctree:
28 | :template: module.rst
29 | :recursive:
30 |
31 | maxfuse.graph
32 | maxfuse.match_utils
33 | maxfuse.metrics
34 | maxfuse.model
35 | maxfuse.spatial_utils
36 | maxfuse.utils
37 |
38 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.spatial_utils.bind_spatial.rst:
--------------------------------------------------------------------------------
1 | maxfuse.spatial\_utils.bind\_spatial
2 | ====================================
3 |
4 | .. currentmodule:: maxfuse.spatial_utils
5 |
6 | .. autofunction:: bind_spatial
--------------------------------------------------------------------------------
/docs/api/maxfuse.spatial_utils.get_neighborhood_composition.rst:
--------------------------------------------------------------------------------
1 | maxfuse.spatial\_utils.get\_neighborhood\_composition
2 | =====================================================
3 |
4 | .. currentmodule:: maxfuse.spatial_utils
5 |
6 | .. autofunction:: get_neighborhood_composition
--------------------------------------------------------------------------------
/docs/api/maxfuse.spatial_utils.get_spatial_knn_indices.rst:
--------------------------------------------------------------------------------
1 | maxfuse.spatial\_utils.get\_spatial\_knn\_indices
2 | =================================================
3 |
4 | .. currentmodule:: maxfuse.spatial_utils
5 |
6 | .. autofunction:: get_spatial_knn_indices
--------------------------------------------------------------------------------
/docs/api/maxfuse.spatial_utils.rst:
--------------------------------------------------------------------------------
1 | maxfuse.spatial\_utils
2 | ======================
3 |
4 | .. automodule:: maxfuse.spatial_utils
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | .. rubric:: Functions
13 |
14 | .. autosummary::
15 | :toctree:
16 | :nosignatures:
17 |
18 | bind_spatial
19 | get_neighborhood_composition
20 | get_spatial_knn_indices
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.cca_embedding.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.cca\_embedding
2 | ============================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: cca_embedding
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.cdist_correlation.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.cdist\_correlation
2 | ================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: cdist_correlation
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.center_scale.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.center\_scale
2 | ===========================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: center_scale
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.dict_to_list.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.dict\_to\_list
2 | ============================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: dict_to_list
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.drop_zero_variability_columns.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.drop\_zero\_variability\_columns
2 | ==============================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: drop_zero_variability_columns
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.filter_bad_matches.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.filter\_bad\_matches
2 | ==================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: filter_bad_matches
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.get_centroids.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.get\_centroids
2 | ============================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: get_centroids
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.graph_smoothing.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.graph\_smoothing
2 | ==============================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: graph_smoothing
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.list_to_dict.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.list\_to\_dict
2 | ============================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: list_to_dict
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.pearson_correlation.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.pearson\_correlation
2 | ==================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: pearson_correlation
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.process_count_data.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.process\_count\_data
2 | ==================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: process_count_data
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.recode.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.recode
2 | ====================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: recode
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.robust_svd.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.robust\_svd
2 | =========================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: robust_svd
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils
2 | =============
3 |
4 | .. automodule:: maxfuse.utils
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | .. rubric:: Functions
13 |
14 | .. autosummary::
15 | :toctree:
16 | :nosignatures:
17 |
18 | cca_embedding
19 | cdist_correlation
20 | center_scale
21 | dict_to_list
22 | drop_zero_variability_columns
23 | filter_bad_matches
24 | get_centroids
25 | graph_smoothing
26 | list_to_dict
27 | pearson_correlation
28 | process_count_data
29 | recode
30 | robust_svd
31 | shrink_towards_centroids
32 | sort_dict
33 | summarize_clustering
34 | svd_denoise
35 | svd_embedding
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.shrink_towards_centroids.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.shrink\_towards\_centroids
2 | ========================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: shrink_towards_centroids
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.sort_dict.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.sort\_dict
2 | ========================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: sort_dict
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.summarize_clustering.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.summarize\_clustering
2 | ===================================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: summarize_clustering
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.svd_denoise.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.svd\_denoise
2 | ==========================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: svd_denoise
--------------------------------------------------------------------------------
/docs/api/maxfuse.utils.svd_embedding.rst:
--------------------------------------------------------------------------------
1 | maxfuse.utils.svd\_embedding
2 | ============================
3 |
4 | .. currentmodule:: maxfuse.utils
5 |
6 | .. autofunction:: svd_embedding
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | r"""
2 | Sphinx configuration
3 | """
4 | # add system path to make maxfuse importable
5 | import inspect
6 | import sphinx_autodoc_typehints
7 |
8 | project = 'MaxFuse'
9 | version = '0.0.1'
10 | release = '0.0.1'
11 | author = "Shuxiao Chen, Bokai Zhu"
12 |
13 | extensions = [
14 | 'sphinx.ext.autodoc',
15 | 'sphinx.ext.autosummary',
16 | 'sphinx.ext.intersphinx',
17 | 'sphinx.ext.napoleon',
18 | 'sphinx.ext.viewcode',
19 | 'sphinx.ext.mathjax',
20 | 'sphinx_autodoc_typehints',
21 | 'sphinx_copybutton',
22 | 'nbsphinx'
23 | ]
24 |
25 | templates_path = ['_templates']
26 | html_static_path = ['_static']
27 | html_css_files = ['custom.css']
28 | source_suffix = '.rst'
29 | master_doc = 'index'
30 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
31 |
32 | html_show_sourcelink = True
33 | set_type_checking_flag = True
34 | typehints_fully_qualified = True
35 | napoleon_use_rtype = False
36 | autosummary_generate = True
37 | autosummary_generate_overwrite = True
38 | autodoc_preserve_defaults = True
39 | autodoc_inherit_docstrings = True
40 | autodoc_default_options = {
41 | 'autosummary': True
42 | }
43 |
44 | # html_favicon = '_static/favicon.ico'
45 | html_theme = 'sphinx_rtd_theme'
46 |
47 | intersphinx_mapping = dict(
48 | python=('https://docs.python.org/3/', None),
49 | numpy=('https://numpy.org/doc/stable/', None),
50 | scipy=('https://docs.scipy.org/doc/scipy/', None),
51 | pandas=('https://pandas.pydata.org/pandas-docs/stable/', None),
52 | sklearn=('https://scikit-learn.org/stable/', None),
53 | matplotlib=('https://matplotlib.org/stable/', None),
54 | seaborn=('https://seaborn.pydata.org/', None),
55 | # networkx=('https://networkx.org/documentation/stable/', None),
56 | anndata=('https://anndata.readthedocs.io/en/stable/', None),
57 | scanpy=('https://scanpy.readthedocs.io/en/stable/', None),
58 | # torch=('https://pytorch.org/docs/stable/', None),
59 | ignite=('https://pytorch.org/ignite/', None),
60 | # plotly=('https://plotly.com/python-api-reference/', None)
61 | )
62 |
63 | qualname_overrides = {
64 | 'anndata._core.anndata.AnnData': 'anndata.AnnData',
65 | 'matplotlib.axes._axes.Axes': 'matplotlib.axes.Axes',
66 | 'numpy.random.mtrand.RandomState': 'numpy.random.RandomState',
67 | 'pandas.core.frame.DataFrame': 'pandas.DataFrame',
68 | 'scipy.sparse.base.spmatrix': 'scipy.sparse.spmatrix',
69 | 'seaborn.axisgrid.JointGrid': 'seaborn.JointGrid',
70 | }
71 |
72 | fa_orig = sphinx_autodoc_typehints.format_annotation
73 |
74 | def format_annotation(annotation, config, fully_qualified=True): # pylint: disable=unused-argument
75 | r"""
76 | Adapted from https://github.com/agronholm/sphinx-autodoc-typehints/issues/38#issuecomment-448517805
77 | """
78 | if inspect.isclass(annotation):
79 | full_name = f'{annotation.__module__}.{annotation.__qualname__}'
80 | override = qualname_overrides.get(full_name)
81 | if override is not None:
82 | return f':py:class:`~{override}`'
83 | return fa_orig(annotation, config)
84 | sphinx_autodoc_typehints.format_annotation = format_annotation
85 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | MaxFuse Documentation
2 | ==================================
3 |
4 | ``MaxFuse`` (``Ma``\ tching ``x``\ cross modalities via ``Fu``\ zzy ``s``\ moothed ``e``\ mbeddings)
5 | is a python package for integrating single-cell datasets from different modalities with no overlapping features and/or under low signal-to-noise ratio regimes.
6 | For most single-cell cross modality integration methods, the feasibility of cross-modal integration relies on the existence of highly correlated, a priori 'linked' features.
7 | When such linked features are few or uninformative, a scenario that we call 'weak linkage', existing methods fail.
8 | We developed MaxFuse, a cross-modal data integration method that, through iterative co-embedding, data smoothing, and cell matching, leverages all information in each modality to obtain high-quality integration.
9 | A prototypical example of weak linkage is the integration of spatial proteomic data with single-cell sequencing data.
10 | For details, please refer to `the manuscript `__.
11 |
12 |
13 | ***************
14 | Getting started
15 | ***************
16 |
17 | The ``MaxFuse`` package can also be installed via pip:
18 |
19 | .. code-block:: bash
20 | :linenos:
21 |
22 | conda create -n maxfuse python=3.8
23 | conda activate maxfuse
24 | python -m pip install maxfuse
25 |
26 | .. note::
27 | To avoid potential dependency conflicts, we recommend
28 | installing within Python virtual environment such as conda.
29 |
30 | Now you are all set! Please proceed to `tutorials `__
31 | for a list of examples.
32 |
33 | Note in cases when integrating single cell data across protein and RNA modalities,
34 | many times the nomenclature of features are different (e.g., mRNA ITGAM could be named as CD11b-1 when used as antibody).
35 | We gathered a `.csv `__ file that covers many of such naming conversions and used during the MaxFuse process.
36 | Of course, this is not a complete conversion, and users should manually add in new naming conversions if they were not included in this .csv file.
37 |
38 |
39 | ***************
40 | Code archive
41 | ***************
42 | The analysis presented in `the manuscript `__ was also
43 | deposited in `this `__ GitHub repository, under `this `__ folder.
44 | Note in the manuscript we used a development version of MaxFuse with slightly different grammar and can also be found there.
45 | If you require additional information on the analysis/data, please contact Zongming Ma (zongming@wharton.upenn.edu).
46 |
47 |
48 | ***************
49 | License
50 | ***************
51 | MaxFuse is under the `Academic Software License Agreement `__, please use accordingly.
52 |
53 |
54 |
55 | .. toctree::
56 | :maxdepth: 2
57 | :caption: Contents
58 |
59 | tutorials
60 | api
61 |
62 | ******************
63 | Indices and tables
64 | ******************
65 |
66 | * :ref:`genindex`
67 | * :ref:`modindex`
68 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | maxfuse
2 | sphinx_autodoc_typehints
3 | sphinx_copybutton
4 | nbsphinx
5 | sphinx_rtd_theme
6 |
--------------------------------------------------------------------------------
/docs/tutorials.rst:
--------------------------------------------------------------------------------
1 | Tutorials
2 | =========
3 |
4 | .. toctree::
5 |
6 | citeseq_pbmc_evaluate.ipynb
7 | tonsil_codex_rnaseq.ipynb
8 |
--------------------------------------------------------------------------------
/maxfuse/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | MaxFuse
3 | """
4 | from . import graph, match_utils, metrics, model, spatial_utils, utils
--------------------------------------------------------------------------------
/maxfuse/spatial_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility functions for dealing with spatial data
3 | """
4 |
5 | import numpy as np
6 | from sklearn.neighbors import NearestNeighbors
7 |
8 |
9 | def bind_spatial(features, nbhd, wt_on_features=0.7):
10 | """
11 | Return a new array of form [wt_on_features * features / feature_norm, (1-wt_on_features) * nbhd / nbhd_norm]
12 |
13 | Parameters
14 | ----------
15 | features: np.ndarray of shape (n_samples, n_features)
16 | Feature matrix
17 | nbhd: np.ndarray of shape (n_samples, n_clusters)
18 | Cell neighborhood composition matrix
19 | wt_on_features: float, default=0.7
20 | Weight to put on the feature matrix.
21 |
22 | Returns
23 | -------
24 | res: np.ndarray of shape (n_samples, n_features+n_clusters)
25 |
26 | """
27 | # normalize two kinds of info for easier tuning of weight
28 | feature_norm = np.linalg.norm(features)
29 | nbhd_norm = np.linalg.norm(nbhd)
30 | res = np.concatenate((
31 | wt_on_features * features / feature_norm,
32 | (1-wt_on_features) * nbhd / nbhd_norm
33 | ), axis=1)
34 | return res
35 |
36 |
37 | def get_spatial_knn_indices(locations, n_neighbors=15, method='kd_tree'):
38 | """
39 | Compute k-nearest neighbors of locations.
40 |
41 | Parameters
42 | ----------
43 | locations: np.ndarray of shape (n_samples, 2)
44 | Data matrix
45 | n_neighbors: int
46 | Number of nearest neighbors
47 | method: str, default='kd_tree'
48 | Method to use when computing the nearest neighbors, one of ['ball_tree', 'kd_tree', 'brute']
49 |
50 | Returns
51 | -------
52 | knn_indices: np.ndarray of shape (n_samples, n_neighbors)
53 | Each row represents the knn of that sample
54 | """
55 | locations = np.array(locations)
56 | assert n_neighbors <= locations.shape[0]
57 | # k-NN indices, may be asymmetric
58 | _, knn_indices = NearestNeighbors(
59 | n_neighbors=n_neighbors, algorithm=method
60 | ).fit(locations).kneighbors(locations)
61 | return knn_indices
62 |
63 |
64 | def get_neighborhood_composition(knn_indices, labels, log1p=False):
65 | """
66 | Compute the composition of neighbors for each sample.
67 |
68 | Parameters
69 | ----------
70 | knn_indices: np.ndarray of shape (n_samples, n_neighbors)
71 | Each row represents the knn of that sample
72 | labels: np.ndarray of shape (n_samples, )
73 | Cluster labels
74 | log1p: bool, default=False
75 | Whether to apply log1p transformation
76 |
77 | Returns
78 | -------
79 | comp: np.ndarray of shape (n_samples, n_neighbors)
80 | The composition (in proportion) of neighbors for each sample.
81 | """
82 | labels = list(labels)
83 | n, k = knn_indices.shape
84 | unique_clusters = np.unique(labels)
85 | n_clusters = len(unique_clusters)
86 | label_to_clust_idx = {label: i for i, label in enumerate(unique_clusters)}
87 |
88 | comp = np.zeros((n, n_clusters))
89 | for i, neighbors in enumerate(knn_indices):
90 | good_neighbors = [nb for nb in neighbors if nb != -1]
91 | for nb in good_neighbors:
92 | comp[i, label_to_clust_idx[labels[nb]]] += 1
93 |
94 | if log1p:
95 | comp = np.log1p(comp)
96 | return comp
97 |
--------------------------------------------------------------------------------
/media/ai_generated_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/media/ai_generated_icon.png
--------------------------------------------------------------------------------
/media/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuxiaoc/maxfuse/7ccf6b4a32e01d013265b9c72ade8878d3172aa4/media/fig1.png
--------------------------------------------------------------------------------
/media/temp.md:
--------------------------------------------------------------------------------
1 | temp.md
2 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "maxfuse"
7 | version = "0.0.2"
8 | authors = [
9 | { name="Shuxiao Chen", email="shuxiaoc@gmail.com" },
10 | { name="Bokai Zhu", email="bkzhu@stanford.edu" },
11 | ]
12 | description = "Cross-modality matching of single cells via iterative fuzzy smoothed embedding"
13 | readme = "README.md"
14 | requires-python = ">=3.8"
15 | classifiers = [
16 | "Programming Language :: Python :: 3",
17 | "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 | "igraph",
21 | "leidenalg",
22 | "numpy",
23 | "pandas",
24 | "scanpy",
25 | "scipy",
26 | "scikit-learn",
27 | "matplotlib",
28 | "requests",
29 | ]
30 |
31 | [project.urls]
32 | "Homepage" = "https://github.com/shuxiaoc/maxfuse"
33 | "Bug Tracker" = "https://github.com/shuxiaoc/maxfuse/issues"
34 |
35 | [options.packages.find]
36 | where = '.'
37 |
--------------------------------------------------------------------------------