├── .gitattributes ├── .gitignore ├── BrainAlign ├── .gitattributes ├── README.md ├── SR_RSC │ ├── .ipynb_checkpoints │ │ ├── embedder-checkpoint.py │ │ ├── evaluation-checkpoint.py │ │ └── main_sr_rsc-checkpoint.py │ ├── LICENSE │ ├── README.md │ ├── embedder.py │ ├── evaluation.py │ ├── layers │ │ ├── GCN.py │ │ ├── GCN2.py │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── discriminator.py │ │ └── fc.py │ ├── main.py │ ├── main_sr_rsc.py │ ├── models │ │ ├── .ipynb_checkpoints │ │ │ └── SubHIN-checkpoint.py │ │ ├── SubHIN.py │ │ └── __init__.py │ ├── test_input_data.py │ └── utils │ │ └── process.py ├── __init__.py ├── brain_analysis │ ├── .ipynb_checkpoints │ │ ├── analysis-checkpoint.py │ │ ├── analysis_anatomical-checkpoint.py │ │ ├── analysis_general-checkpoint.py │ │ ├── analysis_genomic-checkpoint.py │ │ ├── analysis_main-checkpoint.py │ │ ├── analysis_spatial-checkpoint.py │ │ ├── analysis_utils-checkpoint.py │ │ ├── data_utils-checkpoint.py │ │ ├── pipeline-checkpoint.py │ │ ├── process-checkpoint.py │ │ └── utils-checkpoint.py │ ├── __init__.py │ ├── analysis.py │ ├── analysis_anatomical.py │ ├── analysis_general.py │ ├── analysis_genomic.py │ ├── analysis_main.py │ ├── analysis_spatial.py │ ├── analysis_utils.py │ ├── configs │ │ ├── .ipynb_checkpoints │ │ │ └── sr_rsc_config_binary-checkpoint.py │ │ ├── __init__.py │ │ ├── came_config_binary.py │ │ ├── heco_config.py │ │ ├── heco_config_all.py │ │ ├── heco_config_binary.py │ │ ├── heco_config_binary_2020sa.py │ │ ├── heco_config_three_2020sa.py │ │ ├── sr_rsc_config.py │ │ └── sr_rsc_config_binary.py │ ├── data_utils.py │ ├── logger.py │ ├── metrics.py │ ├── pipeline.py │ ├── pipline_analysis_alignment.py │ ├── process.py │ ├── r_analysis │ │ ├── .Rhistory │ │ ├── genomic_findmarkers.R │ │ ├── genomic_functions.R │ │ ├── includes.R │ │ ├── test.R │ │ ├── transform2seurat.R │ │ ├── transform2seurat_cluster.R │ │ ├── transform_adata.py │ │ └── transform_adata_cluster.py │ ├── r_gene_comparison │ │ ├── barplot_degs.R │ │ └── fanPlotScript-compact.R │ ├── typehint.py │ └── utils.py ├── came │ ├── PARAMETERS.py │ ├── __init__.py │ ├── datapair │ │ ├── __init__.py │ │ ├── aligned.py │ │ └── unaligned.py │ ├── model │ │ ├── __init__.py │ │ ├── _minibatch.py │ │ ├── _predict.py │ │ ├── _utils.py │ │ ├── base_layers.py │ │ ├── cgc.py │ │ ├── cggc.py │ │ ├── heteroframe.py │ │ ├── hidden.py │ │ ├── loss.py │ │ └── v0 │ │ │ ├── __init__.py │ │ │ ├── _minibatch.py │ │ │ ├── _predict.py │ │ │ ├── _utils.py │ │ │ ├── base_layers.py │ │ │ ├── cgc.py │ │ │ ├── cggc.py │ │ │ ├── heteroframe.py │ │ │ ├── hidden.py │ │ │ └── loss.py │ ├── pipeline.py │ └── utils │ │ ├── __init__.py │ │ ├── _alluvial.py │ │ ├── _base_trainer.py │ │ ├── _get_example_data.py │ │ ├── _io_h5py.py │ │ ├── analyze.py │ │ ├── base.py │ │ ├── downsample_counts.py │ │ ├── evaluation.py │ │ ├── plot.py │ │ ├── preprocess.py │ │ ├── train.py │ │ └── train_v0.py ├── code │ ├── __init__.py │ ├── embeds │ │ ├── acm │ │ │ └── README.md │ │ ├── aminer │ │ │ └── README.md │ │ ├── dblp │ │ │ └── README.md │ │ └── freebase │ │ │ └── README.md │ ├── main.py │ ├── main_heco.py │ ├── main_parallel.py │ ├── module │ │ ├── __init__.py │ │ ├── contrast.py │ │ ├── heco.py │ │ ├── mp_encoder.py │ │ └── sc_encoder.py │ ├── predict.py │ └── utils │ │ ├── __init__.py │ │ ├── evaluate.py │ │ ├── load_data.py │ │ ├── logger.py │ │ ├── logreg.py │ │ └── params.py ├── data │ ├── Demo_mouse_human_wholebrain │ │ ├── Data │ │ │ └── load_one2one.ipynb │ │ ├── Demo_run-region.ipynb │ │ └── Demo_run.ipynb │ ├── SlideseqV2_mouse_macaque_hippocampus │ │ ├── Data │ │ │ └── Untitled.ipynb │ │ └── Hmerfish_run.ipynb │ ├── __init__.py │ ├── data_utils.py │ ├── load_node_feature_mouse_human.py │ ├── mouse_human_wholebrain │ │ ├── Data │ │ │ └── load_one2one.ipynb │ │ └── W_run.ipynb │ ├── mouse_macaque_hippocampus │ │ ├── Data │ │ │ └── Untitled.ipynb │ │ ├── H_run.ipynb │ │ ├── H_run.py │ │ └── H_run_region.ipynb │ ├── mp_gen.py │ ├── mp_gen_mouse_human.py │ ├── neibor.py │ ├── pos.py │ └── script_labels.py └── demo │ └── subsample.py ├── LICENSE ├── README.md ├── readme_figs ├── alldatasets │ ├── all_dataset_seurate_alignment_score.png │ └── all_dataset_umap_integration.png └── subsampled │ ├── subsampled_seurate_alignment_score.png │ └── subsampled_umap.png ├── requirements_pip.txt └── run_came ├── __init__.py ├── analysis_script ├── .ipynb_checkpoints │ ├── H_run_came-checkpoint.ipynb │ ├── H_run_came-checkpoint.py │ └── run_came-checkpoint.py ├── H_run_came.ipynb ├── H_run_came.py ├── load_human_region_tree_v2.py ├── load_mouse_2020sa.py ├── load_mouse_region_tree.R ├── load_mouse_region_tree.py ├── load_mouse_region_tree_v2.py ├── load_part_expression.py ├── load_part_expression_6regions.py ├── read_rhesus_2018s.R ├── read_rhesus_2018s.rmd ├── run_came.py ├── run_came_demo.py └── test_tree.py ├── analysis_utils ├── .ipynb_checkpoints │ ├── homo_random_config-checkpoint.py │ ├── ttest_plot_utils-checkpoint.py │ └── ttest_plot_utils_origin-checkpoint.py ├── __init__.py ├── homo_random_config.py ├── logger.py ├── tree_tools.R ├── ttest_plot_utils.py └── ttest_plot_utils_origin.py ├── brain_human_mouse └── get_human_acronym_color.py ├── brain_mouse_2020sa └── human_gene_palette │ └── 2011-12-16203C-Supplementary_Table8.xls ├── brain_voxel_sample_mouse_human_pipline.py ├── came ├── .ipynb_checkpoints │ ├── PARAMETERS-checkpoint.py │ ├── __init__-checkpoint.py │ └── pipeline-checkpoint.py ├── PARAMETERS.py ├── __init__.py ├── datapair │ ├── .ipynb_checkpoints │ │ └── unaligned-checkpoint.py │ ├── __init__.py │ ├── aligned.py │ └── unaligned.py ├── model │ ├── .ipynb_checkpoints │ │ └── _utils-checkpoint.py │ ├── __init__.py │ ├── _minibatch.py │ ├── _predict.py │ ├── _utils.py │ ├── base_layers.py │ ├── cgc.py │ ├── cggc.py │ ├── heteroframe.py │ ├── hidden.py │ ├── loss.py │ └── v0 │ │ ├── __init__.py │ │ ├── _minibatch.py │ │ ├── _predict.py │ │ ├── _utils.py │ │ ├── base_layers.py │ │ ├── cgc.py │ │ ├── cggc.py │ │ ├── heteroframe.py │ │ ├── hidden.py │ │ └── loss.py ├── pipeline.py └── utils │ ├── .ipynb_checkpoints │ ├── _get_example_data-checkpoint.py │ ├── preprocess-checkpoint.py │ └── train-checkpoint.py │ ├── __init__.py │ ├── _alluvial.py │ ├── _base_trainer.py │ ├── _get_example_data.py │ ├── _io_h5py.py │ ├── analyze.py │ ├── base.py │ ├── downsample_counts.py │ ├── evaluation.py │ ├── plot.py │ ├── preprocess.py │ ├── train.py │ └── train_v0.py ├── came_origin ├── .ipynb_checkpoints │ ├── PARAMETERS-checkpoint.py │ ├── __init__-checkpoint.py │ └── pipeline-checkpoint.py ├── PARAMETERS.py ├── __init__.py ├── datapair │ ├── .ipynb_checkpoints │ │ └── unaligned-checkpoint.py │ ├── __init__.py │ ├── aligned.py │ └── unaligned.py ├── model │ ├── .ipynb_checkpoints │ │ └── _utils-checkpoint.py │ ├── __init__.py │ ├── _minibatch.py │ ├── _predict.py │ ├── _utils.py │ ├── base_layers.py │ ├── cgc.py │ ├── cggc.py │ ├── heteroframe.py │ ├── hidden.py │ ├── loss.py │ └── v0 │ │ ├── __init__.py │ │ ├── _minibatch.py │ │ ├── _predict.py │ │ ├── _utils.py │ │ ├── base_layers.py │ │ ├── cgc.py │ │ ├── cggc.py │ │ ├── heteroframe.py │ │ ├── hidden.py │ │ └── loss.py ├── pipeline.py └── utils │ ├── .ipynb_checkpoints │ ├── preprocess-checkpoint.py │ └── train-checkpoint.py │ ├── __init__.py │ ├── _alluvial.py │ ├── _base_trainer.py │ ├── _get_example_data.py │ ├── _io_h5py.py │ ├── analyze.py │ ├── base.py │ ├── downsample_counts.py │ ├── evaluation.py │ ├── plot.py │ ├── preprocess.py │ ├── train.py │ └── train_v0.py ├── heco_utils.py └── load_brain_voxel_sample_mouse_human.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea/ 3 | *.txt 4 | *.bin 5 | *.json 6 | *.log 7 | *.pkl 8 | *.svg 9 | #*.png 10 | *.pt 11 | *.h5 12 | 13 | *.obj 14 | *.gz 15 | *.h5ad 16 | *.pickle 17 | *.tsv 18 | *.csv 19 | *.zip 20 | *.tiff 21 | *.Rdata 22 | 23 | *.mtx 24 | *.rds 25 | 26 | *.svg 27 | 28 | *.npz 29 | *.npy 30 | 31 | ./BrainAlign/**/.npz 32 | __pycache__/ 33 | 34 | .Rproj.user 35 | -------------------------------------------------------------------------------- /BrainAlign/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /BrainAlign/README.md: -------------------------------------------------------------------------------- 1 | # BrainAlign 2 | This repo is for source code of "Whole Brain Alignment of Spatial Transcriptomics between Humans and Mice with BrainAlign". 3 | 4 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/.ipynb_checkpoints/evaluation-checkpoint.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | from scipy.cluster.hierarchy import dendrogram, linkage, fcluster 4 | from scipy.spatial.distance import pdist 5 | from sklearn.cluster import KMeans 6 | from sklearn import metrics 7 | from sklearn.linear_model import LogisticRegression 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.svm import SVC 12 | from sklearn.preprocessing import StandardScaler 13 | from scipy.spatial.distance import cdist 14 | import logging 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | def purity_score(y_true, y_pred): 19 | contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred) 20 | return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 21 | 22 | def sigmoid(x): 23 | return 1 / (1 + np.exp(-x)) 24 | 25 | class evaluation_metrics(): 26 | def __init__(self, embs, labels, logger): 27 | 28 | self.embs = embs 29 | train, val, test = labels 30 | 31 | self.logger = logger 32 | 33 | self.trX, self.trY = self.embs[np.array(train)[:,0]], np.array(train)[:,1] 34 | self.valX, self.valY = self.embs[np.array(val)[:,0]], np.array(val)[:,1] 35 | self.tsX, self.tsY = self.embs[np.array(test)[:,0]], np.array(test)[:,1] 36 | self.n_label = len(set(self.tsY)) 37 | 38 | self.val_acc = self.evaluate_cluster() 39 | 40 | 41 | def evaluation_lp(self, node1, node2, label): 42 | 43 | X1, X2 = [], [] 44 | cnt = 0 45 | error = 0 46 | prob = [] 47 | preds = [] 48 | 49 | meanvec = np.mean(self.embs, 0) 50 | for i in range(len(node1)): 51 | n1 = int(node1[i]) 52 | n2 = int(node2[i]) 53 | X1 = self.embs[n1] 54 | X2 = self.embs[n2] 55 | 56 | if X1.sum() == 0: 57 | cnt+= 1 58 | X1 = meanvec 59 | if X2.sum() == 0: 60 | cnt+= 1 61 | X2 = meanvec 62 | r = X1.dot(X2) 63 | prob.append(r) 64 | if r >= 0.5: 65 | r = 1 66 | else: 67 | r = 0 68 | preds.append(r) 69 | if r != label[i]: 70 | error += 1 71 | 72 | auc = metrics.roc_auc_score(label, prob) 73 | precision, recall, thresholds = metrics.precision_recall_curve(label, prob) 74 | pr = metrics.auc(recall, precision) 75 | ap = metrics.average_precision_score(label, prob, average=None) 76 | acc = metrics.accuracy_score(label, preds) 77 | f1_micro = metrics.f1_score(label, preds, average='micro') 78 | f1_macro = metrics.f1_score(label, preds, average='macro') 79 | self.logger.info('AUC: %.5f, AP: %.5f, PR: %.5f, ACC: %.5f, F1_micro: %.5f, F1_macro: %.5f'%(auc, ap, pr, acc, f1_micro, f1_macro)) 80 | 81 | def evalutation(self): 82 | 83 | nmis, adjscores, puritys, fis, fas = 0,0,0,0,0 84 | # for rs in [0,123,432,6543,8478643]: 85 | for rs in [0]: 86 | kmeans = KMeans(n_clusters=self.n_label, random_state=rs).fit(self.tsX) 87 | preds = kmeans.predict(self.tsX) 88 | nmi = metrics.normalized_mutual_info_score(labels_true=self.tsY, labels_pred=np.array(preds)) 89 | adjscore = metrics.adjusted_rand_score(self.tsY, np.array(preds)) 90 | purity = purity_score(self.tsY, np.array(preds)) 91 | nmis += nmi 92 | adjscores += adjscore 93 | puritys+=purity 94 | 95 | lr = LogisticRegression(max_iter=500, random_state=rs, solver='sag') 96 | lr.fit(self.trX, self.trY) 97 | Y_pred = lr.predict(self.tsX) 98 | f1_micro = metrics.f1_score(self.tsY, Y_pred, average='micro') 99 | f1_macro = metrics.f1_score(self.tsY, Y_pred, average='macro') 100 | fis+=f1_micro 101 | fas+=f1_macro 102 | self.logger.info('NMI=%.5f, ARI: %.5f, f1_micro=%.5f, f1_macro=%.5f' % (nmis, adjscores, fis, fas)) 103 | 104 | 105 | def evaluate_cluster(self): 106 | 107 | kmeans = KMeans(n_clusters=self.n_label, random_state=0).fit(self.valX) 108 | preds = kmeans.predict(self.trX) 109 | nmi = metrics.normalized_mutual_info_score( labels_true=self.trY, labels_pred=np.array(preds)) 110 | return nmi 111 | 112 | def evaluate_clf(self): 113 | r"""Evaluates latent space quality via a logistic regression downstream task.""" 114 | clf = LogisticRegression(max_iter=500, random_state=0, solver='lbfgs').fit(self.trX, self.trY) 115 | val_acc = clf.score(self.valX, self.valY) 116 | return val_acc 117 | 118 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 RuixZh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/README.md: -------------------------------------------------------------------------------- 1 | # SR-RSC 2 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | from scipy.cluster.hierarchy import dendrogram, linkage, fcluster 4 | from scipy.spatial.distance import pdist 5 | from sklearn.cluster import KMeans 6 | from sklearn import metrics 7 | from sklearn.linear_model import LogisticRegression 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.svm import SVC 12 | from sklearn.preprocessing import StandardScaler 13 | from scipy.spatial.distance import cdist 14 | import logging 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | def purity_score(y_true, y_pred): 19 | contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred) 20 | return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 21 | 22 | def sigmoid(x): 23 | return 1 / (1 + np.exp(-x)) 24 | 25 | class evaluation_metrics(): 26 | def __init__(self, embs, labels, logger): 27 | 28 | self.embs = embs 29 | train, val, test = labels 30 | 31 | self.logger = logger 32 | 33 | self.trX, self.trY = self.embs[np.array(train)[:,0]], np.array(train)[:,1] 34 | self.valX, self.valY = self.embs[np.array(val)[:,0]], np.array(val)[:,1] 35 | self.tsX, self.tsY = self.embs[np.array(test)[:,0]], np.array(test)[:,1] 36 | self.n_label = len(set(self.tsY)) 37 | 38 | self.val_acc = self.evaluate_cluster() 39 | 40 | 41 | def evaluation_lp(self, node1, node2, label): 42 | 43 | X1, X2 = [], [] 44 | cnt = 0 45 | error = 0 46 | prob = [] 47 | preds = [] 48 | 49 | meanvec = np.mean(self.embs, 0) 50 | for i in range(len(node1)): 51 | n1 = int(node1[i]) 52 | n2 = int(node2[i]) 53 | X1 = self.embs[n1] 54 | X2 = self.embs[n2] 55 | 56 | if X1.sum() == 0: 57 | cnt+= 1 58 | X1 = meanvec 59 | if X2.sum() == 0: 60 | cnt+= 1 61 | X2 = meanvec 62 | r = X1.dot(X2) 63 | prob.append(r) 64 | if r >= 0.5: 65 | r = 1 66 | else: 67 | r = 0 68 | preds.append(r) 69 | if r != label[i]: 70 | error += 1 71 | 72 | auc = metrics.roc_auc_score(label, prob) 73 | precision, recall, thresholds = metrics.precision_recall_curve(label, prob) 74 | pr = metrics.auc(recall, precision) 75 | ap = metrics.average_precision_score(label, prob, average=None) 76 | acc = metrics.accuracy_score(label, preds) 77 | f1_micro = metrics.f1_score(label, preds, average='micro') 78 | f1_macro = metrics.f1_score(label, preds, average='macro') 79 | self.logger.info('AUC: %.5f, AP: %.5f, PR: %.5f, ACC: %.5f, F1_micro: %.5f, F1_macro: %.5f'%(auc, ap, pr, acc, f1_micro, f1_macro)) 80 | 81 | def evalutation(self): 82 | 83 | nmis, adjscores, puritys, fis, fas = 0,0,0,0,0 84 | # for rs in [0,123,432,6543,8478643]: 85 | for rs in [0]: 86 | kmeans = KMeans(n_clusters=self.n_label, random_state=rs).fit(self.tsX) 87 | preds = kmeans.predict(self.tsX) 88 | nmi = metrics.normalized_mutual_info_score(labels_true=self.tsY, labels_pred=np.array(preds)) 89 | adjscore = metrics.adjusted_rand_score(self.tsY, np.array(preds)) 90 | purity = purity_score(self.tsY, np.array(preds)) 91 | nmis += nmi 92 | adjscores += adjscore 93 | puritys+=purity 94 | 95 | lr = LogisticRegression(max_iter=500, random_state=rs, solver='sag') 96 | lr.fit(self.trX, self.trY) 97 | Y_pred = lr.predict(self.tsX) 98 | f1_micro = metrics.f1_score(self.tsY, Y_pred, average='micro') 99 | f1_macro = metrics.f1_score(self.tsY, Y_pred, average='macro') 100 | fis+=f1_micro 101 | fas+=f1_macro 102 | self.logger.info('NMI=%.5f, ARI: %.5f, f1_micro=%.5f, f1_macro=%.5f' % (nmis, adjscores, fis, fas)) 103 | 104 | 105 | def evaluate_cluster(self): 106 | 107 | kmeans = KMeans(n_clusters=self.n_label, random_state=0).fit(self.valX) 108 | preds = kmeans.predict(self.trX) 109 | nmi = metrics.normalized_mutual_info_score( labels_true=self.trY, labels_pred=np.array(preds)) 110 | return nmi 111 | 112 | def evaluate_clf(self): 113 | r"""Evaluates latent space quality via a logistic regression downstream task.""" 114 | clf = LogisticRegression(max_iter=500, random_state=0, solver='lbfgs').fit(self.trX, self.trY) 115 | val_acc = clf.score(self.valX, self.valY) 116 | return val_acc 117 | 118 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/layers/GCN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class GCN(nn.Module): 7 | def __init__(self, in_ft, out_ft, act=nn.PReLU(), drop_prob=0.0, isBias=False): 8 | super().__init__() 9 | self.linear = nn.Linear(in_ft, out_ft, bias=False) 10 | 11 | # if isBias: 12 | # self.bias = nn.Parameter(torch.empty(out_ft)) 13 | # self.bias.data.fill_(0.0) 14 | # else: 15 | # self.register_parameter('bias', None) 16 | 17 | self.act = act 18 | self.isBias = isBias 19 | self.drop_prob = drop_prob 20 | 21 | for m in self.modules(): 22 | self.weights_init(m) 23 | 24 | def weights_init(self, m): 25 | if isinstance(m, nn.Linear): 26 | torch.nn.init.xavier_uniform_(m.weight.data) 27 | if m.bias is not None: 28 | m.bias.data.fill_(0.0) 29 | 30 | def forward(self, emb): 31 | # emb (batch_size, ft) 32 | # emb = F.dropout(emb, self.drop_prob, training=self.training) 33 | e = self.linear(emb) # (batch_size, d) 34 | # if self.isBias: 35 | # e += self.bias 36 | e_out = self.act(e) 37 | return e_out 38 | 39 | 40 | class GNN(nn.Module): 41 | def __init__(self, nb_rel, in_ft, out_ft, act=nn.PReLU(), drop_prob=0.5, isBias=False): 42 | super().__init__() 43 | self.encoder = nn.ModuleList() 44 | for i in range(nb_rel): 45 | self.encoder.append(GCN(in_ft, out_ft, act=act, isBias=isBias)) 46 | 47 | def forward(self, embs): 48 | outs = [] 49 | for emb in embs: # emb (batch_size, ft) 50 | outs.append(self.encoder(emb)) 51 | outs = torch.stack(outs, 0) # outs (nb_rel, batch_size, ft) 52 | return outs 53 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/layers/GCN2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | 7 | class GCN(nn.Module): 8 | def __init__(self, nfeat, nhid, dropout=0.5, isBias=False): 9 | super().__init__() 10 | self.weight = nn.Parameter(torch.empty(nfeat, nhid)) 11 | nn.init.xavier_uniform_(self.weight) 12 | if isBias: 13 | self.bias = nn.Parameter(torch.empty(nhid)) 14 | self.bias.data.fill_(0.0) 15 | else: 16 | self.register_parameter('bias', None) 17 | self.dropout = dropout 18 | self.act = nn.ReLU() 19 | 20 | 21 | def forward(self, adj, x): 22 | support = torch.mm(x, self.weight) 23 | output = torch.spmm(adj, support) 24 | if self.bias is not None: 25 | output = output + self.bias 26 | return self.act(output) 27 | 28 | 29 | class DGCN(nn.Module): 30 | def __init__(self, v_in_ft, u_in_ft, out_ft, act=nn.PReLU(), drop_prob=0.5, isBias=False): 31 | super().__init__() 32 | 33 | self.v_gc1 = GCN(nfeat=v_in_ft, 34 | nhid=out_ft, 35 | dropout=drop_prob) 36 | self.v_gc2 = GCN(nfeat=out_ft, 37 | nhid=out_ft, 38 | dropout=drop_prob) 39 | 40 | self.u_gc1 = GCN(nfeat=u_in_ft, 41 | nhid=out_ft, 42 | dropout=drop_prob) 43 | self.u_gc2 = GCN(nfeat=out_ft, 44 | nhid=out_ft, 45 | dropout=drop_prob) 46 | self.u_fc = nn.Linear(out_ft + u_in_ft, out_ft) 47 | nn.init.xavier_uniform_(self.u_fc.weight.data) 48 | self.v_fc = nn.Linear(out_ft + v_in_ft, out_ft) 49 | nn.init.xavier_uniform_(self.v_fc.weight.data) 50 | self.u_fc2 = nn.Linear(out_ft , out_ft) 51 | nn.init.xavier_uniform_(self.u_fc.weight.data) 52 | self.v_fc2 = nn.Linear(out_ft , out_ft) 53 | nn.init.xavier_uniform_(self.v_fc.weight.data) 54 | 55 | self.act = act 56 | self.drop_prob = drop_prob 57 | self.isBias = isBias 58 | 59 | def forward(self, uv_adj, vu_adj, ufea, vfea): 60 | # emb (batch_size, ft) 61 | # u = F.dropout(ufea, self.drop_prob, training=self.training) 62 | # v = F.dropout(vfea, self.drop_prob, training=self.training) 63 | 64 | vu = self.u_gc1(vu_adj, ufea) 65 | uv = self.v_gc1(uv_adj, vfea) 66 | 67 | uv2 = self.v_gc2(uv_adj, vu) 68 | vu2 = self.u_gc2(vu_adj, uv) 69 | 70 | Hv = torch.cat((vu2, vfea), dim=1) 71 | Hu = torch.cat((uv2, ufea), dim=1) 72 | 73 | Hv = nn.ReLU()(self.v_fc(Hv)) # (batch_size, d) 74 | Hu = nn.ReLU()(self.u_fc(Hu)) # (batch_size, d) 75 | Hv = self.v_fc2(Hv) 76 | Hu = self.u_fc2(Hu) 77 | 78 | return self.act(Hu), self.act(Hv) 79 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .GCN import GCN, GNN 2 | from .discriminator import Discriminator 3 | from .attention import Attention, NodeAttention, SemanticAttention, LocalAttention 4 | from .fc import FullyConnect, FullyConnect2 5 | from .GCN2 import DGCN 6 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/layers/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class Discriminator(nn.Module): 7 | def __init__(self, v_ft, u_ft): 8 | super().__init__() 9 | self.bilinear = nn.Bilinear(v_ft, u_ft, 1) 10 | self.act = nn.Sigmoid() 11 | 12 | for m in self.modules(): 13 | self.weights_init(m) 14 | 15 | def weights_init(self, m): 16 | if isinstance(m, nn.Bilinear): 17 | torch.nn.init.xavier_uniform_(m.weight.data) 18 | if m.bias is not None: 19 | m.bias.data.fill_(0.0) 20 | 21 | def forward(self, v_h, c): 22 | 23 | # c = self.act(c) 24 | # v_h = self.act(v_h) 25 | 26 | # c = c.expand_as(v_h) 27 | sc_1 = self.bilinear(v_h, c) 28 | 29 | return sc_1.squeeze() 30 | 31 | 32 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/layers/fc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class FullyConnect(nn.Module): 7 | def __init__(self, in_ft, out_ft, act=nn.Identity(), drop_prob=0.0, isBias=False): 8 | super().__init__() 9 | self.fc = nn.Linear(in_ft, out_ft, bias=False) 10 | if self.fc.bias is not None: 11 | self.fc.bias.data.fill_(0.0) 12 | 13 | if isBias: 14 | self.bias = nn.Parameter(torch.empty(out_ft)) 15 | self.bias.data.fill_(0.0) 16 | else: 17 | self.register_parameter('bias', None) 18 | 19 | self.act = act 20 | self.drop_prob = drop_prob 21 | self.isBias = isBias 22 | 23 | for m in self.modules(): 24 | self.weights_init(m) 25 | 26 | def weights_init(self, m): 27 | if isinstance(m, nn.Linear): 28 | torch.nn.init.xavier_uniform_(m.weight.data) 29 | if m.bias is not None: 30 | m.bias.data.fill_(0.0) 31 | 32 | 33 | def forward(self, emb): 34 | # emb (batch_size, ft) 35 | emb = F.dropout(emb, self.drop_prob, training=self.training) 36 | e = self.fc(emb) # (batch_size, d) 37 | if self.isBias: 38 | e += self.bias 39 | return self.act(e) 40 | 41 | 42 | class FullyConnect2(nn.Module): 43 | def __init__(self, in_ft, hid_unit, out_ft, drop_prob=0.0, isBias=False): 44 | super().__init__() 45 | self.fc = FullyConnect(in_ft, hid_unit, act=nn.PReLU(), drop_prob=drop_prob,isBias=isBias) 46 | self.fc2 = FullyConnect(hid_unit, out_ft, act=nn.PReLU(), drop_prob=drop_prob,isBias=isBias) 47 | self.dense = FullyConnect(out_ft, 1, act=nn.Identity(), drop_prob=drop_prob,isBias=isBias) 48 | 49 | 50 | def forward(self, emb): 51 | # emb (batch_size, ft) 52 | e = self.fc(emb) # (batch_size, d) 53 | e2 = self.fc2(e) 54 | out = self.dense(e2) 55 | return out -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import time 4 | seed = 268945 5 | torch.autograd.set_detect_anomaly(True) 6 | np.random.seed(seed) 7 | torch.manual_seed(seed) 8 | torch.cuda.manual_seed_all(seed) 9 | torch.backends.cudnn.deterministic = True 10 | torch.backends.cudnn.benchmark = False 11 | import argparse 12 | 13 | 14 | def parse_args(): 15 | # input arguments 16 | parser = argparse.ArgumentParser(description='BiHIN') 17 | parser.add_argument('--gpu_num', nargs='?', default='0') 18 | parser.add_argument('--model', nargs='?', default='SubHIN') 19 | parser.add_argument('--dataset', nargs='?', default='dblp') 20 | parser.add_argument('--save_path', nargs='?', default='./results') 21 | 22 | parser.add_argument('--nb_epochs', type=int, default=10000) 23 | parser.add_argument('--lr', type = float, default = 0.001) 24 | parser.add_argument('--patience', type=int, default=50) 25 | 26 | # parser.add_argument('--att_hid_units', type=int, default=64) 27 | parser.add_argument('--hid_units', type=int, default=256)# 128 best for dblp and yelp, larger datasets 28 | parser.add_argument('--hid_units2', type=int, default=128) 29 | parser.add_argument('--out_ft', type=int, default=64) 30 | 31 | parser.add_argument('--drop_prob', type=float, default=0.0) 32 | parser.add_argument('--lamb', type=float, default=0.5, 33 | help='coefficient for the losses in node task') 34 | parser.add_argument('--lamb_lp', type=float, default=1.0, 35 | help='coefficient for the losses in link task') 36 | parser.add_argument('--margin', type=float, default=0.8, 37 | help='coefficient for the margin loss') 38 | parser.add_argument('--isBias', action='store_true', default=False) 39 | parser.add_argument('--isAtt', action='store_true', default=False) 40 | parser.add_argument('--isLP', action='store_true', default=False)# link prediction 41 | parser.add_argument('--isSemi', action='store_true', default=False)# semi-supervised learning 42 | 43 | return parser.parse_known_args() 44 | 45 | def printConfig(args): 46 | args_names = [] 47 | args_vals = [] 48 | for arg in vars(args): 49 | args_names.append(arg) 50 | args_vals.append(getattr(args, arg)) 51 | print(args_names) 52 | print(args_vals) 53 | 54 | def main(): 55 | args, unknown = parse_args() 56 | # printConfig(args) 57 | if args.model == 'SubHIN': 58 | from models import SubHIN 59 | embedder = SubHIN(args) 60 | start = time.time() 61 | embedder.training() 62 | print('time (s):%.2f'%(time.time()-start)) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .SubHIN import SubHIN 2 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/test_input_data.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/12/14 13:30 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : test_input_data.py 6 | import pickle 7 | import numpy as np 8 | 9 | if __name__ == '__main__': 10 | ''' 11 | edges_path = './dataset/acm/edges.pkl' 12 | file = open(edges_path, 'rb') 13 | load_data = pickle.load(file) 14 | print('edges type: ', type(load_data)) 15 | #print('edges shape: ', np.shape(load_data)) 16 | print('edges: ', load_data) 17 | 18 | 19 | labels_path = './dataset/acm/labels.pkl' 20 | file = open(labels_path, 'rb') 21 | load_data = pickle.load(file) 22 | print('labels type: ', type(load_data)) 23 | print('labels: ', load_data) 24 | print('labels 0', load_data[0].shape) 25 | print('labels 1', load_data[1].shape) 26 | print('labels 2', load_data[2].shape) 27 | 28 | 29 | 30 | meta_data_path = './dataset/acm/meta_data.pkl' 31 | file = open(meta_data_path, 'rb') 32 | load_data = pickle.load(file) 33 | print('meta_data type: ', type(load_data)) 34 | print('meta_data keys: ', load_data.keys()) 35 | print('meta_data: ', load_data) 36 | ''' 37 | 38 | node_features_path = './dataset/acm/node_features.pkl' 39 | file = open(node_features_path, 'rb') 40 | load_data = pickle.load(file) 41 | print('node_features type: ', type(load_data)) 42 | print('node_features shape: ', np.shape(load_data)) 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /BrainAlign/SR_RSC/utils/process.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import normalize 2 | import scipy.sparse as sp 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | 7 | def indices_to_one_hot(data, nb_classes): 8 | """Convert an iterable of indices to one-hot encoded labels.""" 9 | targets = np.array(data).reshape(-1) 10 | labels = np.eye(nb_classes)[targets] 11 | return torch.LongTensor(labels) 12 | 13 | 14 | def normalize_adj(mx): 15 | """Row-normalize sparse matrix""" 16 | rowsum = np.array(mx.sum(1)) 17 | r_inv = np.power(rowsum, -1.0).flatten() 18 | r_inv[np.isinf(r_inv)] = 0. 19 | r_mat_inv = sp.diags(r_inv) 20 | mx = r_mat_inv.dot(mx) 21 | return mx 22 | 23 | 24 | def sparse_to_tuple(mx): 25 | # mx = normalize_adj(mx) 26 | if not sp.isspmatrix_coo(mx): 27 | mx = mx.tocoo() 28 | coords = np.vstack((mx.row, mx.col)) 29 | values = mx.data 30 | shape = mx.shape 31 | return coords, values, shape 32 | 33 | 34 | def preprocess_features(features, norm=True): 35 | """Row-normalize feature matrix and convert to tuple representation""" 36 | if sp.issparse(features): 37 | features = features.toarray() 38 | if norm: 39 | features[features>0] = 1 40 | # rowsum = np.array(features.sum(1)) 41 | # r_inv = np.power(rowsum, -1.0).flatten() 42 | # r_inv[np.isinf(r_inv)] = 0. 43 | # r_mat_inv = sp.diags(r_inv) 44 | # features = r_mat_inv.dot(features) 45 | return torch.FloatTensor(features) 46 | 47 | 48 | def normalize_mx(mx, diagonal=True): 49 | if diagonal: 50 | size = mx.shape[0] 51 | return normalize(mx+sp.eye(size), norm='l1', axis=1) 52 | else: 53 | return normalize(mx, norm='l1', axis=1) 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /BrainAlign/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 16:12 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py.py 6 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 16:04 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 17:15 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py.py 6 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/configs/heco_config.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 16:11 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : heco_config.py 6 | 7 | from yacs.config import CfgNode as CN 8 | import time 9 | # -------------------------------------------------------------- 10 | # Config of model 11 | # -------------------------------------------------------------- 12 | _C = CN() 13 | 14 | _C.CAME = CN() 15 | _C.CAME.path_rawdata1 = '../../../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad' 16 | _C.CAME.path_rawdata2 = '../../../../CAME/brain_human_mouse/human_brain_region_88_sparse_with3d.h5ad' 17 | _C.CAME.ROOT = '../../../../CAME/brain_mouse_human_sagittal/Baron_mouse-Baron_human-(10-13_15.26.12)/' 18 | _C.CAME.figdir = '../../../../CAME/analysis_results/figs/' # 19 | _C.CAME.embedding_dim = 128 20 | 21 | _C.CAME.homo_region_file_path = '../../../../CAME/brain_human_mouse/MouseHumanMatches_H88M67.csv' 22 | _C.CAME.labels_dir = '../../../../CAME/brain_human_mouse/' 23 | 24 | _C.HECO = CN() 25 | _C.HECO.dsnames = ['Mouse', 'Human'] 26 | 27 | # Could be pca or came 28 | _C.HECO.normalize_before_pca = None#'default' # None represent no normalization 29 | _C.HECO.normalize_before_pca_target_sum = None 30 | _C.HECO.embedding_type = 'pca' 31 | _C.HECO.embedding_pca_dim = 30 32 | 33 | 34 | _C.HECO.dataset = 'mouse_human_sagittal' 35 | _C.HECO.result_save_folder = './result/' 36 | _C.HECO.experiment_time = time.strftime("%Y-%m-%d_%H-%M-%S") 37 | _C.HECO.result_save_path = _C.HECO.result_save_folder + _C.HECO.experiment_time 38 | _C.HECO.embeddings_file_path = _C.HECO.result_save_path + "/embeds/" 39 | _C.HECO.DATA_PATH = _C.HECO.result_save_path + '/data/' 40 | 41 | _C.HECO.normalize_scale = True 42 | 43 | _C.HECO.normalize_before_pruning_method = 'default' 44 | _C.HECO.pruning_target_sum = None # None 45 | _C.HECO.pruning_normalize_axis = 0 46 | _C.HECO.if_threshold = True 47 | _C.HECO.pruning_method = 'std' # top, std, quantile 48 | _C.HECO.pruning_std_times_sm = 3#3.3#2.9 49 | _C.HECO.pruning_std_times_vh = 2.3#2.5#2.4 50 | 51 | _C.HECO.sm_gene_top = 2 52 | _C.HECO.vh_gene_top = 2 53 | _C.HECO.sm_sample_top = 5 54 | _C.HECO.vh_sample_top = 5 55 | 56 | _C.HECO.target_sum = None # None 57 | 58 | _C.HECO.NODE_TYPE_NUM = 4 59 | _C.HECO.S = 21749 # 60 | _C.HECO.S_sample_rate = [0.2] 61 | _C.HECO.M = 4035 62 | _C.HECO.M_sample_rate = [5, 2] 63 | _C.HECO.H = 6507 64 | _C.HECO.H_sample_rate = [0.5, 0.5] 65 | _C.HECO.V = 3682 66 | _C.HECO.V_sample_rate = [2] 67 | 68 | _C.HECO.DEG_batch_key = None 69 | _C.HECO.DEG_n_top_genes = 2000 70 | 71 | _C.HECO.positive_sample_number = 5000 72 | 73 | 74 | _C.HECO.fig_format = 'png' 75 | 76 | _C.ANALYSIS = CN() 77 | _C.ANALYSIS.cut_ov = 0 78 | _C.ANALYSIS.umap_neighbor = 20 79 | _C.ANALYSIS.mouse_umap_neighbor = 20 80 | _C.ANALYSIS.human_umap_neighbor = 20 81 | 82 | 83 | # Paramaters of BrainAlign 84 | _C.HECO_args = CN() 85 | _C.HECO_args.save_emb = True 86 | _C.HECO_args.turn = 0 87 | _C.HECO_args.dataset = _C.HECO.dataset 88 | _C.HECO_args.target_node = "S" # S, M, H, V 89 | _C.HECO_args.if_pretrained = False 90 | _C.HECO_args.pretrained_model_path = None 91 | _C.HECO_args.save_path = "./results/" + _C.HECO.experiment_time+'/'#"../data/{}/results/".format(_C.HECO_args.dataset)+_C.HECO.experiment_time+'/' 92 | _C.HECO_args.data_path = "./results/"+_C.HECO.experiment_time+'/data/'#"../data/{}/results/".format(_C.HECO_args.dataset)+_C.HECO.experiment_time+'/data/' 93 | _C.HECO_args.ratio = [20, 40, 60] 94 | _C.HECO_args.gpu = 0 95 | _C.HECO_args.seed = 53 96 | _C.HECO_args.hidden_dim = 128 97 | _C.HECO_args.nb_epochs = 1000 98 | # The parameters of evaluation 99 | _C.HECO_args.eva_lr = 0.01 100 | _C.HECO_args.eva_wd = 0 101 | # The parameters of learning process 102 | _C.HECO_args.patience = 30 103 | _C.HECO_args.lr = 0.0005 104 | _C.HECO_args.l2_coef = 0 105 | # model-specific parameters 106 | _C.HECO_args.tau = 0.9 107 | _C.HECO_args.feat_drop = 0.4 108 | _C.HECO_args.attn_drop = 0.35 109 | _C.HECO_args.sample_rate = [6] 110 | _C.HECO_args.lam = 0.5 111 | 112 | _C.HECO_args.type_num = [21749, 4035, 6507, 3682] 113 | _C.HECO_args.nei_num = 1 114 | 115 | # -------------------------------------------------------------- 116 | # Config of INPUT 117 | # -------------------------------------------------------------- 118 | _C.HOMO_RANDOM = CN() 119 | 120 | # if use all the non-homogeneous regions as back ground, default 'all', else will only use 35 regions in each species 121 | _C.HOMO_RANDOM.random_field = 'all' # or all 122 | # config if plot all the cross species correlation heatmap, default false; Require large memory if True. 123 | _C.HOMO_RANDOM.random_plot = False # config 124 | 125 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/configs/heco_config_all.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 16:11 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : heco_config.py 6 | 7 | from yacs.config import CfgNode as CN 8 | 9 | # -------------------------------------------------------------- 10 | # Config of model 11 | # -------------------------------------------------------------- 12 | _C = CN() 13 | 14 | _C.CAME = CN() 15 | _C.CAME.path_rawdata1 = '../../../../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad' 16 | _C.CAME.path_rawdata2 = '../../../../CAME/brain_human_mouse/human_brain_region_88_sparse.h5ad' 17 | _C.CAME.ROOT = '../../../../CAME/analysis_results/Dense_Baron_mouse-Baron_human-10-24_11.37.58/' 18 | _C.CAME.figdir = '../../../../CAME/analysis_results/Dense_Baron_mouse-Baron_human-10-24_11.37.58/figs/' # 19 | _C.CAME.embedding_dim = 128 20 | 21 | _C.CAME.homo_region_file_path = '../../../../CAME/brain_human_mouse/MouseHumanMatches_H88M67.csv' 22 | _C.CAME.labels_dir = '../../../../CAME/brain_human_mouse/' 23 | 24 | _C.HECO = CN() 25 | # Could be pca or came 26 | _C.HECO.embedding_type = 'pca' 27 | _C.HECO.embedding_pca_dim = 30 28 | _C.HECO.DATA_PATH = './data/' 29 | _C.HECO.dataset = 'mouse_human' 30 | _C.HECO.result_save_path = './results/2022-11-04_22-37-26' 31 | _C.HECO.embeddings_file_path = _C.HECO.result_save_path + "/embeds/" 32 | 33 | _C.HECO.if_threshold = True 34 | _C.HECO.pruning_method = 'std' # top, std, quantile 35 | _C.HECO.pruning_std_times_sm = 3.5 36 | _C.HECO.pruning_std_times_vh = 3.2 37 | 38 | _C.HECO.sm_gene_top = 100 39 | _C.HECO.vh_gene_top = 20 40 | _C.HECO.sm_sample_top = 5 41 | _C.HECO.vh_sample_top = 5 42 | 43 | _C.HECO.target_sum = 1 # None 44 | 45 | _C.HECO.NODE_TYPE_NUM = 4 46 | _C.HECO.S = 72968 47 | _C.HECO.S_sample_rate = [0.2] 48 | _C.HECO.M = 2578 49 | _C.HECO.M_sample_rate = [5, 2] 50 | _C.HECO.H = 3326 51 | _C.HECO.H_sample_rate = [0.5, 0.5] 52 | _C.HECO.V = 3682 53 | _C.HECO.V_sample_rate = [2] 54 | 55 | _C.HECO.DEG_batch_key = None 56 | _C.HECO.DEG_n_top_genes = 2000 57 | 58 | _C.HECO.positive_sample_number = 5000 59 | 60 | 61 | _C.HECO.fig_format = 'png' 62 | # -------------------------------------------------------------- 63 | # Config of INPUT 64 | # -------------------------------------------------------------- 65 | _C.HOMO_RANDOM = CN() 66 | 67 | # if use all the non-homogeneous regions as back ground, default 'all', else will only use 35 regions in each species 68 | _C.HOMO_RANDOM.random_field = 'all' # or all 69 | # config if plot all the cross species correlation heatmap, default false; Require large memory if True. 70 | _C.HOMO_RANDOM.random_plot = False # config 71 | 72 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import os.path as osp 5 | import time 6 | def setup_logger(name, save_dir, if_train): 7 | logger = logging.getLogger(name) 8 | logger.setLevel(logging.DEBUG) 9 | 10 | ch = logging.StreamHandler(stream=sys.stdout) 11 | ch.setLevel(logging.DEBUG) 12 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 13 | ch.setFormatter(formatter) 14 | logger.addHandler(ch) 15 | 16 | if save_dir: 17 | if not osp.exists(save_dir): 18 | os.makedirs(save_dir) 19 | if if_train: 20 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_train_log.txt"), mode='w') 21 | elif if_train == False: 22 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_analysis_log.txt"), mode='w') 23 | elif if_train == None: 24 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S") + "_process_log.txt"), 25 | mode='w') 26 | fh.setLevel(logging.DEBUG) 27 | fh.setFormatter(formatter) 28 | logger.addHandler(fh) 29 | 30 | return logger -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/.Rhistory: -------------------------------------------------------------------------------- 1 | install.packages("sceasy") 2 | install.packages("r-sceasy") 3 | install.packages("sceasy") 4 | q() 5 | q 6 | q() 7 | devtools::install_github("cellgeni/sceasy") 8 | install.packages("devtools") 9 | devtools::install_github("cellgeni/sceasy") 10 | q() 11 | install.packages("anndata") 12 | if (!requireNamespace("BiocManager", quietly=TRUE)) { 13 | install.packages("BiocManager") 14 | } 15 | BiocManager::install("zellkonverter") 16 | BiocManager::install("basilisk") 17 | install.packages("basilisk") 18 | BiocManager::install("basilisk") 19 | if (!require("BiocManager", quietly = TRUE)) 20 | install.packages("BiocManager") 21 | BiocManager::install("basilisk") 22 | install.packages("anndata") 23 | library(anndata) 24 | adata_mouse_path_isocortex <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad" 25 | adata_mouse_path_isocortex 26 | ad <- read_h5ad(adata_path) 27 | library(anndata) 28 | ad <- read_h5ad(adata_path) 29 | install.packages("SeuratDisk") 30 | if (!requireNamespace("remotes", quietly = TRUE)) { 31 | install.packages("remotes") 32 | } 33 | remotes::install_github("mojaveazure/seurat-disk") 34 | library('SeuratDisk') 35 | library('Seuratdisk') 36 | if (!requireNamespace("remotes", quietly = TRUE)) { 37 | install.packages("remotes") 38 | } 39 | remotes::install_github("mojaveazure/seurat-disk") 40 | reticulate::install_miniconda() 41 | anndata::install_anndata() 42 | install.packages("anndata") 43 | install.packages("magrittr") 44 | install.packages("dplyr") 45 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/genomic_findmarkers.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/7/22 20:38 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : genomic_findmarkers.r 5 | #@Description: This file is used to find marker genes with Seurat findMarker 6 | #source('./genomic_functions.R', local = TRUE) 7 | 8 | library(sceasy) 9 | library(anndata) 10 | library(Seurat) 11 | library(Matrix) 12 | 13 | library(reticulate) 14 | #use_condaenv('pad') 15 | #loompy <- reticulate::import('loompy') 16 | 17 | print("No error") 18 | 19 | # convert_scanpy_seurat 20 | convert_scanpy2seurat <- function(adata_path, save_path){ 21 | #ad <- zellkonverter::readH5AD(adata_path) 22 | #ad <- zellkonverter::readH5AD(adata_path) 23 | #ad <- read_h5ad(adata_path) 24 | # end with .rds file 25 | #print(ad) 26 | sceasy::convertFormat(adata_path, from="anndata", to="seurat", outFile=save_path) 27 | #if_success <- TRUE 28 | #return(if_success) 29 | } 30 | 31 | seurat_findmarker <- function(input_data, groupby) 32 | { 33 | adata_markers <- FindAllMarkers(input_data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) 34 | adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC) 35 | #return(NULL) 36 | } 37 | 38 | 39 | # isocortex 40 | adata_mouse_path_isocortex <- "D:/Research_programs/HeCo/BrainAlign/data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad" 41 | save_path <- "D:/Research_programs/HeCo/BrainAlign/data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/" 42 | mouse_data_path <- paste0(save_path, 'adata_mouse_path_isocortex.rds') 43 | print(mouse_data_path) 44 | adata <- read_h5ad(adata_mouse_path_isocortex) 45 | print(adata) 46 | 47 | #adata$X <- as(adata$X, "dgCMatrix") 48 | 49 | #print(adata$X) 50 | #seurat_object <- Seurat::Convert(adata) 51 | # Create a Seurat object 52 | seurat_object <- CreateSeuratObject(counts = adata$X) 53 | 54 | # Transfer feature names (genes) 55 | rownames(seurat_object) <- rownames(adata$X) 56 | 57 | # Transfer cell names and metadata 58 | colnames(seurat_object) <- colnames(adata$X) 59 | seurat_object$meta.data <- adata$obs 60 | # Transfer additional metadata columns (obs) 61 | seurat_object$meta.data$additional_metadata_column <- adata$obs$additional_metadata_column 62 | 63 | # Transfer additional metadata columns (var) 64 | seurat_object$var$additional_metadata_column <- adata$var$additional_metadata_column 65 | 66 | 67 | saveRDS(seurat_object, mouse_data_path) 68 | 69 | #convert_scanpy2seurat(adata_mouse_path_isocortex, mouse_data_path) #mouse_data_path 70 | mouse_isocortex_data <- readRDS(file = mouse_data_path) 71 | Idents(mouse_isocortex_dat) <- "region_name" 72 | seurat_findmarker(mouse_isocortex_data, groupby="region_name") 73 | 74 | print("Mouse finished") 75 | 76 | # adata_human_path_isocortex <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad" 77 | # human_data_path <- paste0(save_path, 'adata_human_path_isocortex.rds') 78 | # convert_scanpy2seurat(adata_human_path_isocortex, paste(save_path, human_data_path)) 79 | # human_isocortex_data <- readRDS(file = human_data_path) 80 | # Idents(human_isocortex_dat) <- "region_name" 81 | # seurat_findmarker(human_isocortex_data, groupby="region_name") 82 | # 83 | # print("Human finished.") 84 | 85 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/genomic_functions.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/7/23 11:09 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : genomic_functions.r 5 | #@Description: This file is used to ... 6 | #source('./genomic_functions.R', local = TRUE) 7 | 8 | library(Seurat) 9 | library(anndata) 10 | library(sceasy) 11 | 12 | # convert_scanpy_seurat 13 | if_success <- convert_scanpy2seurat(adata_path, save_path) 14 | { 15 | ad <- anndata::read_h5ad(adata_path) 16 | # end with .rds file 17 | sceasy::convertFormat(ad, from="anndata", to="seurat", outFile=save_path) 18 | if_success <- TRUE 19 | #return(if_success) 20 | } 21 | 22 | 23 | NULL <- seurat_findmarker(input_data, groupby) 24 | { 25 | adata_markers <- FindAllMarkers(input_data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) 26 | adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC) 27 | #return(NULL) 28 | } 29 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/includes.R: -------------------------------------------------------------------------------- 1 | path.bin <- 'D:/Research_programs/single_cell/molecular-atlas-master/bin' 2 | path.matrices <-'D:/Research_programs/single_cell/figures' 3 | 4 | load(paste(path.matrices ,'atlasspots.RData',sep='/')) 5 | load(paste(path.matrices , 'vivid-colors.RData', sep='/')) 6 | 7 | #Only loading if not existing, takes some time 8 | if(!exists('aligned.atlas')) 9 | load(paste(path.matrices ,'alignedAtlas.RData',sep='/')) 10 | 11 | if(!exists('atlas.stereo')) 12 | load(paste(path.matrices ,'atlasStereo.RData',sep='/')) 13 | 14 | source(paste(path.bin,'plotFunctions.R',sep='/')) 15 | source(paste(path.bin,'execFunctions.R',sep='/')) 16 | source(paste(path.bin,'araAtlasFunctions.R',sep='/')) 17 | source(paste(path.bin,'smoothingFunctions.R',sep='/')) 18 | source(paste(path.bin,'layerFunctions.R',sep='/')) 19 | source(paste(path.bin,'icFunctions.R',sep='/')) 20 | source(paste(path.bin,'tsneFunctions.R',sep='/')) 21 | source(paste(path.bin,'similarityIndexFunctions.R',sep='/')) 22 | source(paste(path.bin,'scRemapFunctions.R',sep='/')) 23 | source(paste(path.bin,'constantParameters.R',sep='/')) 24 | source(paste(path.bin,'allenAnnotationsFunctions.R',sep='/')) 25 | source(paste(path.bin,'meshCutsFunctions.R',sep='/')) -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/test.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/7/23 11:58 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : test.r 5 | #@Description: This file is used to ... 6 | 7 | 8 | sumfunc <- function(x, y) 9 | { 10 | sum <- x+y 11 | return(sum) 12 | } 13 | 14 | print(sumfunc(1, 2)) -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/transform2seurat.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/7/23 21:23 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : transform2seurat.r 5 | #@Description: This file is used to ... 6 | 7 | 8 | library(Matrix) 9 | library(Seurat) 10 | 11 | 12 | 13 | adata_path_isocortex <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/" 14 | 15 | output_for_R_path <- "R_isocortex_mouse" 16 | 17 | save_dir<- paste0(adata_path_isocortex, output_for_R_path) 18 | 19 | counts<-readMM(paste0(save_dir,'/counts.mtx')) 20 | dim(counts) 21 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv')) 22 | head(cellMeta) 23 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv')) 24 | dim(geneMeta) 25 | head(geneMeta) 26 | ### Set the rownames and colnames 27 | rownames(counts)<-cellMeta$Barcode 28 | colnames(counts)<-geneMeta$GeneName 29 | 30 | seo <- CreateSeuratObject(counts = t(counts), project = "min", min.cells = 2, min.features = 5) 31 | ### Set the meta data 32 | seo@meta.data<-cbind(cellMeta,seo@meta.data) 33 | rownames(seo@meta.data)<-colnames(seo) 34 | ### Normalize the data 35 | #seo <- NormalizeData(seo) 36 | groupby <- "region_name" 37 | 38 | Idents(seo) <- "region_name" 39 | 40 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.15) #, min.pct = 0.25, logfc.threshold = 0.25 41 | #adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC) 42 | print(adata_markers) 43 | 44 | saveRDS(seo, file.path(save_dir,'mouse_isocortex.rds')) 45 | 46 | 47 | 48 | # ------------------------------------------------------------ 49 | # human---------------- 50 | output_for_R_path <- "R_isocortex_human" 51 | 52 | save_dir<- paste0(adata_path_isocortex, output_for_R_path) 53 | 54 | counts<-readMM(paste0(save_dir,'/counts.mtx')) 55 | dim(counts) 56 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv')) 57 | head(cellMeta) 58 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv')) 59 | dim(geneMeta) 60 | head(geneMeta) 61 | ### Set the rownames and colnames 62 | rownames(counts)<-cellMeta$Barcode 63 | colnames(counts)<-geneMeta$GeneName 64 | 65 | seo <- CreateSeuratObject(counts = t(counts), project = "min", min.cells = 2, min.features = 5) 66 | ### Set the meta data 67 | seo@meta.data<-cbind(cellMeta,seo@meta.data) 68 | rownames(seo@meta.data)<-colnames(seo) 69 | ### Normalize the data 70 | #seo <- NormalizeData(seo) 71 | groupby <- "region_name" 72 | 73 | Idents(seo) <- "region_name" 74 | 75 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.15) #, min.pct = 0.25, logfc.threshold = 0.25 76 | #adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC) 77 | print(adata_markers) 78 | 79 | saveRDS(seo, file.path(save_dir,'human_isocortex.rds')) -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/transform2seurat_cluster.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/7/23 21:23 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : transform2seurat.r 5 | #@Description: This file is used to ... 6 | 7 | 8 | library(Matrix) 9 | library(Seurat) 10 | 11 | library(magrittr) # needs to be run every time you start R and want to use %>% 12 | library(dplyr) # alternatively, this also loads %>% 13 | 14 | 15 | adata_path_th <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/" 16 | 17 | output_for_R_path <- "R_TH_mouse" 18 | 19 | save_dir<- paste0(adata_path_th, output_for_R_path) 20 | 21 | counts<-readMM(paste0(save_dir,'/counts.mtx')) 22 | dim(counts) 23 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv')) 24 | head(cellMeta) 25 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv')) 26 | dim(geneMeta) 27 | head(geneMeta) 28 | ### Set the rownames and colnames 29 | rownames(counts)<-cellMeta$Barcode 30 | colnames(counts)<-geneMeta$GeneName 31 | 32 | seo <- CreateSeuratObject(counts = t(counts)) 33 | ### Set the meta data 34 | seo@meta.data<-cbind(cellMeta,seo@meta.data) 35 | rownames(seo@meta.data)<-colnames(seo) 36 | ### Normalize the data 37 | #seo <- NormalizeData(seo) 38 | groupby <- "cluster_name_acronym" 39 | 40 | Idents(seo) <- "cluster_name_acronym" 41 | 42 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.11) #, min.pct = 0.25, logfc.threshold = 0.25 43 | #adata_markers %>% group_by(cluster_name_acronym) %>% slice_max(n = 1, order_by = avg_log2FC) 44 | #adata_markers %>% group_by("cluster_name_acronym") %>% top_n(2, avg_logFC) 45 | 46 | print(adata_markers) 47 | 48 | saveRDS(seo, file.path(save_dir,"mouse_th.rds")) 49 | 50 | 51 | 52 | # ------------------------------------------------------------ 53 | # human---------------- 54 | print('----------------------------------human----------------------------------------------') 55 | output_for_R_path <- "R_TH_human" 56 | 57 | save_dir<- paste0(adata_path_th, output_for_R_path) 58 | 59 | counts<-readMM(paste0(save_dir,'/counts.mtx')) 60 | dim(counts) 61 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv')) 62 | head(cellMeta) 63 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv')) 64 | dim(geneMeta) 65 | head(geneMeta) 66 | ### Set the rownames and colnames 67 | rownames(counts)<-cellMeta$Barcode 68 | colnames(counts)<-geneMeta$GeneName 69 | 70 | seo <- CreateSeuratObject(counts = t(counts)) 71 | ### Set the meta data 72 | seo@meta.data<-cbind(cellMeta,seo@meta.data) 73 | rownames(seo@meta.data)<-colnames(seo) 74 | ### Normalize the data 75 | #seo <- NormalizeData(seo) 76 | groupby <- "cluster_name_acronym" 77 | 78 | Idents(seo) <- "cluster_name_acronym" 79 | 80 | 81 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.15) #, min.pct=0.1, return.thresh=0.05, thresh.use = 0.15, 82 | #adata_markers %>% group_by(cluster_name_acronym) %>% slice_max(n = 1, order_by = avg_log2FC) 83 | #adata_markers %>% group_by("cluster_name_acronym") %>% top_n(2, avg_logFC) 84 | print(adata_markers) 85 | 86 | saveRDS(seo, file.path(save_dir,"human_th.rds")) 87 | -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/transform_adata.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2023/7/23 17:57 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : transform_adata.py 6 | # @Description: This file is used to ... 7 | 8 | import anndata 9 | import scanpy as sc 10 | from pathlib import Path 11 | from scipy import io 12 | import os 13 | 14 | if __name__ == '__main__': 15 | 16 | output_for_R_path = "R_isocortex_mouse" 17 | 18 | adata_mouse_path_isocortex = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad" 19 | adata = sc.read_h5ad(adata_mouse_path_isocortex) 20 | ### Set the directory for saving files 21 | save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/' 22 | 23 | Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True) 24 | print(save_dir + output_for_R_path) 25 | 26 | io.mmwrite(save_dir + output_for_R_path+'/counts.mtx', adata.X) 27 | cell_meta = adata.obs.copy() 28 | cell_meta['Barcode'] = cell_meta.index 29 | #cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0] 30 | #cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1] 31 | cell_meta['region_name'] = adata.obs['region_name'] 32 | cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym'] 33 | 34 | gene_meta = adata.var.copy() 35 | gene_meta['GeneName'] = gene_meta.index 36 | 37 | if not os.path.exists(save_dir + output_for_R_path): 38 | os.makedirs(save_dir + output_for_R_path) 39 | cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None) 40 | gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None) 41 | 42 | 43 | #------------------------------------------------------------------------------ 44 | # human------------------------------------ 45 | output_for_R_path = "R_isocortex_human" 46 | 47 | adata_human_path_isocortex = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_human_exp_isocortex.h5ad" 48 | adata = sc.read_h5ad(adata_human_path_isocortex) 49 | ### Set the directory for saving files 50 | save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/' 51 | 52 | Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True) 53 | print(save_dir + output_for_R_path) 54 | 55 | io.mmwrite(save_dir + output_for_R_path + '/counts.mtx', adata.X) 56 | cell_meta = adata.obs.copy() 57 | cell_meta['Barcode'] = cell_meta.index 58 | # cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0] 59 | # cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1] 60 | cell_meta['region_name'] = adata.obs['region_name'] 61 | cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym'] 62 | 63 | gene_meta = adata.var.copy() 64 | gene_meta['GeneName'] = gene_meta.index 65 | 66 | if not os.path.exists(save_dir + output_for_R_path): 67 | os.makedirs(save_dir + output_for_R_path) 68 | cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None) 69 | gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None) -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_analysis/transform_adata_cluster.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2023/7/23 17:57 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : transform_adata.py 6 | # @Description: This file is used to ... 7 | 8 | import anndata 9 | import scanpy as sc 10 | from pathlib import Path 11 | from scipy import io 12 | import os 13 | 14 | if __name__ == '__main__': 15 | 16 | output_for_R_path = "R_TH_mouse" 17 | 18 | adata_mouse_path_th = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/adata_exp_TH_mouse.h5ad" 19 | adata = sc.read_h5ad(adata_mouse_path_th) 20 | ### Set the directory for saving files 21 | save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/' 22 | 23 | Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True) 24 | print(save_dir + output_for_R_path) 25 | 26 | io.mmwrite(save_dir + output_for_R_path+'/counts.mtx', adata.X) 27 | cell_meta = adata.obs.copy() 28 | cell_meta['Barcode'] = cell_meta.index 29 | #cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0] 30 | #cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1] 31 | cell_meta['region_name'] = adata.obs['region_name'] 32 | cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym'] 33 | 34 | gene_meta = adata.var.copy() 35 | gene_meta['GeneName'] = gene_meta.index 36 | 37 | if not os.path.exists(save_dir + output_for_R_path): 38 | os.makedirs(save_dir + output_for_R_path) 39 | cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None) 40 | gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None) 41 | 42 | 43 | #------------------------------------------------------------------------------ 44 | # human------------------------------------ 45 | output_for_R_path = "R_TH_human" 46 | 47 | adata_human_path_isocortex = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/adata_exp_TH_human.h5ad" 48 | adata = sc.read_h5ad(adata_human_path_isocortex) 49 | ### Set the directory for saving files 50 | save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/' 51 | 52 | Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True) 53 | print(save_dir + output_for_R_path) 54 | 55 | io.mmwrite(save_dir + output_for_R_path + '/counts.mtx', adata.X) 56 | cell_meta = adata.obs.copy() 57 | cell_meta['Barcode'] = cell_meta.index 58 | # cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0] 59 | # cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1] 60 | cell_meta['region_name'] = adata.obs['region_name'] 61 | cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym'] 62 | 63 | gene_meta = adata.var.copy() 64 | gene_meta['GeneName'] = gene_meta.index 65 | 66 | if not os.path.exists(save_dir + output_for_R_path): 67 | os.makedirs(save_dir + output_for_R_path) 68 | cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None) 69 | gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None) -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/r_gene_comparison/barplot_degs.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/3/11 20:17 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : barplot_degs.r 5 | #@Description: This file is used to ... 6 | 7 | # library 8 | library(tidyverse) 9 | library(viridis) 10 | 11 | # Create dataset 12 | data <- data.frame( 13 | individual=paste( "Mister ", seq(1,60), sep=""), 14 | group=c( rep('A', 60), rep('B', 30), rep('C', 14), rep('D', 6)) , 15 | value1=sample( seq(10,100), 60, replace=T), 16 | value2=sample( seq(10,100), 60, replace=T), 17 | value3=sample( seq(10,100), 60, replace=T) 18 | ) 19 | 20 | # Transform data in a tidy format (long format) 21 | data <- data %>% gather(key = "observation", value="value", -c(1,2)) 22 | 23 | # Set a number of 'empty bar' to add at the end of each group 24 | empty_bar <- 2 25 | nObsType <- nlevels(as.factor(data$observation)) 26 | to_add <- data.frame( matrix(NA, empty_bar*nlevels(data$group)*nObsType, ncol(data)) ) 27 | colnames(to_add) <- colnames(data) 28 | to_add$group <- rep(levels(data$group), each=empty_bar*nObsType ) 29 | data <- rbind(data, to_add) 30 | data <- data %>% arrange(group, individual) 31 | data$id <- rep( seq(1, nrow(data)/nObsType) , each=nObsType) 32 | 33 | # Get the name and the y position of each label 34 | label_data <- data %>% group_by(id, individual) %>% summarize(tot=sum(value)) 35 | number_of_bar <- nrow(label_data) 36 | angle <- 90 - 360 * (label_data$id-0.5) /number_of_bar # I substract 0.5 because the letter must have the angle of the center of the bars. Not extreme right(1) or extreme left (0) 37 | label_data$hjust <- ifelse( angle < -90, 1, 0) 38 | label_data$angle <- ifelse(angle < -90, angle+180, angle) 39 | 40 | # prepare a data frame for base lines 41 | base_data <- data %>% 42 | group_by(group) %>% 43 | summarize(start=min(id), end=max(id) - empty_bar) %>% 44 | rowwise() %>% 45 | mutate(title=mean(c(start, end))) 46 | 47 | # prepare a data frame for grid (scales) 48 | grid_data <- base_data 49 | grid_data$end <- grid_data$end[ c( nrow(grid_data), 1:nrow(grid_data)-1)] + 1 50 | grid_data$start <- grid_data$start - 1 51 | grid_data <- grid_data[-1,] 52 | 53 | # Make the plot 54 | p <- ggplot(data) + 55 | 56 | # Add the stacked bar 57 | geom_bar(aes(x=as.factor(id), y=value, fill=observation), stat="identity", alpha=0.5) + 58 | scale_fill_viridis(discrete=TRUE) + 59 | 60 | # Add a val=100/75/50/25 lines. I do it at the beginning to make sur barplots are OVER it. 61 | geom_segment(data=grid_data, aes(x = end, y = 0, xend = start, yend = 0), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + 62 | geom_segment(data=grid_data, aes(x = end, y = 50, xend = start, yend = 50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + 63 | geom_segment(data=grid_data, aes(x = end, y = 100, xend = start, yend = 100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + 64 | geom_segment(data=grid_data, aes(x = end, y = 150, xend = start, yend = 150), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + 65 | geom_segment(data=grid_data, aes(x = end, y = 200, xend = start, yend = 200), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + 66 | 67 | # Add text showing the value of each 100/75/50/25 lines 68 | ggplot2::annotate("text", x = rep(max(data$id),5), y = c(0, 50, 100, 150, 200), label = c("0", "50", "100", "150", "200") , color="grey", size=6 , angle=0, fontface="bold", hjust=1) + 69 | 70 | ylim(-150,max(label_data$tot, na.rm=T)) + 71 | theme_minimal() + 72 | theme( 73 | legend.position = "none", 74 | axis.text = element_blank(), 75 | axis.title = element_blank(), 76 | panel.grid = element_blank(), 77 | plot.margin = unit(rep(-1,4), "cm") 78 | ) + 79 | coord_polar() + 80 | 81 | # Add labels on top of each bar 82 | geom_text(data=label_data, aes(x=id, y=tot+10, label=individual, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=5, angle= label_data$angle, inherit.aes = FALSE ) + 83 | 84 | # Add base line information 85 | geom_segment(data=base_data, aes(x = start, y = -5, xend = end, yend = -5), colour = "black", alpha=0.8, size=0.6 , inherit.aes = FALSE ) + 86 | geom_text(data=base_data, aes(x = title, y = -18, label=group), hjust=c(1,1,0,0), colour = "black", alpha=0.8, size=4, fontface="bold", inherit.aes = FALSE) 87 | 88 | 89 | # Save at png 90 | ggsave(p, file="../r_gene_comparison/output.png", width=10, height=10) -------------------------------------------------------------------------------- /BrainAlign/brain_analysis/typehint.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2023/4/2 20:51 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : typehint.py 6 | # @Description: This file is used to ... 7 | 8 | r""" 9 | Type hint definitions 10 | """ 11 | 12 | import numbers 13 | from typing import Any, Mapping, Optional, TypeVar, Union 14 | 15 | import anndata as ad 16 | import h5py 17 | import numpy as np 18 | import scipy.sparse 19 | 20 | Array = Union[np.ndarray, scipy.sparse.spmatrix] 21 | BackedArray = Union[h5py.Dataset, ad._core.sparse_dataset.SparseDataset] 22 | AnyArray = Union[Array, BackedArray] 23 | ArrayOrScalar = Union[np.ndarray, numbers.Number] 24 | Kws = Optional[Mapping[str, Any]] 25 | RandomState = Optional[Union[np.random.RandomState, int]] 26 | 27 | T = TypeVar("T") # Generic type var -------------------------------------------------------------------------------- /BrainAlign/came/PARAMETERS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 22:13:17 2021 4 | 5 | @author: Xingyan Liu 6 | 7 | Parameter Settings 8 | 9 | Notes 10 | ----- 11 | * Do NOT change this file directly! 12 | 13 | Examples 14 | -------- 15 | >>> params_pre = PARAMETER.get_preprocess_params() 16 | >>> params_model = PARAMETER.get_model_params() 17 | >>> params_loss = PARAMETER.get_loss_params() 18 | 19 | """ 20 | import copy 21 | 22 | # _params_pre = dict( 23 | # remove_rare=False, # True for benchmarking; False for case study 24 | # min_samples=10, 25 | # ### 26 | # norm__rev=False, # False by default 27 | # norm__log_only=False, # False by default 28 | # ### 29 | # scale_within=True, # True by default 30 | # unit_var=True, # True by default 31 | # clip=not True, clip_range=(-3, 5), # False by default 32 | # ### 33 | # use_degs=True, 34 | # only_1v1homo=False, # False by default 35 | # target_sum='auto', # auto --> 1e4 36 | # with_single_vnodes=not True, 37 | # ) 38 | 39 | _params_model = dict( 40 | h_dim=128, 41 | num_hidden_layers=2, 42 | norm='right', 43 | dropout_feat=0.0, # no dropout for cell input features 44 | dropout=0.2, 45 | negative_slope=0.05, 46 | layernorm_ntypes=['cell', 'gene'], 47 | out_bias=True, 48 | rel_names_out=[('gene', 'expressed_by', 'cell'), 49 | ], 50 | share_hidden_weights=True, 51 | attn_out=True, 52 | kwdict_outgat=dict(n_heads=8, 53 | feat_drop=0.01, 54 | attn_drop=0.6, 55 | negative_slope=0.2, 56 | residual=False, 57 | attn_type='add', # 'add' is more robust than 'mul' 58 | heads_fuse='mean', 59 | ), 60 | share_layernorm=True, # ignored if no weights are shared 61 | residual=False, # performance un-tested 62 | ) 63 | 64 | _params_lossfunc = dict( 65 | smooth_eps=0.1, reduction='mean', 66 | beta=1., # balance factor for multi-label loss 67 | alpha=0, # for R-drop, setting it larger than zero 68 | ) 69 | 70 | 71 | def _get_parameter_dict(default={}, **kwds) -> dict: 72 | params = copy.deepcopy(default) 73 | if len(kwds) > 0: 74 | params.update(**kwds) 75 | return params 76 | 77 | 78 | # def get_preprocess_params(**kwds) -> dict: 79 | # return _get_parameter_dict(_params_pre, **kwds) 80 | 81 | 82 | def get_loss_params(**kwds) -> dict: 83 | return _get_parameter_dict(_params_lossfunc, **kwds) 84 | 85 | 86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict: 87 | params = _get_parameter_dict(_params_model, **kwds) 88 | if len(kwdict_outgat) > 0: 89 | params['kwdict_outgat'].update(kwdict_outgat) 90 | return params 91 | 92 | -------------------------------------------------------------------------------- /BrainAlign/came/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author: Xingyan Liu 3 | 4 | from .utils import ( 5 | load_hidden_states, 6 | save_hidden_states, 7 | load_example_data 8 | ) 9 | from .utils import base 10 | from .utils.base import ( 11 | save_pickle, 12 | load_pickle, 13 | save_json_dict, 14 | load_json_dict, 15 | check_dirs, 16 | write_info, 17 | make_nowtime_tag, 18 | subsample_each_group, 19 | ) 20 | from .utils import preprocess as pp 21 | from .utils import plot as pl 22 | from .utils import analyze as ana 23 | from .utils.analyze import ( 24 | load_dpair_and_model, 25 | weight_linked_vars, 26 | make_abstracted_graph, 27 | ) 28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL 29 | from .utils._base_trainer import get_checkpoint_list 30 | from .utils.evaluation import accuracy 31 | from .model import ( 32 | Predictor, 33 | detach2numpy, 34 | as_probabilities, 35 | predict_from_logits, 36 | predict, 37 | CGGCNet, 38 | CGCNet 39 | ) 40 | from .datapair import ( 41 | datapair_from_adatas, 42 | aligned_datapair_from_adatas, 43 | DataPair, 44 | AlignedDataPair, 45 | make_features, 46 | ) 47 | from .PARAMETERS import get_model_params, get_loss_params 48 | from . import pipeline 49 | from .pipeline import KET_CLUSTER, __test1__, __test2__ 50 | 51 | 52 | __version__ = "0.1.8" 53 | -------------------------------------------------------------------------------- /BrainAlign/came/datapair/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from .unaligned import datapair_from_adatas, DataPair, make_features 9 | from .aligned import aligned_datapair_from_adatas, AlignedDataPair 10 | 11 | -------------------------------------------------------------------------------- /BrainAlign/came/model/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from ._utils import * 9 | from ._predict import * 10 | from .loss import * 11 | from ._predict import * 12 | from .loss import * 13 | from .cggc import CGGCNet 14 | from .cgc import CGCNet 15 | -------------------------------------------------------------------------------- /BrainAlign/came/model/_minibatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @CreateDate: 2021/07/15 4 | @Author: Qunlun Shen 5 | @File: _minibatch.py 6 | @Project: CAME 7 | """ 8 | from pathlib import Path 9 | from typing import Sequence, Union, Mapping, Optional 10 | import time 11 | import numpy as np 12 | import torch 13 | from torch import Tensor 14 | import dgl 15 | import tqdm 16 | 17 | 18 | def make_fanouts(etypes, etypes_each_layers, k_each_etype: Union[int, dict]): 19 | if isinstance(k_each_etype, int): 20 | k_each_etype = dict.fromkeys(etypes, k_each_etype) 21 | 22 | fanouts = [] 23 | for _etypes in etypes_each_layers: 24 | _fanout = dict.fromkeys(etypes, 0) 25 | _fanout.update({e: k_each_etype[e] for e in _etypes}) 26 | fanouts.append(_fanout) 27 | return fanouts 28 | 29 | 30 | def involved_nodes(g,) -> dict: 31 | """ collect all the involved nodes from the edges on g 32 | (a heterogeneous graph) 33 | 34 | Examples 35 | -------- 36 | 37 | >>> input_nodes, output_nodes, mfgs = next(iter(train_dataloader)) 38 | >>> g.subgraph(involved_nodes(mfgs[0])) 39 | 40 | """ 41 | from collections import defaultdict 42 | nodes = defaultdict(set) 43 | for stype, etype, dtype in g.canonical_etypes: 44 | src, dst = g.edges(etype=etype) 45 | nodes[stype].update(src.numpy()) 46 | nodes[dtype].update(dst.numpy()) 47 | 48 | nodes = {k: sorted(v) for k, v in nodes.items()} 49 | return nodes 50 | 51 | -------------------------------------------------------------------------------- /BrainAlign/came/model/v0/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from ._utils import * 9 | from ._predict import * 10 | from .loss import * 11 | from ._predict import * 12 | from .loss import * 13 | from .cggc import CGGCNet 14 | from .cgc import CGCNet 15 | -------------------------------------------------------------------------------- /BrainAlign/came/model/v0/_minibatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @CreateDate: 2021/07/15 4 | @Author: Qunlun Shen 5 | @File: _minibatch.py 6 | @Project: CAME 7 | """ 8 | from pathlib import Path 9 | from typing import Sequence, Union, Mapping, Optional 10 | import time 11 | import numpy as np 12 | import torch 13 | from torch import Tensor 14 | import dgl 15 | import tqdm 16 | 17 | 18 | def sub_graph(cell_ids, gene_ids, g): 19 | """ 20 | Making sub_graph for g with input cell_ids and gene_ids 21 | """ 22 | output_nodes_dict = {'cell': cell_ids, 'gene': gene_ids} 23 | g_subgraph = dgl.node_subgraph(g, output_nodes_dict) 24 | return g_subgraph 25 | 26 | 27 | def create_blocks(g, output_nodes, etype='expressed_by'): 28 | cell_ids = output_nodes.clone().detach() 29 | gene_ids = g.in_edges(cell_ids, etype=etype)[0] # genes expressed_by cells 30 | gene_ids = torch.unique(gene_ids) 31 | block = sub_graph(cell_ids, gene_ids, g) # graph for GAT 32 | return block 33 | 34 | 35 | def create_batch( 36 | sample_size=None, 37 | train_idx=None, 38 | test_idx=None, 39 | batch_size=None, 40 | labels=None, 41 | shuffle=True, 42 | label=True 43 | ): 44 | """ 45 | This function create batch idx, i.e. the cells IDs in a batch. 46 | 47 | Parameters 48 | ---------- 49 | train_idx: 50 | the index for reference cells 51 | test_idx: 52 | the index for query cells 53 | batch_size: 54 | the number of cells in each batch 55 | labels: 56 | the labels for both Reference cells and Query cells 57 | 58 | Returns 59 | ------- 60 | train_labels 61 | the shuffled or non-shuffled labels for all reference cells 62 | test_labels 63 | the shuffled or non-shuffled labels for all query cells 64 | batch_list 65 | the list sores the batch of cell IDs 66 | all_idx 67 | the shuffled or non-shuffled index for all cells 68 | """ 69 | if label: 70 | batch_list = [] 71 | batch_labels = [] 72 | sample_size = len(train_idx) + len(test_idx) 73 | if shuffle: 74 | all_idx = torch.randperm(sample_size) 75 | shuffled_labels = labels[all_idx] 76 | train_labels = shuffled_labels[all_idx < len(train_idx)].clone().detach() 77 | test_labels = shuffled_labels[all_idx >= len(train_idx)].clone().detach() 78 | 79 | if batch_size >= sample_size: 80 | batch_list.append(all_idx) 81 | 82 | else: 83 | batch_num = int(len(all_idx) / batch_size) + 1 84 | for i in range(batch_num - 1): 85 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 86 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 87 | 88 | else: 89 | train_labels = labels[train_idx].clone().detach() 90 | test_labels = labels[test_idx].clone().detach() 91 | all_idx = torch.cat((train_idx, test_idx), 0) 92 | if batch_size >= sample_size: 93 | batch_list.append(all_idx) 94 | else: 95 | batch_num = int(len(all_idx) / batch_size) + 1 96 | for i in range(batch_num - 1): 97 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 98 | batch_labels.append(labels[batch_size * i: batch_size * (i + 1)]) 99 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 100 | 101 | return train_labels, test_labels, batch_list, all_idx 102 | 103 | else: 104 | batch_list = [] 105 | if shuffle: 106 | all_idx = torch.randperm(sample_size) 107 | 108 | if batch_size >= sample_size: 109 | batch_list.append(all_idx) 110 | else: 111 | batch_num = int(len(all_idx) / batch_size) + 1 112 | for i in range(batch_num - 1): 113 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 114 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 115 | 116 | else: 117 | all_idx = torch.arange(sample_size) 118 | if batch_size >= sample_size: 119 | batch_list.append(all_idx) 120 | else: 121 | batch_num = int(len(all_idx) / batch_size) + 1 122 | for i in range(batch_num - 1): 123 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 124 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 125 | 126 | return batch_list, all_idx, None, None 127 | 128 | -------------------------------------------------------------------------------- /BrainAlign/came/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | from . import * 8 | from .base import ( 9 | save_pickle, 10 | load_pickle, 11 | check_dirs, 12 | write_info, 13 | make_nowtime_tag, 14 | subsample_each_group, 15 | ) 16 | from .evaluation import accuracy 17 | from .analyze import ( 18 | weight_linked_vars, 19 | make_abstracted_graph, 20 | ) 21 | from ._get_example_data import load_example_data 22 | from .downsample_counts import ( 23 | downsample_total_counts, 24 | downsample_counts_per_cell 25 | ) 26 | from ._io_h5py import load_hidden_states, save_hidden_states 27 | -------------------------------------------------------------------------------- /BrainAlign/came/utils/_get_example_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @author: Xingyan Liu 4 | @file: _get_example_data.py 5 | @time: 2021-06-12 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | from typing import Sequence, Union, Dict, List, Optional # , Callable 11 | import numpy as np 12 | import pandas as pd 13 | import scanpy as sc 14 | from scipy import sparse 15 | import logging 16 | 17 | CAME_ROOT = Path(__file__).parents[1] 18 | 19 | 20 | def _extract_zip( 21 | fp_zip=CAME_ROOT / 'sample_data.zip', 22 | fp_unzip=CAME_ROOT / 'sample_data', 23 | ): 24 | import zipfile 25 | with zipfile.ZipFile(fp_zip) as zipf: 26 | zipf.extractall(fp_unzip) 27 | 28 | 29 | def load_example_data() -> Dict: 30 | """ Load example data, for a quick start with CAME. 31 | 32 | This pair of cross-species datasets contains the pancreatic scRNA-seq data 33 | of human ("Baron_human") and mouse ("Baron_human"), 34 | initially published with paper [1]. 35 | 36 | NOTE that "Baron_human" is a 20%-subsample from the original data. 37 | The resulting cell-typing accuracy may not be as good as one 38 | using full dataset as the reference. 39 | 40 | [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human 41 | and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure. 42 | Cell Syst 3 (4), 346-360.e4. 43 | 44 | Returns 45 | ------- 46 | dict: 47 | a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'] 48 | 49 | Examples 50 | -------- 51 | >>> example_data_dict = load_example_data() 52 | >>> print(example_data_dict.keys()) 53 | # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']) 54 | 55 | >>> adatas = example_data_dict['adatas'] 56 | >>> dsnames = example_data_dict['dataset_names'] # ('Baron_human', 'Baron_mouse') 57 | >>> df_varmap = example_data_dict['varmap'] 58 | >>> df_varmap_1v1 = example_data_dict['varmap_1v1'] 59 | >>> key_class1 = key_class2 = example_data_dict['key_class'] 60 | 61 | """ 62 | datadir = CAME_ROOT / 'sample_data' 63 | 64 | sp1, sp2 = ('human', 'mouse') 65 | dsnames = ('Baron_human', 'Baron_mouse') 66 | dsn1, dsn2 = dsnames 67 | fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad' 68 | fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv' 69 | fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv' 70 | 71 | if not (datadir.exists() and fp1.exists() and fp2.exists() and 72 | fp_varmap.exists() and fp_varmap_1v1.exists()): 73 | _extract_zip() 74 | 75 | df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, ) 76 | df_varmap = pd.read_csv(fp_varmap, ) 77 | 78 | adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2) 79 | 80 | key_class = 'cell_ontology_class' 81 | example_dict = { 82 | 'adatas': [adata_raw1, adata_raw2], 83 | 'varmap': df_varmap, 84 | 'varmap_1v1': df_varmap_1v1, 85 | 'dataset_names': dsnames, 86 | 'key_class': key_class, 87 | } 88 | logging.info(example_dict.keys()) 89 | logging.debug(example_dict) 90 | return example_dict 91 | 92 | 93 | if __name__ == '__main__': 94 | logging.basicConfig( 95 | level=logging.DEBUG, 96 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 97 | '%(levelname)s\n %(message)s') 98 | d = load_example_data() 99 | print(d.keys()) 100 | -------------------------------------------------------------------------------- /BrainAlign/came/utils/_io_h5py.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @Author: Xingyan Liu 4 | @File: _tmp_h5py.py 5 | @Date: 2021-08-03 6 | @Project: CAME 7 | """ 8 | import os 9 | from pathlib import Path 10 | from typing import Union, Optional, List, Mapping 11 | import logging 12 | import numpy as np 13 | import h5py 14 | 15 | 16 | def save_hidden_states(data_list: list, path: Union[Path, str]): 17 | """ Save hidden states into .h5 file 18 | 19 | Parameters 20 | ---------- 21 | data_list 22 | a list of data matrix, or a list of dicts whose values are matrices 23 | path 24 | file-path ends with .h5, if not, '.h5' will be appended to it. 25 | 26 | Returns 27 | ------- 28 | None 29 | """ 30 | if not str(path).endswith('.h5'): 31 | path = str(path) + '.h5' 32 | f = h5py.File(path, 'w') 33 | if isinstance(data_list[0], dict): 34 | for i, dct in enumerate(data_list): 35 | for key, _data in dct.items(): 36 | f.create_dataset(f'/layer{i}/{key}', data=_data) 37 | else: 38 | for i, _data in enumerate(data_list): 39 | f.create_dataset(f'/layer{i}', data=_data) 40 | 41 | f.close() 42 | 43 | 44 | def load_hidden_states(path) -> List[dict]: 45 | """ Load hidden states from .h5 file 46 | the data structure should be like 47 | [ 48 | 'layer0/cell', 'layer0/gene', 49 | 'layer1/cell', 'layer1/gene', 50 | 'layer2/cell', 'layer2/gene' 51 | ] 52 | 53 | Parameters 54 | ---------- 55 | path 56 | .h5 file path 57 | 58 | Returns 59 | ------- 60 | values: a list of dicts 61 | """ 62 | f = h5py.File(path, 'r') 63 | prefix = 'layer' 64 | keys = sorted(f.keys(), key=lambda x: int(x.strip(prefix))) 65 | # print(keys) 66 | values = [_unfold_to_dict(f[key]) for key in keys] 67 | return values 68 | 69 | 70 | def _unfold_to_dict(d: h5py.Group) -> dict: 71 | dct = {} 72 | for key, val in d.items(): 73 | if isinstance(val, h5py.Dataset): 74 | dct[key] = np.array(val) 75 | return dct 76 | 77 | 78 | def _visit(f: h5py.File): 79 | tree = [] 80 | 81 | def foo(_name, _obj): 82 | if isinstance(_obj, h5py.Dataset): 83 | tree.append(_name) 84 | f.visititems(foo) 85 | logging.info(f'tree={tree}') 86 | return tree 87 | 88 | 89 | def __test__(): 90 | n_cells = 100 91 | n_genes = 114 92 | n_dims = 64 93 | hidden_data = [ 94 | {'cell': np.random.randn(n_cells, n_dims), 95 | 'gene': np.random.randn(n_genes, n_dims)} 96 | for i in range(3) 97 | ] 98 | hidden_data.append({'cell': np.random.randn(n_cells, n_dims)}) 99 | 100 | # logging.debug(hidden_data) 101 | save_hidden_states(hidden_data, '_tmp_data') 102 | f1 = h5py.File('_tmp_data.h5', 'r') 103 | h_list = load_hidden_states('../../_tmp_data.h5') 104 | # logging.info(values) 105 | for k, d in zip(f1.keys(), h_list): 106 | print(f'{k}: {list(d.keys())}') 107 | 108 | 109 | if __name__ == '__main__': 110 | logging.basicConfig( 111 | level=logging.DEBUG, 112 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 113 | '%(levelname)s\n %(message)s') 114 | __test__() 115 | -------------------------------------------------------------------------------- /BrainAlign/came/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 19:43:10 2021 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | import numpy as np 9 | from sklearn import metrics 10 | import torch 11 | from torch import Tensor 12 | from typing import Sequence 13 | from ..model import detach2numpy 14 | 15 | 16 | def accuracy(logits: Tensor, labels: Tensor): 17 | labels = labels.to(logits.device) 18 | if len(logits.shape) >= 2: 19 | _, preds = torch.max(logits, dim=1) 20 | else: 21 | preds = logits 22 | if len(labels.shape) >= 2: 23 | _, labels = torch.max(labels, dim=1) 24 | else: 25 | labels = labels 26 | correct = torch.sum(preds == labels) 27 | return correct.item() * 1.0 / len(labels) 28 | 29 | 30 | def get_AMI(y_true, y_pred, **kwds): 31 | y_true, y_pred = list(map(detach2numpy, (y_true, y_pred))) 32 | ami = metrics.adjusted_mutual_info_score(y_true, y_pred, **kwds) 33 | return ami 34 | 35 | 36 | def get_F1_score(y_true, y_pred, average='macro', **kwds): 37 | y_true, y_pred = list(map(detach2numpy, (y_true, y_pred))) 38 | f1 = metrics.f1_score(y_true, y_pred, average=average, **kwds) 39 | return f1 40 | 41 | 42 | -------------------------------------------------------------------------------- /BrainAlign/code/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/18 17:14 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /BrainAlign/code/embeds/acm/README.md: -------------------------------------------------------------------------------- 1 | This folder is to save embeddings of ACM. 2 | -------------------------------------------------------------------------------- /BrainAlign/code/embeds/aminer/README.md: -------------------------------------------------------------------------------- 1 | This folder is to save embeddings of AMiner. 2 | -------------------------------------------------------------------------------- /BrainAlign/code/embeds/dblp/README.md: -------------------------------------------------------------------------------- 1 | This folder is to save embeddings from DBLP. 2 | -------------------------------------------------------------------------------- /BrainAlign/code/embeds/freebase/README.md: -------------------------------------------------------------------------------- 1 | This folder is to save embeddings of Freebase. 2 | -------------------------------------------------------------------------------- /BrainAlign/code/main_parallel.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import torch 3 | from utils import load_data, set_params, evaluate 4 | from module import HeCo 5 | import warnings 6 | import datetime 7 | import pickle as pkl 8 | import os 9 | import random 10 | 11 | 12 | warnings.filterwarnings('ignore') 13 | 14 | os.environ['CUDA_VISIBLE_DEVICES'] = '1,2' 15 | 16 | args = set_params() 17 | 18 | if torch.cuda.is_available(): 19 | device = torch.device("cuda") 20 | #torch.cuda.set_device(device) 21 | else: 22 | device = torch.device("cpu") 23 | 24 | ## name of intermediate document ## 25 | own_str = args.dataset 26 | 27 | ## random seed ## 28 | seed = args.seed 29 | numpy.random.seed(seed) 30 | random.seed(seed) 31 | torch.manual_seed(seed) 32 | torch.cuda.manual_seed(seed) 33 | 34 | 35 | def train(): 36 | nei_index, feats, mps, pos, label, idx_train, idx_val, idx_test = \ 37 | load_data(args.dataset, args.ratio, args.type_num) 38 | nb_classes = label.shape[-1] 39 | feats_dim_list = [i.shape[1] for i in feats] 40 | P = int(len(mps)) 41 | print("seed ",args.seed) 42 | print("Dataset: ", args.dataset) 43 | print("The number of meta-paths: ", P) 44 | 45 | model = HeCo(args.hidden_dim, feats_dim_list, args.feat_drop, args.attn_drop, 46 | P, args.sample_rate, args.nei_num, args.tau, args.lam) 47 | model = torch.nn.DataParallel(model, device_ids=[0, 1]) 48 | 49 | optimiser = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2_coef) 50 | 51 | if torch.cuda.is_available(): 52 | print('Using CUDA') 53 | #model.cuda() 54 | model.to(device) 55 | feats = [feat.to(device) for feat in feats] 56 | mps = [mp.to(device) for mp in mps] 57 | pos = pos.to(device) 58 | label = label.to(device) 59 | idx_train = [i.to(device) for i in idx_train] 60 | idx_val = [i.to(device) for i in idx_val] 61 | idx_test = [i.to(device) for i in idx_test] 62 | 63 | cnt_wait = 0 64 | best = 1e9 65 | best_t = 0 66 | 67 | starttime = datetime.datetime.now() 68 | for epoch in range(args.nb_epochs): 69 | model.train() 70 | optimiser.zero_grad() 71 | loss = model(feats, pos, mps, nei_index) 72 | print("loss ", loss.data.cpu()) 73 | if loss < best: 74 | best = loss 75 | best_t = epoch 76 | cnt_wait = 0 77 | torch.save(model.state_dict(), 'HeCo_'+own_str+'.pkl') 78 | else: 79 | cnt_wait += 1 80 | 81 | if cnt_wait == args.patience: 82 | print('Early stopping!') 83 | break 84 | loss.backward() 85 | optimiser.step() 86 | 87 | print('Loading {}th epoch'.format(best_t)) 88 | model.load_state_dict(torch.load('HeCo_'+own_str+'.pkl')) 89 | model.eval() 90 | os.remove('HeCo_'+own_str+'.pkl') 91 | embeds = model.get_embeds(feats, mps) 92 | for i in range(len(idx_train)): 93 | evaluate(embeds, args.ratio[i], idx_train[i], idx_val[i], idx_test[i], label, nb_classes, device, args.dataset, 94 | args.eva_lr, args.eva_wd) 95 | endtime = datetime.datetime.now() 96 | time = (endtime - starttime).seconds 97 | print("Total time: ", time, "s") 98 | 99 | if args.save_emb: 100 | f = open("./embeds/"+args.dataset+"/"+str(args.turn)+".pkl", "wb") 101 | pkl.dump(embeds.cpu().data.numpy(), f) 102 | f.close() 103 | 104 | 105 | if __name__ == '__main__': 106 | train() 107 | -------------------------------------------------------------------------------- /BrainAlign/code/module/__init__.py: -------------------------------------------------------------------------------- 1 | from .heco import HeCo 2 | -------------------------------------------------------------------------------- /BrainAlign/code/module/contrast.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Contrast(nn.Module): 6 | def __init__(self, hidden_dim, tau, lam): 7 | super(Contrast, self).__init__() 8 | self.proj = nn.Sequential( 9 | nn.Linear(hidden_dim, hidden_dim), 10 | nn.ELU(), 11 | nn.Linear(hidden_dim, hidden_dim) 12 | ) 13 | self.tau = tau 14 | self.lam = lam 15 | for model in self.proj: 16 | if isinstance(model, nn.Linear): 17 | nn.init.xavier_normal_(model.weight, gain=1.414) 18 | 19 | def sim(self, z1, z2): 20 | z1_norm = torch.norm(z1, dim=-1, keepdim=True) 21 | z2_norm = torch.norm(z2, dim=-1, keepdim=True) 22 | dot_numerator = torch.mm(z1, z2.t()) 23 | dot_denominator = torch.mm(z1_norm, z2_norm.t()) 24 | sim_matrix = torch.exp(dot_numerator / dot_denominator / self.tau) 25 | return sim_matrix 26 | 27 | def forward(self, z_mp, z_sc, pos): 28 | z_proj_mp = self.proj(z_mp) 29 | z_proj_sc = self.proj(z_sc) 30 | matrix_mp2sc = self.sim(z_proj_mp, z_proj_sc) 31 | matrix_sc2mp = matrix_mp2sc.t() 32 | 33 | matrix_mp2sc = matrix_mp2sc/(torch.sum(matrix_mp2sc, dim=1).view(-1, 1) + 1e-8) 34 | lori_mp = -torch.log(matrix_mp2sc.mul(pos.to_dense()).sum(dim=-1)).mean() 35 | 36 | matrix_sc2mp = matrix_sc2mp / (torch.sum(matrix_sc2mp, dim=1).view(-1, 1) + 1e-8) 37 | lori_sc = -torch.log(matrix_sc2mp.mul(pos.to_dense()).sum(dim=-1)).mean() 38 | return self.lam * lori_mp + (1 - self.lam) * lori_sc 39 | -------------------------------------------------------------------------------- /BrainAlign/code/module/heco.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from .mp_encoder import Mp_encoder 4 | from .sc_encoder import Sc_encoder 5 | from .contrast import Contrast 6 | 7 | 8 | class HeCo(nn.Module): 9 | def __init__(self, hidden_dim, feats_dim_list, feat_drop, attn_drop, P, sample_rate, 10 | nei_num, tau, lam): 11 | super(HeCo, self).__init__() 12 | self.hidden_dim = hidden_dim 13 | self.fc_list = nn.ModuleList([nn.Linear(feats_dim, hidden_dim, bias=True) 14 | for feats_dim in feats_dim_list]) 15 | for fc in self.fc_list: 16 | nn.init.xavier_normal_(fc.weight, gain=1.414) 17 | 18 | if feat_drop > 0: 19 | self.feat_drop = nn.Dropout(feat_drop) 20 | else: 21 | self.feat_drop = lambda x: x 22 | self.mp = Mp_encoder(P, hidden_dim, attn_drop) 23 | self.sc = Sc_encoder(hidden_dim, sample_rate, nei_num, attn_drop) 24 | self.contrast = Contrast(hidden_dim, tau, lam) 25 | 26 | def forward(self, feats, pos, mps, nei_index): # p a s 27 | h_all = [] 28 | for i in range(len(feats)): 29 | h_all.append(F.elu(self.feat_drop(self.fc_list[i](feats[i])))) 30 | z_mp = self.mp(h_all[0], mps) 31 | z_sc = self.sc(h_all, nei_index) 32 | loss = self.contrast(z_mp, z_sc, pos) 33 | return loss 34 | 35 | def get_embeds(self, feats, mps): 36 | z_mp = F.elu(self.fc_list[0](feats[0])) 37 | z_mp = self.mp(z_mp, mps) 38 | return z_mp.detach() 39 | 40 | ''' 41 | def get_embeds(self, feats, mps): 42 | z_mp = F.elu(self.fc_list[0](feats[0])) 43 | z_mp = self.mp(z_mp, mps) 44 | embeds_list = [z_mp] 45 | for i in range(len(feats)-1): 46 | embeds_list.append(F.elu(self.fc_list[i+1](feats[i+1]))) 47 | return [x.detach() for x in embeds_list] 48 | ''' 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /BrainAlign/code/module/mp_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class GCN(nn.Module): 6 | def __init__(self, in_ft, out_ft, bias=True): 7 | super(GCN, self).__init__() 8 | self.fc = nn.Linear(in_ft, out_ft, bias=False) 9 | self.act = nn.PReLU() 10 | 11 | if bias: 12 | self.bias = nn.Parameter(torch.FloatTensor(out_ft)) 13 | self.bias.data.fill_(0.0) 14 | else: 15 | self.register_parameter('bias', None) 16 | 17 | for m in self.modules(): 18 | self.weights_init(m) 19 | 20 | def weights_init(self, m): 21 | if isinstance(m, nn.Linear): 22 | nn.init.xavier_normal_(m.weight, gain=1.414) 23 | if m.bias is not None: 24 | m.bias.data.fill_(0.0) 25 | 26 | def forward(self, seq, adj): 27 | seq_fts = self.fc(seq) 28 | out = torch.spmm(adj, seq_fts) 29 | if self.bias is not None: 30 | out += self.bias 31 | return self.act(out) 32 | 33 | 34 | class Attention(nn.Module): 35 | def __init__(self, hidden_dim, attn_drop): 36 | super(Attention, self).__init__() 37 | self.fc = nn.Linear(hidden_dim, hidden_dim, bias=True) 38 | nn.init.xavier_normal_(self.fc.weight, gain=1.414) 39 | 40 | self.tanh = nn.Tanh() 41 | self.att = nn.Parameter(torch.empty(size=(1, hidden_dim)), requires_grad=True) 42 | nn.init.xavier_normal_(self.att.data, gain=1.414) 43 | 44 | self.softmax = nn.Softmax() 45 | if attn_drop: 46 | self.attn_drop = nn.Dropout(attn_drop) 47 | else: 48 | self.attn_drop = lambda x: x 49 | 50 | def forward(self, embeds): 51 | beta = [] 52 | attn_curr = self.attn_drop(self.att) 53 | for embed in embeds: 54 | sp = self.tanh(self.fc(embed)).mean(dim=0) 55 | beta.append(attn_curr.matmul(sp.t())) 56 | beta = torch.cat(beta, dim=-1).view(-1) 57 | beta = self.softmax(beta) 58 | print("mp ", beta.data.cpu().numpy()) # semantic attention 59 | z_mp = 0 60 | for i in range(len(embeds)): 61 | z_mp += embeds[i]*beta[i] 62 | return z_mp 63 | 64 | 65 | class Mp_encoder(nn.Module): 66 | def __init__(self, P, hidden_dim, attn_drop): 67 | super(Mp_encoder, self).__init__() 68 | self.P = P 69 | self.node_level = nn.ModuleList([GCN(hidden_dim, hidden_dim) for _ in range(P)]) 70 | self.att = Attention(hidden_dim, attn_drop) 71 | 72 | def forward(self, h, mps): 73 | embeds = [] 74 | for i in range(self.P): 75 | embeds.append(self.node_level[i](h, mps[i])) 76 | z_mp = self.att(embeds) 77 | return z_mp 78 | -------------------------------------------------------------------------------- /BrainAlign/code/module/sc_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class inter_att(nn.Module): 8 | def __init__(self, hidden_dim, attn_drop): 9 | super(inter_att, self).__init__() 10 | self.fc = nn.Linear(hidden_dim, hidden_dim, bias=True) 11 | nn.init.xavier_normal_(self.fc.weight, gain=1.414) 12 | 13 | self.tanh = nn.Tanh() 14 | self.att = nn.Parameter(torch.empty(size=(1, hidden_dim)), requires_grad=True) 15 | nn.init.xavier_normal_(self.att.data, gain=1.414) 16 | 17 | self.softmax = nn.Softmax() 18 | if attn_drop: 19 | self.attn_drop = nn.Dropout(attn_drop) 20 | else: 21 | self.attn_drop = lambda x: x 22 | 23 | def forward(self, embeds): 24 | beta = [] 25 | attn_curr = self.attn_drop(self.att) 26 | for embed in embeds: 27 | sp = self.tanh(self.fc(embed)).mean(dim=0) 28 | beta.append(attn_curr.matmul(sp.t())) 29 | beta = torch.cat(beta, dim=-1).view(-1) 30 | beta = self.softmax(beta) 31 | print("sc ", beta.data.cpu().numpy()) # type-level attention 32 | z_mc = 0 33 | for i in range(len(embeds)): 34 | z_mc += embeds[i] * beta[i] 35 | return z_mc 36 | 37 | 38 | class intra_att(nn.Module): 39 | def __init__(self, hidden_dim, attn_drop): 40 | super(intra_att, self).__init__() 41 | self.att = nn.Parameter(torch.empty(size=(1, 2*hidden_dim)), requires_grad=True) 42 | nn.init.xavier_normal_(self.att.data, gain=1.414) 43 | if attn_drop: 44 | self.attn_drop = nn.Dropout(attn_drop) 45 | else: 46 | self.attn_drop = lambda x: x 47 | 48 | self.softmax = nn.Softmax(dim=1) 49 | self.leakyrelu = nn.LeakyReLU() 50 | 51 | def forward(self, nei, h, h_refer): 52 | #print('nei.shape', nei.shape) 53 | #print('h.shape', h.shape) 54 | nei_emb = F.embedding(nei, h) 55 | #print('nei_emb.shape', nei_emb.shape) 56 | #print('h_refer.shape', h_refer.shape) 57 | h_refer = torch.unsqueeze(h_refer, 1) 58 | h_refer = h_refer.expand_as(nei_emb) 59 | all_emb = torch.cat([h_refer, nei_emb], dim=-1) 60 | attn_curr = self.attn_drop(self.att) 61 | att = self.leakyrelu(all_emb.matmul(attn_curr.t())) 62 | att = self.softmax(att) 63 | nei_emb = (att*nei_emb).sum(dim=1) 64 | return nei_emb 65 | 66 | 67 | class Sc_encoder(nn.Module): 68 | def __init__(self, hidden_dim, sample_rate, nei_num, attn_drop): 69 | super(Sc_encoder, self).__init__() 70 | self.intra = nn.ModuleList([intra_att(hidden_dim, attn_drop) for _ in range(nei_num)]) 71 | self.inter = inter_att(hidden_dim, attn_drop) 72 | self.sample_rate = sample_rate 73 | self.nei_num = nei_num 74 | 75 | def forward(self, nei_h, nei_index): 76 | embeds = [] 77 | for i in range(self.nei_num): 78 | sele_nei = [] 79 | sample_num = self.sample_rate[i] 80 | for per_node_nei in nei_index[i]: 81 | if len(per_node_nei) >= sample_num: 82 | select_one = torch.tensor(np.random.choice(per_node_nei, sample_num, 83 | replace=False))[np.newaxis] 84 | else: 85 | select_one = torch.tensor(np.random.choice(per_node_nei, sample_num, 86 | replace=True))[np.newaxis] 87 | sele_nei.append(select_one) 88 | sele_nei = torch.cat(sele_nei, dim=0)#.cuda() 89 | #print('sele_nei.shape', sele_nei.shape) 90 | one_type_emb = F.elu(self.intra[i](sele_nei, nei_h[i + 1], nei_h[0])) 91 | embeds.append(one_type_emb) 92 | z_mc = self.inter(embeds) 93 | return z_mc 94 | -------------------------------------------------------------------------------- /BrainAlign/code/predict.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/16 9:11 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : predict.py 6 | import numpy 7 | import torch 8 | from utils import load_data, set_params, evaluate 9 | from module import HeCo 10 | import warnings 11 | import datetime 12 | import pickle as pkl 13 | import os 14 | import random 15 | from utils.logger import setup_logger 16 | warnings.filterwarnings('ignore') 17 | args = set_params() 18 | if torch.cuda.is_available(): 19 | device = torch.device("cuda:" + str(args.gpu)) 20 | torch.cuda.set_device(args.gpu) 21 | else: 22 | device = torch.device("cpu") 23 | 24 | ## name of intermediate document ## 25 | own_str = args.dataset 26 | 27 | ## random seed ## 28 | seed = args.seed 29 | numpy.random.seed(seed) 30 | random.seed(seed) 31 | torch.manual_seed(seed) 32 | torch.cuda.manual_seed(seed) 33 | 34 | def predict(save_path): 35 | 36 | logger = setup_logger("Build logging...", save_path, if_train=False) 37 | 38 | logger.info('Configs {}\n'.format(args)) 39 | nei_index, feats, mps, pos, label, idx_train, idx_val, idx_test = \ 40 | load_data(args.dataset, args.ratio, args.type_num) 41 | nb_classes = label.shape[-1] 42 | logger.info('number of classes = {}'.format(nb_classes)) 43 | feats_dim_list = [i.shape[1] for i in feats] 44 | P = int(len(mps)) 45 | logger.info("seed {}".format(args.seed)) 46 | logger.info("Dataset: {}".format(args.dataset)) 47 | logger.info("The number of meta-paths: {}".format(P)) 48 | model = HeCo(args.hidden_dim, feats_dim_list, args.feat_drop, args.attn_drop, 49 | P, args.sample_rate, args.nei_num, args.tau, args.lam) 50 | if torch.cuda.is_available(): 51 | logger.info('Using CUDA') 52 | model.cuda() 53 | feats = [feat.cuda() for feat in feats] 54 | mps = [mp.cuda() for mp in mps] 55 | 56 | model.load_state_dict(torch.load(save_path + 'HeCo_' + own_str + '.pkl')) 57 | model.eval() 58 | # os.remove('HeCo_'+own_str+'.pkl') 59 | embeds = model.get_embeds(feats, mps) 60 | 61 | if args.save_emb: 62 | if not os.path.exists(args.save_path + "./embeds/" + args.dataset + "/"): 63 | os.makedirs(args.save_path + "./embeds/" + args.dataset + "/") 64 | f = open(args.save_path + "./embeds/" + args.dataset + "/" + str(args.turn) + ".pkl", "wb") 65 | pkl.dump(embeds.cpu().data.numpy(), f) 66 | f.close() 67 | 68 | if __name__ == '__main__': 69 | save_path = '../data/mouse_human_sagittal/results/2022-10-14_11-40-11/' 70 | predict(save_path) -------------------------------------------------------------------------------- /BrainAlign/code/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluate import evaluate 2 | from .load_data import load_data 3 | from .params import set_params 4 | from .logreg import LogReg 5 | -------------------------------------------------------------------------------- /BrainAlign/code/utils/evaluate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from .logreg import LogReg 4 | import torch.nn as nn 5 | from sklearn.metrics import f1_score 6 | from torch.nn.functional import softmax 7 | from sklearn.metrics import roc_auc_score 8 | 9 | 10 | ################################################## 11 | # This section of code adapted from pcy1302/DMGI # 12 | ################################################## 13 | 14 | def evaluate(embeds, ratio, idx_train, idx_val, idx_test, label, nb_classes, device, dataset, lr, wd 15 | , isTest=True): 16 | hid_units = embeds.shape[1] 17 | xent = nn.CrossEntropyLoss() 18 | 19 | train_embs = embeds[idx_train] 20 | val_embs = embeds[idx_val] 21 | test_embs = embeds[idx_test] 22 | 23 | train_lbls = torch.argmax(label[idx_train], dim=-1) 24 | val_lbls = torch.argmax(label[idx_val], dim=-1) 25 | test_lbls = torch.argmax(label[idx_test], dim=-1) 26 | accs = [] 27 | micro_f1s = [] 28 | macro_f1s = [] 29 | macro_f1s_val = [] 30 | auc_score_list = [] 31 | 32 | for _ in range(50): 33 | log = LogReg(hid_units, nb_classes) 34 | opt = torch.optim.Adam(log.parameters(), lr=lr, weight_decay=wd) 35 | log.to(device) 36 | 37 | val_accs = [] 38 | test_accs = [] 39 | val_micro_f1s = [] 40 | test_micro_f1s = [] 41 | val_macro_f1s = [] 42 | test_macro_f1s = [] 43 | 44 | logits_list = [] 45 | for iter_ in range(200): 46 | # train 47 | log.train() 48 | opt.zero_grad() 49 | 50 | logits = log(train_embs) 51 | loss = xent(logits, train_lbls) 52 | 53 | loss.backward() 54 | opt.step() 55 | 56 | # val 57 | logits = log(val_embs) 58 | preds = torch.argmax(logits, dim=1) 59 | 60 | val_acc = torch.sum(preds == val_lbls).float() / val_lbls.shape[0] 61 | val_f1_macro = f1_score(val_lbls.cpu(), preds.cpu(), average='macro') 62 | val_f1_micro = f1_score(val_lbls.cpu(), preds.cpu(), average='micro') 63 | 64 | val_accs.append(val_acc.item()) 65 | val_macro_f1s.append(val_f1_macro) 66 | val_micro_f1s.append(val_f1_micro) 67 | 68 | # test 69 | logits = log(test_embs) 70 | preds = torch.argmax(logits, dim=1) 71 | 72 | test_acc = torch.sum(preds == test_lbls).float() / test_lbls.shape[0] 73 | test_f1_macro = f1_score(test_lbls.cpu(), preds.cpu(), average='macro') 74 | test_f1_micro = f1_score(test_lbls.cpu(), preds.cpu(), average='micro') 75 | 76 | test_accs.append(test_acc.item()) 77 | test_macro_f1s.append(test_f1_macro) 78 | test_micro_f1s.append(test_f1_micro) 79 | logits_list.append(logits) 80 | 81 | max_iter = val_accs.index(max(val_accs)) 82 | accs.append(test_accs[max_iter]) 83 | max_iter = val_macro_f1s.index(max(val_macro_f1s)) 84 | macro_f1s.append(test_macro_f1s[max_iter]) 85 | macro_f1s_val.append(val_macro_f1s[max_iter]) 86 | 87 | max_iter = val_micro_f1s.index(max(val_micro_f1s)) 88 | micro_f1s.append(test_micro_f1s[max_iter]) 89 | 90 | # auc 91 | best_logits = logits_list[max_iter] 92 | best_proba = softmax(best_logits, dim=1) 93 | auc_score_list.append(roc_auc_score(y_true=test_lbls.detach().cpu().numpy(), 94 | y_score=best_proba.detach().cpu().numpy(), 95 | multi_class='ovr' 96 | )) 97 | 98 | if isTest: 99 | print("\t[Classification] Macro-F1_mean: {:.4f} var: {:.4f} Micro-F1_mean: {:.4f} var: {:.4f} auc {:.4f}" 100 | .format(np.mean(macro_f1s), 101 | np.std(macro_f1s), 102 | np.mean(micro_f1s), 103 | np.std(micro_f1s), 104 | np.mean(auc_score_list), 105 | np.std(auc_score_list) 106 | ) 107 | ) 108 | else: 109 | return np.mean(macro_f1s_val), np.mean(macro_f1s) 110 | 111 | f = open("result_"+dataset+str(ratio)+".txt", "a") 112 | f.write(str(np.mean(macro_f1s))+"\t"+str(np.mean(micro_f1s))+"\t"+str(np.mean(auc_score_list))+"\n") 113 | f.close() 114 | -------------------------------------------------------------------------------- /BrainAlign/code/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import os.path as osp 5 | import time 6 | def setup_logger(name, save_dir, if_train): 7 | logger = logging.getLogger(name) 8 | logger.setLevel(logging.DEBUG) 9 | 10 | ch = logging.StreamHandler(stream=sys.stdout) 11 | ch.setLevel(logging.DEBUG) 12 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 13 | ch.setFormatter(formatter) 14 | logger.addHandler(ch) 15 | 16 | if save_dir: 17 | if not osp.exists(save_dir): 18 | os.makedirs(save_dir) 19 | if if_train: 20 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_train_log.txt"), mode='w') 21 | elif if_train == False: 22 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_analysis_log.txt"), mode='w') 23 | elif if_train == None: 24 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S") + "_process_log.txt"), 25 | mode='w') 26 | fh.setLevel(logging.DEBUG) 27 | fh.setFormatter(formatter) 28 | logger.addHandler(fh) 29 | 30 | return logger -------------------------------------------------------------------------------- /BrainAlign/code/utils/logreg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class LogReg(nn.Module): 6 | def __init__(self, ft_in, nb_classes): 7 | super(LogReg, self).__init__() 8 | self.fc = nn.Linear(ft_in, nb_classes) 9 | 10 | for m in self.modules(): 11 | self.weights_init(m) 12 | 13 | def weights_init(self, m): 14 | if isinstance(m, nn.Linear): 15 | torch.nn.init.xavier_uniform_(m.weight.data) 16 | if m.bias is not None: 17 | m.bias.data.fill_(0.0) 18 | 19 | def forward(self, seq): 20 | ret = self.fc(seq) 21 | return ret 22 | -------------------------------------------------------------------------------- /BrainAlign/data/SlideseqV2_mouse_macaque_hippocampus/Data/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "91f07b93-f89e-41c8-b310-799ee1869995", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " Gene name Mouse gene name Mouse homology type\n", 14 | "0 ZNF692 Zfp692 ortholog_one2one\n", 15 | "1 ZNF672 Zfp672 ortholog_one2one\n", 16 | "2 SH3BP5L Sh3bp5l ortholog_one2one\n", 17 | "3 NaN Bend2 ortholog_many2many\n", 18 | "4 LYPD8 Lypd8 ortholog_one2one\n", 19 | "... ... ... ...\n", 20 | "24961 ND4L mt-Nd4l ortholog_one2one\n", 21 | "24962 ND4 mt-Nd4 ortholog_one2one\n", 22 | "24963 ND5 mt-Nd5 ortholog_one2one\n", 23 | "24964 ND6 mt-Nd6 ortholog_one2many\n", 24 | "24965 CYTB mt-Cytb ortholog_one2one\n", 25 | "\n", 26 | "[24966 rows x 3 columns]\n", 27 | " Gene name Mouse gene name Mouse homology type\n", 28 | "0 ZNF692 Zfp692 ortholog_one2one\n", 29 | "1 ZNF672 Zfp672 ortholog_one2one\n", 30 | "2 SH3BP5L Sh3bp5l ortholog_one2one\n", 31 | "4 LYPD8 Lypd8 ortholog_one2one\n", 32 | "8 NaN Lypd9 ortholog_one2one\n", 33 | "... ... ... ...\n", 34 | "24960 ND3 mt-Nd3 ortholog_one2one\n", 35 | "24961 ND4L mt-Nd4l ortholog_one2one\n", 36 | "24962 ND4 mt-Nd4 ortholog_one2one\n", 37 | "24963 ND5 mt-Nd5 ortholog_one2one\n", 38 | "24965 CYTB mt-Cytb ortholog_one2one\n", 39 | "\n", 40 | "[16080 rows x 3 columns]\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "import pandas as pd\n", 46 | "\n", 47 | "file_path = './Macaque_Mouse.tsv'\n", 48 | "df = pd.read_csv(file_path, sep='\\t')\n", 49 | "\n", 50 | "print(df)\n", 51 | "df.to_csv('./Macaque_Mouse_multi2multi.csv', index=False)\n", 52 | "\n", 53 | "df = df[df['Mouse homology type'].isin(['ortholog_one2one'])]\n", 54 | "print(df)\n", 55 | "\n", 56 | "df.to_csv('./Macaque_Mouse_one2one.csv', index=False)\n", 57 | "\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "d42fc7ae-1696-4ed4-a779-884f08b58ee0", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3 (ipykernel)", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.8.18" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 5 90 | } 91 | -------------------------------------------------------------------------------- /BrainAlign/data/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2023/5/14 10:05 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py.py 6 | # @Description: This file is used to ... 7 | -------------------------------------------------------------------------------- /BrainAlign/data/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def threshold_array(X): 5 | ''' 6 | input: array row: sample, column: gene vector 7 | output: a binary matrix 8 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 9 | ''' 10 | return X > np.mean(X, axis=0) 11 | 12 | def threshold_quantile(X, quantile=0.8): 13 | ''' 14 | input: array row: sample, column: gene vector 15 | output: a binary matrix 16 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 17 | ''' 18 | return X > np.quantile(X, quantile, axis=0) 19 | 20 | 21 | def threshold_top(X, percent=1): 22 | ''' 23 | input: array row: sample, column: gene vector 24 | output: a binary matrix 25 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 26 | ''' 27 | #topk = int(round(X.shape[0] * percent)) 28 | topk = percent 29 | #print(topk) 30 | #topk_pos = X.shape[0] - topk 31 | X_sort = np.sort(X, axis=0) 32 | return X >= X_sort[-topk, :] 33 | 34 | 35 | def threshold_array_nonzero(X): 36 | ''' 37 | input: array row: sample, column: gene vector 38 | output: a binary matrix 39 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 40 | ''' 41 | return X > 0 42 | 43 | 44 | 45 | if __name__ == '__main__': 46 | X = np.array([[1,2,3],[2,3,4], [2,3,4], [4,5,2], [7,26,10]]) 47 | print(X) 48 | print(threshold_top(X, percent=0.4)) 49 | #print(threshold_array(X)) -------------------------------------------------------------------------------- /BrainAlign/data/load_node_feature_mouse_human.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | import pickle 4 | 5 | ''' 6 | The 1st kind of method to get initial embeddings: use original sample/voxel expression data. 7 | ''' 8 | 9 | def extract_expression_embedding(): 10 | return 0 11 | 12 | 13 | ''' 14 | The 2nd kind of method to get initial embeddings: use embeddings output by CAME. 15 | ''' 16 | def extract_came_embedding(): 17 | 18 | return 0 19 | 20 | 21 | 22 | def init_embedding(method='CAME'): 23 | if method == 'CAME': 24 | extract_came_embedding() 25 | elif method == 'Expression': 26 | extract_expression_embedding() 27 | 28 | 29 | 30 | if __name__ == '__main__': 31 | path_datapiar = '../../../CAME/brain_human_mouse/(\'Baron_mouse\', \'Baron_human\')-(06-19 16.19.17)/datapair_init.pickle' 32 | path_datapiar_file = open(path_datapiar, 'rb') 33 | datapair = pickle.load(path_datapiar_file) 34 | print(datapair) 35 | print(datapair['features'][0].shape) 36 | print(datapair['features'][1].shape) 37 | 38 | print(datapair['varnames_feat']) 39 | 40 | 41 | 42 | ''' 43 | nei = np.load('./dblp/nei_p.npy', allow_pickle=True) 44 | print(nei) 45 | print(nei.shape) 46 | print(nei[0].shape) 47 | for arr in nei: 48 | print(arr.shape) 49 | ''' 50 | 51 | ''' 52 | p_feat = sp.load_npz('./dblp/p_feat.npz') 53 | print(p_feat.shape) 54 | a_feat = sp.load_npz('./dblp/a_feat.npz') 55 | print(a_feat.shape) 56 | t_feat = np.load('./dblp/t_feat.npz') 57 | print(t_feat.shape) 58 | ''' 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /BrainAlign/data/mouse_macaque_hippocampus/Data/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "91f07b93-f89e-41c8-b310-799ee1869995", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " Gene name Mouse gene name Mouse homology type\n", 14 | "0 ZNF692 Zfp692 ortholog_one2one\n", 15 | "1 ZNF672 Zfp672 ortholog_one2one\n", 16 | "2 SH3BP5L Sh3bp5l ortholog_one2one\n", 17 | "3 NaN Bend2 ortholog_many2many\n", 18 | "4 LYPD8 Lypd8 ortholog_one2one\n", 19 | "... ... ... ...\n", 20 | "24961 ND4L mt-Nd4l ortholog_one2one\n", 21 | "24962 ND4 mt-Nd4 ortholog_one2one\n", 22 | "24963 ND5 mt-Nd5 ortholog_one2one\n", 23 | "24964 ND6 mt-Nd6 ortholog_one2many\n", 24 | "24965 CYTB mt-Cytb ortholog_one2one\n", 25 | "\n", 26 | "[24966 rows x 3 columns]\n", 27 | " Gene name Mouse gene name Mouse homology type\n", 28 | "0 ZNF692 Zfp692 ortholog_one2one\n", 29 | "1 ZNF672 Zfp672 ortholog_one2one\n", 30 | "2 SH3BP5L Sh3bp5l ortholog_one2one\n", 31 | "4 LYPD8 Lypd8 ortholog_one2one\n", 32 | "8 NaN Lypd9 ortholog_one2one\n", 33 | "... ... ... ...\n", 34 | "24960 ND3 mt-Nd3 ortholog_one2one\n", 35 | "24961 ND4L mt-Nd4l ortholog_one2one\n", 36 | "24962 ND4 mt-Nd4 ortholog_one2one\n", 37 | "24963 ND5 mt-Nd5 ortholog_one2one\n", 38 | "24965 CYTB mt-Cytb ortholog_one2one\n", 39 | "\n", 40 | "[16080 rows x 3 columns]\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "import pandas as pd\n", 46 | "\n", 47 | "file_path = './Macaque_Mouse.tsv'\n", 48 | "df = pd.read_csv(file_path, sep='\\t')\n", 49 | "\n", 50 | "print(df)\n", 51 | "df.to_csv('./Macaque_Mouse_multi2multi.csv', index=False)\n", 52 | "\n", 53 | "df = df[df['Mouse homology type'].isin(['ortholog_one2one'])]\n", 54 | "print(df)\n", 55 | "\n", 56 | "df.to_csv('./Macaque_Mouse_one2one.csv', index=False)\n", 57 | "\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "d42fc7ae-1696-4ed4-a779-884f08b58ee0", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3 (ipykernel)", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.8.18" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 5 90 | } 91 | -------------------------------------------------------------------------------- /BrainAlign/data/mp_gen.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | #################################################### 5 | # This tool is to generate meta-path based adjacency 6 | # matrix given original links. 7 | #################################################### 8 | 9 | pa = np.genfromtxt("./dblp/pa.txt") 10 | pc = np.genfromtxt("./dblp/pc.txt") 11 | pt = np.genfromtxt("./dblp/pt.txt") 12 | 13 | A = 4057 14 | P = 14328 15 | C = 20 16 | T = 7723 17 | 18 | pa_ = sp.coo_matrix((np.ones(pa.shape[0]),(pa[:,0], pa[:, 1])),shape=(P,A)).toarray() 19 | pc_ = sp.coo_matrix((np.ones(pc.shape[0]),(pc[:,0], pc[:, 1])),shape=(P,C)).toarray() 20 | pt_ = sp.coo_matrix((np.ones(pt.shape[0]),(pt[:,0], pt[:, 1])),shape=(P,T)).toarray() 21 | 22 | apa = np.matmul(pa_.T, pa_) > 0 23 | apa = sp.coo_matrix(apa) 24 | sp.save_npz("./dblp/apa.npz", apa) 25 | 26 | apc = np.matmul(pa_.T, pc_) > 0 27 | apcpa = np.matmul(apc, apc.T) > 0 28 | apcpa = sp.coo_matrix(apcpa) 29 | sp.save_npz("./dblp/apcpa.npz", apcpa) 30 | 31 | apt = np.matmul(pa_.T, pt_) > 0 32 | aptpa = np.matmul(apt, apt.T) > 0 33 | aptpa = sp.coo_matrix(aptpa) 34 | sp.save_npz("./dblp/aptpa.npz", aptpa) 35 | -------------------------------------------------------------------------------- /BrainAlign/data/mp_gen_mouse_human.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | import pickle 4 | #################################################### 5 | # This tool is to generate meta-path based adjacency 6 | # matrix given original links. 7 | #################################################### 8 | 9 | 10 | path_datapiar = '../../../CAME/brain_mouse_human_no_threshold/datapair_init.pickle' 11 | path_datapiar_file = open(path_datapiar, 'rb') 12 | datapair = pickle.load(path_datapiar_file) 13 | print(datapair) 14 | print(len(datapair['varnames_node'][0])) 15 | np.save('./mouse_human/mouse_gene_names.npy', datapair['varnames_node'][0]) 16 | print(len(datapair['varnames_node'][1])) 17 | np.save('./mouse_human/human_gene_names.npy', datapair['varnames_node'][1]) 18 | 19 | 20 | 21 | S = 72968 22 | M = 2578 23 | H = 3326 24 | V = 3682 25 | 26 | sm_ = datapair['ov_adjs'][0].toarray() 27 | print(sm_) 28 | print('sm_', sm_.shape) 29 | 30 | vh_ = datapair['ov_adjs'][1].toarray() 31 | print('vh_', vh_.shape) 32 | mm_ = datapair['vv_adj'].toarray()[0:M, 0:M] 33 | print('mm_', mm_.shape) 34 | print('mm_ sum', np.sum(mm_)) 35 | hh_ = datapair['vv_adj'].toarray()[M:, M:] 36 | print('hh_', hh_.shape) 37 | print('hh_ sum', np.sum(hh_)) # == 0 38 | mh_ = datapair['vv_adj'].toarray()[0:M, M:] 39 | print('mh_', mh_.shape) 40 | #ss_ = datapair['oo_adjs'].toarray()[0:S, 0:S] 41 | #print('ss_', ss_.shape) 42 | #print('ss_ sum', np.sum(ss_)) # == 0 43 | vv_ = datapair['oo_adjs'].toarray()[S:, S:] 44 | print('vv_', vv_.shape) 45 | print(np.sum(vv_)) 46 | print('vv_ sum', np.sum(vv_)) 47 | sv_ = datapair['oo_adjs'].toarray()[0:S, S:] 48 | print('sv_', sv_.shape) 49 | 50 | ''' 51 | sms = np.matmul(sm_, sm_.T) # > 0 52 | print(sms) 53 | sms = sp.coo_matrix(sms) 54 | sp.save_npz("./mouse_human/sms.npz", sms) 55 | 56 | 57 | smh = np.matmul(sm_, mh_) #> 0 58 | smhv = np.matmul(smh, vh_.T) #> 0 59 | smhvhms = np.matmul(smhv, smhv.T) #> 0 60 | print(smhvhms) 61 | smhvhms = sp.coo_matrix(smhvhms) 62 | sp.save_npz("./mouse_human/smhvhms.npz", smhvhms) 63 | 64 | smh = np.matmul(sm_, mh_) #> 0 65 | smhv = np.matmul(smh, vh_.T) #> 0 66 | smhvv = np.matmul(smhv, vv_) #> 0 67 | smhvvhms = np.matmul(smhv, smhvv.T) #> 0 68 | print(smhvvhms) 69 | smhvvhms = sp.coo_matrix(smhvvhms) 70 | sp.save_npz("./mouse_human/smhvvhms.npz", smhvvhms) 71 | 72 | 73 | ''' 74 | 75 | 76 | 77 | ''' 78 | sms = sp.csr_matrix(sm_).dot( sp.csr_matrix(sm_.T) ).toarray() > 0 79 | sms = sp.csr_matrix(sms) 80 | sp.save_npz("./mouse_human/sms.npz", sms) 81 | 82 | smh = sp.csr_matrix(sm_).dot(sp.csr_matrix(mh_)) > 0 83 | smhv = smh.dot(sp.csr_matrix(vh_.T)) > 0 84 | smhvhms = smhv.dot(smh.T) > 0 85 | smhvhms = sp.coo_matrix(smhvhms) 86 | sp.save_npz("./mouse_human/smhvhms.npz", smhvhms) 87 | 88 | #smh = np.matmul(sm_, mh_) > 0 89 | #smhv = np.matmul(smh, vh_.T) > 0 90 | smhvvhms = smhv.dot(smhv.T) > 0 91 | smhvvhms = sp.coo_matrix(smhvvhms) 92 | sp.save_npz("./mouse_human/smhvvhms.npz", smhvvhms) 93 | ''' 94 | 95 | 96 | 97 | 98 | 99 | ''' 100 | pa = np.genfromtxt("./dblp/pa.txt") 101 | pc = np.genfromtxt("./dblp/pc.txt") 102 | pt = np.genfromtxt("./dblp/pt.txt") 103 | 104 | A = 4057 105 | P = 14328 106 | C = 20 107 | T = 7723 108 | 109 | pa_ = sp.coo_matrix((np.ones(pa.shape[0]),(pa[:,0], pa[:, 1])),shape=(P,A)).toarray() 110 | print(pa_.shape) 111 | pc_ = sp.coo_matrix((np.ones(pc.shape[0]),(pc[:,0], pc[:, 1])),shape=(P,C)).toarray() 112 | print(pc_.shape) 113 | pt_ = sp.coo_matrix((np.ones(pt.shape[0]),(pt[:,0], pt[:, 1])),shape=(P,T)).toarray() 114 | print(pt_.shape) 115 | 116 | 117 | apa = np.matmul(pa_.T, pa_) > 0 118 | apa = sp.coo_matrix(apa) 119 | sp.save_npz("./dblp/apa.npz", apa) 120 | 121 | apc = np.matmul(pa_.T, pc_) > 0 122 | apcpa = np.matmul(apc, apc.T) > 0 123 | apcpa = sp.coo_matrix(apcpa) 124 | sp.save_npz("./dblp/apcpa.npz", apcpa) 125 | 126 | apt = np.matmul(pa_.T, pt_) > 0 127 | aptpa = np.matmul(apt, apt.T) > 0 128 | aptpa = sp.coo_matrix(aptpa) 129 | sp.save_npz("./dblp/aptpa.npz", aptpa) 130 | ''' -------------------------------------------------------------------------------- /BrainAlign/data/neibor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | #################################################### 5 | # This tool is to collect neighbors, and reform them 6 | # as numpy.array style for futher usage. 7 | #################################################### 8 | 9 | # This is for DBLP 10 | pa = np.genfromtxt("./dblp/pa.txt") 11 | a_n = {} 12 | for i in pa: 13 | if i[1] not in a_n: 14 | a_n[int(i[1])]=[] 15 | a_n[int(i[1])].append(int(i[0])) 16 | else: 17 | a_n[int(i[1])].append(int(i[0])) 18 | 19 | keys = sorted(a_n.keys()) 20 | a_n = [a_n[i] for i in keys] 21 | a_n = np.array([np.array(i) for i in a_n]) 22 | np.save("nei_p.npy", a_n) 23 | print(a_n.shape) 24 | 25 | # give some basic statistics about neighbors 26 | l = [len(i) for i in a_n] 27 | print(max(l),min(l),np.mean(l)) 28 | 29 | 30 | 31 | 32 | # This is for ACM, Freebase, AMiner 33 | pa = np.genfromtxt("./aminer/pa.txt") 34 | p_n = {} 35 | for i in pa: 36 | if i[0] not in p_n: 37 | p_n[int(i[0])]=[] 38 | p_n[int(i[0])].append(int(i[1])) 39 | else: 40 | p_n[int(i[0])].append(int(i[1])) 41 | 42 | keys = sorted(p_n.keys()) 43 | p_n = [p_n[i] for i in keys] 44 | p_n = np.array([np.array(i) for i in p_n]) 45 | np.save("nei_a.npy", p_n) 46 | print(p_n.shape) 47 | # give some basic statistics about neighbors 48 | l = [len(i) for i in p_n] 49 | print(max(l),min(l),np.mean(l)) 50 | -------------------------------------------------------------------------------- /BrainAlign/data/pos.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | from collections import Counter 4 | 5 | #################################################### 6 | # This tool is to generate positive set with a thre- 7 | # shold "pos_num". 8 | # dataset pos_num 9 | # acm 5 10 | # dblp 1000 11 | # aminer 15 12 | # freebase 80 13 | # 14 | # 15 | # Notice: The best pos_num of acm is 7 reported in 16 | # paper, but we find there is no much difference 17 | # between 5 and 7 in practice. 18 | #################################################### 19 | 20 | pos_num = 5 21 | p = 4019 22 | pap = sp.load_npz("./acm/pap.npz") 23 | pap = pap / pap.sum(axis=-1).reshape(-1,1) 24 | print(pap) 25 | psp = sp.load_npz("./acm/psp.npz") 26 | psp = psp / psp.sum(axis=-1).reshape(-1,1) 27 | print(psp) 28 | all = (pap + psp).A.astype("float32") 29 | print(all) 30 | all_ = (all>0).sum(-1) 31 | print(all_.max(),all_.min(),all_.mean()) 32 | 33 | pos = np.zeros((p,p)) 34 | k=0 35 | for i in range(len(all)): 36 | one = all[i].nonzero()[0] 37 | if len(one) > pos_num: 38 | oo = np.argsort(-all[i, one]) 39 | sele = one[oo[:pos_num]] 40 | pos[i, sele] = 1 41 | k+=1 42 | else: 43 | pos[i, one] = 1 44 | pos = sp.coo_matrix(pos) 45 | print(pos) 46 | print(type(pos)) 47 | sp.save_npz("pos.npz", pos) 48 | -------------------------------------------------------------------------------- /BrainAlign/data/script_labels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | if __name__ == '__main__': 5 | ''' 6 | path = './acm/labels.npy' 7 | x = np.load(path).astype('int32') 8 | print(x) 9 | print(len(x)) 10 | print('max = ', max(x), 'min = ', min(x)) 11 | ''' 12 | database = 'aminer' 13 | 14 | path = './{}/train_20.npy'.format(database) 15 | x = np.load(path) 16 | print(x) 17 | print('train:', x.shape) 18 | path = './{}/train_40.npy'.format(database) 19 | x = np.load(path) 20 | #print(x) 21 | print('train:', x.shape) 22 | path = './{}/train_60.npy'.format(database) 23 | x = np.load(path) 24 | #print(x) 25 | print('train:', x.shape) 26 | 27 | path = './{}/test_20.npy'.format(database) 28 | x = np.load(path) 29 | #print(x) 30 | print('test:', x.shape) 31 | path = './{}/test_40.npy'.format(database) 32 | x = np.load(path) 33 | #print(x) 34 | print('test:', x.shape) 35 | path = './{}/test_60.npy'.format(database) 36 | x = np.load(path) 37 | #print(x) 38 | print('test:', x.shape) 39 | 40 | path = './{}/val_20.npy'.format(database) 41 | x = np.load(path) 42 | #print(x) 43 | print('val:', x.shape) 44 | path = './{}/val_40.npy'.format(database) 45 | x = np.load(path) 46 | #print(x) 47 | print('val:', x.shape) 48 | path = './{}/val_60.npy'.format(database) 49 | x = np.load(path) 50 | #print(x) 51 | print('val:', x.shape) 52 | 53 | database = 'freebase' 54 | 55 | path = './{}/train_20.npy'.format(database) 56 | x = np.load(path) 57 | print(x) 58 | print('train:', x.shape) 59 | path = './{}/train_40.npy'.format(database) 60 | x = np.load(path) 61 | # print(x) 62 | print('train:', x.shape) 63 | path = './{}/train_60.npy'.format(database) 64 | x = np.load(path) 65 | # print(x) 66 | print('train:', x.shape) 67 | 68 | path = './{}/test_20.npy'.format(database) 69 | x = np.load(path) 70 | # print(x) 71 | print('test:', x.shape) 72 | path = './{}/test_40.npy'.format(database) 73 | x = np.load(path) 74 | # print(x) 75 | print('test:', x.shape) 76 | path = './{}/test_60.npy'.format(database) 77 | x = np.load(path) 78 | # print(x) 79 | print('test:', x.shape) 80 | 81 | path = './{}/val_20.npy'.format(database) 82 | x = np.load(path) 83 | # print(x) 84 | print('val:', x.shape) 85 | path = './{}/val_40.npy'.format(database) 86 | x = np.load(path) 87 | # print(x) 88 | print('val:', x.shape) 89 | path = './{}/val_60.npy'.format(database) 90 | x = np.load(path) 91 | # print(x) 92 | print('val:', x.shape) -------------------------------------------------------------------------------- /BrainAlign/demo/subsample.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2024/2/1 18:13 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : subsample.py 6 | # @Description: This file is subsampling the origin data 7 | import scanpy as sc 8 | import numpy as np 9 | 10 | def obs_key_wise_subsampling(adata, obs_key, N): 11 | ''' 12 | Subsample each class to same cell numbers (N). Classes are given by obs_key pointing to categorical in adata.obs. 13 | ''' 14 | counts = adata.obs[obs_key].value_counts() 15 | # subsample indices per group defined by obs_key 16 | indices = [np.random.choice(adata.obs_names[adata.obs[obs_key]==group], size=N, replace=True).unique() for group in counts.index] 17 | selection = np.hstack(np.array(indices)) 18 | return adata[selection].copy() 19 | 20 | if __name__ == '__main__': 21 | mouse_h5ad_file = 'G:/backup/CAME/brain_mouse_2020sa/mouse_2020sa_64regions.h5ad' 22 | mouse_adata = sc.read_h5ad(mouse_h5ad_file) 23 | #mouse_adata = sc.pp.subsample(mouse_adata, fraction=0.1, copy=True) 24 | 25 | 26 | target_cells = 20 27 | 28 | adatas = [mouse_adata[mouse_adata.obs['region_name'].isin([clust])] for clust in mouse_adata.obs['region_name'].cat.categories] 29 | 30 | for dat in adatas: 31 | if dat.n_obs > target_cells: 32 | sc.pp.subsample(dat, fraction=0.1) 33 | 34 | adata_downsampled = adatas[0].concatenate(*adatas[1:]) 35 | 36 | print(adata_downsampled) 37 | print(adata_downsampled.obs['region_name'].value_counts()) 38 | 39 | adata_downsampled.write_h5ad("./mouse_2020sa_64regions_demo.h5ad") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Saul Goodenough 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /readme_figs/alldatasets/all_dataset_seurate_alignment_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/alldatasets/all_dataset_seurate_alignment_score.png -------------------------------------------------------------------------------- /readme_figs/alldatasets/all_dataset_umap_integration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/alldatasets/all_dataset_umap_integration.png -------------------------------------------------------------------------------- /readme_figs/subsampled/subsampled_seurate_alignment_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/subsampled/subsampled_seurate_alignment_score.png -------------------------------------------------------------------------------- /readme_figs/subsampled/subsampled_umap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/subsampled/subsampled_umap.png -------------------------------------------------------------------------------- /requirements_pip.txt: -------------------------------------------------------------------------------- 1 | adjustText==0.8 2 | anndata==0.8.0 3 | brotlipy==0.7.0 4 | click==8.1.3 5 | colorama==0.4.6 6 | colorcet==3.0.1 7 | colorlog==6.7.0 8 | colormap==1.0.4 9 | contourpy==1.0.7 10 | cycler==0.11.0 11 | dgl==1.0.1 12 | easydev==0.12.1 13 | fitter==1.5.2 14 | fonttools==4.38.0 15 | gseapy==1.0.4 16 | h5py==3.8.0 17 | igraph==0.10.4 18 | imbalanced-learn==0.10.1 19 | imblearn==0.0 20 | importlib-metadata==6.0.0 21 | importlib-resources==5.12.0 22 | joblib==1.2.0 23 | kaleido==0.2.1 24 | kiwisolver==1.4.4 25 | leidenalg==0.9.1 26 | llvmlite==0.39.1 27 | matplotlib==3.7.0 28 | matplotlib-venn==0.11.9 29 | mkl-fft==1.3.1 30 | mkl-service==2.4.0 31 | natsort==8.2.0 32 | networkx==3.0 33 | numba==0.56.4 34 | packaging==23.0 35 | pandas==1.5.3 36 | param==1.12.3 37 | patsy==0.5.3 38 | pexpect==4.8.0 39 | Pillow==9.4.0 40 | plotly==5.14.0 41 | psutil==5.9.4 42 | ptyprocess==0.7.0 43 | pyct==0.5.0 44 | pynndescent==0.5.8 45 | pyparsing==3.0.9 46 | python-dateutil==2.8.2 47 | python-igraph==0.10.4 48 | pytz==2022.7.1 49 | scanpy==1.9.2 50 | scikit-learn==1.2.1 51 | scipy==1.10.1 52 | seaborn==0.12.2 53 | session-info==1.0.0 54 | statannot==0.2.3 55 | statsmodels==0.13.5 56 | stdlib-list==0.8.0 57 | tenacity==8.2.2 58 | texttable==1.6.7 59 | threadpoolctl==3.1.0 60 | torch==1.13.1 61 | torchaudio==0.13.1 62 | torchvision==0.14.1 63 | tqdm==4.64.1 64 | treelib==1.6.4 65 | umap-learn==0.5.3 66 | webcolors==1.13 67 | xgboost==1.7.4 68 | yacs==0.1.8 69 | zipp==3.15.0 70 | ipywidgets==8.1.2 -------------------------------------------------------------------------------- /run_came/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/19 20:43 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /run_came/analysis_script/.ipynb_checkpoints/H_run_came-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "befa887d-0cc5-4d4d-a7b1-1a21dfb4e5f1", 7 | "metadata": { 8 | "scrolled": true 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "# -- coding: utf-8 --\n", 13 | "\n", 14 | "\n", 15 | "import warnings\n", 16 | "warnings.filterwarnings(\"ignore\")\n", 17 | "import sys\n", 18 | "sys.path.append('../')\n", 19 | "\n", 20 | "from analysis_utils import ttest_plot_utils\n", 21 | "from analysis_utils import homo_random_config as config\n", 22 | "import os\n", 23 | "\n", 24 | "\n", 25 | "if __name__ == '__main__':\n", 26 | "\n", 27 | " cfg = config._C\n", 28 | " #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device\n", 29 | " #cfg.CAME.n_top_genes = 1000\n", 30 | " cfg.CAME.visible_device = '0'\n", 31 | " n_top_genes_list = [2000]\n", 32 | "\n", 33 | " cfg.CAME.n_top_genes = n_top_genes_list[0]\n", 34 | " cfg.CAME.sparse = False\n", 35 | " cfg.CAME.do_normalize = [False, False]\n", 36 | " cfg.CAME.ROOT = '../analysis_results/macaque_mouse_hippocampus/'\n", 37 | " cfg.CAME.path_rawdata2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Mouse.h5ad'\n", 38 | " cfg.CAME.path_rawdata1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque.h5ad'\n", 39 | "\n", 40 | " cfg.CAME.path_labels_2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_region_list.csv'\n", 41 | " cfg.CAME.path_labels_1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/macaque_region_list.csv'\n", 42 | "\n", 43 | " cfg.CAME.human_mouse_homo_region = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_macaque_homo_region.csv'\n", 44 | " # ttest_plot_utils.run_came_homo_random(cfg)\n", 45 | "\n", 46 | " cfg.CAME.path_varmap = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque_Mouse_multi2multi.csv'\n", 47 | " cfg.CAME.path_varmap_1v1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque_Mouse_one2one.csv'\n", 48 | "\n", 49 | " cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1\n", 50 | " cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2\n", 51 | "\n", 52 | " ttest_plot_utils.run_came_homo_random(cfg)\n", 53 | "\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "d1071b72-043c-46aa-a9e0-541dc7a15ede", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "env_came", 68 | "language": "python", 69 | "name": "env_came" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.8.18" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 5 86 | } 87 | -------------------------------------------------------------------------------- /run_came/analysis_script/.ipynb_checkpoints/H_run_came-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2024/02/01 11:30 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : run_came.py 6 | import sys 7 | sys.path.append('../') 8 | 9 | from analysis_utils import ttest_plot_utils 10 | from analysis_utils import homo_random_config as config 11 | import os 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | cfg = config._C 17 | #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device 18 | #cfg.CAME.n_top_genes = 1000 19 | cfg.CAME.visible_device = '-1' 20 | n_top_genes_list = [2000] 21 | #quantile_gene_list = [0.8] 22 | #quantile_sample_list = [0.9] 23 | #cfg.CAME.quantile_gene = quantile_gene_list[0] 24 | #cfg.CAME.quantile_sample = quantile_sample_list[0] 25 | #for n_top_genes in n_top_genes_list: 26 | cfg.CAME.n_top_genes = n_top_genes_list[0] 27 | cfg.CAME.sparse = False 28 | cfg.CAME.do_normalize = [True, True] 29 | cfg.CAME.ROOT = '../analysis_results/mouse_macaque_hippocampus/' 30 | cfg.CAME.path_rawdata1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Mouse.h5ad' 31 | cfg.CAME.path_rawdata2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque.h5ad' 32 | 33 | cfg.CAME.path_mouse_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_region_list.csv' 34 | cfg.CAME.path_human_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/macaque_region_list.csv' 35 | 36 | cfg.CAME.human_mouse_homo_region = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_macaque_homo_region.csv' 37 | # ttest_plot_utils.run_came_homo_random(cfg) 38 | 39 | cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1 40 | cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2 41 | 42 | #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 43 | #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 44 | 45 | #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 46 | #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 47 | ttest_plot_utils.run_came_homo_random(cfg) 48 | 49 | -------------------------------------------------------------------------------- /run_came/analysis_script/.ipynb_checkpoints/run_came-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2024/02/01 11:30 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : run_came.py 6 | import sys 7 | sys.path.append('../') 8 | 9 | from analysis_utils import ttest_plot_utils 10 | from analysis_utils import homo_random_config as config 11 | import os 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | cfg = config._C 17 | #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device 18 | #cfg.CAME.n_top_genes = 1000 19 | cfg.CAME.visible_device = '-1' 20 | n_top_genes_list = [2000] 21 | #quantile_gene_list = [0.8] 22 | #quantile_sample_list = [0.9] 23 | #cfg.CAME.quantile_gene = quantile_gene_list[0] 24 | #cfg.CAME.quantile_sample = quantile_sample_list[0] 25 | #for n_top_genes in n_top_genes_list: 26 | cfg.CAME.n_top_genes = n_top_genes_list[0] 27 | cfg.CAME.sparse = False 28 | cfg.CAME.do_normalize = [False, True] 29 | cfg.CAME.ROOT = '../analysis_results/mouse_2020sa/' 30 | cfg.CAME.path_rawdata1 = '../brain_mouse_2020sa/mouse_2020sa_64regions.h5ad' 31 | cfg.CAME.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad' 32 | 33 | cfg.CAME.path_mouse_labels = '../brain_mouse_2020sa/mouse_region_list_64.csv' 34 | cfg.CAME.path_human_labels = '../brain_human_mouse/human_88_label_origin.csv' 35 | 36 | cfg.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67_all.csv' 37 | # ttest_plot_utils.run_came_homo_random(cfg) 38 | 39 | cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1 40 | cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2 41 | 42 | #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 43 | #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 44 | 45 | #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 46 | #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 47 | 48 | ttest_plot_utils.run_came_homo_random(cfg) 49 | 50 | -------------------------------------------------------------------------------- /run_came/analysis_script/H_run_came.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2024/02/01 11:30 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : run_came.py 6 | import sys 7 | sys.path.append('../') 8 | 9 | from analysis_utils import ttest_plot_utils 10 | from analysis_utils import homo_random_config as config 11 | import os 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | cfg = config._C 17 | #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device 18 | #cfg.CAME.n_top_genes = 1000 19 | cfg.CAME.visible_device = '-1' 20 | n_top_genes_list = [2000] 21 | #quantile_gene_list = [0.8] 22 | #quantile_sample_list = [0.9] 23 | #cfg.CAME.quantile_gene = quantile_gene_list[0] 24 | #cfg.CAME.quantile_sample = quantile_sample_list[0] 25 | #for n_top_genes in n_top_genes_list: 26 | cfg.CAME.n_top_genes = n_top_genes_list[0] 27 | cfg.CAME.sparse = False 28 | cfg.CAME.do_normalize = [True, True] 29 | cfg.CAME.ROOT = '../analysis_results/mouse_macaque_hippocampus/' 30 | cfg.CAME.path_rawdata1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Mouse.h5ad' 31 | cfg.CAME.path_rawdata2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque.h5ad' 32 | 33 | cfg.CAME.path_mouse_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_region_list.csv' 34 | cfg.CAME.path_human_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/macaque_region_list.csv' 35 | 36 | cfg.CAME.human_mouse_homo_region = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_macaque_homo_region.csv' 37 | # ttest_plot_utils.run_came_homo_random(cfg) 38 | 39 | cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1 40 | cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2 41 | 42 | #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 43 | #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 44 | 45 | #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 46 | #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 47 | ttest_plot_utils.run_came_homo_random(cfg) 48 | 49 | -------------------------------------------------------------------------------- /run_came/analysis_script/load_mouse_region_tree.R: -------------------------------------------------------------------------------- 1 | #@Time : 2022/12/18 22:47 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : load_mouse_region_tree.r 5 | #@Description: This file is used to ... 6 | 7 | 8 | # Packages ------------------------------------------------------------------- 9 | 10 | suppressPackageStartupMessages(library(tidyverse)) 11 | suppressPackageStartupMessages(library(data.tree)) 12 | suppressPackageStartupMessages(library(rjson)) 13 | suppressPackageStartupMessages(library(optparse)) 14 | 15 | working_dir <- getwd() 16 | 17 | path_tree_tools <- '../analysis_utils/tree_tools.R' 18 | fileTree <- '../brain_mouse_2020sa/DSURQE_tree.json' 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /run_came/analysis_script/load_part_expression.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/12/1 18:38 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : load_part_expression.py 6 | import sys 7 | 8 | import pandas as pd 9 | 10 | sys.path.append('..') 11 | from analysis_utils import ttest_plot_utils 12 | from analysis_utils import homo_random_config as config 13 | import os 14 | import scanpy as sc 15 | 16 | if __name__ == "__main__": 17 | cfg = config._C 18 | 19 | mouse_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata1) 20 | mouse_region_list = set(list(pd.read_csv(cfg.PROCESS.path_mouse_labels)['region_name'])) 21 | print(mouse_region_list) 22 | mouse_h5ad_part = mouse_all_h5ad[mouse_all_h5ad.obs['region_name'].isin(mouse_region_list)] 23 | print(mouse_h5ad_part) 24 | mouse_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata1_part) 25 | 26 | human_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata2) 27 | human_region_list = set(list(pd.read_csv(cfg.PROCESS.path_human_labels)['region_name'])) 28 | print(human_region_list) 29 | human_h5ad_part = human_all_h5ad[human_all_h5ad.obs['region_name'].isin(human_region_list)] 30 | print(human_h5ad_part) 31 | human_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata2_part) 32 | -------------------------------------------------------------------------------- /run_came/analysis_script/load_part_expression_6regions.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/12/1 18:38 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : load_part_expression.py 6 | import sys 7 | 8 | import pandas as pd 9 | 10 | sys.path.append('..') 11 | from analysis_utils import ttest_plot_utils 12 | from analysis_utils import homo_random_config as config 13 | import os 14 | import scanpy as sc 15 | 16 | if __name__ == "__main__": 17 | cfg = config._C 18 | 19 | cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_6regions.csv' 20 | cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_6regions.csv' 21 | 22 | cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/6regions_mouse_brain_region_67_sagittal.h5ad' 23 | cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/6regions_human_brain_region_88_sparse_with3d.h5ad' 24 | 25 | mouse_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata1) 26 | mouse_region_list = set(list(pd.read_csv(cfg.PROCESS.path_mouse_labels)['region_name'])) 27 | print(mouse_region_list) 28 | mouse_h5ad_part = mouse_all_h5ad[mouse_all_h5ad.obs['region_name'].isin(mouse_region_list)] 29 | print(mouse_h5ad_part) 30 | mouse_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata1_part) 31 | 32 | human_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata2) 33 | human_region_list = set(list(pd.read_csv(cfg.PROCESS.path_human_labels)['region_name'])) 34 | print(human_region_list) 35 | human_h5ad_part = human_all_h5ad[human_all_h5ad.obs['region_name'].isin(human_region_list)] 36 | print(human_h5ad_part) 37 | human_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata2_part) 38 | -------------------------------------------------------------------------------- /run_came/analysis_script/read_rhesus_2018s.R: -------------------------------------------------------------------------------- 1 | #@Time : 2023/1/20 14:28 2 | #@Author : Biao Zhang 3 | #@Email : littlebiao@outlook.com 4 | #@File : read_rhesus_2018s.R.r 5 | #@Description: This file is used to ... 6 | 7 | # packages 8 | 9 | 10 | # 11 | load('../brain_rhesus_2018s/Sestan.adultMonkeyNuclei.Psychencode.Rdata') 12 | ls() 13 | -------------------------------------------------------------------------------- /run_came/analysis_script/read_rhesus_2018s.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | The [R plugin](https://www.jetbrains.com/help/pycharm/r-plugin-support.html) for IntelliJ-based IDEs provides 7 | handy capabilities to work with the [R Markdown](https://www.jetbrains.com/help/pycharm/r-markdown.html) files. 8 | To [add](https://www.jetbrains.com/help/pycharm/r-markdown.html#add-code-chunk) a new R chunk, 9 | position the caret at any line or the code chunk, then click "+". 10 | 11 | The code chunk appears: 12 | ```{r} 13 | load('./brain_rhesus_2018s/Sestan.adultMonkeyNuclei.Psychencode.Rdata') 14 | 15 | ``` 16 | 17 | Type any R code in the chunk, for example: 18 | ```{r} 19 | mycars <- within(mtcars, { cyl <- ordered(cyl) }) 20 | mycars 21 | ``` 22 | 23 | Now, click the **Run** button on the chunk toolbar to [execute](https://www.jetbrains.com/help/pycharm/r-markdown.html#run-r-code) the chunk code. The result should be placed under the chunk. 24 | Click the **Knit and Open Document** to build and preview an output. 25 | -------------------------------------------------------------------------------- /run_came/analysis_script/run_came.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2024/02/01 11:30 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : run_came.py 6 | import sys 7 | sys.path.append('../') 8 | 9 | from analysis_utils import ttest_plot_utils 10 | from analysis_utils import homo_random_config as config 11 | import os 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | cfg = config._C 17 | #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device 18 | #cfg.CAME.n_top_genes = 1000 19 | cfg.CAME.visible_device = '-1' 20 | n_top_genes_list = [2000] 21 | #quantile_gene_list = [0.8] 22 | #quantile_sample_list = [0.9] 23 | #cfg.CAME.quantile_gene = quantile_gene_list[0] 24 | #cfg.CAME.quantile_sample = quantile_sample_list[0] 25 | #for n_top_genes in n_top_genes_list: 26 | cfg.CAME.n_top_genes = n_top_genes_list[0] 27 | cfg.CAME.sparse = False 28 | cfg.CAME.do_normalize = [False, True] 29 | cfg.CAME.ROOT = '../analysis_results/mouse_2020sa/' 30 | cfg.CAME.path_rawdata1 = '../brain_mouse_2020sa/mouse_2020sa_64regions.h5ad' 31 | cfg.CAME.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad' 32 | 33 | cfg.CAME.path_mouse_labels = '../brain_mouse_2020sa/mouse_region_list_64.csv' 34 | cfg.CAME.path_human_labels = '../brain_human_mouse/human_88_label_origin.csv' 35 | 36 | cfg.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67_all.csv' 37 | # ttest_plot_utils.run_came_homo_random(cfg) 38 | 39 | cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1 40 | cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2 41 | 42 | #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 43 | #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 44 | 45 | #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 46 | #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 47 | 48 | ttest_plot_utils.run_came_homo_random(cfg) 49 | 50 | -------------------------------------------------------------------------------- /run_came/analysis_script/run_came_demo.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2024/02/01 11:30 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : run_came.py 6 | import sys 7 | sys.path.append('../') 8 | 9 | from analysis_utils import ttest_plot_utils 10 | from analysis_utils import homo_random_config as config 11 | import os 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | cfg = config._C 17 | #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device 18 | #cfg.CAME.n_top_genes = 1000 19 | cfg.CAME.visible_device = '-1' 20 | n_top_genes_list = [2000] 21 | #quantile_gene_list = [0.8] 22 | #quantile_sample_list = [0.9] 23 | #cfg.CAME.quantile_gene = quantile_gene_list[0] 24 | #cfg.CAME.quantile_sample = quantile_sample_list[0] 25 | #for n_top_genes in n_top_genes_list: 26 | cfg.CAME.n_top_genes = n_top_genes_list[0] 27 | cfg.CAME.sparse = False 28 | cfg.CAME.do_normalize = [False, True] 29 | cfg.CAME.ROOT = '../analysis_results/mouse_2020sa/' 30 | cfg.CAME.path_rawdata1 = '../../BrainAlign/demo/mouse_2020sa_64regions_demo.h5ad' 31 | cfg.CAME.path_rawdata2 = '../../BrainAlign/demo/human_brain_region_88_sparse_with3d.h5ad' 32 | 33 | cfg.CAME.path_mouse_labels = '../brain_mouse_2020sa/mouse_region_list_64.csv' 34 | cfg.CAME.path_human_labels = '../brain_human_mouse/human_88_label_origin.csv' 35 | 36 | cfg.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67_all.csv' 37 | # ttest_plot_utils.run_came_homo_random(cfg) 38 | 39 | cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1 40 | cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2 41 | 42 | #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 43 | #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 44 | 45 | #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 46 | #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 47 | 48 | ttest_plot_utils.run_came_homo_random(cfg) 49 | 50 | -------------------------------------------------------------------------------- /run_came/analysis_script/test_tree.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/12/19 21:16 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : test_tree.py 6 | # @Description: This file is used to ... 7 | import pandas as pd 8 | from treelib import Node, Tree 9 | 10 | tree = Tree() 11 | tree.create_node('root', 'root') 12 | 13 | def add_node(tree_dict, key_input): 14 | if tree_dict[key_input]['children'] == []: 15 | return 16 | elif isinstance(tree_dict[key_input]['children'], list): 17 | for key in tree_dict[key_input]['children']: 18 | tree.create_node(key, key, parent=key_input) 19 | else: 20 | for key in tree_dict[key_input]['children'].keys(): 21 | tree.create_node(key, key, parent=key_input) 22 | add_node(tree_dict[key_input]['children'], key) 23 | 24 | if __name__ == '__main__': 25 | 26 | 27 | dict_ = {"2": {'parent': "1"}, "1": {'parent': None}, "3": {'parent': "2"}} 28 | tree_dict = {"0": {'name':'n0','children': {'0-1':{'name':'n0-1', 'children':['n0-1-0']}}}, 29 | "1": {'name':'n1','children': []}, 30 | "2": {'name':'n2', 'children': {'n2-0':{'name':'n2-0', 'children':{'n2-0-0':{'name':'n2-0-0', 'children':[]}}}}}} 31 | added = set() 32 | #tree = Tree() 33 | for key in tree_dict.keys(): 34 | tree.create_node(key, key, parent='root') 35 | add_node(tree_dict, key) 36 | 37 | tree.show() 38 | print(tree.depth()) 39 | print(tree.subtree('0-1').depth()) 40 | #new_tree = tree.expand_tree(filter=lambda x:(tree.depth()-tree.subtree(x).depth())!=2) 41 | print([tree[node].tag for node in tree.subtree('0-1').expand_tree(mode=Tree.DEPTH)]) 42 | #new_tree.show() 43 | ''' 44 | region_69_df = pd.read_csv('../brain_mouse_2020sa/mouse_69_label_acronym.csv', sep=',') 45 | region_69_list = region_69_df['region_name'] 46 | 47 | region_new_df = pd.read_csv('../brain_mouse_2020sa/mouse_region_list.csv') 48 | region_list = region_new_df['region_name'] 49 | 50 | for region in region_69_list: 51 | if not region in set(region_list): 52 | print(region) 53 | ''' 54 | 55 | -------------------------------------------------------------------------------- /run_came/analysis_utils/.ipynb_checkpoints/homo_random_config-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 11:20 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : homo_random_config.py 6 | 7 | 8 | from yacs.config import CfgNode as CN 9 | 10 | # -------------------------------------------------------------- 11 | # Config of model 12 | # -------------------------------------------------------------- 13 | _C = CN() 14 | 15 | _C.CAME = CN() 16 | _C.CAME.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'#'../../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad' 17 | _C.CAME.path_rawdata2 = '../brain_human_mouse/human_brain_region_88_sparse.h5ad' 18 | 19 | _C.CAME.path_labels_1 = '../brain_human_mouse/mouse_67_label.csv' 20 | _C.CAME.path_labels_2 = '../brain_human_mouse/human_88_label_origin.csv' 21 | 22 | _C.CAME.embedding_path = None 23 | 24 | _C.CAME.path_varmap = '../came/sample_data/gene_matches_mouse2human.csv' 25 | _C.CAME.path_varmap_1v1 = '../came/sample_data/gene_matches_1v1_mouse2human.csv' 26 | 27 | _C.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67.csv' 28 | 29 | _C.CAME.species_name_list = ['Mouse', 'Human'] 30 | _C.CAME.annotation_name = ['region_name', 'region_name'] 31 | 32 | _C.CAME.learning_label = ['region_name', 'region_name'] 33 | 34 | _C.CAME.n_top_genes = 5000 35 | _C.CAME.do_normalize = [True, True] 36 | 37 | _C.CAME.sparse = False 38 | _C.CAME.quantile_gene= 0.5 39 | _C.CAME.quantile_sample = 0.99 40 | 41 | _C.CAME.embedding_size = 128 42 | 43 | _C.CAME.preclustering_resolution = 3 44 | 45 | 46 | _C.TRAINING = CN() 47 | _C.TRAINING.n_epochs = 300 48 | 49 | # The training batch size 50 | # When the GPU memory is limited, set 4096 or more if possible. 51 | _C.TRAINING.batch_size = 2048 52 | # The number of epochs to skip for checkpoint backup 53 | _C.TRAINING.n_pass = 50 54 | # The number of top DEGs to take as the node-features of each cells. 55 | # You set it 70-100 for distant species pairs. 56 | _C.TRAINING.ntop_deg = 70 57 | 58 | # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. 59 | _C.TRAINING.ntop_deg_nodes = 50 60 | 61 | 62 | _C.ANALYSIS = CN() 63 | _C.ANALYSIS.cut_ov = 0 64 | _C.ANALYSIS.umap_neighbor = 20 #30 65 | _C.ANALYSIS.mouse_umap_neighbor = 20 #40 66 | _C.ANALYSIS.human_umap_neighbor = 20 67 | 68 | 69 | _C.CAME.ROOT = '../analysis_results/' 70 | 71 | _C.CAME.visible_device = '0' 72 | 73 | 74 | _C.PROCESS = CN() 75 | _C.PROCESS.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad' 76 | _C.PROCESS.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad' 77 | 78 | _C.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 79 | _C.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 80 | 81 | _C.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 82 | _C.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 83 | 84 | 85 | # -------------------------------------------------------------- 86 | # Config of INPUT 87 | # -------------------------------------------------------------- 88 | _C.HOMO_RANDOM = CN() 89 | 90 | 91 | -------------------------------------------------------------------------------- /run_came/analysis_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 12:12 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : __init__.py 6 | 7 | 8 | -------------------------------------------------------------------------------- /run_came/analysis_utils/homo_random_config.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2022/10/15 11:20 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : homo_random_config.py 6 | 7 | 8 | from yacs.config import CfgNode as CN 9 | 10 | # -------------------------------------------------------------- 11 | # Config of model 12 | # -------------------------------------------------------------- 13 | _C = CN() 14 | 15 | _C.CAME = CN() 16 | _C.CAME.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'#'../../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad' 17 | _C.CAME.path_rawdata2 = '../brain_human_mouse/human_brain_region_88_sparse.h5ad' 18 | 19 | _C.CAME.path_labels_1 = '../brain_human_mouse/mouse_67_label.csv' 20 | _C.CAME.path_labels_2 = '../brain_human_mouse/human_88_label_origin.csv' 21 | 22 | _C.CAME.embedding_path = None 23 | 24 | _C.CAME.path_varmap = '../came/sample_data/gene_matches_mouse2human.csv' 25 | _C.CAME.path_varmap_1v1 = '../came/sample_data/gene_matches_1v1_mouse2human.csv' 26 | 27 | _C.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67.csv' 28 | 29 | _C.CAME.species_name_list = ['Mouse', 'Human'] 30 | _C.CAME.annotation_name = ['region_name', 'region_name'] 31 | 32 | _C.CAME.learning_label = ['region_name', 'region_name'] 33 | 34 | _C.CAME.n_top_genes = 5000 35 | _C.CAME.do_normalize = [True, True] 36 | 37 | _C.CAME.sparse = False 38 | _C.CAME.quantile_gene= 0.5 39 | _C.CAME.quantile_sample = 0.99 40 | 41 | _C.CAME.embedding_size = 128 42 | 43 | _C.CAME.preclustering_resolution = 3 44 | 45 | 46 | _C.TRAINING = CN() 47 | _C.TRAINING.n_epochs = 300 48 | 49 | # The training batch size 50 | # When the GPU memory is limited, set 4096 or more if possible. 51 | _C.TRAINING.batch_size = 2048 52 | # The number of epochs to skip for checkpoint backup 53 | _C.TRAINING.n_pass = 50 54 | # The number of top DEGs to take as the node-features of each cells. 55 | # You set it 70-100 for distant species pairs. 56 | _C.TRAINING.ntop_deg = 70 57 | 58 | # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. 59 | _C.TRAINING.ntop_deg_nodes = 50 60 | 61 | 62 | _C.ANALYSIS = CN() 63 | _C.ANALYSIS.cut_ov = 0 64 | _C.ANALYSIS.umap_neighbor = 20 #30 65 | _C.ANALYSIS.mouse_umap_neighbor = 20 #40 66 | _C.ANALYSIS.human_umap_neighbor = 20 67 | 68 | 69 | _C.CAME.ROOT = '../analysis_results/' 70 | 71 | _C.CAME.visible_device = '0' 72 | 73 | 74 | _C.PROCESS = CN() 75 | _C.PROCESS.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad' 76 | _C.PROCESS.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad' 77 | 78 | _C.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv' 79 | _C.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv' 80 | 81 | _C.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad' 82 | _C.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad' 83 | 84 | 85 | # -------------------------------------------------------------- 86 | # Config of INPUT 87 | # -------------------------------------------------------------- 88 | _C.HOMO_RANDOM = CN() 89 | 90 | 91 | -------------------------------------------------------------------------------- /run_came/analysis_utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import os.path as osp 5 | import time 6 | def setup_logger(name, save_dir, if_train): 7 | logger = logging.getLogger(name) 8 | logger.setLevel(logging.DEBUG) 9 | 10 | ch = logging.StreamHandler(stream=sys.stdout) 11 | ch.setLevel(logging.DEBUG) 12 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 13 | ch.setFormatter(formatter) 14 | logger.addHandler(ch) 15 | 16 | if save_dir: 17 | if not osp.exists(save_dir): 18 | os.makedirs(save_dir) 19 | if if_train: 20 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_train_log.txt"), mode='w') 21 | else: 22 | fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_test_log.txt"), mode='w') 23 | fh.setLevel(logging.DEBUG) 24 | fh.setFormatter(formatter) 25 | logger.addHandler(fh) 26 | 27 | return logger -------------------------------------------------------------------------------- /run_came/brain_human_mouse/get_human_acronym_color.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | # @Time : 2023/3/6 19:55 3 | # @Author : Biao Zhang 4 | # @Email : littlebiao@outlook.com 5 | # @File : get_human_acronym_color.py 6 | # @Description: This file is used to ... 7 | 8 | import pandas as pd 9 | 10 | if __name__ == '__main__': 11 | human_structure_df = pd.read_csv('./human_query.csv') 12 | name_list = human_structure_df['name'] 13 | acronym_list = human_structure_df['acronym'] 14 | acronym_dict = {k:v for k,v in zip(name_list,acronym_list)} 15 | color_list = human_structure_df['color_hex_triplet'] 16 | color_dict = {k:v for k,v in zip(name_list,color_list)} 17 | 18 | 19 | human_88_label_df = pd.read_csv('human_88_label_origin.csv', index_col=0) 20 | 21 | region_name_list = human_88_label_df['region_name'] 22 | 23 | human_88_label_df['acronym'] = [acronym_dict[r] for r in region_name_list] 24 | human_88_label_df['color_hex_triplet'] = ['#'+color_dict[r] for r in region_name_list] 25 | 26 | human_88_label_df.to_csv('./human_88_labels.csv') 27 | 28 | -------------------------------------------------------------------------------- /run_came/brain_mouse_2020sa/human_gene_palette/2011-12-16203C-Supplementary_Table8.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/run_came/brain_mouse_2020sa/human_gene_palette/2011-12-16203C-Supplementary_Table8.xls -------------------------------------------------------------------------------- /run_came/came/.ipynb_checkpoints/PARAMETERS-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 22:13:17 2021 4 | 5 | @author: Xingyan Liu 6 | 7 | Parameter Settings 8 | 9 | Notes 10 | ----- 11 | * Do NOT change this file directly! 12 | 13 | Examples 14 | -------- 15 | >>> params_pre = PARAMETER.get_preprocess_params() 16 | >>> params_model = PARAMETER.get_model_params() 17 | >>> params_loss = PARAMETER.get_loss_params() 18 | 19 | """ 20 | import copy 21 | 22 | # _params_pre = dict( 23 | # remove_rare=False, # True for benchmarking; False for case study 24 | # min_samples=10, 25 | # ### 26 | # norm__rev=False, # False by default 27 | # norm__log_only=False, # False by default 28 | # ### 29 | # scale_within=True, # True by default 30 | # unit_var=True, # True by default 31 | # clip=not True, clip_range=(-3, 5), # False by default 32 | # ### 33 | # use_degs=True, 34 | # only_1v1homo=False, # False by default 35 | # target_sum='auto', # auto --> 1e4 36 | # with_single_vnodes=not True, 37 | # ) 38 | 39 | _params_model = dict( 40 | h_dim=128, 41 | num_hidden_layers=2, 42 | norm='right', 43 | dropout_feat=0.0, # no dropout for cell input features 44 | dropout=0.2, 45 | negative_slope=0.05, 46 | layernorm_ntypes=['cell', 'gene'], 47 | out_bias=True, 48 | rel_names_out=[('gene', 'expressed_by', 'cell'), 49 | ], 50 | share_hidden_weights=True, 51 | attn_out=True, 52 | kwdict_outgat=dict(n_heads=8, 53 | feat_drop=0.01, 54 | attn_drop=0.6, 55 | negative_slope=0.2, 56 | residual=False, 57 | attn_type='add', # 'add' is more robust than 'mul' 58 | heads_fuse='mean', 59 | ), 60 | share_layernorm=True, # ignored if no weights are shared 61 | residual=False, # performance un-tested 62 | ) 63 | 64 | _params_lossfunc = dict( 65 | smooth_eps=0.1, reduction='mean', 66 | beta=1., # balance factor for multi-label loss 67 | alpha=0, # for R-drop, setting it larger than zero 68 | ) 69 | 70 | 71 | def _get_parameter_dict(default={}, **kwds) -> dict: 72 | params = copy.deepcopy(default) 73 | if len(kwds) > 0: 74 | params.update(**kwds) 75 | return params 76 | 77 | 78 | # def get_preprocess_params(**kwds) -> dict: 79 | # return _get_parameter_dict(_params_pre, **kwds) 80 | 81 | 82 | def get_loss_params(**kwds) -> dict: 83 | return _get_parameter_dict(_params_lossfunc, **kwds) 84 | 85 | 86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict: 87 | params = _get_parameter_dict(_params_model, **kwds) 88 | if len(kwdict_outgat) > 0: 89 | params['kwdict_outgat'].update(kwdict_outgat) 90 | return params 91 | 92 | -------------------------------------------------------------------------------- /run_came/came/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author: Xingyan Liu 3 | 4 | from .utils import ( 5 | load_hidden_states, 6 | save_hidden_states, 7 | load_example_data 8 | ) 9 | from .utils import base 10 | from .utils.base import ( 11 | save_pickle, 12 | load_pickle, 13 | save_json_dict, 14 | load_json_dict, 15 | check_dirs, 16 | write_info, 17 | make_nowtime_tag, 18 | subsample_each_group, 19 | ) 20 | from .utils import preprocess as pp 21 | from .utils import plot as pl 22 | from .utils import analyze as ana 23 | from .utils.analyze import ( 24 | load_dpair_and_model, 25 | weight_linked_vars, 26 | make_abstracted_graph, 27 | ) 28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL 29 | from .utils._base_trainer import get_checkpoint_list 30 | from .utils.evaluation import accuracy 31 | from .model import ( 32 | Predictor, 33 | detach2numpy, 34 | as_probabilities, 35 | predict_from_logits, 36 | predict, 37 | CGGCNet, 38 | CGCNet 39 | ) 40 | from .datapair import ( 41 | datapair_from_adatas, 42 | aligned_datapair_from_adatas, 43 | DataPair, 44 | AlignedDataPair, 45 | make_features, 46 | ) 47 | from .PARAMETERS import get_model_params, get_loss_params 48 | from . import pipeline 49 | from .pipeline import KET_CLUSTER, __test1__, __test2__ 50 | 51 | 52 | __version__ = "0.1.8" 53 | -------------------------------------------------------------------------------- /run_came/came/PARAMETERS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 22:13:17 2021 4 | 5 | @author: Xingyan Liu 6 | 7 | Parameter Settings 8 | 9 | Notes 10 | ----- 11 | * Do NOT change this file directly! 12 | 13 | Examples 14 | -------- 15 | >>> params_pre = PARAMETER.get_preprocess_params() 16 | >>> params_model = PARAMETER.get_model_params() 17 | >>> params_loss = PARAMETER.get_loss_params() 18 | 19 | """ 20 | import copy 21 | 22 | # _params_pre = dict( 23 | # remove_rare=False, # True for benchmarking; False for case study 24 | # min_samples=10, 25 | # ### 26 | # norm__rev=False, # False by default 27 | # norm__log_only=False, # False by default 28 | # ### 29 | # scale_within=True, # True by default 30 | # unit_var=True, # True by default 31 | # clip=not True, clip_range=(-3, 5), # False by default 32 | # ### 33 | # use_degs=True, 34 | # only_1v1homo=False, # False by default 35 | # target_sum='auto', # auto --> 1e4 36 | # with_single_vnodes=not True, 37 | # ) 38 | 39 | _params_model = dict( 40 | h_dim=128, 41 | num_hidden_layers=2, 42 | norm='right', 43 | dropout_feat=0.0, # no dropout for cell input features 44 | dropout=0.2, 45 | negative_slope=0.05, 46 | layernorm_ntypes=['cell', 'gene'], 47 | out_bias=True, 48 | rel_names_out=[('gene', 'expressed_by', 'cell'), 49 | ], 50 | share_hidden_weights=True, 51 | attn_out=True, 52 | kwdict_outgat=dict(n_heads=8, 53 | feat_drop=0.01, 54 | attn_drop=0.6, 55 | negative_slope=0.2, 56 | residual=False, 57 | attn_type='add', # 'add' is more robust than 'mul' 58 | heads_fuse='mean', 59 | ), 60 | share_layernorm=True, # ignored if no weights are shared 61 | residual=False, # performance un-tested 62 | ) 63 | 64 | _params_lossfunc = dict( 65 | smooth_eps=0.1, reduction='mean', 66 | beta=1., # balance factor for multi-label loss 67 | alpha=0, # for R-drop, setting it larger than zero 68 | ) 69 | 70 | 71 | def _get_parameter_dict(default={}, **kwds) -> dict: 72 | params = copy.deepcopy(default) 73 | if len(kwds) > 0: 74 | params.update(**kwds) 75 | return params 76 | 77 | 78 | # def get_preprocess_params(**kwds) -> dict: 79 | # return _get_parameter_dict(_params_pre, **kwds) 80 | 81 | 82 | def get_loss_params(**kwds) -> dict: 83 | return _get_parameter_dict(_params_lossfunc, **kwds) 84 | 85 | 86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict: 87 | params = _get_parameter_dict(_params_model, **kwds) 88 | if len(kwdict_outgat) > 0: 89 | params['kwdict_outgat'].update(kwdict_outgat) 90 | return params 91 | 92 | -------------------------------------------------------------------------------- /run_came/came/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author: Xingyan Liu 3 | 4 | from .utils import ( 5 | load_hidden_states, 6 | save_hidden_states, 7 | load_example_data 8 | ) 9 | from .utils import base 10 | from .utils.base import ( 11 | save_pickle, 12 | load_pickle, 13 | save_json_dict, 14 | load_json_dict, 15 | check_dirs, 16 | write_info, 17 | make_nowtime_tag, 18 | subsample_each_group, 19 | ) 20 | from .utils import preprocess as pp 21 | from .utils import plot as pl 22 | from .utils import analyze as ana 23 | from .utils.analyze import ( 24 | load_dpair_and_model, 25 | weight_linked_vars, 26 | make_abstracted_graph, 27 | ) 28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL 29 | from .utils._base_trainer import get_checkpoint_list 30 | from .utils.evaluation import accuracy 31 | from .model import ( 32 | Predictor, 33 | detach2numpy, 34 | as_probabilities, 35 | predict_from_logits, 36 | predict, 37 | CGGCNet, 38 | CGCNet 39 | ) 40 | from .datapair import ( 41 | datapair_from_adatas, 42 | aligned_datapair_from_adatas, 43 | DataPair, 44 | AlignedDataPair, 45 | make_features, 46 | ) 47 | from .PARAMETERS import get_model_params, get_loss_params 48 | from . import pipeline 49 | from .pipeline import KET_CLUSTER, __test1__, __test2__ 50 | 51 | 52 | __version__ = "0.1.8" 53 | -------------------------------------------------------------------------------- /run_came/came/datapair/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from .unaligned import datapair_from_adatas, DataPair, make_features 9 | from .aligned import aligned_datapair_from_adatas, AlignedDataPair 10 | 11 | -------------------------------------------------------------------------------- /run_came/came/model/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from ._utils import * 9 | from ._predict import * 10 | from .loss import * 11 | from ._predict import * 12 | from .loss import * 13 | from .cggc import CGGCNet 14 | from .cgc import CGCNet 15 | -------------------------------------------------------------------------------- /run_came/came/model/_minibatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @CreateDate: 2021/07/15 4 | @Author: Qunlun Shen 5 | @File: _minibatch.py 6 | @Project: CAME 7 | """ 8 | from pathlib import Path 9 | from typing import Sequence, Union, Mapping, Optional 10 | import time 11 | import numpy as np 12 | import torch 13 | from torch import Tensor 14 | import dgl 15 | import tqdm 16 | 17 | 18 | def make_fanouts(etypes, etypes_each_layers, k_each_etype: Union[int, dict]): 19 | if isinstance(k_each_etype, int): 20 | k_each_etype = dict.fromkeys(etypes, k_each_etype) 21 | 22 | fanouts = [] 23 | for _etypes in etypes_each_layers: 24 | _fanout = dict.fromkeys(etypes, 0) 25 | _fanout.update({e: k_each_etype[e] for e in _etypes}) 26 | fanouts.append(_fanout) 27 | return fanouts 28 | 29 | 30 | def involved_nodes(g,) -> dict: 31 | """ collect all the involved nodes from the edges on g 32 | (a heterogeneous graph) 33 | 34 | Examples 35 | -------- 36 | 37 | >>> input_nodes, output_nodes, mfgs = next(iter(train_dataloader)) 38 | >>> g.subgraph(involved_nodes(mfgs[0])) 39 | 40 | """ 41 | from collections import defaultdict 42 | nodes = defaultdict(set) 43 | for stype, etype, dtype in g.canonical_etypes: 44 | src, dst = g.edges(etype=etype) 45 | nodes[stype].update(src.numpy()) 46 | nodes[dtype].update(dst.numpy()) 47 | 48 | nodes = {k: sorted(v) for k, v in nodes.items()} 49 | return nodes 50 | 51 | -------------------------------------------------------------------------------- /run_came/came/model/v0/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from ._utils import * 9 | from ._predict import * 10 | from .loss import * 11 | from ._predict import * 12 | from .loss import * 13 | from .cggc import CGGCNet 14 | from .cgc import CGCNet 15 | -------------------------------------------------------------------------------- /run_came/came/model/v0/_minibatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @CreateDate: 2021/07/15 4 | @Author: Qunlun Shen 5 | @File: _minibatch.py 6 | @Project: CAME 7 | """ 8 | from pathlib import Path 9 | from typing import Sequence, Union, Mapping, Optional 10 | import time 11 | import numpy as np 12 | import torch 13 | from torch import Tensor 14 | import dgl 15 | import tqdm 16 | 17 | 18 | def sub_graph(cell_ids, gene_ids, g): 19 | """ 20 | Making sub_graph for g with input cell_ids and gene_ids 21 | """ 22 | output_nodes_dict = {'cell': cell_ids, 'gene': gene_ids} 23 | g_subgraph = dgl.node_subgraph(g, output_nodes_dict) 24 | return g_subgraph 25 | 26 | 27 | def create_blocks(g, output_nodes, etype='expressed_by'): 28 | cell_ids = output_nodes.clone().detach() 29 | gene_ids = g.in_edges(cell_ids, etype=etype)[0] # genes expressed_by cells 30 | gene_ids = torch.unique(gene_ids) 31 | block = sub_graph(cell_ids, gene_ids, g) # graph for GAT 32 | return block 33 | 34 | 35 | def create_batch( 36 | sample_size=None, 37 | train_idx=None, 38 | test_idx=None, 39 | batch_size=None, 40 | labels=None, 41 | shuffle=True, 42 | label=True 43 | ): 44 | """ 45 | This function create batch idx, i.e. the cells IDs in a batch. 46 | 47 | Parameters 48 | ---------- 49 | train_idx: 50 | the index for reference cells 51 | test_idx: 52 | the index for query cells 53 | batch_size: 54 | the number of cells in each batch 55 | labels: 56 | the labels for both Reference cells and Query cells 57 | 58 | Returns 59 | ------- 60 | train_labels 61 | the shuffled or non-shuffled labels for all reference cells 62 | test_labels 63 | the shuffled or non-shuffled labels for all query cells 64 | batch_list 65 | the list sores the batch of cell IDs 66 | all_idx 67 | the shuffled or non-shuffled index for all cells 68 | """ 69 | if label: 70 | batch_list = [] 71 | batch_labels = [] 72 | sample_size = len(train_idx) + len(test_idx) 73 | if shuffle: 74 | all_idx = torch.randperm(sample_size) 75 | shuffled_labels = labels[all_idx] 76 | train_labels = shuffled_labels[all_idx < len(train_idx)].clone().detach() 77 | test_labels = shuffled_labels[all_idx >= len(train_idx)].clone().detach() 78 | 79 | if batch_size >= sample_size: 80 | batch_list.append(all_idx) 81 | 82 | else: 83 | batch_num = int(len(all_idx) / batch_size) + 1 84 | for i in range(batch_num - 1): 85 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 86 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 87 | 88 | else: 89 | train_labels = labels[train_idx].clone().detach() 90 | test_labels = labels[test_idx].clone().detach() 91 | all_idx = torch.cat((train_idx, test_idx), 0) 92 | if batch_size >= sample_size: 93 | batch_list.append(all_idx) 94 | else: 95 | batch_num = int(len(all_idx) / batch_size) + 1 96 | for i in range(batch_num - 1): 97 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 98 | batch_labels.append(labels[batch_size * i: batch_size * (i + 1)]) 99 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 100 | 101 | return train_labels, test_labels, batch_list, all_idx 102 | 103 | else: 104 | batch_list = [] 105 | if shuffle: 106 | all_idx = torch.randperm(sample_size) 107 | 108 | if batch_size >= sample_size: 109 | batch_list.append(all_idx) 110 | else: 111 | batch_num = int(len(all_idx) / batch_size) + 1 112 | for i in range(batch_num - 1): 113 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 114 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 115 | 116 | else: 117 | all_idx = torch.arange(sample_size) 118 | if batch_size >= sample_size: 119 | batch_list.append(all_idx) 120 | else: 121 | batch_num = int(len(all_idx) / batch_size) + 1 122 | for i in range(batch_num - 1): 123 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 124 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 125 | 126 | return batch_list, all_idx, None, None 127 | 128 | -------------------------------------------------------------------------------- /run_came/came/utils/.ipynb_checkpoints/_get_example_data-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @author: Xingyan Liu 4 | @file: _get_example_data.py 5 | @time: 2021-06-12 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | from typing import Sequence, Union, Dict, List, Optional # , Callable 11 | import numpy as np 12 | import pandas as pd 13 | import scanpy as sc 14 | from scipy import sparse 15 | import logging 16 | 17 | CAME_ROOT = Path(__file__).parents[1] 18 | 19 | 20 | def _extract_zip( 21 | fp_zip=CAME_ROOT / 'sample_data.zip', 22 | fp_unzip=CAME_ROOT / 'sample_data', 23 | ): 24 | import zipfile 25 | with zipfile.ZipFile(fp_zip) as zipf: 26 | zipf.extractall(fp_unzip) 27 | 28 | 29 | def load_example_data() -> Dict: 30 | """ Load example data, for a quick start with CAME. 31 | 32 | This pair of cross-species datasets contains the pancreatic scRNA-seq data 33 | of human ("Baron_human") and mouse ("Baron_human"), 34 | initially published with paper [1]. 35 | 36 | NOTE that "Baron_human" is a 20%-subsample from the original data. 37 | The resulting cell-typing accuracy may not be as good as one 38 | using full dataset as the reference. 39 | 40 | [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human 41 | and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure. 42 | Cell Syst 3 (4), 346-360.e4. 43 | 44 | Returns 45 | ------- 46 | dict: 47 | a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'] 48 | 49 | Examples 50 | -------- 51 | >>> example_data_dict = load_example_data() 52 | >>> print(example_data_dict.keys()) 53 | # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']) 54 | 55 | >>> adatas = example_data_dict['adatas'] 56 | >>> dsnames = example_data_dict['dataset_names'] # ('Baron_human', 'Baron_mouse') 57 | >>> df_varmap = example_data_dict['varmap'] 58 | >>> df_varmap_1v1 = example_data_dict['varmap_1v1'] 59 | >>> key_class1 = key_class2 = example_data_dict['key_class'] 60 | 61 | """ 62 | datadir = CAME_ROOT / 'sample_data' 63 | 64 | sp1, sp2 = ('human', 'mouse') 65 | dsnames = ('Baron_human', 'Baron_mouse') 66 | dsn1, dsn2 = dsnames 67 | fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad' 68 | fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv' 69 | fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv' 70 | 71 | if not (datadir.exists() and fp1.exists() and fp2.exists() and 72 | fp_varmap.exists() and fp_varmap_1v1.exists()): 73 | _extract_zip() 74 | 75 | df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, ) 76 | df_varmap = pd.read_csv(fp_varmap, ) 77 | 78 | adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2) 79 | 80 | key_class = 'cell_ontology_class' 81 | example_dict = { 82 | 'adatas': [adata_raw1, adata_raw2], 83 | 'varmap': df_varmap, 84 | 'varmap_1v1': df_varmap_1v1, 85 | 'dataset_names': dsnames, 86 | 'key_class': key_class, 87 | } 88 | logging.info(example_dict.keys()) 89 | logging.debug(example_dict) 90 | return example_dict 91 | 92 | 93 | if __name__ == '__main__': 94 | logging.basicConfig( 95 | level=logging.DEBUG, 96 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 97 | '%(levelname)s\n %(message)s') 98 | d = load_example_data() 99 | print(d.keys()) 100 | -------------------------------------------------------------------------------- /run_came/came/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | from . import * 8 | from .base import ( 9 | save_pickle, 10 | load_pickle, 11 | check_dirs, 12 | write_info, 13 | make_nowtime_tag, 14 | subsample_each_group, 15 | ) 16 | from .evaluation import accuracy 17 | from .analyze import ( 18 | weight_linked_vars, 19 | make_abstracted_graph, 20 | ) 21 | from ._get_example_data import load_example_data 22 | from .downsample_counts import ( 23 | downsample_total_counts, 24 | downsample_counts_per_cell 25 | ) 26 | from ._io_h5py import load_hidden_states, save_hidden_states 27 | -------------------------------------------------------------------------------- /run_came/came/utils/_get_example_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @author: Xingyan Liu 4 | @file: _get_example_data.py 5 | @time: 2021-06-12 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | from typing import Sequence, Union, Dict, List, Optional # , Callable 11 | import numpy as np 12 | import pandas as pd 13 | import scanpy as sc 14 | from scipy import sparse 15 | import logging 16 | 17 | CAME_ROOT = Path(__file__).parents[1] 18 | 19 | 20 | def _extract_zip( 21 | fp_zip=CAME_ROOT / 'sample_data.zip', 22 | fp_unzip=CAME_ROOT / 'sample_data', 23 | ): 24 | import zipfile 25 | with zipfile.ZipFile(fp_zip) as zipf: 26 | zipf.extractall(fp_unzip) 27 | 28 | 29 | def load_example_data() -> Dict: 30 | """ Load example data, for a quick start with CAME. 31 | 32 | This pair of cross-species datasets contains the pancreatic scRNA-seq data 33 | of human ("Baron_human") and mouse ("Baron_human"), 34 | initially published with paper [1]. 35 | 36 | NOTE that "Baron_human" is a 20%-subsample from the original data. 37 | The resulting cell-typing accuracy may not be as good as one 38 | using full dataset as the reference. 39 | 40 | [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human 41 | and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure. 42 | Cell Syst 3 (4), 346-360.e4. 43 | 44 | Returns 45 | ------- 46 | dict: 47 | a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'] 48 | 49 | Examples 50 | -------- 51 | >>> example_data_dict = load_example_data() 52 | >>> print(example_data_dict.keys()) 53 | # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']) 54 | 55 | >>> adatas = example_data_dict['adatas'] 56 | >>> dsnames = example_data_dict['dataset_names'] # ('Baron_human', 'Baron_mouse') 57 | >>> df_varmap = example_data_dict['varmap'] 58 | >>> df_varmap_1v1 = example_data_dict['varmap_1v1'] 59 | >>> key_class1 = key_class2 = example_data_dict['key_class'] 60 | 61 | """ 62 | datadir = CAME_ROOT / 'sample_data' 63 | 64 | sp1, sp2 = ('human', 'mouse') 65 | dsnames = ('Baron_human', 'Baron_mouse') 66 | dsn1, dsn2 = dsnames 67 | fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad' 68 | fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv' 69 | fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv' 70 | 71 | if not (datadir.exists() and fp1.exists() and fp2.exists() and 72 | fp_varmap.exists() and fp_varmap_1v1.exists()): 73 | _extract_zip() 74 | 75 | df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, ) 76 | df_varmap = pd.read_csv(fp_varmap, ) 77 | 78 | adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2) 79 | 80 | key_class = 'cell_ontology_class' 81 | example_dict = { 82 | 'adatas': [adata_raw1, adata_raw2], 83 | 'varmap': df_varmap, 84 | 'varmap_1v1': df_varmap_1v1, 85 | 'dataset_names': dsnames, 86 | 'key_class': key_class, 87 | } 88 | logging.info(example_dict.keys()) 89 | logging.debug(example_dict) 90 | return example_dict 91 | 92 | 93 | if __name__ == '__main__': 94 | logging.basicConfig( 95 | level=logging.DEBUG, 96 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 97 | '%(levelname)s\n %(message)s') 98 | d = load_example_data() 99 | print(d.keys()) 100 | -------------------------------------------------------------------------------- /run_came/came/utils/_io_h5py.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @Author: Xingyan Liu 4 | @File: _tmp_h5py.py 5 | @Date: 2021-08-03 6 | @Project: CAME 7 | """ 8 | import os 9 | from pathlib import Path 10 | from typing import Union, Optional, List, Mapping 11 | import logging 12 | import numpy as np 13 | import h5py 14 | 15 | 16 | def save_hidden_states(data_list: list, path: Union[Path, str]): 17 | """ Save hidden states into .h5 file 18 | 19 | Parameters 20 | ---------- 21 | data_list 22 | a list of data matrix, or a list of dicts whose values are matrices 23 | path 24 | file-path ends with .h5, if not, '.h5' will be appended to it. 25 | 26 | Returns 27 | ------- 28 | None 29 | """ 30 | if not str(path).endswith('.h5'): 31 | path = str(path) + '.h5' 32 | f = h5py.File(path, 'w') 33 | if isinstance(data_list[0], dict): 34 | for i, dct in enumerate(data_list): 35 | for key, _data in dct.items(): 36 | f.create_dataset(f'/layer{i}/{key}', data=_data) 37 | else: 38 | for i, _data in enumerate(data_list): 39 | f.create_dataset(f'/layer{i}', data=_data) 40 | 41 | f.close() 42 | 43 | 44 | def load_hidden_states(path) -> List[dict]: 45 | """ Load hidden states from .h5 file 46 | the data structure should be like 47 | [ 48 | 'layer0/cell', 'layer0/gene', 49 | 'layer1/cell', 'layer1/gene', 50 | 'layer2/cell', 'layer2/gene' 51 | ] 52 | 53 | Parameters 54 | ---------- 55 | path 56 | .h5 file path 57 | 58 | Returns 59 | ------- 60 | values: a list of dicts 61 | """ 62 | f = h5py.File(path, 'r') 63 | prefix = 'layer' 64 | keys = sorted(f.keys(), key=lambda x: int(x.strip(prefix))) 65 | # print(keys) 66 | values = [_unfold_to_dict(f[key]) for key in keys] 67 | return values 68 | 69 | 70 | def _unfold_to_dict(d: h5py.Group) -> dict: 71 | dct = {} 72 | for key, val in d.items(): 73 | if isinstance(val, h5py.Dataset): 74 | dct[key] = np.array(val) 75 | return dct 76 | 77 | 78 | def _visit(f: h5py.File): 79 | tree = [] 80 | 81 | def foo(_name, _obj): 82 | if isinstance(_obj, h5py.Dataset): 83 | tree.append(_name) 84 | f.visititems(foo) 85 | logging.info(f'tree={tree}') 86 | return tree 87 | 88 | 89 | def __test__(): 90 | n_cells = 100 91 | n_genes = 114 92 | n_dims = 64 93 | hidden_data = [ 94 | {'cell': np.random.randn(n_cells, n_dims), 95 | 'gene': np.random.randn(n_genes, n_dims)} 96 | for i in range(3) 97 | ] 98 | hidden_data.append({'cell': np.random.randn(n_cells, n_dims)}) 99 | 100 | # logging.debug(hidden_data) 101 | save_hidden_states(hidden_data, '_tmp_data') 102 | f1 = h5py.File('_tmp_data.h5', 'r') 103 | h_list = load_hidden_states('../../_tmp_data.h5') 104 | # logging.info(values) 105 | for k, d in zip(f1.keys(), h_list): 106 | print(f'{k}: {list(d.keys())}') 107 | 108 | 109 | if __name__ == '__main__': 110 | logging.basicConfig( 111 | level=logging.DEBUG, 112 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 113 | '%(levelname)s\n %(message)s') 114 | __test__() 115 | -------------------------------------------------------------------------------- /run_came/came/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 19:43:10 2021 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | import numpy as np 9 | from sklearn import metrics 10 | import torch 11 | from torch import Tensor 12 | from typing import Sequence 13 | from ..model import detach2numpy 14 | 15 | 16 | def accuracy(logits: Tensor, labels: Tensor): 17 | labels = labels.to(logits.device) 18 | if len(logits.shape) >= 2: 19 | _, preds = torch.max(logits, dim=1) 20 | else: 21 | preds = logits 22 | if len(labels.shape) >= 2: 23 | _, labels = torch.max(labels, dim=1) 24 | else: 25 | labels = labels 26 | correct = torch.sum(preds == labels) 27 | return correct.item() * 1.0 / len(labels) 28 | 29 | 30 | def get_AMI(y_true, y_pred, **kwds): 31 | y_true, y_pred = list(map(detach2numpy, (y_true, y_pred))) 32 | ami = metrics.adjusted_mutual_info_score(y_true, y_pred, **kwds) 33 | return ami 34 | 35 | 36 | def get_F1_score(y_true, y_pred, average='macro', **kwds): 37 | y_true, y_pred = list(map(detach2numpy, (y_true, y_pred))) 38 | f1 = metrics.f1_score(y_true, y_pred, average=average, **kwds) 39 | return f1 40 | 41 | 42 | -------------------------------------------------------------------------------- /run_came/came_origin/.ipynb_checkpoints/PARAMETERS-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 22:13:17 2021 4 | 5 | @author: Xingyan Liu 6 | 7 | Parameter Settings 8 | 9 | Notes 10 | ----- 11 | * Do NOT change this file directly! 12 | 13 | Examples 14 | -------- 15 | >>> params_pre = PARAMETER.get_preprocess_params() 16 | >>> params_model = PARAMETER.get_model_params() 17 | >>> params_loss = PARAMETER.get_loss_params() 18 | 19 | """ 20 | import copy 21 | 22 | # _params_pre = dict( 23 | # remove_rare=False, # True for benchmarking; False for case study 24 | # min_samples=10, 25 | # ### 26 | # norm__rev=False, # False by default 27 | # norm__log_only=False, # False by default 28 | # ### 29 | # scale_within=True, # True by default 30 | # unit_var=True, # True by default 31 | # clip=not True, clip_range=(-3, 5), # False by default 32 | # ### 33 | # use_degs=True, 34 | # only_1v1homo=False, # False by default 35 | # target_sum='auto', # auto --> 1e4 36 | # with_single_vnodes=not True, 37 | # ) 38 | 39 | _params_model = dict( 40 | h_dim=128, 41 | num_hidden_layers=2, 42 | norm='right', 43 | dropout_feat=0.0, # no dropout for cell input features 44 | dropout=0.2, 45 | negative_slope=0.05, 46 | layernorm_ntypes=['cell', 'gene'], 47 | out_bias=True, 48 | rel_names_out=[('gene', 'expressed_by', 'cell'), 49 | ], 50 | share_hidden_weights=True, 51 | attn_out=True, 52 | kwdict_outgat=dict(n_heads=8, 53 | feat_drop=0.01, 54 | attn_drop=0.6, 55 | negative_slope=0.2, 56 | residual=False, 57 | attn_type='add', # 'add' is more robust than 'mul' 58 | heads_fuse='mean', 59 | ), 60 | share_layernorm=True, # ignored if no weights are shared 61 | residual=False, # performance un-tested 62 | ) 63 | 64 | _params_lossfunc = dict( 65 | smooth_eps=0.1, reduction='mean', 66 | beta=1., # balance factor for multi-label loss 67 | alpha=0, # for R-drop, setting it larger than zero 68 | ) 69 | 70 | 71 | def _get_parameter_dict(default={}, **kwds) -> dict: 72 | params = copy.deepcopy(default) 73 | if len(kwds) > 0: 74 | params.update(**kwds) 75 | return params 76 | 77 | 78 | # def get_preprocess_params(**kwds) -> dict: 79 | # return _get_parameter_dict(_params_pre, **kwds) 80 | 81 | 82 | def get_loss_params(**kwds) -> dict: 83 | return _get_parameter_dict(_params_lossfunc, **kwds) 84 | 85 | 86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict: 87 | params = _get_parameter_dict(_params_model, **kwds) 88 | if len(kwdict_outgat) > 0: 89 | params['kwdict_outgat'].update(kwdict_outgat) 90 | return params 91 | 92 | -------------------------------------------------------------------------------- /run_came/came_origin/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author: Xingyan Liu 3 | 4 | from .utils import ( 5 | load_hidden_states, 6 | save_hidden_states, 7 | load_example_data 8 | ) 9 | from .utils import base 10 | from .utils.base import ( 11 | save_pickle, 12 | load_pickle, 13 | save_json_dict, 14 | load_json_dict, 15 | check_dirs, 16 | write_info, 17 | make_nowtime_tag, 18 | subsample_each_group, 19 | ) 20 | from .utils import preprocess as pp 21 | from .utils import plot as pl 22 | from .utils import analyze as ana 23 | from .utils.analyze import ( 24 | load_dpair_and_model, 25 | weight_linked_vars, 26 | make_abstracted_graph, 27 | ) 28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL 29 | from .utils._base_trainer import get_checkpoint_list 30 | from .utils.evaluation import accuracy 31 | from .model import ( 32 | Predictor, 33 | detach2numpy, 34 | as_probabilities, 35 | predict_from_logits, 36 | predict, 37 | CGGCNet, 38 | CGCNet 39 | ) 40 | from .datapair import ( 41 | datapair_from_adatas, 42 | aligned_datapair_from_adatas, 43 | DataPair, 44 | AlignedDataPair, 45 | make_features, 46 | ) 47 | from .PARAMETERS import get_model_params, get_loss_params 48 | from . import pipeline 49 | from .pipeline import KET_CLUSTER, __test1__, __test2__ 50 | 51 | 52 | __version__ = "0.1.8" 53 | -------------------------------------------------------------------------------- /run_came/came_origin/PARAMETERS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 22:13:17 2021 4 | 5 | @author: Xingyan Liu 6 | 7 | Parameter Settings 8 | 9 | Notes 10 | ----- 11 | * Do NOT change this file directly! 12 | 13 | Examples 14 | -------- 15 | >>> params_pre = PARAMETER.get_preprocess_params() 16 | >>> params_model = PARAMETER.get_model_params() 17 | >>> params_loss = PARAMETER.get_loss_params() 18 | 19 | """ 20 | import copy 21 | 22 | # _params_pre = dict( 23 | # remove_rare=False, # True for benchmarking; False for case study 24 | # min_samples=10, 25 | # ### 26 | # norm__rev=False, # False by default 27 | # norm__log_only=False, # False by default 28 | # ### 29 | # scale_within=True, # True by default 30 | # unit_var=True, # True by default 31 | # clip=not True, clip_range=(-3, 5), # False by default 32 | # ### 33 | # use_degs=True, 34 | # only_1v1homo=False, # False by default 35 | # target_sum='auto', # auto --> 1e4 36 | # with_single_vnodes=not True, 37 | # ) 38 | 39 | _params_model = dict( 40 | h_dim=128, 41 | num_hidden_layers=2, 42 | norm='right', 43 | dropout_feat=0.0, # no dropout for cell input features 44 | dropout=0.2, 45 | negative_slope=0.05, 46 | layernorm_ntypes=['cell', 'gene'], 47 | out_bias=True, 48 | rel_names_out=[('gene', 'expressed_by', 'cell'), 49 | ], 50 | share_hidden_weights=True, 51 | attn_out=True, 52 | kwdict_outgat=dict(n_heads=8, 53 | feat_drop=0.01, 54 | attn_drop=0.6, 55 | negative_slope=0.2, 56 | residual=False, 57 | attn_type='add', # 'add' is more robust than 'mul' 58 | heads_fuse='mean', 59 | ), 60 | share_layernorm=True, # ignored if no weights are shared 61 | residual=False, # performance un-tested 62 | ) 63 | 64 | _params_lossfunc = dict( 65 | smooth_eps=0.1, reduction='mean', 66 | beta=1., # balance factor for multi-label loss 67 | alpha=0, # for R-drop, setting it larger than zero 68 | ) 69 | 70 | 71 | def _get_parameter_dict(default={}, **kwds) -> dict: 72 | params = copy.deepcopy(default) 73 | if len(kwds) > 0: 74 | params.update(**kwds) 75 | return params 76 | 77 | 78 | # def get_preprocess_params(**kwds) -> dict: 79 | # return _get_parameter_dict(_params_pre, **kwds) 80 | 81 | 82 | def get_loss_params(**kwds) -> dict: 83 | return _get_parameter_dict(_params_lossfunc, **kwds) 84 | 85 | 86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict: 87 | params = _get_parameter_dict(_params_model, **kwds) 88 | if len(kwdict_outgat) > 0: 89 | params['kwdict_outgat'].update(kwdict_outgat) 90 | return params 91 | 92 | -------------------------------------------------------------------------------- /run_came/came_origin/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author: Xingyan Liu 3 | 4 | from .utils import ( 5 | load_hidden_states, 6 | save_hidden_states, 7 | load_example_data 8 | ) 9 | from .utils import base 10 | from .utils.base import ( 11 | save_pickle, 12 | load_pickle, 13 | save_json_dict, 14 | load_json_dict, 15 | check_dirs, 16 | write_info, 17 | make_nowtime_tag, 18 | subsample_each_group, 19 | ) 20 | from .utils import preprocess as pp 21 | from .utils import plot as pl 22 | from .utils import analyze as ana 23 | from .utils.analyze import ( 24 | load_dpair_and_model, 25 | weight_linked_vars, 26 | make_abstracted_graph, 27 | ) 28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL 29 | from .utils._base_trainer import get_checkpoint_list 30 | from .utils.evaluation import accuracy 31 | from .model import ( 32 | Predictor, 33 | detach2numpy, 34 | as_probabilities, 35 | predict_from_logits, 36 | predict, 37 | CGGCNet, 38 | CGCNet 39 | ) 40 | from .datapair import ( 41 | datapair_from_adatas, 42 | aligned_datapair_from_adatas, 43 | DataPair, 44 | AlignedDataPair, 45 | make_features, 46 | ) 47 | from .PARAMETERS import get_model_params, get_loss_params 48 | from . import pipeline 49 | from .pipeline import KET_CLUSTER, __test1__, __test2__ 50 | 51 | 52 | __version__ = "0.1.8" 53 | -------------------------------------------------------------------------------- /run_came/came_origin/datapair/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from .unaligned import datapair_from_adatas, DataPair, make_features 9 | from .aligned import aligned_datapair_from_adatas, AlignedDataPair 10 | 11 | -------------------------------------------------------------------------------- /run_came/came_origin/model/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from ._utils import * 9 | from ._predict import * 10 | from .loss import * 11 | from ._predict import * 12 | from .loss import * 13 | from .cggc import CGGCNet 14 | from .cgc import CGCNet 15 | -------------------------------------------------------------------------------- /run_came/came_origin/model/_minibatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @CreateDate: 2021/07/15 4 | @Author: Qunlun Shen 5 | @File: _minibatch.py 6 | @Project: CAME 7 | """ 8 | from pathlib import Path 9 | from typing import Sequence, Union, Mapping, Optional 10 | import time 11 | import numpy as np 12 | import torch 13 | from torch import Tensor 14 | import dgl 15 | import tqdm 16 | 17 | 18 | def make_fanouts(etypes, etypes_each_layers, k_each_etype: Union[int, dict]): 19 | if isinstance(k_each_etype, int): 20 | k_each_etype = dict.fromkeys(etypes, k_each_etype) 21 | 22 | fanouts = [] 23 | for _etypes in etypes_each_layers: 24 | _fanout = dict.fromkeys(etypes, 0) 25 | _fanout.update({e: k_each_etype[e] for e in _etypes}) 26 | fanouts.append(_fanout) 27 | return fanouts 28 | 29 | 30 | def involved_nodes(g,) -> dict: 31 | """ collect all the involved nodes from the edges on g 32 | (a heterogeneous graph) 33 | 34 | Examples 35 | -------- 36 | 37 | >>> input_nodes, output_nodes, mfgs = next(iter(train_dataloader)) 38 | >>> g.subgraph(involved_nodes(mfgs[0])) 39 | 40 | """ 41 | from collections import defaultdict 42 | nodes = defaultdict(set) 43 | for stype, etype, dtype in g.canonical_etypes: 44 | src, dst = g.edges(etype=etype) 45 | nodes[stype].update(src.numpy()) 46 | nodes[dtype].update(dst.numpy()) 47 | 48 | nodes = {k: sorted(v) for k, v in nodes.items()} 49 | return nodes 50 | 51 | -------------------------------------------------------------------------------- /run_came/came_origin/model/v0/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | from ._utils import * 9 | from ._predict import * 10 | from .loss import * 11 | from ._predict import * 12 | from .loss import * 13 | from .cggc import CGGCNet 14 | from .cgc import CGCNet 15 | -------------------------------------------------------------------------------- /run_came/came_origin/model/v0/_minibatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @CreateDate: 2021/07/15 4 | @Author: Qunlun Shen 5 | @File: _minibatch.py 6 | @Project: CAME 7 | """ 8 | from pathlib import Path 9 | from typing import Sequence, Union, Mapping, Optional 10 | import time 11 | import numpy as np 12 | import torch 13 | from torch import Tensor 14 | import dgl 15 | import tqdm 16 | 17 | 18 | def sub_graph(cell_ids, gene_ids, g): 19 | """ 20 | Making sub_graph for g with input cell_ids and gene_ids 21 | """ 22 | output_nodes_dict = {'cell': cell_ids, 'gene': gene_ids} 23 | g_subgraph = dgl.node_subgraph(g, output_nodes_dict) 24 | return g_subgraph 25 | 26 | 27 | def create_blocks(g, output_nodes, etype='expressed_by'): 28 | cell_ids = output_nodes.clone().detach() 29 | gene_ids = g.in_edges(cell_ids, etype=etype)[0] # genes expressed_by cells 30 | gene_ids = torch.unique(gene_ids) 31 | block = sub_graph(cell_ids, gene_ids, g) # graph for GAT 32 | return block 33 | 34 | 35 | def create_batch( 36 | sample_size=None, 37 | train_idx=None, 38 | test_idx=None, 39 | batch_size=None, 40 | labels=None, 41 | shuffle=True, 42 | label=True 43 | ): 44 | """ 45 | This function create batch idx, i.e. the cells IDs in a batch. 46 | 47 | Parameters 48 | ---------- 49 | train_idx: 50 | the index for reference cells 51 | test_idx: 52 | the index for query cells 53 | batch_size: 54 | the number of cells in each batch 55 | labels: 56 | the labels for both Reference cells and Query cells 57 | 58 | Returns 59 | ------- 60 | train_labels 61 | the shuffled or non-shuffled labels for all reference cells 62 | test_labels 63 | the shuffled or non-shuffled labels for all query cells 64 | batch_list 65 | the list sores the batch of cell IDs 66 | all_idx 67 | the shuffled or non-shuffled index for all cells 68 | """ 69 | if label: 70 | batch_list = [] 71 | batch_labels = [] 72 | sample_size = len(train_idx) + len(test_idx) 73 | if shuffle: 74 | all_idx = torch.randperm(sample_size) 75 | shuffled_labels = labels[all_idx] 76 | train_labels = shuffled_labels[all_idx < len(train_idx)].clone().detach() 77 | test_labels = shuffled_labels[all_idx >= len(train_idx)].clone().detach() 78 | 79 | if batch_size >= sample_size: 80 | batch_list.append(all_idx) 81 | 82 | else: 83 | batch_num = int(len(all_idx) / batch_size) + 1 84 | for i in range(batch_num - 1): 85 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 86 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 87 | 88 | else: 89 | train_labels = labels[train_idx].clone().detach() 90 | test_labels = labels[test_idx].clone().detach() 91 | all_idx = torch.cat((train_idx, test_idx), 0) 92 | if batch_size >= sample_size: 93 | batch_list.append(all_idx) 94 | else: 95 | batch_num = int(len(all_idx) / batch_size) + 1 96 | for i in range(batch_num - 1): 97 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 98 | batch_labels.append(labels[batch_size * i: batch_size * (i + 1)]) 99 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 100 | 101 | return train_labels, test_labels, batch_list, all_idx 102 | 103 | else: 104 | batch_list = [] 105 | if shuffle: 106 | all_idx = torch.randperm(sample_size) 107 | 108 | if batch_size >= sample_size: 109 | batch_list.append(all_idx) 110 | else: 111 | batch_num = int(len(all_idx) / batch_size) + 1 112 | for i in range(batch_num - 1): 113 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 114 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 115 | 116 | else: 117 | all_idx = torch.arange(sample_size) 118 | if batch_size >= sample_size: 119 | batch_list.append(all_idx) 120 | else: 121 | batch_num = int(len(all_idx) / batch_size) + 1 122 | for i in range(batch_num - 1): 123 | batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)]) 124 | batch_list.append(all_idx[batch_size * (batch_num - 1):]) 125 | 126 | return batch_list, all_idx, None, None 127 | 128 | -------------------------------------------------------------------------------- /run_came/came_origin/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 27 21:59:44 2020 4 | 5 | @author: Xingyan Liu 6 | """ 7 | from . import * 8 | from .base import ( 9 | save_pickle, 10 | load_pickle, 11 | check_dirs, 12 | write_info, 13 | make_nowtime_tag, 14 | subsample_each_group, 15 | ) 16 | from .evaluation import accuracy 17 | from .analyze import ( 18 | weight_linked_vars, 19 | make_abstracted_graph, 20 | ) 21 | from ._get_example_data import load_example_data 22 | from .downsample_counts import ( 23 | downsample_total_counts, 24 | downsample_counts_per_cell 25 | ) 26 | from ._io_h5py import load_hidden_states, save_hidden_states 27 | -------------------------------------------------------------------------------- /run_came/came_origin/utils/_get_example_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @author: Xingyan Liu 4 | @file: _get_example_data.py 5 | @time: 2021-06-12 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | from typing import Sequence, Union, Dict, List, Optional # , Callable 11 | import numpy as np 12 | import pandas as pd 13 | import scanpy as sc 14 | from scipy import sparse 15 | import logging 16 | 17 | CAME_ROOT = Path(__file__).parents[1] 18 | 19 | 20 | def _extract_zip( 21 | fp_zip=CAME_ROOT / 'sample_data.zip', 22 | fp_unzip=CAME_ROOT / 'sample_data', 23 | ): 24 | import zipfile 25 | with zipfile.ZipFile(fp_zip) as zipf: 26 | zipf.extractall(fp_unzip) 27 | 28 | 29 | def load_example_data() -> Dict: 30 | """ Load example data, for a quick start with CAME. 31 | 32 | This pair of cross-species datasets contains the pancreatic scRNA-seq data 33 | of human ("Baron_human") and mouse ("Baron_human"), 34 | initially published with paper [1]. 35 | 36 | NOTE that "Baron_human" is a 20%-subsample from the original data. 37 | The resulting cell-typing accuracy may not be as good as one 38 | using full dataset as the reference. 39 | 40 | [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human 41 | and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure. 42 | Cell Syst 3 (4), 346-360.e4. 43 | 44 | Returns 45 | ------- 46 | dict: 47 | a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'] 48 | 49 | Examples 50 | -------- 51 | >>> example_data_dict = load_example_data() 52 | >>> print(example_data_dict.keys()) 53 | # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']) 54 | 55 | >>> adatas = example_data_dict['adatas'] 56 | >>> dsnames = example_data_dict['dataset_names'] # ('Baron_human', 'Baron_mouse') 57 | >>> df_varmap = example_data_dict['varmap'] 58 | >>> df_varmap_1v1 = example_data_dict['varmap_1v1'] 59 | >>> key_class1 = key_class2 = example_data_dict['key_class'] 60 | 61 | """ 62 | datadir = CAME_ROOT / 'sample_data' 63 | 64 | sp1, sp2 = ('human', 'mouse') 65 | dsnames = ('Baron_human', 'Baron_mouse') 66 | dsn1, dsn2 = dsnames 67 | fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad' 68 | fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv' 69 | fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv' 70 | 71 | if not (datadir.exists() and fp1.exists() and fp2.exists() and 72 | fp_varmap.exists() and fp_varmap_1v1.exists()): 73 | _extract_zip() 74 | 75 | df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, ) 76 | df_varmap = pd.read_csv(fp_varmap, ) 77 | 78 | adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2) 79 | 80 | key_class = 'cell_ontology_class' 81 | example_dict = { 82 | 'adatas': [adata_raw1, adata_raw2], 83 | 'varmap': df_varmap, 84 | 'varmap_1v1': df_varmap_1v1, 85 | 'dataset_names': dsnames, 86 | 'key_class': key_class, 87 | } 88 | logging.info(example_dict.keys()) 89 | logging.debug(example_dict) 90 | return example_dict 91 | 92 | 93 | if __name__ == '__main__': 94 | logging.basicConfig( 95 | level=logging.DEBUG, 96 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 97 | '%(levelname)s\n %(message)s') 98 | d = load_example_data() 99 | print(d.keys()) 100 | -------------------------------------------------------------------------------- /run_came/came_origin/utils/_io_h5py.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | @Author: Xingyan Liu 4 | @File: _tmp_h5py.py 5 | @Date: 2021-08-03 6 | @Project: CAME 7 | """ 8 | import os 9 | from pathlib import Path 10 | from typing import Union, Optional, List, Mapping 11 | import logging 12 | import numpy as np 13 | import h5py 14 | 15 | 16 | def save_hidden_states(data_list: list, path: Union[Path, str]): 17 | """ Save hidden states into .h5 file 18 | 19 | Parameters 20 | ---------- 21 | data_list 22 | a list of data matrix, or a list of dicts whose values are matrices 23 | path 24 | file-path ends with .h5, if not, '.h5' will be appended to it. 25 | 26 | Returns 27 | ------- 28 | None 29 | """ 30 | if not str(path).endswith('.h5'): 31 | path = str(path) + '.h5' 32 | f = h5py.File(path, 'w') 33 | if isinstance(data_list[0], dict): 34 | for i, dct in enumerate(data_list): 35 | for key, _data in dct.items(): 36 | f.create_dataset(f'/layer{i}/{key}', data=_data) 37 | else: 38 | for i, _data in enumerate(data_list): 39 | f.create_dataset(f'/layer{i}', data=_data) 40 | 41 | f.close() 42 | 43 | 44 | def load_hidden_states(path) -> List[dict]: 45 | """ Load hidden states from .h5 file 46 | the data structure should be like 47 | [ 48 | 'layer0/cell', 'layer0/gene', 49 | 'layer1/cell', 'layer1/gene', 50 | 'layer2/cell', 'layer2/gene' 51 | ] 52 | 53 | Parameters 54 | ---------- 55 | path 56 | .h5 file path 57 | 58 | Returns 59 | ------- 60 | values: a list of dicts 61 | """ 62 | f = h5py.File(path, 'r') 63 | prefix = 'layer' 64 | keys = sorted(f.keys(), key=lambda x: int(x.strip(prefix))) 65 | # print(keys) 66 | values = [_unfold_to_dict(f[key]) for key in keys] 67 | return values 68 | 69 | 70 | def _unfold_to_dict(d: h5py.Group) -> dict: 71 | dct = {} 72 | for key, val in d.items(): 73 | if isinstance(val, h5py.Dataset): 74 | dct[key] = np.array(val) 75 | return dct 76 | 77 | 78 | def _visit(f: h5py.File): 79 | tree = [] 80 | 81 | def foo(_name, _obj): 82 | if isinstance(_obj, h5py.Dataset): 83 | tree.append(_name) 84 | f.visititems(foo) 85 | logging.info(f'tree={tree}') 86 | return tree 87 | 88 | 89 | def __test__(): 90 | n_cells = 100 91 | n_genes = 114 92 | n_dims = 64 93 | hidden_data = [ 94 | {'cell': np.random.randn(n_cells, n_dims), 95 | 'gene': np.random.randn(n_genes, n_dims)} 96 | for i in range(3) 97 | ] 98 | hidden_data.append({'cell': np.random.randn(n_cells, n_dims)}) 99 | 100 | # logging.debug(hidden_data) 101 | save_hidden_states(hidden_data, '_tmp_data') 102 | f1 = h5py.File('_tmp_data.h5', 'r') 103 | h_list = load_hidden_states('../../_tmp_data.h5') 104 | # logging.info(values) 105 | for k, d in zip(f1.keys(), h_list): 106 | print(f'{k}: {list(d.keys())}') 107 | 108 | 109 | if __name__ == '__main__': 110 | logging.basicConfig( 111 | level=logging.DEBUG, 112 | format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' 113 | '%(levelname)s\n %(message)s') 114 | __test__() 115 | -------------------------------------------------------------------------------- /run_came/came_origin/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 11 19:43:10 2021 4 | 5 | @author: Xingyan Liu 6 | """ 7 | 8 | import numpy as np 9 | from sklearn import metrics 10 | import torch 11 | from torch import Tensor 12 | from typing import Sequence 13 | from ..model import detach2numpy 14 | 15 | 16 | def accuracy(logits: Tensor, labels: Tensor): 17 | labels = labels.to(logits.device) 18 | if len(logits.shape) >= 2: 19 | _, preds = torch.max(logits, dim=1) 20 | else: 21 | preds = logits 22 | if len(labels.shape) >= 2: 23 | _, labels = torch.max(labels, dim=1) 24 | else: 25 | labels = labels 26 | correct = torch.sum(preds == labels) 27 | return correct.item() * 1.0 / len(labels) 28 | 29 | 30 | def get_AMI(y_true, y_pred, **kwds): 31 | y_true, y_pred = list(map(detach2numpy, (y_true, y_pred))) 32 | ami = metrics.adjusted_mutual_info_score(y_true, y_pred, **kwds) 33 | return ami 34 | 35 | 36 | def get_F1_score(y_true, y_pred, average='macro', **kwds): 37 | y_true, y_pred = list(map(detach2numpy, (y_true, y_pred))) 38 | f1 = metrics.f1_score(y_true, y_pred, average=average, **kwds) 39 | return f1 40 | 41 | 42 | -------------------------------------------------------------------------------- /run_came/heco_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def threshold_array(X): 5 | ''' 6 | input: array row: sample, column: gene vector 7 | output: a binary matrix 8 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 9 | ''' 10 | return X > np.mean(X, axis=0) 11 | 12 | def threshold_quantile(X, quantile_gene=0.9, quantile_sample=0.95): 13 | ''' 14 | input: array row: sample, column: gene vector 15 | output: a binary matrix 16 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 17 | ''' 18 | keep_mat_gene = X > np.quantile(X, quantile_gene, axis=0) 19 | keep_mat_sample = (X.T > np.quantile(X.T, quantile_sample, axis=0)).T 20 | keep_mat = keep_mat_sample + keep_mat_gene 21 | return X * keep_mat 22 | 23 | 24 | def threshold_top(X, percent=1): 25 | ''' 26 | input: array row: sample, column: gene vector 27 | output: a binary matrix 28 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 29 | ''' 30 | #topk = int(round(X.shape[0] * percent)) 31 | topk = percent 32 | #print(topk) 33 | #topk_pos = X.shape[0] - topk 34 | X_sort = np.sort(X, axis=0) 35 | return X >= X_sort[-topk, :] 36 | 37 | 38 | def threshold_array_nonzero(X): 39 | ''' 40 | input: array row: sample, column: gene vector 41 | output: a binary matrix 42 | For each value in (i, j), the binary value = if(M_ij > avg(column_j)) 43 | ''' 44 | return X > 0 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | X = np.array([[1,2,3],[2,3,4], [2,3,4], [4,5,2], [7,26,10]]) 50 | print(X) 51 | print(threshold_top(X, percent=0.4)) 52 | #print(threshold_array(X)) --------------------------------------------------------------------------------