├── .gitattributes
├── .gitignore
├── BrainAlign
    ├── .gitattributes
    ├── README.md
    ├── SR_RSC
    │   ├── .ipynb_checkpoints
    │   │   ├── embedder-checkpoint.py
    │   │   ├── evaluation-checkpoint.py
    │   │   └── main_sr_rsc-checkpoint.py
    │   ├── LICENSE
    │   ├── README.md
    │   ├── embedder.py
    │   ├── evaluation.py
    │   ├── layers
    │   │   ├── GCN.py
    │   │   ├── GCN2.py
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── discriminator.py
    │   │   └── fc.py
    │   ├── main.py
    │   ├── main_sr_rsc.py
    │   ├── models
    │   │   ├── .ipynb_checkpoints
    │   │   │   └── SubHIN-checkpoint.py
    │   │   ├── SubHIN.py
    │   │   └── __init__.py
    │   ├── test_input_data.py
    │   └── utils
    │   │   └── process.py
    ├── __init__.py
    ├── brain_analysis
    │   ├── .ipynb_checkpoints
    │   │   ├── analysis-checkpoint.py
    │   │   ├── analysis_anatomical-checkpoint.py
    │   │   ├── analysis_general-checkpoint.py
    │   │   ├── analysis_genomic-checkpoint.py
    │   │   ├── analysis_main-checkpoint.py
    │   │   ├── analysis_spatial-checkpoint.py
    │   │   ├── analysis_utils-checkpoint.py
    │   │   ├── data_utils-checkpoint.py
    │   │   ├── pipeline-checkpoint.py
    │   │   ├── process-checkpoint.py
    │   │   └── utils-checkpoint.py
    │   ├── __init__.py
    │   ├── analysis.py
    │   ├── analysis_anatomical.py
    │   ├── analysis_general.py
    │   ├── analysis_genomic.py
    │   ├── analysis_main.py
    │   ├── analysis_spatial.py
    │   ├── analysis_utils.py
    │   ├── configs
    │   │   ├── .ipynb_checkpoints
    │   │   │   └── sr_rsc_config_binary-checkpoint.py
    │   │   ├── __init__.py
    │   │   ├── came_config_binary.py
    │   │   ├── heco_config.py
    │   │   ├── heco_config_all.py
    │   │   ├── heco_config_binary.py
    │   │   ├── heco_config_binary_2020sa.py
    │   │   ├── heco_config_three_2020sa.py
    │   │   ├── sr_rsc_config.py
    │   │   └── sr_rsc_config_binary.py
    │   ├── data_utils.py
    │   ├── logger.py
    │   ├── metrics.py
    │   ├── pipeline.py
    │   ├── pipline_analysis_alignment.py
    │   ├── process.py
    │   ├── r_analysis
    │   │   ├── .Rhistory
    │   │   ├── genomic_findmarkers.R
    │   │   ├── genomic_functions.R
    │   │   ├── includes.R
    │   │   ├── test.R
    │   │   ├── transform2seurat.R
    │   │   ├── transform2seurat_cluster.R
    │   │   ├── transform_adata.py
    │   │   └── transform_adata_cluster.py
    │   ├── r_gene_comparison
    │   │   ├── barplot_degs.R
    │   │   └── fanPlotScript-compact.R
    │   ├── typehint.py
    │   └── utils.py
    ├── came
    │   ├── PARAMETERS.py
    │   ├── __init__.py
    │   ├── datapair
    │   │   ├── __init__.py
    │   │   ├── aligned.py
    │   │   └── unaligned.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── _minibatch.py
    │   │   ├── _predict.py
    │   │   ├── _utils.py
    │   │   ├── base_layers.py
    │   │   ├── cgc.py
    │   │   ├── cggc.py
    │   │   ├── heteroframe.py
    │   │   ├── hidden.py
    │   │   ├── loss.py
    │   │   └── v0
    │   │   │   ├── __init__.py
    │   │   │   ├── _minibatch.py
    │   │   │   ├── _predict.py
    │   │   │   ├── _utils.py
    │   │   │   ├── base_layers.py
    │   │   │   ├── cgc.py
    │   │   │   ├── cggc.py
    │   │   │   ├── heteroframe.py
    │   │   │   ├── hidden.py
    │   │   │   └── loss.py
    │   ├── pipeline.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── _alluvial.py
    │   │   ├── _base_trainer.py
    │   │   ├── _get_example_data.py
    │   │   ├── _io_h5py.py
    │   │   ├── analyze.py
    │   │   ├── base.py
    │   │   ├── downsample_counts.py
    │   │   ├── evaluation.py
    │   │   ├── plot.py
    │   │   ├── preprocess.py
    │   │   ├── train.py
    │   │   └── train_v0.py
    ├── code
    │   ├── __init__.py
    │   ├── embeds
    │   │   ├── acm
    │   │   │   └── README.md
    │   │   ├── aminer
    │   │   │   └── README.md
    │   │   ├── dblp
    │   │   │   └── README.md
    │   │   └── freebase
    │   │   │   └── README.md
    │   ├── main.py
    │   ├── main_heco.py
    │   ├── main_parallel.py
    │   ├── module
    │   │   ├── __init__.py
    │   │   ├── contrast.py
    │   │   ├── heco.py
    │   │   ├── mp_encoder.py
    │   │   └── sc_encoder.py
    │   ├── predict.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── evaluate.py
    │   │   ├── load_data.py
    │   │   ├── logger.py
    │   │   ├── logreg.py
    │   │   └── params.py
    ├── data
    │   ├── Demo_mouse_human_wholebrain
    │   │   ├── Data
    │   │   │   └── load_one2one.ipynb
    │   │   ├── Demo_run-region.ipynb
    │   │   └── Demo_run.ipynb
    │   ├── SlideseqV2_mouse_macaque_hippocampus
    │   │   ├── Data
    │   │   │   └── Untitled.ipynb
    │   │   └── Hmerfish_run.ipynb
    │   ├── __init__.py
    │   ├── data_utils.py
    │   ├── load_node_feature_mouse_human.py
    │   ├── mouse_human_wholebrain
    │   │   ├── Data
    │   │   │   └── load_one2one.ipynb
    │   │   └── W_run.ipynb
    │   ├── mouse_macaque_hippocampus
    │   │   ├── Data
    │   │   │   └── Untitled.ipynb
    │   │   ├── H_run.ipynb
    │   │   ├── H_run.py
    │   │   └── H_run_region.ipynb
    │   ├── mp_gen.py
    │   ├── mp_gen_mouse_human.py
    │   ├── neibor.py
    │   ├── pos.py
    │   └── script_labels.py
    └── demo
    │   └── subsample.py
├── LICENSE
├── README.md
├── readme_figs
    ├── alldatasets
    │   ├── all_dataset_seurate_alignment_score.png
    │   └── all_dataset_umap_integration.png
    └── subsampled
    │   ├── subsampled_seurate_alignment_score.png
    │   └── subsampled_umap.png
├── requirements_pip.txt
└── run_came
    ├── __init__.py
    ├── analysis_script
        ├── .ipynb_checkpoints
        │   ├── H_run_came-checkpoint.ipynb
        │   ├── H_run_came-checkpoint.py
        │   └── run_came-checkpoint.py
        ├── H_run_came.ipynb
        ├── H_run_came.py
        ├── load_human_region_tree_v2.py
        ├── load_mouse_2020sa.py
        ├── load_mouse_region_tree.R
        ├── load_mouse_region_tree.py
        ├── load_mouse_region_tree_v2.py
        ├── load_part_expression.py
        ├── load_part_expression_6regions.py
        ├── read_rhesus_2018s.R
        ├── read_rhesus_2018s.rmd
        ├── run_came.py
        ├── run_came_demo.py
        └── test_tree.py
    ├── analysis_utils
        ├── .ipynb_checkpoints
        │   ├── homo_random_config-checkpoint.py
        │   ├── ttest_plot_utils-checkpoint.py
        │   └── ttest_plot_utils_origin-checkpoint.py
        ├── __init__.py
        ├── homo_random_config.py
        ├── logger.py
        ├── tree_tools.R
        ├── ttest_plot_utils.py
        └── ttest_plot_utils_origin.py
    ├── brain_human_mouse
        └── get_human_acronym_color.py
    ├── brain_mouse_2020sa
        └── human_gene_palette
        │   └── 2011-12-16203C-Supplementary_Table8.xls
    ├── brain_voxel_sample_mouse_human_pipline.py
    ├── came
        ├── .ipynb_checkpoints
        │   ├── PARAMETERS-checkpoint.py
        │   ├── __init__-checkpoint.py
        │   └── pipeline-checkpoint.py
        ├── PARAMETERS.py
        ├── __init__.py
        ├── datapair
        │   ├── .ipynb_checkpoints
        │   │   └── unaligned-checkpoint.py
        │   ├── __init__.py
        │   ├── aligned.py
        │   └── unaligned.py
        ├── model
        │   ├── .ipynb_checkpoints
        │   │   └── _utils-checkpoint.py
        │   ├── __init__.py
        │   ├── _minibatch.py
        │   ├── _predict.py
        │   ├── _utils.py
        │   ├── base_layers.py
        │   ├── cgc.py
        │   ├── cggc.py
        │   ├── heteroframe.py
        │   ├── hidden.py
        │   ├── loss.py
        │   └── v0
        │   │   ├── __init__.py
        │   │   ├── _minibatch.py
        │   │   ├── _predict.py
        │   │   ├── _utils.py
        │   │   ├── base_layers.py
        │   │   ├── cgc.py
        │   │   ├── cggc.py
        │   │   ├── heteroframe.py
        │   │   ├── hidden.py
        │   │   └── loss.py
        ├── pipeline.py
        └── utils
        │   ├── .ipynb_checkpoints
        │       ├── _get_example_data-checkpoint.py
        │       ├── preprocess-checkpoint.py
        │       └── train-checkpoint.py
        │   ├── __init__.py
        │   ├── _alluvial.py
        │   ├── _base_trainer.py
        │   ├── _get_example_data.py
        │   ├── _io_h5py.py
        │   ├── analyze.py
        │   ├── base.py
        │   ├── downsample_counts.py
        │   ├── evaluation.py
        │   ├── plot.py
        │   ├── preprocess.py
        │   ├── train.py
        │   └── train_v0.py
    ├── came_origin
        ├── .ipynb_checkpoints
        │   ├── PARAMETERS-checkpoint.py
        │   ├── __init__-checkpoint.py
        │   └── pipeline-checkpoint.py
        ├── PARAMETERS.py
        ├── __init__.py
        ├── datapair
        │   ├── .ipynb_checkpoints
        │   │   └── unaligned-checkpoint.py
        │   ├── __init__.py
        │   ├── aligned.py
        │   └── unaligned.py
        ├── model
        │   ├── .ipynb_checkpoints
        │   │   └── _utils-checkpoint.py
        │   ├── __init__.py
        │   ├── _minibatch.py
        │   ├── _predict.py
        │   ├── _utils.py
        │   ├── base_layers.py
        │   ├── cgc.py
        │   ├── cggc.py
        │   ├── heteroframe.py
        │   ├── hidden.py
        │   ├── loss.py
        │   └── v0
        │   │   ├── __init__.py
        │   │   ├── _minibatch.py
        │   │   ├── _predict.py
        │   │   ├── _utils.py
        │   │   ├── base_layers.py
        │   │   ├── cgc.py
        │   │   ├── cggc.py
        │   │   ├── heteroframe.py
        │   │   ├── hidden.py
        │   │   └── loss.py
        ├── pipeline.py
        └── utils
        │   ├── .ipynb_checkpoints
        │       ├── preprocess-checkpoint.py
        │       └── train-checkpoint.py
        │   ├── __init__.py
        │   ├── _alluvial.py
        │   ├── _base_trainer.py
        │   ├── _get_example_data.py
        │   ├── _io_h5py.py
        │   ├── analyze.py
        │   ├── base.py
        │   ├── downsample_counts.py
        │   ├── evaluation.py
        │   ├── plot.py
        │   ├── preprocess.py
        │   ├── train.py
        │   └── train_v0.py
    ├── heco_utils.py
    └── load_brain_voxel_sample_mouse_human.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .idea/
 3 | *.txt
 4 | *.bin
 5 | *.json
 6 | *.log
 7 | *.pkl
 8 | *.svg
 9 | #*.png
10 | *.pt
11 | *.h5
12 | 
13 | *.obj
14 | *.gz
15 | *.h5ad
16 | *.pickle
17 | *.tsv
18 | *.csv
19 | *.zip
20 | *.tiff
21 | *.Rdata
22 | 
23 | *.mtx
24 | *.rds
25 | 
26 | *.svg
27 | 
28 | *.npz
29 | *.npy
30 | 
31 | ./BrainAlign/**/.npz
32 | __pycache__/
33 | 
34 | .Rproj.user
35 | 


--------------------------------------------------------------------------------
/BrainAlign/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/BrainAlign/README.md:
--------------------------------------------------------------------------------
1 | # BrainAlign
2 | This repo is for source code of "Whole Brain Alignment of Spatial Transcriptomics between Humans and Mice with BrainAlign". 
3 | 
4 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/.ipynb_checkpoints/evaluation-checkpoint.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | from sklearn.metrics.pairwise import cosine_similarity
  3 | from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
  4 | from scipy.spatial.distance import pdist
  5 | from sklearn.cluster import KMeans
  6 | from sklearn import metrics
  7 | from sklearn.linear_model import LogisticRegression
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.pipeline import make_pipeline
 11 | from sklearn.svm import SVC
 12 | from sklearn.preprocessing import StandardScaler
 13 | from scipy.spatial.distance import cdist
 14 | import logging
 15 | import warnings
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | def purity_score(y_true, y_pred):
 19 |     contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
 20 |     return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 
 21 | 
 22 | def sigmoid(x):
 23 |     return 1 / (1 + np.exp(-x))
 24 | 
 25 | class evaluation_metrics():
 26 |     def __init__(self, embs, labels, logger):
 27 | 
 28 |         self.embs = embs
 29 |         train, val, test = labels
 30 | 
 31 |         self.logger = logger
 32 |         
 33 |         self.trX, self.trY = self.embs[np.array(train)[:,0]], np.array(train)[:,1]
 34 |         self.valX, self.valY = self.embs[np.array(val)[:,0]], np.array(val)[:,1]
 35 |         self.tsX, self.tsY = self.embs[np.array(test)[:,0]], np.array(test)[:,1]
 36 |         self.n_label = len(set(self.tsY))
 37 |         
 38 |         self.val_acc = self.evaluate_cluster()
 39 |         
 40 |         
 41 |     def evaluation_lp(self, node1, node2, label):
 42 | 
 43 |         X1, X2 = [], []
 44 |         cnt = 0
 45 |         error = 0
 46 |         prob = []
 47 |         preds = []
 48 | 
 49 |         meanvec = np.mean(self.embs, 0)
 50 |         for i in range(len(node1)):
 51 |             n1 = int(node1[i])
 52 |             n2 = int(node2[i])
 53 |             X1 = self.embs[n1]
 54 |             X2 = self.embs[n2]
 55 | 
 56 |             if X1.sum() == 0:
 57 |                 cnt+= 1
 58 |                 X1 = meanvec
 59 |             if X2.sum() == 0:
 60 |                 cnt+= 1
 61 |                 X2 = meanvec
 62 |             r = X1.dot(X2)
 63 |             prob.append(r)
 64 |             if r >= 0.5:
 65 |                 r = 1
 66 |             else:
 67 |                 r = 0
 68 |             preds.append(r)
 69 |             if r != label[i]:
 70 |                 error += 1
 71 | 
 72 |         auc = metrics.roc_auc_score(label, prob)
 73 |         precision, recall, thresholds = metrics.precision_recall_curve(label, prob)
 74 |         pr = metrics.auc(recall, precision)
 75 |         ap = metrics.average_precision_score(label, prob, average=None)
 76 |         acc = metrics.accuracy_score(label, preds)
 77 |         f1_micro = metrics.f1_score(label, preds, average='micro')
 78 |         f1_macro = metrics.f1_score(label, preds, average='macro')
 79 |         self.logger.info('AUC: %.5f, AP: %.5f, PR: %.5f, ACC: %.5f, F1_micro: %.5f, F1_macro: %.5f'%(auc, ap, pr, acc, f1_micro, f1_macro))
 80 | 
 81 |     def evalutation(self):
 82 |         
 83 |         nmis, adjscores, puritys, fis, fas = 0,0,0,0,0
 84 | #         for rs in [0,123,432,6543,8478643]:
 85 |         for rs in [0]:
 86 |             kmeans = KMeans(n_clusters=self.n_label, random_state=rs).fit(self.tsX)
 87 |             preds = kmeans.predict(self.tsX)
 88 |             nmi = metrics.normalized_mutual_info_score(labels_true=self.tsY, labels_pred=np.array(preds))
 89 |             adjscore = metrics.adjusted_rand_score(self.tsY, np.array(preds))
 90 |             purity = purity_score(self.tsY, np.array(preds))
 91 |             nmis += nmi
 92 |             adjscores += adjscore
 93 |             puritys+=purity
 94 | 
 95 |             lr = LogisticRegression(max_iter=500, random_state=rs, solver='sag')
 96 |             lr.fit(self.trX, self.trY)
 97 |             Y_pred = lr.predict(self.tsX)
 98 |             f1_micro = metrics.f1_score(self.tsY, Y_pred, average='micro')
 99 |             f1_macro = metrics.f1_score(self.tsY, Y_pred, average='macro')
100 |             fis+=f1_micro
101 |             fas+=f1_macro
102 |         self.logger.info('NMI=%.5f, ARI: %.5f, f1_micro=%.5f, f1_macro=%.5f' % (nmis, adjscores, fis, fas))
103 | 
104 | 
105 |     def evaluate_cluster(self):
106 | 
107 |         kmeans = KMeans(n_clusters=self.n_label, random_state=0).fit(self.valX)
108 |         preds = kmeans.predict(self.trX)
109 |         nmi = metrics.normalized_mutual_info_score( labels_true=self.trY, labels_pred=np.array(preds))
110 |         return nmi
111 |     
112 |     def evaluate_clf(self):
113 |         r"""Evaluates latent space quality via a logistic regression downstream task."""
114 |         clf = LogisticRegression(max_iter=500, random_state=0, solver='lbfgs').fit(self.trX, self.trY)
115 |         val_acc = clf.score(self.valX, self.valY)
116 |         return val_acc
117 |     
118 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 RuixZh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/README.md:
--------------------------------------------------------------------------------
1 | # SR-RSC
2 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | from sklearn.metrics.pairwise import cosine_similarity
  3 | from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
  4 | from scipy.spatial.distance import pdist
  5 | from sklearn.cluster import KMeans
  6 | from sklearn import metrics
  7 | from sklearn.linear_model import LogisticRegression
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.pipeline import make_pipeline
 11 | from sklearn.svm import SVC
 12 | from sklearn.preprocessing import StandardScaler
 13 | from scipy.spatial.distance import cdist
 14 | import logging
 15 | import warnings
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | def purity_score(y_true, y_pred):
 19 |     contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
 20 |     return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 
 21 | 
 22 | def sigmoid(x):
 23 |     return 1 / (1 + np.exp(-x))
 24 | 
 25 | class evaluation_metrics():
 26 |     def __init__(self, embs, labels, logger):
 27 | 
 28 |         self.embs = embs
 29 |         train, val, test = labels
 30 | 
 31 |         self.logger = logger
 32 |         
 33 |         self.trX, self.trY = self.embs[np.array(train)[:,0]], np.array(train)[:,1]
 34 |         self.valX, self.valY = self.embs[np.array(val)[:,0]], np.array(val)[:,1]
 35 |         self.tsX, self.tsY = self.embs[np.array(test)[:,0]], np.array(test)[:,1]
 36 |         self.n_label = len(set(self.tsY))
 37 |         
 38 |         self.val_acc = self.evaluate_cluster()
 39 |         
 40 |         
 41 |     def evaluation_lp(self, node1, node2, label):
 42 | 
 43 |         X1, X2 = [], []
 44 |         cnt = 0
 45 |         error = 0
 46 |         prob = []
 47 |         preds = []
 48 | 
 49 |         meanvec = np.mean(self.embs, 0)
 50 |         for i in range(len(node1)):
 51 |             n1 = int(node1[i])
 52 |             n2 = int(node2[i])
 53 |             X1 = self.embs[n1]
 54 |             X2 = self.embs[n2]
 55 | 
 56 |             if X1.sum() == 0:
 57 |                 cnt+= 1
 58 |                 X1 = meanvec
 59 |             if X2.sum() == 0:
 60 |                 cnt+= 1
 61 |                 X2 = meanvec
 62 |             r = X1.dot(X2)
 63 |             prob.append(r)
 64 |             if r >= 0.5:
 65 |                 r = 1
 66 |             else:
 67 |                 r = 0
 68 |             preds.append(r)
 69 |             if r != label[i]:
 70 |                 error += 1
 71 | 
 72 |         auc = metrics.roc_auc_score(label, prob)
 73 |         precision, recall, thresholds = metrics.precision_recall_curve(label, prob)
 74 |         pr = metrics.auc(recall, precision)
 75 |         ap = metrics.average_precision_score(label, prob, average=None)
 76 |         acc = metrics.accuracy_score(label, preds)
 77 |         f1_micro = metrics.f1_score(label, preds, average='micro')
 78 |         f1_macro = metrics.f1_score(label, preds, average='macro')
 79 |         self.logger.info('AUC: %.5f, AP: %.5f, PR: %.5f, ACC: %.5f, F1_micro: %.5f, F1_macro: %.5f'%(auc, ap, pr, acc, f1_micro, f1_macro))
 80 | 
 81 |     def evalutation(self):
 82 |         
 83 |         nmis, adjscores, puritys, fis, fas = 0,0,0,0,0
 84 | #         for rs in [0,123,432,6543,8478643]:
 85 |         for rs in [0]:
 86 |             kmeans = KMeans(n_clusters=self.n_label, random_state=rs).fit(self.tsX)
 87 |             preds = kmeans.predict(self.tsX)
 88 |             nmi = metrics.normalized_mutual_info_score(labels_true=self.tsY, labels_pred=np.array(preds))
 89 |             adjscore = metrics.adjusted_rand_score(self.tsY, np.array(preds))
 90 |             purity = purity_score(self.tsY, np.array(preds))
 91 |             nmis += nmi
 92 |             adjscores += adjscore
 93 |             puritys+=purity
 94 | 
 95 |             lr = LogisticRegression(max_iter=500, random_state=rs, solver='sag')
 96 |             lr.fit(self.trX, self.trY)
 97 |             Y_pred = lr.predict(self.tsX)
 98 |             f1_micro = metrics.f1_score(self.tsY, Y_pred, average='micro')
 99 |             f1_macro = metrics.f1_score(self.tsY, Y_pred, average='macro')
100 |             fis+=f1_micro
101 |             fas+=f1_macro
102 |         self.logger.info('NMI=%.5f, ARI: %.5f, f1_micro=%.5f, f1_macro=%.5f' % (nmis, adjscores, fis, fas))
103 | 
104 | 
105 |     def evaluate_cluster(self):
106 | 
107 |         kmeans = KMeans(n_clusters=self.n_label, random_state=0).fit(self.valX)
108 |         preds = kmeans.predict(self.trX)
109 |         nmi = metrics.normalized_mutual_info_score( labels_true=self.trY, labels_pred=np.array(preds))
110 |         return nmi
111 |     
112 |     def evaluate_clf(self):
113 |         r"""Evaluates latent space quality via a logistic regression downstream task."""
114 |         clf = LogisticRegression(max_iter=500, random_state=0, solver='lbfgs').fit(self.trX, self.trY)
115 |         val_acc = clf.score(self.valX, self.valY)
116 |         return val_acc
117 |     
118 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/layers/GCN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | class GCN(nn.Module):
 7 |     def __init__(self, in_ft, out_ft, act=nn.PReLU(), drop_prob=0.0, isBias=False):
 8 |         super().__init__()
 9 |         self.linear = nn.Linear(in_ft, out_ft, bias=False)
10 | 
11 | #         if isBias:
12 | #             self.bias = nn.Parameter(torch.empty(out_ft))
13 | #             self.bias.data.fill_(0.0)
14 | #         else:
15 | #             self.register_parameter('bias', None)
16 | 
17 |         self.act = act
18 |         self.isBias = isBias
19 |         self.drop_prob = drop_prob
20 |         
21 |         for m in self.modules():
22 |             self.weights_init(m)
23 | 
24 |     def weights_init(self, m):
25 |         if isinstance(m, nn.Linear):
26 |             torch.nn.init.xavier_uniform_(m.weight.data)
27 |             if m.bias is not None:
28 |                 m.bias.data.fill_(0.0)
29 | 
30 |     def forward(self, emb):
31 |         # emb (batch_size, ft)
32 | #         emb = F.dropout(emb, self.drop_prob, training=self.training)
33 |         e = self.linear(emb) #  (batch_size, d)
34 | #         if self.isBias:
35 | #             e += self.bias
36 |         e_out = self.act(e)
37 |         return e_out
38 | 
39 |     
40 | class GNN(nn.Module):
41 |     def __init__(self, nb_rel, in_ft, out_ft, act=nn.PReLU(), drop_prob=0.5, isBias=False):
42 |         super().__init__()
43 |         self.encoder = nn.ModuleList()
44 |         for i in range(nb_rel):
45 |             self.encoder.append(GCN(in_ft, out_ft, act=act, isBias=isBias))
46 | 
47 |     def forward(self, embs):
48 |         outs = []
49 |         for emb in embs:  # emb (batch_size, ft)
50 |             outs.append(self.encoder(emb))
51 |         outs = torch.stack(outs, 0)  # outs (nb_rel, batch_size, ft)
52 |         return outs
53 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/layers/GCN2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | 
 7 | class GCN(nn.Module):
 8 |     def __init__(self, nfeat, nhid, dropout=0.5, isBias=False):
 9 |         super().__init__()
10 |         self.weight = nn.Parameter(torch.empty(nfeat, nhid))
11 |         nn.init.xavier_uniform_(self.weight)
12 |         if isBias:
13 |             self.bias = nn.Parameter(torch.empty(nhid))
14 |             self.bias.data.fill_(0.0)
15 |         else:
16 |             self.register_parameter('bias', None)
17 |         self.dropout = dropout
18 |         self.act = nn.ReLU()
19 | 
20 | 
21 |     def forward(self, adj, x):
22 |         support = torch.mm(x, self.weight)
23 |         output = torch.spmm(adj, support)
24 |         if self.bias is not None:
25 |             output = output + self.bias
26 |         return self.act(output)
27 | 
28 | 
29 | class DGCN(nn.Module):
30 |     def __init__(self, v_in_ft, u_in_ft, out_ft, act=nn.PReLU(), drop_prob=0.5, isBias=False):
31 |         super().__init__()
32 | 
33 |         self.v_gc1 = GCN(nfeat=v_in_ft,
34 |                         nhid=out_ft,
35 |                         dropout=drop_prob)
36 |         self.v_gc2 = GCN(nfeat=out_ft,
37 |                         nhid=out_ft,
38 |                         dropout=drop_prob)
39 | 
40 |         self.u_gc1 = GCN(nfeat=u_in_ft,
41 |                         nhid=out_ft,
42 |                         dropout=drop_prob)
43 |         self.u_gc2 = GCN(nfeat=out_ft,
44 |                         nhid=out_ft,
45 |                         dropout=drop_prob)
46 |         self.u_fc = nn.Linear(out_ft + u_in_ft, out_ft)
47 |         nn.init.xavier_uniform_(self.u_fc.weight.data)
48 |         self.v_fc = nn.Linear(out_ft + v_in_ft, out_ft)
49 |         nn.init.xavier_uniform_(self.v_fc.weight.data)
50 |         self.u_fc2 = nn.Linear(out_ft , out_ft)
51 |         nn.init.xavier_uniform_(self.u_fc.weight.data)
52 |         self.v_fc2 = nn.Linear(out_ft , out_ft)
53 |         nn.init.xavier_uniform_(self.v_fc.weight.data)
54 | 
55 |         self.act = act
56 |         self.drop_prob = drop_prob
57 |         self.isBias = isBias
58 | 
59 |     def forward(self, uv_adj, vu_adj, ufea, vfea):
60 |         # emb (batch_size, ft)
61 | #         u = F.dropout(ufea, self.drop_prob, training=self.training)
62 | #         v = F.dropout(vfea, self.drop_prob, training=self.training)
63 |         
64 |         vu = self.u_gc1(vu_adj, ufea)
65 |         uv = self.v_gc1(uv_adj, vfea)
66 | 
67 |         uv2 = self.v_gc2(uv_adj, vu)
68 |         vu2 = self.u_gc2(vu_adj, uv)
69 | 
70 |         Hv = torch.cat((vu2, vfea), dim=1)
71 |         Hu = torch.cat((uv2, ufea), dim=1)
72 | 
73 |         Hv = nn.ReLU()(self.v_fc(Hv))  #  (batch_size, d)
74 |         Hu = nn.ReLU()(self.u_fc(Hu))  #  (batch_size, d)
75 |         Hv = self.v_fc2(Hv)
76 |         Hu = self.u_fc2(Hu)
77 | 
78 |         return self.act(Hu), self.act(Hv)
79 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .GCN import GCN, GNN
2 | from .discriminator import Discriminator
3 | from .attention import Attention, NodeAttention, SemanticAttention, LocalAttention
4 | from .fc import FullyConnect, FullyConnect2
5 | from .GCN2 import DGCN
6 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/layers/discriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | class Discriminator(nn.Module):
 7 |     def __init__(self, v_ft, u_ft):
 8 |         super().__init__()
 9 |         self.bilinear = nn.Bilinear(v_ft, u_ft, 1)
10 |         self.act = nn.Sigmoid()
11 | 
12 |         for m in self.modules():
13 |             self.weights_init(m)
14 | 
15 |     def weights_init(self, m):
16 |         if isinstance(m, nn.Bilinear):
17 |             torch.nn.init.xavier_uniform_(m.weight.data)
18 |             if m.bias is not None:
19 |                 m.bias.data.fill_(0.0)
20 |        
21 |     def forward(self, v_h, c):
22 |         
23 | #         c = self.act(c)
24 | #         v_h = self.act(v_h)
25 |         
26 | #         c = c.expand_as(v_h)
27 |         sc_1 = self.bilinear(v_h, c)
28 | 
29 |         return sc_1.squeeze()
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/layers/fc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | class FullyConnect(nn.Module):
 7 |     def __init__(self, in_ft, out_ft, act=nn.Identity(), drop_prob=0.0, isBias=False):
 8 |         super().__init__()
 9 |         self.fc = nn.Linear(in_ft, out_ft, bias=False)
10 |         if self.fc.bias is not None:
11 |             self.fc.bias.data.fill_(0.0)
12 | 
13 |         if isBias:
14 |             self.bias = nn.Parameter(torch.empty(out_ft))
15 |             self.bias.data.fill_(0.0)
16 |         else:
17 |             self.register_parameter('bias', None)
18 | 
19 |         self.act = act
20 |         self.drop_prob = drop_prob
21 |         self.isBias = isBias
22 |         
23 |         for m in self.modules():
24 |             self.weights_init(m)
25 | 
26 |     def weights_init(self, m):
27 |         if isinstance(m, nn.Linear):
28 |             torch.nn.init.xavier_uniform_(m.weight.data)
29 |             if m.bias is not None:
30 |                 m.bias.data.fill_(0.0)
31 | 
32 | 
33 |     def forward(self, emb):
34 |         # emb (batch_size, ft)
35 |         emb = F.dropout(emb, self.drop_prob, training=self.training)
36 |         e = self.fc(emb)  #  (batch_size, d)
37 |         if self.isBias:
38 |             e += self.bias
39 |         return self.act(e)
40 | 
41 |     
42 | class FullyConnect2(nn.Module):
43 |     def __init__(self, in_ft, hid_unit, out_ft, drop_prob=0.0, isBias=False):
44 |         super().__init__()
45 |         self.fc = FullyConnect(in_ft, hid_unit, act=nn.PReLU(), drop_prob=drop_prob,isBias=isBias)
46 |         self.fc2 = FullyConnect(hid_unit, out_ft, act=nn.PReLU(), drop_prob=drop_prob,isBias=isBias)
47 |         self.dense = FullyConnect(out_ft, 1, act=nn.Identity(), drop_prob=drop_prob,isBias=isBias)
48 |         
49 | 
50 |     def forward(self, emb):
51 |         # emb (batch_size, ft)
52 |         e = self.fc(emb)  #  (batch_size, d)
53 |         e2 = self.fc2(e)
54 |         out = self.dense(e2)
55 |         return out


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import time
 4 | seed = 268945
 5 | torch.autograd.set_detect_anomaly(True)
 6 | np.random.seed(seed)
 7 | torch.manual_seed(seed)
 8 | torch.cuda.manual_seed_all(seed)
 9 | torch.backends.cudnn.deterministic = True
10 | torch.backends.cudnn.benchmark = False
11 | import argparse
12 | 
13 | 
14 | def parse_args():
15 |     # input arguments
16 |     parser = argparse.ArgumentParser(description='BiHIN')
17 |     parser.add_argument('--gpu_num', nargs='?', default='0')
18 |     parser.add_argument('--model', nargs='?', default='SubHIN')
19 |     parser.add_argument('--dataset', nargs='?', default='dblp')
20 |     parser.add_argument('--save_path', nargs='?', default='./results')
21 | 
22 |     parser.add_argument('--nb_epochs', type=int, default=10000)
23 |     parser.add_argument('--lr', type = float, default = 0.001)
24 |     parser.add_argument('--patience', type=int, default=50)
25 | 
26 | #     parser.add_argument('--att_hid_units', type=int, default=64)
27 |     parser.add_argument('--hid_units', type=int, default=256)# 128 best for dblp and yelp, larger datasets
28 |     parser.add_argument('--hid_units2', type=int, default=128)
29 |     parser.add_argument('--out_ft', type=int, default=64)
30 |    
31 |     parser.add_argument('--drop_prob', type=float, default=0.0)
32 |     parser.add_argument('--lamb', type=float, default=0.5,
33 |                         help='coefficient for the losses in node task')
34 |     parser.add_argument('--lamb_lp', type=float, default=1.0,
35 |                         help='coefficient for the losses in link task')
36 |     parser.add_argument('--margin', type=float, default=0.8,
37 |                         help='coefficient for the margin loss')
38 |     parser.add_argument('--isBias', action='store_true', default=False)
39 |     parser.add_argument('--isAtt', action='store_true', default=False)
40 |     parser.add_argument('--isLP', action='store_true', default=False)# link prediction
41 |     parser.add_argument('--isSemi', action='store_true', default=False)# semi-supervised learning
42 | 
43 |     return parser.parse_known_args()
44 | 
45 | def printConfig(args):
46 |     args_names = []
47 |     args_vals = []
48 |     for arg in vars(args):
49 |         args_names.append(arg)
50 |         args_vals.append(getattr(args, arg))
51 |     print(args_names)
52 |     print(args_vals)
53 | 
54 | def main():
55 |     args, unknown = parse_args()
56 | #     printConfig(args)
57 |     if args.model == 'SubHIN':
58 |         from models import SubHIN
59 |         embedder = SubHIN(args)
60 |     start = time.time()
61 |     embedder.training()
62 |     print('time (s):%.2f'%(time.time()-start))
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .SubHIN import SubHIN
2 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/test_input_data.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/12/14 13:30
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : test_input_data.py
 6 | import pickle
 7 | import numpy as np
 8 | 
 9 | if __name__ == '__main__':
10 |     '''
11 |     edges_path = './dataset/acm/edges.pkl'
12 |     file = open(edges_path, 'rb')
13 |     load_data = pickle.load(file)
14 |     print('edges type: ', type(load_data))
15 |     #print('edges shape: ', np.shape(load_data))
16 |     print('edges: ', load_data)
17 | 
18 |     
19 |     labels_path = './dataset/acm/labels.pkl'
20 |     file = open(labels_path, 'rb')
21 |     load_data = pickle.load(file)
22 |     print('labels type: ', type(load_data))
23 |     print('labels: ', load_data)
24 |     print('labels 0', load_data[0].shape)
25 |     print('labels 1', load_data[1].shape)
26 |     print('labels 2', load_data[2].shape)
27 |     
28 | 
29 | 
30 |     meta_data_path = './dataset/acm/meta_data.pkl'
31 |     file = open(meta_data_path, 'rb')
32 |     load_data = pickle.load(file)
33 |     print('meta_data type: ', type(load_data))
34 |     print('meta_data keys: ', load_data.keys())
35 |     print('meta_data: ', load_data)
36 |     '''
37 | 
38 |     node_features_path = './dataset/acm/node_features.pkl'
39 |     file = open(node_features_path, 'rb')
40 |     load_data = pickle.load(file)
41 |     print('node_features type: ', type(load_data))
42 |     print('node_features shape: ', np.shape(load_data))
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/BrainAlign/SR_RSC/utils/process.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import normalize
 2 | import scipy.sparse as sp
 3 | import torch
 4 | import torch.nn as nn
 5 | import numpy as np
 6 | 
 7 | def indices_to_one_hot(data, nb_classes):
 8 |     """Convert an iterable of indices to one-hot encoded labels."""
 9 |     targets = np.array(data).reshape(-1)
10 |     labels = np.eye(nb_classes)[targets]
11 |     return torch.LongTensor(labels)
12 | 
13 | 
14 | def normalize_adj(mx):
15 |     """Row-normalize sparse matrix"""
16 |     rowsum = np.array(mx.sum(1))
17 |     r_inv = np.power(rowsum, -1.0).flatten()
18 |     r_inv[np.isinf(r_inv)] = 0.
19 |     r_mat_inv = sp.diags(r_inv)
20 |     mx = r_mat_inv.dot(mx)
21 |     return mx
22 | 
23 | 
24 | def sparse_to_tuple(mx):
25 | #     mx = normalize_adj(mx)
26 |     if not sp.isspmatrix_coo(mx):
27 |         mx = mx.tocoo()
28 |     coords = np.vstack((mx.row, mx.col))
29 |     values = mx.data
30 |     shape = mx.shape
31 |     return coords, values, shape
32 | 
33 | 
34 | def preprocess_features(features, norm=True):
35 |     """Row-normalize feature matrix and convert to tuple representation"""
36 |     if sp.issparse(features):
37 |         features = features.toarray()
38 |     if norm:
39 |         features[features>0] = 1
40 | #         rowsum = np.array(features.sum(1))
41 | #         r_inv = np.power(rowsum, -1.0).flatten()
42 | #         r_inv[np.isinf(r_inv)] = 0.
43 | #         r_mat_inv = sp.diags(r_inv)
44 | #         features = r_mat_inv.dot(features)
45 |     return torch.FloatTensor(features)
46 | 
47 | 
48 | def normalize_mx(mx, diagonal=True):
49 |     if diagonal:
50 |         size = mx.shape[0]
51 |         return normalize(mx+sp.eye(size), norm='l1', axis=1)
52 |     else:
53 |         return normalize(mx, norm='l1', axis=1)
54 | 
55 | 
56 | 
57 |         
58 | 


--------------------------------------------------------------------------------
/BrainAlign/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2022/10/15 16:12
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py.py
6 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2022/10/15 16:04
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py
6 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2022/10/15 17:15
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py.py
6 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/configs/heco_config.py:
--------------------------------------------------------------------------------
  1 | # -- coding: utf-8 --
  2 | # @Time : 2022/10/15 16:11
  3 | # @Author : Biao Zhang
  4 | # @Email : littlebiao@outlook.com
  5 | # @File : heco_config.py
  6 | 
  7 | from yacs.config import CfgNode as CN
  8 | import time
  9 | # --------------------------------------------------------------
 10 | # Config of model
 11 | # --------------------------------------------------------------
 12 | _C = CN()
 13 | 
 14 | _C.CAME = CN()
 15 | _C.CAME.path_rawdata1 = '../../../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'
 16 | _C.CAME.path_rawdata2 = '../../../../CAME/brain_human_mouse/human_brain_region_88_sparse_with3d.h5ad'
 17 | _C.CAME.ROOT = '../../../../CAME/brain_mouse_human_sagittal/Baron_mouse-Baron_human-(10-13_15.26.12)/'
 18 | _C.CAME.figdir = '../../../../CAME/analysis_results/figs/' #
 19 | _C.CAME.embedding_dim = 128
 20 | 
 21 | _C.CAME.homo_region_file_path = '../../../../CAME/brain_human_mouse/MouseHumanMatches_H88M67.csv'
 22 | _C.CAME.labels_dir = '../../../../CAME/brain_human_mouse/'
 23 | 
 24 | _C.HECO = CN()
 25 | _C.HECO.dsnames = ['Mouse', 'Human']
 26 | 
 27 | # Could be pca or came
 28 | _C.HECO.normalize_before_pca = None#'default'  # None represent no normalization
 29 | _C.HECO.normalize_before_pca_target_sum = None
 30 | _C.HECO.embedding_type = 'pca'
 31 | _C.HECO.embedding_pca_dim = 30
 32 | 
 33 | 
 34 | _C.HECO.dataset = 'mouse_human_sagittal'
 35 | _C.HECO.result_save_folder = './result/'
 36 | _C.HECO.experiment_time = time.strftime("%Y-%m-%d_%H-%M-%S")
 37 | _C.HECO.result_save_path = _C.HECO.result_save_folder + _C.HECO.experiment_time
 38 | _C.HECO.embeddings_file_path = _C.HECO.result_save_path + "/embeds/"
 39 | _C.HECO.DATA_PATH = _C.HECO.result_save_path + '/data/'
 40 | 
 41 | _C.HECO.normalize_scale = True
 42 | 
 43 | _C.HECO.normalize_before_pruning_method = 'default'
 44 | _C.HECO.pruning_target_sum = None # None
 45 | _C.HECO.pruning_normalize_axis = 0
 46 | _C.HECO.if_threshold = True
 47 | _C.HECO.pruning_method = 'std'  # top, std, quantile
 48 | _C.HECO.pruning_std_times_sm = 3#3.3#2.9
 49 | _C.HECO.pruning_std_times_vh = 2.3#2.5#2.4
 50 | 
 51 | _C.HECO.sm_gene_top = 2
 52 | _C.HECO.vh_gene_top = 2
 53 | _C.HECO.sm_sample_top = 5
 54 | _C.HECO.vh_sample_top = 5
 55 | 
 56 | _C.HECO.target_sum = None # None
 57 | 
 58 | _C.HECO.NODE_TYPE_NUM = 4
 59 | _C.HECO.S = 21749 #
 60 | _C.HECO.S_sample_rate = [0.2]
 61 | _C.HECO.M = 4035
 62 | _C.HECO.M_sample_rate = [5, 2]
 63 | _C.HECO.H = 6507
 64 | _C.HECO.H_sample_rate = [0.5, 0.5]
 65 | _C.HECO.V = 3682
 66 | _C.HECO.V_sample_rate = [2]
 67 | 
 68 | _C.HECO.DEG_batch_key = None
 69 | _C.HECO.DEG_n_top_genes = 2000
 70 | 
 71 | _C.HECO.positive_sample_number = 5000
 72 | 
 73 | 
 74 | _C.HECO.fig_format = 'png'
 75 | 
 76 | _C.ANALYSIS = CN()
 77 | _C.ANALYSIS.cut_ov = 0
 78 | _C.ANALYSIS.umap_neighbor = 20
 79 | _C.ANALYSIS.mouse_umap_neighbor = 20
 80 | _C.ANALYSIS.human_umap_neighbor = 20
 81 | 
 82 | 
 83 | # Paramaters of BrainAlign
 84 | _C.HECO_args = CN()
 85 | _C.HECO_args.save_emb = True
 86 | _C.HECO_args.turn = 0
 87 | _C.HECO_args.dataset = _C.HECO.dataset
 88 | _C.HECO_args.target_node = "S" # S, M, H, V
 89 | _C.HECO_args.if_pretrained = False
 90 | _C.HECO_args.pretrained_model_path = None
 91 | _C.HECO_args.save_path = "./results/" + _C.HECO.experiment_time+'/'#"../data/{}/results/".format(_C.HECO_args.dataset)+_C.HECO.experiment_time+'/'
 92 | _C.HECO_args.data_path = "./results/"+_C.HECO.experiment_time+'/data/'#"../data/{}/results/".format(_C.HECO_args.dataset)+_C.HECO.experiment_time+'/data/'
 93 | _C.HECO_args.ratio = [20, 40, 60]
 94 | _C.HECO_args.gpu = 0
 95 | _C.HECO_args.seed = 53
 96 | _C.HECO_args.hidden_dim = 128
 97 | _C.HECO_args.nb_epochs = 1000
 98 | # The parameters of evaluation
 99 | _C.HECO_args.eva_lr = 0.01
100 | _C.HECO_args.eva_wd = 0
101 | # The parameters of learning process
102 | _C.HECO_args.patience = 30
103 | _C.HECO_args.lr = 0.0005
104 | _C.HECO_args.l2_coef = 0
105 | # model-specific parameters
106 | _C.HECO_args.tau = 0.9
107 | _C.HECO_args.feat_drop = 0.4
108 | _C.HECO_args.attn_drop = 0.35
109 | _C.HECO_args.sample_rate = [6]
110 | _C.HECO_args.lam = 0.5
111 | 
112 | _C.HECO_args.type_num = [21749, 4035, 6507, 3682]
113 | _C.HECO_args.nei_num = 1
114 | 
115 | # --------------------------------------------------------------
116 | # Config of INPUT
117 | # --------------------------------------------------------------
118 | _C.HOMO_RANDOM = CN()
119 | 
120 | # if use all the non-homogeneous regions as back ground, default 'all', else will only use 35 regions in each species
121 | _C.HOMO_RANDOM.random_field = 'all' # or all
122 | # config if plot all the cross species correlation heatmap, default false; Require large memory if True.
123 | _C.HOMO_RANDOM.random_plot = False # config
124 | 
125 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/configs/heco_config_all.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/10/15 16:11
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : heco_config.py
 6 | 
 7 | from yacs.config import CfgNode as CN
 8 | 
 9 | # --------------------------------------------------------------
10 | # Config of model
11 | # --------------------------------------------------------------
12 | _C = CN()
13 | 
14 | _C.CAME = CN()
15 | _C.CAME.path_rawdata1 = '../../../../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad'
16 | _C.CAME.path_rawdata2 = '../../../../CAME/brain_human_mouse/human_brain_region_88_sparse.h5ad'
17 | _C.CAME.ROOT = '../../../../CAME/analysis_results/Dense_Baron_mouse-Baron_human-10-24_11.37.58/'
18 | _C.CAME.figdir = '../../../../CAME/analysis_results/Dense_Baron_mouse-Baron_human-10-24_11.37.58/figs/' #
19 | _C.CAME.embedding_dim = 128
20 | 
21 | _C.CAME.homo_region_file_path = '../../../../CAME/brain_human_mouse/MouseHumanMatches_H88M67.csv'
22 | _C.CAME.labels_dir = '../../../../CAME/brain_human_mouse/'
23 | 
24 | _C.HECO = CN()
25 | # Could be pca or came
26 | _C.HECO.embedding_type = 'pca'
27 | _C.HECO.embedding_pca_dim = 30
28 | _C.HECO.DATA_PATH = './data/'
29 | _C.HECO.dataset = 'mouse_human'
30 | _C.HECO.result_save_path = './results/2022-11-04_22-37-26'
31 | _C.HECO.embeddings_file_path = _C.HECO.result_save_path + "/embeds/"
32 | 
33 | _C.HECO.if_threshold = True
34 | _C.HECO.pruning_method = 'std'  # top, std, quantile
35 | _C.HECO.pruning_std_times_sm = 3.5
36 | _C.HECO.pruning_std_times_vh = 3.2
37 | 
38 | _C.HECO.sm_gene_top = 100
39 | _C.HECO.vh_gene_top = 20
40 | _C.HECO.sm_sample_top = 5
41 | _C.HECO.vh_sample_top = 5
42 | 
43 | _C.HECO.target_sum = 1 # None
44 | 
45 | _C.HECO.NODE_TYPE_NUM = 4
46 | _C.HECO.S = 72968
47 | _C.HECO.S_sample_rate = [0.2]
48 | _C.HECO.M = 2578
49 | _C.HECO.M_sample_rate = [5, 2]
50 | _C.HECO.H = 3326
51 | _C.HECO.H_sample_rate = [0.5, 0.5]
52 | _C.HECO.V = 3682
53 | _C.HECO.V_sample_rate = [2]
54 | 
55 | _C.HECO.DEG_batch_key = None
56 | _C.HECO.DEG_n_top_genes = 2000
57 | 
58 | _C.HECO.positive_sample_number = 5000
59 | 
60 | 
61 | _C.HECO.fig_format = 'png'
62 | # --------------------------------------------------------------
63 | # Config of INPUT
64 | # --------------------------------------------------------------
65 | _C.HOMO_RANDOM = CN()
66 | 
67 | # if use all the non-homogeneous regions as back ground, default 'all', else will only use 35 regions in each species
68 | _C.HOMO_RANDOM.random_field = 'all' # or all
69 | # config if plot all the cross species correlation heatmap, default false; Require large memory if True.
70 | _C.HOMO_RANDOM.random_plot = False # config
71 | 
72 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | import os.path as osp
 5 | import time
 6 | def setup_logger(name, save_dir, if_train):
 7 |     logger = logging.getLogger(name)
 8 |     logger.setLevel(logging.DEBUG)
 9 | 
10 |     ch = logging.StreamHandler(stream=sys.stdout)
11 |     ch.setLevel(logging.DEBUG)
12 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
13 |     ch.setFormatter(formatter)
14 |     logger.addHandler(ch)
15 | 
16 |     if save_dir:
17 |         if not osp.exists(save_dir):
18 |             os.makedirs(save_dir)
19 |         if if_train:
20 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_train_log.txt"), mode='w')
21 |         elif if_train == False:
22 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_analysis_log.txt"), mode='w')
23 |         elif if_train == None:
24 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S") + "_process_log.txt"),
25 |                                      mode='w')
26 |         fh.setLevel(logging.DEBUG)
27 |         fh.setFormatter(formatter)
28 |         logger.addHandler(fh)
29 | 
30 |     return logger


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/.Rhistory:
--------------------------------------------------------------------------------
 1 | install.packages("sceasy")
 2 | install.packages("r-sceasy")
 3 | install.packages("sceasy")
 4 | q()
 5 | q
 6 | q()
 7 | devtools::install_github("cellgeni/sceasy")
 8 | install.packages("devtools")
 9 | devtools::install_github("cellgeni/sceasy")
10 | q()
11 | install.packages("anndata")
12 | if (!requireNamespace("BiocManager", quietly=TRUE)) {
13 |     install.packages("BiocManager")
14 | }
15 | BiocManager::install("zellkonverter")
16 | BiocManager::install("basilisk")
17 | install.packages("basilisk")
18 | BiocManager::install("basilisk")
19 | if (!require("BiocManager", quietly = TRUE))
20 |     install.packages("BiocManager")
21 | BiocManager::install("basilisk")
22 | install.packages("anndata")
23 | library(anndata)
24 | adata_mouse_path_isocortex <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad"
25 | adata_mouse_path_isocortex
26 | ad <- read_h5ad(adata_path)
27 | library(anndata)
28 | ad <- read_h5ad(adata_path)
29 | install.packages("SeuratDisk")
30 | if (!requireNamespace("remotes", quietly = TRUE)) {
31 |   install.packages("remotes")
32 | }
33 | remotes::install_github("mojaveazure/seurat-disk")
34 | library('SeuratDisk')
35 | library('Seuratdisk')
36 | if (!requireNamespace("remotes", quietly = TRUE)) {
37 |   install.packages("remotes")
38 | }
39 | remotes::install_github("mojaveazure/seurat-disk")
40 | reticulate::install_miniconda()
41 | anndata::install_anndata()
42 | install.packages("anndata")
43 | install.packages("magrittr")
44 | install.packages("dplyr")
45 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/genomic_findmarkers.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/7/22 20:38
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : genomic_findmarkers.r
 5 | #@Description: This file is used to find marker genes with Seurat findMarker
 6 | #source('./genomic_functions.R', local = TRUE)
 7 | 
 8 | library(sceasy)
 9 | library(anndata)
10 | library(Seurat)
11 | library(Matrix)
12 | 
13 | library(reticulate)
14 | #use_condaenv('pad')
15 | #loompy <- reticulate::import('loompy')
16 | 
17 | print("No error")
18 | 
19 | # convert_scanpy_seurat
20 | convert_scanpy2seurat <- function(adata_path, save_path){
21 |  #ad <- zellkonverter::readH5AD(adata_path)
22 |   #ad <- zellkonverter::readH5AD(adata_path)
23 |   #ad <- read_h5ad(adata_path)
24 |  # end with .rds file
25 |   #print(ad)
26 |  sceasy::convertFormat(adata_path, from="anndata", to="seurat", outFile=save_path)
27 |  #if_success <- TRUE
28 |  #return(if_success)
29 | }
30 | 
31 | seurat_findmarker <- function(input_data, groupby)
32 | {
33 |   adata_markers <- FindAllMarkers(input_data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
34 |   adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC)
35 |   #return(NULL)
36 | }
37 | 
38 | 
39 | # isocortex
40 | adata_mouse_path_isocortex <- "D:/Research_programs/HeCo/BrainAlign/data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad"
41 | save_path <- "D:/Research_programs/HeCo/BrainAlign/data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/"
42 | mouse_data_path <- paste0(save_path, 'adata_mouse_path_isocortex.rds')
43 | print(mouse_data_path)
44 | adata <- read_h5ad(adata_mouse_path_isocortex)
45 | print(adata)
46 | 
47 | #adata$X <- as(adata$X, "dgCMatrix")
48 | 
49 | #print(adata$X)
50 | #seurat_object <- Seurat::Convert(adata)
51 | # Create a Seurat object
52 | seurat_object <- CreateSeuratObject(counts = adata$X)
53 | 
54 | # Transfer feature names (genes)
55 | rownames(seurat_object) <- rownames(adata$X)
56 | 
57 | # Transfer cell names and metadata
58 | colnames(seurat_object) <- colnames(adata$X)
59 | seurat_object$meta.data <- adata$obs
60 | # Transfer additional metadata columns (obs)
61 | seurat_object$meta.data$additional_metadata_column <- adata$obs$additional_metadata_column
62 | 
63 | # Transfer additional metadata columns (var)
64 | seurat_object$var$additional_metadata_column <- adata$var$additional_metadata_column
65 | 
66 | 
67 | saveRDS(seurat_object, mouse_data_path)
68 | 
69 | #convert_scanpy2seurat(adata_mouse_path_isocortex, mouse_data_path) #mouse_data_path
70 | mouse_isocortex_data <- readRDS(file = mouse_data_path)
71 | Idents(mouse_isocortex_dat) <- "region_name"
72 | seurat_findmarker(mouse_isocortex_data, groupby="region_name")
73 | 
74 | print("Mouse finished")
75 | 
76 | # adata_human_path_isocortex <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad"
77 | # human_data_path <- paste0(save_path, 'adata_human_path_isocortex.rds')
78 | # convert_scanpy2seurat(adata_human_path_isocortex, paste(save_path, human_data_path))
79 | # human_isocortex_data <- readRDS(file = human_data_path)
80 | # Idents(human_isocortex_dat) <- "region_name"
81 | # seurat_findmarker(human_isocortex_data, groupby="region_name")
82 | #
83 | # print("Human finished.")
84 | 
85 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/genomic_functions.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/7/23 11:09
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : genomic_functions.r
 5 | #@Description: This file is used to ...
 6 | #source('./genomic_functions.R', local = TRUE)
 7 | 
 8 | library(Seurat)
 9 | library(anndata)
10 | library(sceasy)
11 | 
12 | # convert_scanpy_seurat
13 | if_success <- convert_scanpy2seurat(adata_path, save_path)
14 | {
15 |  ad <- anndata::read_h5ad(adata_path)
16 |  # end with .rds file
17 |  sceasy::convertFormat(ad, from="anndata", to="seurat", outFile=save_path)
18 |  if_success <- TRUE
19 |  #return(if_success)
20 | }
21 | 
22 | 
23 | NULL <- seurat_findmarker(input_data, groupby)
24 | {
25 |   adata_markers <- FindAllMarkers(input_data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
26 |   adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC)
27 |   #return(NULL)
28 | }
29 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/includes.R:
--------------------------------------------------------------------------------
 1 | path.bin <- 'D:/Research_programs/single_cell/molecular-atlas-master/bin'
 2 | path.matrices <-'D:/Research_programs/single_cell/figures'
 3 | 
 4 | load(paste(path.matrices ,'atlasspots.RData',sep='/'))
 5 | load(paste(path.matrices , 'vivid-colors.RData', sep='/'))
 6 | 
 7 | #Only loading if not existing, takes some time
 8 | if(!exists('aligned.atlas'))
 9 |   load(paste(path.matrices ,'alignedAtlas.RData',sep='/'))
10 | 
11 | if(!exists('atlas.stereo'))
12 |   load(paste(path.matrices ,'atlasStereo.RData',sep='/'))
13 | 
14 | source(paste(path.bin,'plotFunctions.R',sep='/'))
15 | source(paste(path.bin,'execFunctions.R',sep='/'))
16 | source(paste(path.bin,'araAtlasFunctions.R',sep='/'))
17 | source(paste(path.bin,'smoothingFunctions.R',sep='/'))
18 | source(paste(path.bin,'layerFunctions.R',sep='/'))
19 | source(paste(path.bin,'icFunctions.R',sep='/'))
20 | source(paste(path.bin,'tsneFunctions.R',sep='/'))
21 | source(paste(path.bin,'similarityIndexFunctions.R',sep='/'))
22 | source(paste(path.bin,'scRemapFunctions.R',sep='/'))
23 | source(paste(path.bin,'constantParameters.R',sep='/'))
24 | source(paste(path.bin,'allenAnnotationsFunctions.R',sep='/'))
25 | source(paste(path.bin,'meshCutsFunctions.R',sep='/'))


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/test.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/7/23 11:58
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : test.r
 5 | #@Description: This file is used to ...
 6 | 
 7 | 
 8 | sumfunc <- function(x, y)
 9 | {
10 |   sum <- x+y
11 |   return(sum)
12 | }
13 | 
14 | print(sumfunc(1, 2))


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/transform2seurat.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/7/23 21:23
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : transform2seurat.r
 5 | #@Description: This file is used to ...
 6 | 
 7 | 
 8 | library(Matrix)
 9 | library(Seurat)
10 | 
11 | 
12 | 
13 | adata_path_isocortex <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/"
14 | 
15 | output_for_R_path <- "R_isocortex_mouse"
16 | 
17 | save_dir<- paste0(adata_path_isocortex, output_for_R_path)
18 | 
19 | counts<-readMM(paste0(save_dir,'/counts.mtx'))
20 | dim(counts)
21 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv'))
22 | head(cellMeta)
23 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv'))
24 | dim(geneMeta)
25 | head(geneMeta)
26 | ### Set the rownames and colnames
27 | rownames(counts)<-cellMeta$Barcode
28 | colnames(counts)<-geneMeta$GeneName
29 | 
30 | seo <- CreateSeuratObject(counts = t(counts), project = "min", min.cells = 2, min.features = 5)
31 | ### Set the meta data
32 | seo@meta.data<-cbind(cellMeta,seo@meta.data)
33 | rownames(seo@meta.data)<-colnames(seo)
34 | ### Normalize the data
35 | #seo <- NormalizeData(seo)
36 | groupby <- "region_name"
37 | 
38 | Idents(seo) <- "region_name"
39 | 
40 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.15) #, min.pct = 0.25, logfc.threshold = 0.25
41 | #adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC)
42 | print(adata_markers)
43 | 
44 | saveRDS(seo, file.path(save_dir,'mouse_isocortex.rds'))
45 | 
46 | 
47 | 
48 | # ------------------------------------------------------------
49 | # human----------------
50 | output_for_R_path <- "R_isocortex_human"
51 | 
52 | save_dir<- paste0(adata_path_isocortex, output_for_R_path)
53 | 
54 | counts<-readMM(paste0(save_dir,'/counts.mtx'))
55 | dim(counts)
56 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv'))
57 | head(cellMeta)
58 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv'))
59 | dim(geneMeta)
60 | head(geneMeta)
61 | ### Set the rownames and colnames
62 | rownames(counts)<-cellMeta$Barcode
63 | colnames(counts)<-geneMeta$GeneName
64 | 
65 | seo <- CreateSeuratObject(counts = t(counts), project = "min", min.cells = 2, min.features = 5)
66 | ### Set the meta data
67 | seo@meta.data<-cbind(cellMeta,seo@meta.data)
68 | rownames(seo@meta.data)<-colnames(seo)
69 | ### Normalize the data
70 | #seo <- NormalizeData(seo)
71 | groupby <- "region_name"
72 | 
73 | Idents(seo) <- "region_name"
74 | 
75 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.15) #, min.pct = 0.25, logfc.threshold = 0.25
76 | #adata_markers %>% group_by(groupby) %>% slice_max(n = 1, order_by = avg_log2FC)
77 | print(adata_markers)
78 | 
79 | saveRDS(seo, file.path(save_dir,'human_isocortex.rds'))


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/transform2seurat_cluster.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/7/23 21:23
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : transform2seurat.r
 5 | #@Description: This file is used to ...
 6 | 
 7 | 
 8 | library(Matrix)
 9 | library(Seurat)
10 | 
11 | library(magrittr) # needs to be run every time you start R and want to use %>%
12 | library(dplyr)    # alternatively, this also loads %>%
13 | 
14 | 
15 | adata_path_th <- "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/"
16 | 
17 | output_for_R_path <- "R_TH_mouse"
18 | 
19 | save_dir<- paste0(adata_path_th, output_for_R_path)
20 | 
21 | counts<-readMM(paste0(save_dir,'/counts.mtx'))
22 | dim(counts)
23 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv'))
24 | head(cellMeta)
25 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv'))
26 | dim(geneMeta)
27 | head(geneMeta)
28 | ### Set the rownames and colnames
29 | rownames(counts)<-cellMeta$Barcode
30 | colnames(counts)<-geneMeta$GeneName
31 | 
32 | seo <- CreateSeuratObject(counts = t(counts))
33 | ### Set the meta data
34 | seo@meta.data<-cbind(cellMeta,seo@meta.data)
35 | rownames(seo@meta.data)<-colnames(seo)
36 | ### Normalize the data
37 | #seo <- NormalizeData(seo)
38 | groupby <- "cluster_name_acronym"
39 | 
40 | Idents(seo) <- "cluster_name_acronym"
41 | 
42 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.11) #, min.pct = 0.25, logfc.threshold = 0.25
43 | #adata_markers %>% group_by(cluster_name_acronym) %>% slice_max(n = 1, order_by = avg_log2FC)
44 | #adata_markers %>% group_by("cluster_name_acronym") %>% top_n(2, avg_logFC)
45 | 
46 | print(adata_markers)
47 | 
48 | saveRDS(seo, file.path(save_dir,"mouse_th.rds"))
49 | 
50 | 
51 | 
52 | # ------------------------------------------------------------
53 | # human----------------
54 | print('----------------------------------human----------------------------------------------')
55 | output_for_R_path <- "R_TH_human"
56 | 
57 | save_dir<- paste0(adata_path_th, output_for_R_path)
58 | 
59 | counts<-readMM(paste0(save_dir,'/counts.mtx'))
60 | dim(counts)
61 | cellMeta<-read.csv(paste0(save_dir,'/counts_cellMeta.csv'))
62 | head(cellMeta)
63 | geneMeta<-read.csv(paste0(save_dir,'/counts_geneMeta.csv'))
64 | dim(geneMeta)
65 | head(geneMeta)
66 | ### Set the rownames and colnames
67 | rownames(counts)<-cellMeta$Barcode
68 | colnames(counts)<-geneMeta$GeneName
69 | 
70 | seo <- CreateSeuratObject(counts = t(counts))
71 | ### Set the meta data
72 | seo@meta.data<-cbind(cellMeta,seo@meta.data)
73 | rownames(seo@meta.data)<-colnames(seo)
74 | ### Normalize the data
75 | #seo <- NormalizeData(seo)
76 | groupby <- "cluster_name_acronym"
77 | 
78 | Idents(seo) <- "cluster_name_acronym"
79 | 
80 | 
81 | adata_markers <- FindAllMarkers(seo, only.pos = TRUE, logfc.threshold=0.15) #, min.pct=0.1, return.thresh=0.05, thresh.use = 0.15,
82 | #adata_markers %>% group_by(cluster_name_acronym) %>% slice_max(n = 1, order_by = avg_log2FC)
83 | #adata_markers %>% group_by("cluster_name_acronym") %>% top_n(2, avg_logFC)
84 | print(adata_markers)
85 | 
86 | saveRDS(seo, file.path(save_dir,"human_th.rds"))
87 | 


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/transform_adata.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2023/7/23 17:57
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : transform_adata.py
 6 | # @Description: This file is used to ...
 7 | 
 8 | import anndata
 9 | import scanpy as sc
10 | from pathlib import Path
11 | from scipy import io
12 | import os
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     output_for_R_path = "R_isocortex_mouse"
17 | 
18 |     adata_mouse_path_isocortex = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_mouse_exp_isocortex.h5ad"
19 |     adata = sc.read_h5ad(adata_mouse_path_isocortex)
20 |     ### Set the directory for saving files
21 |     save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/'
22 | 
23 |     Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True)
24 |     print(save_dir + output_for_R_path)
25 | 
26 |     io.mmwrite(save_dir + output_for_R_path+'/counts.mtx', adata.X)
27 |     cell_meta = adata.obs.copy()
28 |     cell_meta['Barcode'] = cell_meta.index
29 |     #cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0]
30 |     #cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1]
31 |     cell_meta['region_name'] = adata.obs['region_name']
32 |     cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym']
33 | 
34 |     gene_meta = adata.var.copy()
35 |     gene_meta['GeneName'] = gene_meta.index
36 | 
37 |     if not os.path.exists(save_dir + output_for_R_path):
38 |         os.makedirs(save_dir + output_for_R_path)
39 |     cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None)
40 |     gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None)
41 | 
42 | 
43 |     #------------------------------------------------------------------------------
44 |     # human------------------------------------
45 |     output_for_R_path = "R_isocortex_human"
46 | 
47 |     adata_human_path_isocortex = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/adata_human_exp_isocortex.h5ad"
48 |     adata = sc.read_h5ad(adata_human_path_isocortex)
49 |     ### Set the directory for saving files
50 |     save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/2_experiment_spatial_isocortex/'
51 | 
52 |     Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True)
53 |     print(save_dir + output_for_R_path)
54 | 
55 |     io.mmwrite(save_dir + output_for_R_path + '/counts.mtx', adata.X)
56 |     cell_meta = adata.obs.copy()
57 |     cell_meta['Barcode'] = cell_meta.index
58 |     # cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0]
59 |     # cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1]
60 |     cell_meta['region_name'] = adata.obs['region_name']
61 |     cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym']
62 | 
63 |     gene_meta = adata.var.copy()
64 |     gene_meta['GeneName'] = gene_meta.index
65 | 
66 |     if not os.path.exists(save_dir + output_for_R_path):
67 |         os.makedirs(save_dir + output_for_R_path)
68 |     cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None)
69 |     gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None)


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_analysis/transform_adata_cluster.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2023/7/23 17:57
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : transform_adata.py
 6 | # @Description: This file is used to ...
 7 | 
 8 | import anndata
 9 | import scanpy as sc
10 | from pathlib import Path
11 | from scipy import io
12 | import os
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     output_for_R_path = "R_TH_mouse"
17 | 
18 |     adata_mouse_path_th = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/adata_exp_TH_mouse.h5ad"
19 |     adata = sc.read_h5ad(adata_mouse_path_th)
20 |     ### Set the directory for saving files
21 |     save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/'
22 | 
23 |     Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True)
24 |     print(save_dir + output_for_R_path)
25 | 
26 |     io.mmwrite(save_dir + output_for_R_path+'/counts.mtx', adata.X)
27 |     cell_meta = adata.obs.copy()
28 |     cell_meta['Barcode'] = cell_meta.index
29 |     #cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0]
30 |     #cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1]
31 |     cell_meta['region_name'] = adata.obs['region_name']
32 |     cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym']
33 | 
34 |     gene_meta = adata.var.copy()
35 |     gene_meta['GeneName'] = gene_meta.index
36 | 
37 |     if not os.path.exists(save_dir + output_for_R_path):
38 |         os.makedirs(save_dir + output_for_R_path)
39 |     cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None)
40 |     gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None)
41 | 
42 | 
43 |     #------------------------------------------------------------------------------
44 |     # human------------------------------------
45 |     output_for_R_path = "R_TH_human"
46 | 
47 |     adata_human_path_isocortex = "../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/adata_exp_TH_human.h5ad"
48 |     adata = sc.read_h5ad(adata_human_path_isocortex)
49 |     ### Set the directory for saving files
50 |     save_dir = '../../data/srrsc_mouse_human_binary/results_20_1000genes_all_came_selfloop/2023-06-23_20-31-14/embeds/figs/4_spatial_analysis/3_experiment_spatial_clusters/'
51 | 
52 |     Path(save_dir + output_for_R_path).mkdir(parents=True, exist_ok=True)
53 |     print(save_dir + output_for_R_path)
54 | 
55 |     io.mmwrite(save_dir + output_for_R_path + '/counts.mtx', adata.X)
56 |     cell_meta = adata.obs.copy()
57 |     cell_meta['Barcode'] = cell_meta.index
58 |     # cell_meta['UMAP1'] = adata.obsm['X_umap'][:, 0]
59 |     # cell_meta['UMAP2'] = adata.obsm['X_umap'][:, 1]
60 |     cell_meta['region_name'] = adata.obs['region_name']
61 |     cell_meta['cluster_name_acronym'] = adata.obs['cluster_name_acronym']
62 | 
63 |     gene_meta = adata.var.copy()
64 |     gene_meta['GeneName'] = gene_meta.index
65 | 
66 |     if not os.path.exists(save_dir + output_for_R_path):
67 |         os.makedirs(save_dir + output_for_R_path)
68 |     cell_meta.to_csv(save_dir + output_for_R_path + '/counts_cellMeta.csv', index=None)
69 |     gene_meta.to_csv(save_dir + output_for_R_path + '/counts_geneMeta.csv', index=None)


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/r_gene_comparison/barplot_degs.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/3/11 20:17
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : barplot_degs.r
 5 | #@Description: This file is used to ...
 6 | 
 7 | # library
 8 | library(tidyverse)
 9 | library(viridis)
10 | 
11 | # Create dataset
12 | data <- data.frame(
13 |   individual=paste( "Mister ", seq(1,60), sep=""),
14 |   group=c( rep('A', 60), rep('B', 30), rep('C', 14), rep('D', 6)) ,
15 |   value1=sample( seq(10,100), 60, replace=T),
16 |   value2=sample( seq(10,100), 60, replace=T),
17 |   value3=sample( seq(10,100), 60, replace=T)
18 | )
19 | 
20 | # Transform data in a tidy format (long format)
21 | data <- data %>% gather(key = "observation", value="value", -c(1,2))
22 | 
23 | # Set a number of 'empty bar' to add at the end of each group
24 | empty_bar <- 2
25 | nObsType <- nlevels(as.factor(data$observation))
26 | to_add <- data.frame( matrix(NA, empty_bar*nlevels(data$group)*nObsType, ncol(data)) )
27 | colnames(to_add) <- colnames(data)
28 | to_add$group <- rep(levels(data$group), each=empty_bar*nObsType )
29 | data <- rbind(data, to_add)
30 | data <- data %>% arrange(group, individual)
31 | data$id <- rep( seq(1, nrow(data)/nObsType) , each=nObsType)
32 | 
33 | # Get the name and the y position of each label
34 | label_data <- data %>% group_by(id, individual) %>% summarize(tot=sum(value))
35 | number_of_bar <- nrow(label_data)
36 | angle <- 90 - 360 * (label_data$id-0.5) /number_of_bar     # I substract 0.5 because the letter must have the angle of the center of the bars. Not extreme right(1) or extreme left (0)
37 | label_data$hjust <- ifelse( angle < -90, 1, 0)
38 | label_data$angle <- ifelse(angle < -90, angle+180, angle)
39 | 
40 | # prepare a data frame for base lines
41 | base_data <- data %>%
42 |   group_by(group) %>%
43 |   summarize(start=min(id), end=max(id) - empty_bar) %>%
44 |   rowwise() %>%
45 |   mutate(title=mean(c(start, end)))
46 | 
47 | # prepare a data frame for grid (scales)
48 | grid_data <- base_data
49 | grid_data$end <- grid_data$end[ c( nrow(grid_data), 1:nrow(grid_data)-1)] + 1
50 | grid_data$start <- grid_data$start - 1
51 | grid_data <- grid_data[-1,]
52 | 
53 | # Make the plot
54 | p <- ggplot(data) +
55 | 
56 |   # Add the stacked bar
57 |   geom_bar(aes(x=as.factor(id), y=value, fill=observation), stat="identity", alpha=0.5) +
58 |   scale_fill_viridis(discrete=TRUE) +
59 | 
60 |   # Add a val=100/75/50/25 lines. I do it at the beginning to make sur barplots are OVER it.
61 |   geom_segment(data=grid_data, aes(x = end, y = 0, xend = start, yend = 0), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
62 |   geom_segment(data=grid_data, aes(x = end, y = 50, xend = start, yend = 50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
63 |   geom_segment(data=grid_data, aes(x = end, y = 100, xend = start, yend = 100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
64 |   geom_segment(data=grid_data, aes(x = end, y = 150, xend = start, yend = 150), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
65 |   geom_segment(data=grid_data, aes(x = end, y = 200, xend = start, yend = 200), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
66 | 
67 |   # Add text showing the value of each 100/75/50/25 lines
68 |   ggplot2::annotate("text", x = rep(max(data$id),5), y = c(0, 50, 100, 150, 200), label = c("0", "50", "100", "150", "200") , color="grey", size=6 , angle=0, fontface="bold", hjust=1) +
69 | 
70 |   ylim(-150,max(label_data$tot, na.rm=T)) +
71 |   theme_minimal() +
72 |   theme(
73 |     legend.position = "none",
74 |     axis.text = element_blank(),
75 |     axis.title = element_blank(),
76 |     panel.grid = element_blank(),
77 |     plot.margin = unit(rep(-1,4), "cm")
78 |   ) +
79 |   coord_polar() +
80 | 
81 |   # Add labels on top of each bar
82 |   geom_text(data=label_data, aes(x=id, y=tot+10, label=individual, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=5, angle= label_data$angle, inherit.aes = FALSE ) +
83 | 
84 |   # Add base line information
85 |   geom_segment(data=base_data, aes(x = start, y = -5, xend = end, yend = -5), colour = "black", alpha=0.8, size=0.6 , inherit.aes = FALSE )  +
86 |   geom_text(data=base_data, aes(x = title, y = -18, label=group), hjust=c(1,1,0,0), colour = "black", alpha=0.8, size=4, fontface="bold", inherit.aes = FALSE)
87 | 
88 | 
89 | # Save at png
90 | ggsave(p, file="../r_gene_comparison/output.png", width=10, height=10)


--------------------------------------------------------------------------------
/BrainAlign/brain_analysis/typehint.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2023/4/2 20:51
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : typehint.py
 6 | # @Description: This file is used to ...
 7 | 
 8 | r"""
 9 | Type hint definitions
10 | """
11 | 
12 | import numbers
13 | from typing import Any, Mapping, Optional, TypeVar, Union
14 | 
15 | import anndata as ad
16 | import h5py
17 | import numpy as np
18 | import scipy.sparse
19 | 
20 | Array = Union[np.ndarray, scipy.sparse.spmatrix]
21 | BackedArray = Union[h5py.Dataset, ad._core.sparse_dataset.SparseDataset]
22 | AnyArray = Union[Array, BackedArray]
23 | ArrayOrScalar = Union[np.ndarray, numbers.Number]
24 | Kws = Optional[Mapping[str, Any]]
25 | RandomState = Optional[Union[np.random.RandomState, int]]
26 | 
27 | T = TypeVar("T")  # Generic type var


--------------------------------------------------------------------------------
/BrainAlign/came/PARAMETERS.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 22:13:17 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | 
 7 | Parameter Settings
 8 | 
 9 | Notes
10 | -----
11 | * Do NOT change this file directly!
12 | 
13 | Examples
14 | --------
15 | >>> params_pre = PARAMETER.get_preprocess_params()
16 | >>> params_model = PARAMETER.get_model_params()
17 | >>> params_loss = PARAMETER.get_loss_params()
18 | 
19 | """
20 | import copy
21 | 
22 | # _params_pre = dict(
23 | #     remove_rare=False,  # True for benchmarking; False for case study
24 | #     min_samples=10,
25 | #     ###
26 | #     norm__rev=False,  # False by default
27 | #     norm__log_only=False,  # False by default
28 | #     ###
29 | #     scale_within=True,  # True by default
30 | #     unit_var=True,  # True by default
31 | #     clip=not True, clip_range=(-3, 5),  # False by default
32 | #     ###
33 | #     use_degs=True,
34 | #     only_1v1homo=False,  # False by default
35 | #     target_sum='auto',  # auto --> 1e4
36 | #     with_single_vnodes=not True,
37 | # )
38 | 
39 | _params_model = dict(
40 |     h_dim=128,
41 |     num_hidden_layers=2,
42 |     norm='right',
43 |     dropout_feat=0.0,  # no dropout for cell input features
44 |     dropout=0.2,
45 |     negative_slope=0.05,
46 |     layernorm_ntypes=['cell', 'gene'],
47 |     out_bias=True,
48 |     rel_names_out=[('gene', 'expressed_by', 'cell'),
49 |                    ],
50 |     share_hidden_weights=True,
51 |     attn_out=True,
52 |     kwdict_outgat=dict(n_heads=8,
53 |                        feat_drop=0.01,
54 |                        attn_drop=0.6,
55 |                        negative_slope=0.2,
56 |                        residual=False,
57 |                        attn_type='add',  # 'add' is more robust than 'mul'
58 |                        heads_fuse='mean',
59 |                        ),
60 |     share_layernorm=True,  # ignored if no weights are shared
61 |     residual=False,  # performance un-tested
62 | )
63 | 
64 | _params_lossfunc = dict(
65 |     smooth_eps=0.1, reduction='mean',
66 |     beta=1.,  # balance factor for multi-label loss
67 |     alpha=0,  # for R-drop, setting it larger than zero
68 | )
69 | 
70 | 
71 | def _get_parameter_dict(default={}, **kwds) -> dict:
72 |     params = copy.deepcopy(default)
73 |     if len(kwds) > 0:
74 |         params.update(**kwds)
75 |     return params
76 | 
77 | 
78 | # def get_preprocess_params(**kwds) -> dict:
79 | #     return _get_parameter_dict(_params_pre, **kwds)
80 | 
81 | 
82 | def get_loss_params(**kwds) -> dict:
83 |     return _get_parameter_dict(_params_lossfunc, **kwds)
84 | 
85 | 
86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict:
87 |     params = _get_parameter_dict(_params_model, **kwds)
88 |     if len(kwdict_outgat) > 0:
89 |         params['kwdict_outgat'].update(kwdict_outgat)
90 |     return params
91 | 
92 | 


--------------------------------------------------------------------------------
/BrainAlign/came/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @author: Xingyan Liu
 3 | 
 4 | from .utils import (
 5 |         load_hidden_states,
 6 |         save_hidden_states,
 7 |         load_example_data
 8 | )
 9 | from .utils import base
10 | from .utils.base import (
11 |         save_pickle,
12 |         load_pickle,
13 |         save_json_dict,
14 |         load_json_dict,
15 |         check_dirs,
16 |         write_info,
17 |         make_nowtime_tag,
18 |         subsample_each_group,
19 |         )
20 | from .utils import preprocess as pp
21 | from .utils import plot as pl
22 | from .utils import analyze as ana
23 | from .utils.analyze import (
24 |         load_dpair_and_model,
25 |         weight_linked_vars,
26 |         make_abstracted_graph,
27 |         )
28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL
29 | from .utils._base_trainer import get_checkpoint_list
30 | from .utils.evaluation import accuracy
31 | from .model import (
32 |         Predictor,
33 |         detach2numpy,
34 |         as_probabilities,
35 |         predict_from_logits,
36 |         predict,
37 |         CGGCNet,
38 |         CGCNet
39 | )
40 | from .datapair import (
41 |         datapair_from_adatas,
42 |         aligned_datapair_from_adatas,
43 |         DataPair,
44 |         AlignedDataPair,
45 |         make_features,
46 | )
47 | from .PARAMETERS import get_model_params, get_loss_params
48 | from . import pipeline
49 | from .pipeline import KET_CLUSTER, __test1__, __test2__
50 | 
51 | 
52 | __version__ = "0.1.8"
53 | 


--------------------------------------------------------------------------------
/BrainAlign/came/datapair/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from .unaligned import datapair_from_adatas, DataPair, make_features
 9 | from .aligned import aligned_datapair_from_adatas, AlignedDataPair
10 | 
11 | 


--------------------------------------------------------------------------------
/BrainAlign/came/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from ._utils import *
 9 | from ._predict import *
10 | from .loss import *
11 | from ._predict import *
12 | from .loss import *
13 | from .cggc import CGGCNet
14 | from .cgc import CGCNet
15 | 


--------------------------------------------------------------------------------
/BrainAlign/came/model/_minibatch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | """
 3 | @CreateDate: 2021/07/15
 4 | @Author: Qunlun Shen
 5 | @File: _minibatch.py
 6 | @Project: CAME
 7 | """
 8 | from pathlib import Path
 9 | from typing import Sequence, Union, Mapping, Optional
10 | import time
11 | import numpy as np
12 | import torch
13 | from torch import Tensor
14 | import dgl
15 | import tqdm
16 | 
17 | 
18 | def make_fanouts(etypes, etypes_each_layers, k_each_etype: Union[int, dict]):
19 |     if isinstance(k_each_etype, int):
20 |         k_each_etype = dict.fromkeys(etypes, k_each_etype)
21 | 
22 |     fanouts = []
23 |     for _etypes in etypes_each_layers:
24 |         _fanout = dict.fromkeys(etypes, 0)
25 |         _fanout.update({e: k_each_etype[e] for e in _etypes})
26 |         fanouts.append(_fanout)
27 |     return fanouts
28 | 
29 | 
30 | def involved_nodes(g,) -> dict:
31 |     """ collect all the involved nodes from the edges on g
32 |     (a heterogeneous graph)
33 | 
34 |     Examples
35 |     --------
36 | 
37 |     >>> input_nodes, output_nodes, mfgs = next(iter(train_dataloader))
38 |     >>> g.subgraph(involved_nodes(mfgs[0]))
39 | 
40 |     """
41 |     from collections import defaultdict
42 |     nodes = defaultdict(set)
43 |     for stype, etype, dtype in g.canonical_etypes:
44 |         src, dst = g.edges(etype=etype)
45 |         nodes[stype].update(src.numpy())
46 |         nodes[dtype].update(dst.numpy())
47 | 
48 |     nodes = {k: sorted(v) for k, v in nodes.items()}
49 |     return nodes
50 | 
51 | 


--------------------------------------------------------------------------------
/BrainAlign/came/model/v0/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from ._utils import *
 9 | from ._predict import *
10 | from .loss import *
11 | from ._predict import *
12 | from .loss import *
13 | from .cggc import CGGCNet
14 | from .cgc import CGCNet
15 | 


--------------------------------------------------------------------------------
/BrainAlign/came/model/v0/_minibatch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @CreateDate: 2021/07/15
  4 | @Author: Qunlun Shen
  5 | @File: _minibatch.py
  6 | @Project: CAME
  7 | """
  8 | from pathlib import Path
  9 | from typing import Sequence, Union, Mapping, Optional
 10 | import time
 11 | import numpy as np
 12 | import torch
 13 | from torch import Tensor
 14 | import dgl
 15 | import tqdm
 16 | 
 17 | 
 18 | def sub_graph(cell_ids, gene_ids, g):
 19 |     """
 20 |     Making sub_graph for g with input cell_ids and gene_ids
 21 |     """
 22 |     output_nodes_dict = {'cell': cell_ids, 'gene': gene_ids}
 23 |     g_subgraph = dgl.node_subgraph(g, output_nodes_dict)
 24 |     return g_subgraph
 25 | 
 26 | 
 27 | def create_blocks(g, output_nodes, etype='expressed_by'):
 28 |     cell_ids = output_nodes.clone().detach()
 29 |     gene_ids = g.in_edges(cell_ids, etype=etype)[0]  # genes expressed_by cells
 30 |     gene_ids = torch.unique(gene_ids)
 31 |     block = sub_graph(cell_ids, gene_ids, g)  # graph for GAT
 32 |     return block
 33 | 
 34 | 
 35 | def create_batch(
 36 |         sample_size=None,
 37 |         train_idx=None,
 38 |         test_idx=None,
 39 |         batch_size=None,
 40 |         labels=None,
 41 |         shuffle=True,
 42 |         label=True
 43 | ):
 44 |     """
 45 |     This function create batch idx, i.e. the cells IDs in a batch.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     train_idx:
 50 |         the index for reference cells
 51 |     test_idx:
 52 |         the index for query cells
 53 |     batch_size:
 54 |         the number of cells in each batch
 55 |     labels:
 56 |         the labels for both Reference cells and Query cells
 57 | 
 58 |     Returns
 59 |     -------
 60 |     train_labels
 61 |         the shuffled or non-shuffled labels for all reference cells
 62 |     test_labels
 63 |         the shuffled or non-shuffled labels for all query cells
 64 |     batch_list
 65 |         the list sores the batch of cell IDs
 66 |     all_idx
 67 |         the shuffled or non-shuffled index for all cells
 68 |     """
 69 |     if label:
 70 |         batch_list = []
 71 |         batch_labels = []
 72 |         sample_size = len(train_idx) + len(test_idx)
 73 |         if shuffle:
 74 |             all_idx = torch.randperm(sample_size)
 75 |             shuffled_labels = labels[all_idx]
 76 |             train_labels = shuffled_labels[all_idx < len(train_idx)].clone().detach()
 77 |             test_labels = shuffled_labels[all_idx >= len(train_idx)].clone().detach()
 78 | 
 79 |             if batch_size >= sample_size:
 80 |                 batch_list.append(all_idx)
 81 | 
 82 |             else:
 83 |                 batch_num = int(len(all_idx) / batch_size) + 1
 84 |                 for i in range(batch_num - 1):
 85 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
 86 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
 87 | 
 88 |         else:
 89 |             train_labels = labels[train_idx].clone().detach()
 90 |             test_labels = labels[test_idx].clone().detach()
 91 |             all_idx = torch.cat((train_idx, test_idx), 0)
 92 |             if batch_size >= sample_size:
 93 |                 batch_list.append(all_idx)
 94 |             else:
 95 |                 batch_num = int(len(all_idx) / batch_size) + 1
 96 |                 for i in range(batch_num - 1):
 97 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
 98 |                     batch_labels.append(labels[batch_size * i: batch_size * (i + 1)])
 99 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
100 | 
101 |         return train_labels, test_labels, batch_list, all_idx
102 | 
103 |     else:
104 |         batch_list = []
105 |         if shuffle:
106 |             all_idx = torch.randperm(sample_size)
107 | 
108 |             if batch_size >= sample_size:
109 |                 batch_list.append(all_idx)
110 |             else:
111 |                 batch_num = int(len(all_idx) / batch_size) + 1
112 |                 for i in range(batch_num - 1):
113 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
114 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
115 | 
116 |         else:
117 |             all_idx = torch.arange(sample_size)
118 |             if batch_size >= sample_size:
119 |                 batch_list.append(all_idx)
120 |             else:
121 |                 batch_num = int(len(all_idx) / batch_size) + 1
122 |                 for i in range(batch_num - 1):
123 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
124 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
125 | 
126 |         return batch_list, all_idx, None, None
127 | 
128 | 


--------------------------------------------------------------------------------
/BrainAlign/came/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | from . import *
 8 | from .base import (
 9 |         save_pickle,
10 |         load_pickle,
11 |         check_dirs,
12 |         write_info,
13 |         make_nowtime_tag,
14 |         subsample_each_group,
15 |         )
16 | from .evaluation import accuracy
17 | from .analyze import (
18 |        weight_linked_vars,
19 |        make_abstracted_graph,
20 |        )
21 | from ._get_example_data import load_example_data
22 | from .downsample_counts import (
23 |         downsample_total_counts,
24 |         downsample_counts_per_cell
25 | )
26 | from ._io_h5py import load_hidden_states, save_hidden_states
27 | 


--------------------------------------------------------------------------------
/BrainAlign/came/utils/_get_example_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @author: Xingyan Liu
  4 | @file: _get_example_data.py
  5 | @time: 2021-06-12
  6 | """
  7 | 
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Sequence, Union, Dict, List, Optional  # , Callable
 11 | import numpy as np
 12 | import pandas as pd
 13 | import scanpy as sc
 14 | from scipy import sparse
 15 | import logging
 16 | 
 17 | CAME_ROOT = Path(__file__).parents[1]
 18 | 
 19 | 
 20 | def _extract_zip(
 21 |         fp_zip=CAME_ROOT / 'sample_data.zip',
 22 |         fp_unzip=CAME_ROOT / 'sample_data',
 23 | ):
 24 |     import zipfile
 25 |     with zipfile.ZipFile(fp_zip) as zipf:
 26 |         zipf.extractall(fp_unzip)
 27 | 
 28 | 
 29 | def load_example_data() -> Dict:
 30 |     """ Load example data, for a quick start with CAME.
 31 | 
 32 |     This pair of cross-species datasets contains the pancreatic scRNA-seq data
 33 |     of human ("Baron_human") and mouse ("Baron_human"),
 34 |     initially published with paper [1].
 35 | 
 36 |     NOTE that "Baron_human" is a 20%-subsample from the original data.
 37 |     The resulting cell-typing accuracy may not be as good as one
 38 |     using full dataset as the reference.
 39 | 
 40 |     [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human
 41 |     and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure.
 42 |     Cell Syst 3 (4), 346-360.e4.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     dict:
 47 |         a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']
 48 | 
 49 |     Examples
 50 |     --------
 51 |     >>> example_data_dict = load_example_data()
 52 |     >>> print(example_data_dict.keys())
 53 |     # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'])
 54 | 
 55 |     >>> adatas = example_data_dict['adatas']
 56 |     >>> dsnames = example_data_dict['dataset_names']  # ('Baron_human', 'Baron_mouse')
 57 |     >>> df_varmap = example_data_dict['varmap']
 58 |     >>> df_varmap_1v1 = example_data_dict['varmap_1v1']
 59 |     >>> key_class1 = key_class2 = example_data_dict['key_class']
 60 | 
 61 |     """
 62 |     datadir = CAME_ROOT / 'sample_data'
 63 | 
 64 |     sp1, sp2 = ('human', 'mouse')
 65 |     dsnames = ('Baron_human', 'Baron_mouse')
 66 |     dsn1, dsn2 = dsnames
 67 |     fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad'
 68 |     fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv'
 69 |     fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv'
 70 | 
 71 |     if not (datadir.exists() and fp1.exists() and fp2.exists() and
 72 |             fp_varmap.exists() and fp_varmap_1v1.exists()):
 73 |         _extract_zip()
 74 | 
 75 |     df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, )
 76 |     df_varmap = pd.read_csv(fp_varmap, )
 77 | 
 78 |     adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2)
 79 | 
 80 |     key_class = 'cell_ontology_class'
 81 |     example_dict = {
 82 |         'adatas': [adata_raw1, adata_raw2],
 83 |         'varmap': df_varmap,
 84 |         'varmap_1v1': df_varmap_1v1,
 85 |         'dataset_names': dsnames,
 86 |         'key_class': key_class,
 87 |     }
 88 |     logging.info(example_dict.keys())
 89 |     logging.debug(example_dict)
 90 |     return example_dict
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     logging.basicConfig(
 95 |         level=logging.DEBUG,
 96 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
 97 |                '%(levelname)s\n %(message)s')
 98 |     d = load_example_data()
 99 |     print(d.keys())
100 | 


--------------------------------------------------------------------------------
/BrainAlign/came/utils/_io_h5py.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @Author: Xingyan Liu
  4 | @File: _tmp_h5py.py
  5 | @Date: 2021-08-03
  6 | @Project: CAME
  7 | """
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Union, Optional, List, Mapping
 11 | import logging
 12 | import numpy as np
 13 | import h5py
 14 | 
 15 | 
 16 | def save_hidden_states(data_list: list, path: Union[Path, str]):
 17 |     """ Save hidden states into .h5 file
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     data_list
 22 |         a list of data matrix, or a list of dicts whose values are matrices
 23 |     path
 24 |         file-path ends with .h5, if not, '.h5' will be appended to it.
 25 | 
 26 |     Returns
 27 |     -------
 28 |     None
 29 |     """
 30 |     if not str(path).endswith('.h5'):
 31 |         path = str(path) + '.h5'
 32 |     f = h5py.File(path, 'w')
 33 |     if isinstance(data_list[0], dict):
 34 |         for i, dct in enumerate(data_list):
 35 |             for key, _data in dct.items():
 36 |                 f.create_dataset(f'/layer{i}/{key}', data=_data)
 37 |     else:
 38 |         for i, _data in enumerate(data_list):
 39 |             f.create_dataset(f'/layer{i}', data=_data)
 40 | 
 41 |     f.close()
 42 | 
 43 | 
 44 | def load_hidden_states(path) -> List[dict]:
 45 |     """ Load hidden states from .h5 file
 46 |     the data structure should be like
 47 |         [
 48 |         'layer0/cell', 'layer0/gene',
 49 |         'layer1/cell', 'layer1/gene',
 50 |         'layer2/cell', 'layer2/gene'
 51 |         ]
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     path
 56 |         .h5 file path
 57 | 
 58 |     Returns
 59 |     -------
 60 |     values: a list of dicts
 61 |     """
 62 |     f = h5py.File(path, 'r')
 63 |     prefix = 'layer'
 64 |     keys = sorted(f.keys(), key=lambda x: int(x.strip(prefix)))
 65 |     # print(keys)
 66 |     values = [_unfold_to_dict(f[key]) for key in keys]
 67 |     return values
 68 | 
 69 | 
 70 | def _unfold_to_dict(d: h5py.Group) -> dict:
 71 |     dct = {}
 72 |     for key, val in d.items():
 73 |         if isinstance(val, h5py.Dataset):
 74 |             dct[key] = np.array(val)
 75 |     return dct
 76 | 
 77 | 
 78 | def _visit(f: h5py.File):
 79 |     tree = []
 80 | 
 81 |     def foo(_name, _obj):
 82 |         if isinstance(_obj, h5py.Dataset):
 83 |             tree.append(_name)
 84 |     f.visititems(foo)
 85 |     logging.info(f'tree={tree}')
 86 |     return tree
 87 | 
 88 | 
 89 | def __test__():
 90 |     n_cells = 100
 91 |     n_genes = 114
 92 |     n_dims = 64
 93 |     hidden_data = [
 94 |         {'cell': np.random.randn(n_cells, n_dims),
 95 |          'gene': np.random.randn(n_genes, n_dims)}
 96 |         for i in range(3)
 97 |     ]
 98 |     hidden_data.append({'cell': np.random.randn(n_cells, n_dims)})
 99 | 
100 |     # logging.debug(hidden_data)
101 |     save_hidden_states(hidden_data, '_tmp_data')
102 |     f1 = h5py.File('_tmp_data.h5', 'r')
103 |     h_list = load_hidden_states('../../_tmp_data.h5')
104 |     # logging.info(values)
105 |     for k, d in zip(f1.keys(), h_list):
106 |         print(f'{k}: {list(d.keys())}')
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     logging.basicConfig(
111 |         level=logging.DEBUG,
112 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
113 |                '%(levelname)s\n %(message)s')
114 |     __test__()
115 | 


--------------------------------------------------------------------------------
/BrainAlign/came/utils/evaluation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 19:43:10 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | import numpy as np
 9 | from sklearn import metrics
10 | import torch
11 | from torch import Tensor
12 | from typing import Sequence
13 | from ..model import detach2numpy
14 | 
15 | 
16 | def accuracy(logits: Tensor, labels: Tensor):
17 |     labels = labels.to(logits.device)
18 |     if len(logits.shape) >= 2:
19 |         _, preds = torch.max(logits, dim=1)
20 |     else:
21 |         preds = logits
22 |     if len(labels.shape) >= 2:
23 |         _, labels = torch.max(labels, dim=1)
24 |     else:
25 |         labels = labels
26 |     correct = torch.sum(preds == labels)
27 |     return correct.item() * 1.0 / len(labels)
28 | 
29 | 
30 | def get_AMI(y_true, y_pred, **kwds):
31 |     y_true, y_pred = list(map(detach2numpy, (y_true, y_pred)))
32 |     ami = metrics.adjusted_mutual_info_score(y_true, y_pred, **kwds)
33 |     return ami
34 | 
35 | 
36 | def get_F1_score(y_true, y_pred, average='macro', **kwds):
37 |     y_true, y_pred = list(map(detach2numpy, (y_true, y_pred)))
38 |     f1 = metrics.f1_score(y_true, y_pred, average=average, **kwds)
39 |     return f1
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/BrainAlign/code/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2022/10/18 17:14
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py
6 | 


--------------------------------------------------------------------------------
/BrainAlign/code/embeds/acm/README.md:
--------------------------------------------------------------------------------
1 | This folder is to save embeddings of ACM.
2 | 


--------------------------------------------------------------------------------
/BrainAlign/code/embeds/aminer/README.md:
--------------------------------------------------------------------------------
1 | This folder is to save embeddings of AMiner.
2 | 


--------------------------------------------------------------------------------
/BrainAlign/code/embeds/dblp/README.md:
--------------------------------------------------------------------------------
1 | This folder is to save embeddings from DBLP.
2 | 


--------------------------------------------------------------------------------
/BrainAlign/code/embeds/freebase/README.md:
--------------------------------------------------------------------------------
1 | This folder is to save embeddings of Freebase.
2 | 


--------------------------------------------------------------------------------
/BrainAlign/code/main_parallel.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import torch
  3 | from utils import load_data, set_params, evaluate
  4 | from module import HeCo
  5 | import warnings
  6 | import datetime
  7 | import pickle as pkl
  8 | import os
  9 | import random
 10 | 
 11 | 
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | os.environ['CUDA_VISIBLE_DEVICES'] = '1,2'
 15 | 
 16 | args = set_params()
 17 | 
 18 | if torch.cuda.is_available():
 19 |     device = torch.device("cuda")
 20 |     #torch.cuda.set_device(device)
 21 | else:
 22 |     device = torch.device("cpu")
 23 | 
 24 | ## name of intermediate document ##
 25 | own_str = args.dataset
 26 | 
 27 | ## random seed ##
 28 | seed = args.seed
 29 | numpy.random.seed(seed)
 30 | random.seed(seed)
 31 | torch.manual_seed(seed)
 32 | torch.cuda.manual_seed(seed)
 33 | 
 34 | 
 35 | def train():
 36 |     nei_index, feats, mps, pos, label, idx_train, idx_val, idx_test = \
 37 |         load_data(args.dataset, args.ratio, args.type_num)
 38 |     nb_classes = label.shape[-1]
 39 |     feats_dim_list = [i.shape[1] for i in feats]
 40 |     P = int(len(mps))
 41 |     print("seed ",args.seed)
 42 |     print("Dataset: ", args.dataset)
 43 |     print("The number of meta-paths: ", P)
 44 | 
 45 |     model = HeCo(args.hidden_dim, feats_dim_list, args.feat_drop, args.attn_drop,
 46 |                     P, args.sample_rate, args.nei_num, args.tau, args.lam)
 47 |     model = torch.nn.DataParallel(model, device_ids=[0, 1])
 48 | 
 49 |     optimiser = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2_coef)
 50 | 
 51 |     if torch.cuda.is_available():
 52 |         print('Using CUDA')
 53 |         #model.cuda()
 54 |         model.to(device)
 55 |         feats = [feat.to(device) for feat in feats]
 56 |         mps = [mp.to(device) for mp in mps]
 57 |         pos = pos.to(device)
 58 |         label = label.to(device)
 59 |         idx_train = [i.to(device) for i in idx_train]
 60 |         idx_val = [i.to(device) for i in idx_val]
 61 |         idx_test = [i.to(device) for i in idx_test]
 62 | 
 63 |     cnt_wait = 0
 64 |     best = 1e9
 65 |     best_t = 0
 66 | 
 67 |     starttime = datetime.datetime.now()
 68 |     for epoch in range(args.nb_epochs):
 69 |         model.train()
 70 |         optimiser.zero_grad()
 71 |         loss = model(feats, pos, mps, nei_index)
 72 |         print("loss ", loss.data.cpu())
 73 |         if loss < best:
 74 |             best = loss
 75 |             best_t = epoch
 76 |             cnt_wait = 0
 77 |             torch.save(model.state_dict(), 'HeCo_'+own_str+'.pkl')
 78 |         else:
 79 |             cnt_wait += 1
 80 | 
 81 |         if cnt_wait == args.patience:
 82 |             print('Early stopping!')
 83 |             break
 84 |         loss.backward()
 85 |         optimiser.step()
 86 |         
 87 |     print('Loading {}th epoch'.format(best_t))
 88 |     model.load_state_dict(torch.load('HeCo_'+own_str+'.pkl'))
 89 |     model.eval()
 90 |     os.remove('HeCo_'+own_str+'.pkl')
 91 |     embeds = model.get_embeds(feats, mps)
 92 |     for i in range(len(idx_train)):
 93 |         evaluate(embeds, args.ratio[i], idx_train[i], idx_val[i], idx_test[i], label, nb_classes, device, args.dataset,
 94 |                  args.eva_lr, args.eva_wd)
 95 |     endtime = datetime.datetime.now()
 96 |     time = (endtime - starttime).seconds
 97 |     print("Total time: ", time, "s")
 98 |     
 99 |     if args.save_emb:
100 |         f = open("./embeds/"+args.dataset+"/"+str(args.turn)+".pkl", "wb")
101 |         pkl.dump(embeds.cpu().data.numpy(), f)
102 |         f.close()
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     train()
107 | 


--------------------------------------------------------------------------------
/BrainAlign/code/module/__init__.py:
--------------------------------------------------------------------------------
1 | from .heco import HeCo
2 | 


--------------------------------------------------------------------------------
/BrainAlign/code/module/contrast.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Contrast(nn.Module):
 6 |     def __init__(self, hidden_dim, tau, lam):
 7 |         super(Contrast, self).__init__()
 8 |         self.proj = nn.Sequential(
 9 |             nn.Linear(hidden_dim, hidden_dim),
10 |             nn.ELU(),
11 |             nn.Linear(hidden_dim, hidden_dim)
12 |         )
13 |         self.tau = tau
14 |         self.lam = lam
15 |         for model in self.proj:
16 |             if isinstance(model, nn.Linear):
17 |                 nn.init.xavier_normal_(model.weight, gain=1.414)
18 | 
19 |     def sim(self, z1, z2):
20 |         z1_norm = torch.norm(z1, dim=-1, keepdim=True)
21 |         z2_norm = torch.norm(z2, dim=-1, keepdim=True)
22 |         dot_numerator = torch.mm(z1, z2.t())
23 |         dot_denominator = torch.mm(z1_norm, z2_norm.t())
24 |         sim_matrix = torch.exp(dot_numerator / dot_denominator / self.tau)
25 |         return sim_matrix
26 | 
27 |     def forward(self, z_mp, z_sc, pos):
28 |         z_proj_mp = self.proj(z_mp)
29 |         z_proj_sc = self.proj(z_sc)
30 |         matrix_mp2sc = self.sim(z_proj_mp, z_proj_sc)
31 |         matrix_sc2mp = matrix_mp2sc.t()
32 |         
33 |         matrix_mp2sc = matrix_mp2sc/(torch.sum(matrix_mp2sc, dim=1).view(-1, 1) + 1e-8)
34 |         lori_mp = -torch.log(matrix_mp2sc.mul(pos.to_dense()).sum(dim=-1)).mean()
35 | 
36 |         matrix_sc2mp = matrix_sc2mp / (torch.sum(matrix_sc2mp, dim=1).view(-1, 1) + 1e-8)
37 |         lori_sc = -torch.log(matrix_sc2mp.mul(pos.to_dense()).sum(dim=-1)).mean()
38 |         return self.lam * lori_mp + (1 - self.lam) * lori_sc
39 | 


--------------------------------------------------------------------------------
/BrainAlign/code/module/heco.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | from .mp_encoder import Mp_encoder
 4 | from .sc_encoder import Sc_encoder
 5 | from .contrast import Contrast
 6 | 
 7 | 
 8 | class HeCo(nn.Module):
 9 |     def __init__(self, hidden_dim, feats_dim_list, feat_drop, attn_drop, P, sample_rate,
10 |                  nei_num, tau, lam):
11 |         super(HeCo, self).__init__()
12 |         self.hidden_dim = hidden_dim
13 |         self.fc_list = nn.ModuleList([nn.Linear(feats_dim, hidden_dim, bias=True)
14 |                                       for feats_dim in feats_dim_list])
15 |         for fc in self.fc_list:
16 |             nn.init.xavier_normal_(fc.weight, gain=1.414)
17 | 
18 |         if feat_drop > 0:
19 |             self.feat_drop = nn.Dropout(feat_drop)
20 |         else:
21 |             self.feat_drop = lambda x: x
22 |         self.mp = Mp_encoder(P, hidden_dim, attn_drop)
23 |         self.sc = Sc_encoder(hidden_dim, sample_rate, nei_num, attn_drop)      
24 |         self.contrast = Contrast(hidden_dim, tau, lam)
25 | 
26 |     def forward(self, feats, pos, mps, nei_index):  # p a s
27 |         h_all = []
28 |         for i in range(len(feats)):
29 |             h_all.append(F.elu(self.feat_drop(self.fc_list[i](feats[i]))))
30 |         z_mp = self.mp(h_all[0], mps)
31 |         z_sc = self.sc(h_all, nei_index)
32 |         loss = self.contrast(z_mp, z_sc, pos)
33 |         return loss
34 | 
35 |     def get_embeds(self, feats, mps):
36 |         z_mp = F.elu(self.fc_list[0](feats[0]))
37 |         z_mp = self.mp(z_mp, mps)
38 |         return z_mp.detach()
39 | 
40 |     '''
41 |     def get_embeds(self, feats, mps):
42 |         z_mp = F.elu(self.fc_list[0](feats[0]))
43 |         z_mp = self.mp(z_mp, mps)
44 |         embeds_list = [z_mp]
45 |         for i in range(len(feats)-1):
46 |             embeds_list.append(F.elu(self.fc_list[i+1](feats[i+1])))
47 |         return [x.detach() for x in embeds_list]
48 |     '''
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/BrainAlign/code/module/mp_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class GCN(nn.Module):
 6 |     def __init__(self, in_ft, out_ft, bias=True):
 7 |         super(GCN, self).__init__()
 8 |         self.fc = nn.Linear(in_ft, out_ft, bias=False)
 9 |         self.act = nn.PReLU()
10 | 
11 |         if bias:
12 |             self.bias = nn.Parameter(torch.FloatTensor(out_ft))
13 |             self.bias.data.fill_(0.0)
14 |         else:
15 |             self.register_parameter('bias', None)
16 | 
17 |         for m in self.modules():
18 |             self.weights_init(m)
19 | 
20 |     def weights_init(self, m):
21 |         if isinstance(m, nn.Linear):
22 |             nn.init.xavier_normal_(m.weight, gain=1.414)
23 |             if m.bias is not None:
24 |                 m.bias.data.fill_(0.0)
25 | 
26 |     def forward(self, seq, adj):
27 |         seq_fts = self.fc(seq)
28 |         out = torch.spmm(adj, seq_fts)
29 |         if self.bias is not None:
30 |             out += self.bias
31 |         return self.act(out)
32 | 
33 | 
34 | class Attention(nn.Module):
35 |     def __init__(self, hidden_dim, attn_drop):
36 |         super(Attention, self).__init__()
37 |         self.fc = nn.Linear(hidden_dim, hidden_dim, bias=True)
38 |         nn.init.xavier_normal_(self.fc.weight, gain=1.414)
39 | 
40 |         self.tanh = nn.Tanh()
41 |         self.att = nn.Parameter(torch.empty(size=(1, hidden_dim)), requires_grad=True)
42 |         nn.init.xavier_normal_(self.att.data, gain=1.414)
43 | 
44 |         self.softmax = nn.Softmax()
45 |         if attn_drop:
46 |             self.attn_drop = nn.Dropout(attn_drop)
47 |         else:
48 |             self.attn_drop = lambda x: x
49 | 
50 |     def forward(self, embeds):
51 |         beta = []
52 |         attn_curr = self.attn_drop(self.att)
53 |         for embed in embeds:
54 |             sp = self.tanh(self.fc(embed)).mean(dim=0)
55 |             beta.append(attn_curr.matmul(sp.t()))
56 |         beta = torch.cat(beta, dim=-1).view(-1)
57 |         beta = self.softmax(beta)
58 |         print("mp ", beta.data.cpu().numpy())  # semantic attention
59 |         z_mp = 0
60 |         for i in range(len(embeds)):
61 |             z_mp += embeds[i]*beta[i]
62 |         return z_mp
63 | 
64 | 
65 | class Mp_encoder(nn.Module):
66 |     def __init__(self, P, hidden_dim, attn_drop):
67 |         super(Mp_encoder, self).__init__()
68 |         self.P = P
69 |         self.node_level = nn.ModuleList([GCN(hidden_dim, hidden_dim) for _ in range(P)])
70 |         self.att = Attention(hidden_dim, attn_drop)
71 | 
72 |     def forward(self, h, mps):
73 |         embeds = []
74 |         for i in range(self.P):
75 |             embeds.append(self.node_level[i](h, mps[i]))
76 |         z_mp = self.att(embeds)
77 |         return z_mp
78 | 


--------------------------------------------------------------------------------
/BrainAlign/code/module/sc_encoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class inter_att(nn.Module):
 8 |     def __init__(self, hidden_dim, attn_drop):
 9 |         super(inter_att, self).__init__()
10 |         self.fc = nn.Linear(hidden_dim, hidden_dim, bias=True)
11 |         nn.init.xavier_normal_(self.fc.weight, gain=1.414)
12 | 
13 |         self.tanh = nn.Tanh()
14 |         self.att = nn.Parameter(torch.empty(size=(1, hidden_dim)), requires_grad=True)
15 |         nn.init.xavier_normal_(self.att.data, gain=1.414)
16 | 
17 |         self.softmax = nn.Softmax()
18 |         if attn_drop:
19 |             self.attn_drop = nn.Dropout(attn_drop)
20 |         else:
21 |             self.attn_drop = lambda x: x
22 | 
23 |     def forward(self, embeds):
24 |         beta = []
25 |         attn_curr = self.attn_drop(self.att)
26 |         for embed in embeds:
27 |             sp = self.tanh(self.fc(embed)).mean(dim=0)
28 |             beta.append(attn_curr.matmul(sp.t()))
29 |         beta = torch.cat(beta, dim=-1).view(-1)
30 |         beta = self.softmax(beta)
31 |         print("sc ", beta.data.cpu().numpy())  # type-level attention
32 |         z_mc = 0
33 |         for i in range(len(embeds)):
34 |             z_mc += embeds[i] * beta[i]
35 |         return z_mc
36 | 
37 | 
38 | class intra_att(nn.Module):
39 |     def __init__(self, hidden_dim, attn_drop):
40 |         super(intra_att, self).__init__()
41 |         self.att = nn.Parameter(torch.empty(size=(1, 2*hidden_dim)), requires_grad=True)
42 |         nn.init.xavier_normal_(self.att.data, gain=1.414)
43 |         if attn_drop:
44 |             self.attn_drop = nn.Dropout(attn_drop)
45 |         else:
46 |             self.attn_drop = lambda x: x
47 | 
48 |         self.softmax = nn.Softmax(dim=1)
49 |         self.leakyrelu = nn.LeakyReLU()
50 | 
51 |     def forward(self, nei, h, h_refer):
52 |         #print('nei.shape', nei.shape)
53 |         #print('h.shape', h.shape)
54 |         nei_emb = F.embedding(nei, h)
55 |         #print('nei_emb.shape', nei_emb.shape)
56 |         #print('h_refer.shape', h_refer.shape)
57 |         h_refer = torch.unsqueeze(h_refer, 1)
58 |         h_refer = h_refer.expand_as(nei_emb)
59 |         all_emb = torch.cat([h_refer, nei_emb], dim=-1)
60 |         attn_curr = self.attn_drop(self.att)
61 |         att = self.leakyrelu(all_emb.matmul(attn_curr.t()))
62 |         att = self.softmax(att)
63 |         nei_emb = (att*nei_emb).sum(dim=1)
64 |         return nei_emb
65 | 
66 | 
67 | class Sc_encoder(nn.Module):
68 |     def __init__(self, hidden_dim, sample_rate, nei_num, attn_drop):
69 |         super(Sc_encoder, self).__init__()
70 |         self.intra = nn.ModuleList([intra_att(hidden_dim, attn_drop) for _ in range(nei_num)])
71 |         self.inter = inter_att(hidden_dim, attn_drop)
72 |         self.sample_rate = sample_rate
73 |         self.nei_num = nei_num
74 | 
75 |     def forward(self, nei_h, nei_index):
76 |         embeds = []
77 |         for i in range(self.nei_num):
78 |             sele_nei = []
79 |             sample_num = self.sample_rate[i]
80 |             for per_node_nei in nei_index[i]:
81 |                 if len(per_node_nei) >= sample_num:
82 |                     select_one = torch.tensor(np.random.choice(per_node_nei, sample_num,
83 |                                                                replace=False))[np.newaxis]
84 |                 else:
85 |                     select_one = torch.tensor(np.random.choice(per_node_nei, sample_num,
86 |                                                                replace=True))[np.newaxis]
87 |                 sele_nei.append(select_one)
88 |             sele_nei = torch.cat(sele_nei, dim=0)#.cuda()
89 |             #print('sele_nei.shape', sele_nei.shape)
90 |             one_type_emb = F.elu(self.intra[i](sele_nei, nei_h[i + 1], nei_h[0]))
91 |             embeds.append(one_type_emb)
92 |         z_mc = self.inter(embeds)
93 |         return z_mc
94 | 


--------------------------------------------------------------------------------
/BrainAlign/code/predict.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/10/16 9:11
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : predict.py
 6 | import numpy
 7 | import torch
 8 | from utils import load_data, set_params, evaluate
 9 | from module import HeCo
10 | import warnings
11 | import datetime
12 | import pickle as pkl
13 | import os
14 | import random
15 | from utils.logger import setup_logger
16 | warnings.filterwarnings('ignore')
17 | args = set_params()
18 | if torch.cuda.is_available():
19 |     device = torch.device("cuda:" + str(args.gpu))
20 |     torch.cuda.set_device(args.gpu)
21 | else:
22 |     device = torch.device("cpu")
23 | 
24 | ## name of intermediate document ##
25 | own_str = args.dataset
26 | 
27 | ## random seed ##
28 | seed = args.seed
29 | numpy.random.seed(seed)
30 | random.seed(seed)
31 | torch.manual_seed(seed)
32 | torch.cuda.manual_seed(seed)
33 | 
34 | def predict(save_path):
35 | 
36 |     logger = setup_logger("Build logging...", save_path, if_train=False)
37 | 
38 |     logger.info('Configs {}\n'.format(args))
39 |     nei_index, feats, mps, pos, label, idx_train, idx_val, idx_test = \
40 |         load_data(args.dataset, args.ratio, args.type_num)
41 |     nb_classes = label.shape[-1]
42 |     logger.info('number of classes = {}'.format(nb_classes))
43 |     feats_dim_list = [i.shape[1] for i in feats]
44 |     P = int(len(mps))
45 |     logger.info("seed {}".format(args.seed))
46 |     logger.info("Dataset: {}".format(args.dataset))
47 |     logger.info("The number of meta-paths: {}".format(P))
48 |     model = HeCo(args.hidden_dim, feats_dim_list, args.feat_drop, args.attn_drop,
49 |                  P, args.sample_rate, args.nei_num, args.tau, args.lam)
50 |     if torch.cuda.is_available():
51 |         logger.info('Using CUDA')
52 |         model.cuda()
53 |         feats = [feat.cuda() for feat in feats]
54 |         mps = [mp.cuda() for mp in mps]
55 | 
56 |     model.load_state_dict(torch.load(save_path + 'HeCo_' + own_str + '.pkl'))
57 |     model.eval()
58 |     # os.remove('HeCo_'+own_str+'.pkl')
59 |     embeds = model.get_embeds(feats, mps)
60 | 
61 |     if args.save_emb:
62 |         if not os.path.exists(args.save_path + "./embeds/" + args.dataset + "/"):
63 |             os.makedirs(args.save_path + "./embeds/" + args.dataset + "/")
64 |         f = open(args.save_path + "./embeds/" + args.dataset + "/" + str(args.turn) + ".pkl", "wb")
65 |         pkl.dump(embeds.cpu().data.numpy(), f)
66 |         f.close()
67 | 
68 | if __name__ == '__main__':
69 |     save_path = '../data/mouse_human_sagittal/results/2022-10-14_11-40-11/'
70 |     predict(save_path)


--------------------------------------------------------------------------------
/BrainAlign/code/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import evaluate
2 | from .load_data import load_data
3 | from .params import set_params
4 | from .logreg import LogReg
5 | 


--------------------------------------------------------------------------------
/BrainAlign/code/utils/evaluate.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from .logreg import LogReg
  4 | import torch.nn as nn
  5 | from sklearn.metrics import f1_score
  6 | from torch.nn.functional import softmax
  7 | from sklearn.metrics import roc_auc_score
  8 | 
  9 | 
 10 | ##################################################
 11 | # This section of code adapted from pcy1302/DMGI #
 12 | ##################################################
 13 | 
 14 | def evaluate(embeds, ratio, idx_train, idx_val, idx_test, label, nb_classes, device, dataset, lr, wd
 15 |              , isTest=True):
 16 |     hid_units = embeds.shape[1]
 17 |     xent = nn.CrossEntropyLoss()
 18 | 
 19 |     train_embs = embeds[idx_train]
 20 |     val_embs = embeds[idx_val]
 21 |     test_embs = embeds[idx_test]
 22 | 
 23 |     train_lbls = torch.argmax(label[idx_train], dim=-1)
 24 |     val_lbls = torch.argmax(label[idx_val], dim=-1)
 25 |     test_lbls = torch.argmax(label[idx_test], dim=-1)
 26 |     accs = []
 27 |     micro_f1s = []
 28 |     macro_f1s = []
 29 |     macro_f1s_val = []
 30 |     auc_score_list = []
 31 | 
 32 |     for _ in range(50):
 33 |         log = LogReg(hid_units, nb_classes)
 34 |         opt = torch.optim.Adam(log.parameters(), lr=lr, weight_decay=wd)
 35 |         log.to(device)
 36 | 
 37 |         val_accs = []
 38 |         test_accs = []
 39 |         val_micro_f1s = []
 40 |         test_micro_f1s = []
 41 |         val_macro_f1s = []
 42 |         test_macro_f1s = []
 43 | 
 44 |         logits_list = []
 45 |         for iter_ in range(200):
 46 |             # train
 47 |             log.train()
 48 |             opt.zero_grad()
 49 | 
 50 |             logits = log(train_embs)
 51 |             loss = xent(logits, train_lbls)
 52 | 
 53 |             loss.backward()
 54 |             opt.step()
 55 | 
 56 |             # val
 57 |             logits = log(val_embs)
 58 |             preds = torch.argmax(logits, dim=1)
 59 | 
 60 |             val_acc = torch.sum(preds == val_lbls).float() / val_lbls.shape[0]
 61 |             val_f1_macro = f1_score(val_lbls.cpu(), preds.cpu(), average='macro')
 62 |             val_f1_micro = f1_score(val_lbls.cpu(), preds.cpu(), average='micro')
 63 | 
 64 |             val_accs.append(val_acc.item())
 65 |             val_macro_f1s.append(val_f1_macro)
 66 |             val_micro_f1s.append(val_f1_micro)
 67 | 
 68 |             # test
 69 |             logits = log(test_embs)
 70 |             preds = torch.argmax(logits, dim=1)
 71 | 
 72 |             test_acc = torch.sum(preds == test_lbls).float() / test_lbls.shape[0]
 73 |             test_f1_macro = f1_score(test_lbls.cpu(), preds.cpu(), average='macro')
 74 |             test_f1_micro = f1_score(test_lbls.cpu(), preds.cpu(), average='micro')
 75 | 
 76 |             test_accs.append(test_acc.item())
 77 |             test_macro_f1s.append(test_f1_macro)
 78 |             test_micro_f1s.append(test_f1_micro)
 79 |             logits_list.append(logits)
 80 | 
 81 |         max_iter = val_accs.index(max(val_accs))
 82 |         accs.append(test_accs[max_iter])
 83 |         max_iter = val_macro_f1s.index(max(val_macro_f1s))
 84 |         macro_f1s.append(test_macro_f1s[max_iter])
 85 |         macro_f1s_val.append(val_macro_f1s[max_iter])
 86 | 
 87 |         max_iter = val_micro_f1s.index(max(val_micro_f1s))
 88 |         micro_f1s.append(test_micro_f1s[max_iter])
 89 | 
 90 |         # auc
 91 |         best_logits = logits_list[max_iter]
 92 |         best_proba = softmax(best_logits, dim=1)
 93 |         auc_score_list.append(roc_auc_score(y_true=test_lbls.detach().cpu().numpy(),
 94 |                                             y_score=best_proba.detach().cpu().numpy(),
 95 |                                             multi_class='ovr'
 96 |                                             ))
 97 | 
 98 |     if isTest:
 99 |         print("\t[Classification] Macro-F1_mean: {:.4f} var: {:.4f}  Micro-F1_mean: {:.4f} var: {:.4f} auc {:.4f}"
100 |               .format(np.mean(macro_f1s),
101 |                       np.std(macro_f1s),
102 |                       np.mean(micro_f1s),
103 |                       np.std(micro_f1s),
104 |                       np.mean(auc_score_list),
105 |                       np.std(auc_score_list)
106 |                       )
107 |               )
108 |     else:
109 |         return np.mean(macro_f1s_val), np.mean(macro_f1s)
110 | 
111 |     f = open("result_"+dataset+str(ratio)+".txt", "a")
112 |     f.write(str(np.mean(macro_f1s))+"\t"+str(np.mean(micro_f1s))+"\t"+str(np.mean(auc_score_list))+"\n")
113 |     f.close()
114 | 


--------------------------------------------------------------------------------
/BrainAlign/code/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | import os.path as osp
 5 | import time
 6 | def setup_logger(name, save_dir, if_train):
 7 |     logger = logging.getLogger(name)
 8 |     logger.setLevel(logging.DEBUG)
 9 | 
10 |     ch = logging.StreamHandler(stream=sys.stdout)
11 |     ch.setLevel(logging.DEBUG)
12 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
13 |     ch.setFormatter(formatter)
14 |     logger.addHandler(ch)
15 | 
16 |     if save_dir:
17 |         if not osp.exists(save_dir):
18 |             os.makedirs(save_dir)
19 |         if if_train:
20 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_train_log.txt"), mode='w')
21 |         elif if_train == False:
22 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_analysis_log.txt"), mode='w')
23 |         elif if_train == None:
24 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S") + "_process_log.txt"),
25 |                                      mode='w')
26 |         fh.setLevel(logging.DEBUG)
27 |         fh.setFormatter(formatter)
28 |         logger.addHandler(fh)
29 | 
30 |     return logger


--------------------------------------------------------------------------------
/BrainAlign/code/utils/logreg.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class LogReg(nn.Module):
 6 |     def __init__(self, ft_in, nb_classes):
 7 |         super(LogReg, self).__init__()
 8 |         self.fc = nn.Linear(ft_in, nb_classes)
 9 | 
10 |         for m in self.modules():
11 |             self.weights_init(m)
12 | 
13 |     def weights_init(self, m):
14 |         if isinstance(m, nn.Linear):
15 |             torch.nn.init.xavier_uniform_(m.weight.data)
16 |             if m.bias is not None:
17 |                 m.bias.data.fill_(0.0)
18 | 
19 |     def forward(self, seq):
20 |         ret = self.fc(seq)
21 |         return ret
22 | 


--------------------------------------------------------------------------------
/BrainAlign/data/SlideseqV2_mouse_macaque_hippocampus/Data/Untitled.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 5,
 6 |    "id": "91f07b93-f89e-41c8-b310-799ee1869995",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "name": "stdout",
11 |      "output_type": "stream",
12 |      "text": [
13 |       "      Gene name Mouse gene name Mouse homology type\n",
14 |       "0        ZNF692          Zfp692    ortholog_one2one\n",
15 |       "1        ZNF672          Zfp672    ortholog_one2one\n",
16 |       "2       SH3BP5L         Sh3bp5l    ortholog_one2one\n",
17 |       "3           NaN           Bend2  ortholog_many2many\n",
18 |       "4         LYPD8           Lypd8    ortholog_one2one\n",
19 |       "...         ...             ...                 ...\n",
20 |       "24961      ND4L         mt-Nd4l    ortholog_one2one\n",
21 |       "24962       ND4          mt-Nd4    ortholog_one2one\n",
22 |       "24963       ND5          mt-Nd5    ortholog_one2one\n",
23 |       "24964       ND6          mt-Nd6   ortholog_one2many\n",
24 |       "24965      CYTB         mt-Cytb    ortholog_one2one\n",
25 |       "\n",
26 |       "[24966 rows x 3 columns]\n",
27 |       "      Gene name Mouse gene name Mouse homology type\n",
28 |       "0        ZNF692          Zfp692    ortholog_one2one\n",
29 |       "1        ZNF672          Zfp672    ortholog_one2one\n",
30 |       "2       SH3BP5L         Sh3bp5l    ortholog_one2one\n",
31 |       "4         LYPD8           Lypd8    ortholog_one2one\n",
32 |       "8           NaN           Lypd9    ortholog_one2one\n",
33 |       "...         ...             ...                 ...\n",
34 |       "24960       ND3          mt-Nd3    ortholog_one2one\n",
35 |       "24961      ND4L         mt-Nd4l    ortholog_one2one\n",
36 |       "24962       ND4          mt-Nd4    ortholog_one2one\n",
37 |       "24963       ND5          mt-Nd5    ortholog_one2one\n",
38 |       "24965      CYTB         mt-Cytb    ortholog_one2one\n",
39 |       "\n",
40 |       "[16080 rows x 3 columns]\n"
41 |      ]
42 |     }
43 |    ],
44 |    "source": [
45 |     "import pandas as pd\n",
46 |     "\n",
47 |     "file_path = './Macaque_Mouse.tsv'\n",
48 |     "df = pd.read_csv(file_path, sep='\\t')\n",
49 |     "\n",
50 |     "print(df)\n",
51 |     "df.to_csv('./Macaque_Mouse_multi2multi.csv', index=False)\n",
52 |     "\n",
53 |     "df = df[df['Mouse homology type'].isin(['ortholog_one2one'])]\n",
54 |     "print(df)\n",
55 |     "\n",
56 |     "df.to_csv('./Macaque_Mouse_one2one.csv', index=False)\n",
57 |     "\n"
58 |    ]
59 |   },
60 |   {
61 |    "cell_type": "code",
62 |    "execution_count": null,
63 |    "id": "d42fc7ae-1696-4ed4-a779-884f08b58ee0",
64 |    "metadata": {},
65 |    "outputs": [],
66 |    "source": []
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3 (ipykernel)",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.8.18"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 5
90 | }
91 | 


--------------------------------------------------------------------------------
/BrainAlign/data/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2023/5/14 10:05
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py.py
6 | # @Description: This file is used to ...
7 | 


--------------------------------------------------------------------------------
/BrainAlign/data/data_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def threshold_array(X):
 5 |     '''
 6 |     input: array row: sample, column: gene vector
 7 |     output: a binary matrix
 8 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
 9 |     '''
10 |     return X > np.mean(X, axis=0)
11 | 
12 | def threshold_quantile(X, quantile=0.8):
13 |     '''
14 |     input: array row: sample, column: gene vector
15 |     output: a binary matrix
16 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
17 |     '''
18 |     return X > np.quantile(X, quantile, axis=0)
19 | 
20 | 
21 | def threshold_top(X, percent=1):
22 |     '''
23 |     input: array row: sample, column: gene vector
24 |     output: a binary matrix
25 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
26 |     '''
27 |     #topk = int(round(X.shape[0] * percent))
28 |     topk = percent
29 |     #print(topk)
30 |     #topk_pos = X.shape[0] - topk
31 |     X_sort = np.sort(X, axis=0)
32 |     return X >= X_sort[-topk, :]
33 | 
34 | 
35 | def threshold_array_nonzero(X):
36 |     '''
37 |     input: array row: sample, column: gene vector
38 |     output: a binary matrix
39 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
40 |     '''
41 |     return X > 0
42 | 
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     X = np.array([[1,2,3],[2,3,4], [2,3,4], [4,5,2], [7,26,10]])
47 |     print(X)
48 |     print(threshold_top(X, percent=0.4))
49 |     #print(threshold_array(X))


--------------------------------------------------------------------------------
/BrainAlign/data/load_node_feature_mouse_human.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp
 3 | import pickle
 4 | 
 5 | '''
 6 |    The 1st kind of method to get initial embeddings: use original sample/voxel expression data.
 7 | '''
 8 | 
 9 | def extract_expression_embedding():
10 |     return 0
11 | 
12 | 
13 | '''
14 |     The 2nd kind of method to get initial embeddings: use embeddings output by CAME. 
15 | '''
16 | def extract_came_embedding():
17 | 
18 |     return 0
19 | 
20 | 
21 | 
22 | def init_embedding(method='CAME'):
23 |     if method == 'CAME':
24 |         extract_came_embedding()
25 |     elif method == 'Expression':
26 |         extract_expression_embedding()
27 | 
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     path_datapiar = '../../../CAME/brain_human_mouse/(\'Baron_mouse\', \'Baron_human\')-(06-19 16.19.17)/datapair_init.pickle'
32 |     path_datapiar_file = open(path_datapiar, 'rb')
33 |     datapair = pickle.load(path_datapiar_file)
34 |     print(datapair)
35 |     print(datapair['features'][0].shape)
36 |     print(datapair['features'][1].shape)
37 | 
38 |     print(datapair['varnames_feat'])
39 | 
40 | 
41 | 
42 |     '''
43 |     nei = np.load('./dblp/nei_p.npy', allow_pickle=True)
44 |     print(nei)
45 |     print(nei.shape)
46 |     print(nei[0].shape)
47 |     for arr in nei:
48 |         print(arr.shape)
49 |     '''
50 | 
51 |     '''
52 |     p_feat = sp.load_npz('./dblp/p_feat.npz')
53 |     print(p_feat.shape)
54 |     a_feat = sp.load_npz('./dblp/a_feat.npz')
55 |     print(a_feat.shape)
56 |     t_feat = np.load('./dblp/t_feat.npz')
57 |     print(t_feat.shape)
58 |     '''
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/BrainAlign/data/mouse_macaque_hippocampus/Data/Untitled.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 5,
 6 |    "id": "91f07b93-f89e-41c8-b310-799ee1869995",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "name": "stdout",
11 |      "output_type": "stream",
12 |      "text": [
13 |       "      Gene name Mouse gene name Mouse homology type\n",
14 |       "0        ZNF692          Zfp692    ortholog_one2one\n",
15 |       "1        ZNF672          Zfp672    ortholog_one2one\n",
16 |       "2       SH3BP5L         Sh3bp5l    ortholog_one2one\n",
17 |       "3           NaN           Bend2  ortholog_many2many\n",
18 |       "4         LYPD8           Lypd8    ortholog_one2one\n",
19 |       "...         ...             ...                 ...\n",
20 |       "24961      ND4L         mt-Nd4l    ortholog_one2one\n",
21 |       "24962       ND4          mt-Nd4    ortholog_one2one\n",
22 |       "24963       ND5          mt-Nd5    ortholog_one2one\n",
23 |       "24964       ND6          mt-Nd6   ortholog_one2many\n",
24 |       "24965      CYTB         mt-Cytb    ortholog_one2one\n",
25 |       "\n",
26 |       "[24966 rows x 3 columns]\n",
27 |       "      Gene name Mouse gene name Mouse homology type\n",
28 |       "0        ZNF692          Zfp692    ortholog_one2one\n",
29 |       "1        ZNF672          Zfp672    ortholog_one2one\n",
30 |       "2       SH3BP5L         Sh3bp5l    ortholog_one2one\n",
31 |       "4         LYPD8           Lypd8    ortholog_one2one\n",
32 |       "8           NaN           Lypd9    ortholog_one2one\n",
33 |       "...         ...             ...                 ...\n",
34 |       "24960       ND3          mt-Nd3    ortholog_one2one\n",
35 |       "24961      ND4L         mt-Nd4l    ortholog_one2one\n",
36 |       "24962       ND4          mt-Nd4    ortholog_one2one\n",
37 |       "24963       ND5          mt-Nd5    ortholog_one2one\n",
38 |       "24965      CYTB         mt-Cytb    ortholog_one2one\n",
39 |       "\n",
40 |       "[16080 rows x 3 columns]\n"
41 |      ]
42 |     }
43 |    ],
44 |    "source": [
45 |     "import pandas as pd\n",
46 |     "\n",
47 |     "file_path = './Macaque_Mouse.tsv'\n",
48 |     "df = pd.read_csv(file_path, sep='\\t')\n",
49 |     "\n",
50 |     "print(df)\n",
51 |     "df.to_csv('./Macaque_Mouse_multi2multi.csv', index=False)\n",
52 |     "\n",
53 |     "df = df[df['Mouse homology type'].isin(['ortholog_one2one'])]\n",
54 |     "print(df)\n",
55 |     "\n",
56 |     "df.to_csv('./Macaque_Mouse_one2one.csv', index=False)\n",
57 |     "\n"
58 |    ]
59 |   },
60 |   {
61 |    "cell_type": "code",
62 |    "execution_count": null,
63 |    "id": "d42fc7ae-1696-4ed4-a779-884f08b58ee0",
64 |    "metadata": {},
65 |    "outputs": [],
66 |    "source": []
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3 (ipykernel)",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.8.18"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 5
90 | }
91 | 


--------------------------------------------------------------------------------
/BrainAlign/data/mp_gen.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp
 3 | 
 4 | ####################################################
 5 | # This tool is to generate meta-path based adjacency
 6 | # matrix given original links.
 7 | ####################################################
 8 | 
 9 | pa = np.genfromtxt("./dblp/pa.txt")
10 | pc = np.genfromtxt("./dblp/pc.txt")
11 | pt = np.genfromtxt("./dblp/pt.txt")
12 | 
13 | A = 4057
14 | P = 14328
15 | C = 20
16 | T = 7723
17 | 
18 | pa_ = sp.coo_matrix((np.ones(pa.shape[0]),(pa[:,0], pa[:, 1])),shape=(P,A)).toarray()
19 | pc_ = sp.coo_matrix((np.ones(pc.shape[0]),(pc[:,0], pc[:, 1])),shape=(P,C)).toarray()
20 | pt_ = sp.coo_matrix((np.ones(pt.shape[0]),(pt[:,0], pt[:, 1])),shape=(P,T)).toarray()
21 | 
22 | apa = np.matmul(pa_.T, pa_) > 0
23 | apa = sp.coo_matrix(apa)
24 | sp.save_npz("./dblp/apa.npz", apa)
25 | 
26 | apc = np.matmul(pa_.T, pc_) > 0
27 | apcpa = np.matmul(apc, apc.T) > 0
28 | apcpa = sp.coo_matrix(apcpa)
29 | sp.save_npz("./dblp/apcpa.npz", apcpa)
30 | 
31 | apt = np.matmul(pa_.T, pt_) > 0
32 | aptpa = np.matmul(apt, apt.T) > 0
33 | aptpa = sp.coo_matrix(aptpa)
34 | sp.save_npz("./dblp/aptpa.npz", aptpa)
35 | 


--------------------------------------------------------------------------------
/BrainAlign/data/mp_gen_mouse_human.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse as sp
  3 | import pickle
  4 | ####################################################
  5 | # This tool is to generate meta-path based adjacency
  6 | # matrix given original links.
  7 | ####################################################
  8 | 
  9 | 
 10 | path_datapiar = '../../../CAME/brain_mouse_human_no_threshold/datapair_init.pickle'
 11 | path_datapiar_file = open(path_datapiar, 'rb')
 12 | datapair = pickle.load(path_datapiar_file)
 13 | print(datapair)
 14 | print(len(datapair['varnames_node'][0]))
 15 | np.save('./mouse_human/mouse_gene_names.npy', datapair['varnames_node'][0])
 16 | print(len(datapair['varnames_node'][1]))
 17 | np.save('./mouse_human/human_gene_names.npy', datapair['varnames_node'][1])
 18 | 
 19 | 
 20 | 
 21 | S = 72968
 22 | M = 2578
 23 | H = 3326
 24 | V = 3682
 25 | 
 26 | sm_ = datapair['ov_adjs'][0].toarray()
 27 | print(sm_)
 28 | print('sm_', sm_.shape)
 29 | 
 30 | vh_ = datapair['ov_adjs'][1].toarray()
 31 | print('vh_', vh_.shape)
 32 | mm_ = datapair['vv_adj'].toarray()[0:M, 0:M]
 33 | print('mm_', mm_.shape)
 34 | print('mm_ sum', np.sum(mm_))
 35 | hh_ = datapair['vv_adj'].toarray()[M:, M:]
 36 | print('hh_', hh_.shape)
 37 | print('hh_ sum', np.sum(hh_)) # == 0
 38 | mh_ = datapair['vv_adj'].toarray()[0:M, M:]
 39 | print('mh_', mh_.shape)
 40 | #ss_ = datapair['oo_adjs'].toarray()[0:S, 0:S]
 41 | #print('ss_', ss_.shape)
 42 | #print('ss_ sum', np.sum(ss_)) # == 0
 43 | vv_ = datapair['oo_adjs'].toarray()[S:, S:]
 44 | print('vv_', vv_.shape)
 45 | print(np.sum(vv_))
 46 | print('vv_ sum', np.sum(vv_))
 47 | sv_ = datapair['oo_adjs'].toarray()[0:S, S:]
 48 | print('sv_', sv_.shape)
 49 | 
 50 | '''
 51 | sms = np.matmul(sm_, sm_.T) # > 0
 52 | print(sms)
 53 | sms = sp.coo_matrix(sms)
 54 | sp.save_npz("./mouse_human/sms.npz", sms)
 55 | 
 56 | 
 57 | smh = np.matmul(sm_, mh_) #> 0
 58 | smhv = np.matmul(smh, vh_.T) #> 0
 59 | smhvhms = np.matmul(smhv, smhv.T) #> 0
 60 | print(smhvhms)
 61 | smhvhms = sp.coo_matrix(smhvhms)
 62 | sp.save_npz("./mouse_human/smhvhms.npz", smhvhms)
 63 | 
 64 | smh = np.matmul(sm_, mh_) #> 0
 65 | smhv = np.matmul(smh, vh_.T) #> 0
 66 | smhvv = np.matmul(smhv, vv_) #> 0
 67 | smhvvhms = np.matmul(smhv, smhvv.T) #> 0
 68 | print(smhvvhms)
 69 | smhvvhms = sp.coo_matrix(smhvvhms)
 70 | sp.save_npz("./mouse_human/smhvvhms.npz", smhvvhms)
 71 | 
 72 | 
 73 | '''
 74 | 
 75 | 
 76 | 
 77 | '''
 78 | sms = sp.csr_matrix(sm_).dot( sp.csr_matrix(sm_.T) ).toarray() > 0
 79 | sms = sp.csr_matrix(sms)
 80 | sp.save_npz("./mouse_human/sms.npz", sms)
 81 | 
 82 | smh = sp.csr_matrix(sm_).dot(sp.csr_matrix(mh_)) > 0
 83 | smhv = smh.dot(sp.csr_matrix(vh_.T)) > 0
 84 | smhvhms = smhv.dot(smh.T) > 0
 85 | smhvhms = sp.coo_matrix(smhvhms)
 86 | sp.save_npz("./mouse_human/smhvhms.npz", smhvhms)
 87 | 
 88 | #smh = np.matmul(sm_, mh_) > 0
 89 | #smhv = np.matmul(smh, vh_.T) > 0
 90 | smhvvhms = smhv.dot(smhv.T) > 0
 91 | smhvvhms = sp.coo_matrix(smhvvhms)
 92 | sp.save_npz("./mouse_human/smhvvhms.npz", smhvvhms)
 93 | '''
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | '''
100 | pa = np.genfromtxt("./dblp/pa.txt")
101 | pc = np.genfromtxt("./dblp/pc.txt")
102 | pt = np.genfromtxt("./dblp/pt.txt")
103 | 
104 | A = 4057
105 | P = 14328
106 | C = 20
107 | T = 7723
108 | 
109 | pa_ = sp.coo_matrix((np.ones(pa.shape[0]),(pa[:,0], pa[:, 1])),shape=(P,A)).toarray()
110 | print(pa_.shape)
111 | pc_ = sp.coo_matrix((np.ones(pc.shape[0]),(pc[:,0], pc[:, 1])),shape=(P,C)).toarray()
112 | print(pc_.shape)
113 | pt_ = sp.coo_matrix((np.ones(pt.shape[0]),(pt[:,0], pt[:, 1])),shape=(P,T)).toarray()
114 | print(pt_.shape)
115 | 
116 | 
117 | apa = np.matmul(pa_.T, pa_) > 0
118 | apa = sp.coo_matrix(apa)
119 | sp.save_npz("./dblp/apa.npz", apa)
120 | 
121 | apc = np.matmul(pa_.T, pc_) > 0
122 | apcpa = np.matmul(apc, apc.T) > 0
123 | apcpa = sp.coo_matrix(apcpa)
124 | sp.save_npz("./dblp/apcpa.npz", apcpa)
125 | 
126 | apt = np.matmul(pa_.T, pt_) > 0
127 | aptpa = np.matmul(apt, apt.T) > 0
128 | aptpa = sp.coo_matrix(aptpa)
129 | sp.save_npz("./dblp/aptpa.npz", aptpa)
130 | '''


--------------------------------------------------------------------------------
/BrainAlign/data/neibor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp
 3 | 
 4 | ####################################################
 5 | # This tool is to collect neighbors, and reform them
 6 | # as numpy.array style for futher usage.
 7 | ####################################################
 8 | 
 9 | # This is for DBLP
10 | pa = np.genfromtxt("./dblp/pa.txt")
11 | a_n = {}
12 | for i in pa:
13 |   if i[1] not in a_n:
14 |     a_n[int(i[1])]=[]
15 |     a_n[int(i[1])].append(int(i[0]))
16 |   else:
17 |     a_n[int(i[1])].append(int(i[0]))
18 |     
19 | keys =  sorted(a_n.keys())
20 | a_n = [a_n[i] for i in keys]
21 | a_n = np.array([np.array(i) for i in a_n])
22 | np.save("nei_p.npy", a_n)
23 | print(a_n.shape)
24 | 
25 | # give some basic statistics about neighbors
26 | l = [len(i) for i in a_n]
27 | print(max(l),min(l),np.mean(l))
28 | 
29 | 
30 | 
31 | 
32 | # This is for ACM, Freebase, AMiner
33 | pa = np.genfromtxt("./aminer/pa.txt")
34 | p_n = {}
35 | for i in pa:
36 |   if i[0] not in p_n:
37 |     p_n[int(i[0])]=[]
38 |     p_n[int(i[0])].append(int(i[1]))
39 |   else:
40 |     p_n[int(i[0])].append(int(i[1]))
41 |     
42 | keys =  sorted(p_n.keys())
43 | p_n = [p_n[i] for i in keys]
44 | p_n = np.array([np.array(i) for i in p_n])
45 | np.save("nei_a.npy", p_n)
46 | print(p_n.shape)
47 | # give some basic statistics about neighbors
48 | l = [len(i) for i in p_n]
49 | print(max(l),min(l),np.mean(l))
50 | 


--------------------------------------------------------------------------------
/BrainAlign/data/pos.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp
 3 | from collections import Counter
 4 | 
 5 | ####################################################
 6 | # This tool is to generate positive set with a thre-
 7 | # shold "pos_num".
 8 | # dataset  pos_num
 9 | # acm      5
10 | # dblp     1000
11 | # aminer   15
12 | # freebase 80
13 | #
14 | #
15 | # Notice: The best pos_num of acm is 7 reported in 
16 | # paper, but we find there is no much difference 
17 | # between 5 and 7 in practice.
18 | ####################################################
19 | 
20 | pos_num = 5
21 | p = 4019
22 | pap = sp.load_npz("./acm/pap.npz")
23 | pap = pap / pap.sum(axis=-1).reshape(-1,1)
24 | print(pap)
25 | psp = sp.load_npz("./acm/psp.npz")
26 | psp = psp / psp.sum(axis=-1).reshape(-1,1)
27 | print(psp)
28 | all = (pap + psp).A.astype("float32")
29 | print(all)
30 | all_ = (all>0).sum(-1)
31 | print(all_.max(),all_.min(),all_.mean())
32 | 
33 | pos = np.zeros((p,p))
34 | k=0
35 | for i in range(len(all)):
36 |   one = all[i].nonzero()[0]
37 |   if len(one) > pos_num:
38 |     oo = np.argsort(-all[i, one])
39 |     sele = one[oo[:pos_num]]
40 |     pos[i, sele] = 1
41 |     k+=1
42 |   else:
43 |     pos[i, one] = 1
44 | pos = sp.coo_matrix(pos)
45 | print(pos)
46 | print(type(pos))
47 | sp.save_npz("pos.npz", pos)
48 | 


--------------------------------------------------------------------------------
/BrainAlign/data/script_labels.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | if __name__ == '__main__':
 5 |     '''
 6 |     path = './acm/labels.npy'
 7 |     x = np.load(path).astype('int32')
 8 |     print(x)
 9 |     print(len(x))
10 |     print('max = ', max(x), 'min = ', min(x))
11 |     '''
12 |     database = 'aminer'
13 | 
14 |     path = './{}/train_20.npy'.format(database)
15 |     x = np.load(path)
16 |     print(x)
17 |     print('train:', x.shape)
18 |     path = './{}/train_40.npy'.format(database)
19 |     x = np.load(path)
20 |     #print(x)
21 |     print('train:', x.shape)
22 |     path = './{}/train_60.npy'.format(database)
23 |     x = np.load(path)
24 |     #print(x)
25 |     print('train:', x.shape)
26 | 
27 |     path = './{}/test_20.npy'.format(database)
28 |     x = np.load(path)
29 |     #print(x)
30 |     print('test:', x.shape)
31 |     path = './{}/test_40.npy'.format(database)
32 |     x = np.load(path)
33 |     #print(x)
34 |     print('test:', x.shape)
35 |     path = './{}/test_60.npy'.format(database)
36 |     x = np.load(path)
37 |     #print(x)
38 |     print('test:', x.shape)
39 | 
40 |     path = './{}/val_20.npy'.format(database)
41 |     x = np.load(path)
42 |     #print(x)
43 |     print('val:', x.shape)
44 |     path = './{}/val_40.npy'.format(database)
45 |     x = np.load(path)
46 |     #print(x)
47 |     print('val:', x.shape)
48 |     path = './{}/val_60.npy'.format(database)
49 |     x = np.load(path)
50 |     #print(x)
51 |     print('val:', x.shape)
52 | 
53 |     database = 'freebase'
54 | 
55 |     path = './{}/train_20.npy'.format(database)
56 |     x = np.load(path)
57 |     print(x)
58 |     print('train:', x.shape)
59 |     path = './{}/train_40.npy'.format(database)
60 |     x = np.load(path)
61 |     # print(x)
62 |     print('train:', x.shape)
63 |     path = './{}/train_60.npy'.format(database)
64 |     x = np.load(path)
65 |     # print(x)
66 |     print('train:', x.shape)
67 | 
68 |     path = './{}/test_20.npy'.format(database)
69 |     x = np.load(path)
70 |     # print(x)
71 |     print('test:', x.shape)
72 |     path = './{}/test_40.npy'.format(database)
73 |     x = np.load(path)
74 |     # print(x)
75 |     print('test:', x.shape)
76 |     path = './{}/test_60.npy'.format(database)
77 |     x = np.load(path)
78 |     # print(x)
79 |     print('test:', x.shape)
80 | 
81 |     path = './{}/val_20.npy'.format(database)
82 |     x = np.load(path)
83 |     # print(x)
84 |     print('val:', x.shape)
85 |     path = './{}/val_40.npy'.format(database)
86 |     x = np.load(path)
87 |     # print(x)
88 |     print('val:', x.shape)
89 |     path = './{}/val_60.npy'.format(database)
90 |     x = np.load(path)
91 |     # print(x)
92 |     print('val:', x.shape)


--------------------------------------------------------------------------------
/BrainAlign/demo/subsample.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2024/2/1 18:13
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : subsample.py
 6 | # @Description: This file is subsampling the origin data
 7 | import scanpy as sc
 8 | import numpy as np
 9 | 
10 | def obs_key_wise_subsampling(adata, obs_key, N):
11 |     '''
12 |     Subsample each class to same cell numbers (N). Classes are given by obs_key pointing to categorical in adata.obs.
13 |     '''
14 |     counts = adata.obs[obs_key].value_counts()
15 |     # subsample indices per group defined by obs_key
16 |     indices = [np.random.choice(adata.obs_names[adata.obs[obs_key]==group], size=N, replace=True).unique() for group in counts.index]
17 |     selection = np.hstack(np.array(indices))
18 |     return adata[selection].copy()
19 | 
20 | if __name__ == '__main__':
21 |     mouse_h5ad_file = 'G:/backup/CAME/brain_mouse_2020sa/mouse_2020sa_64regions.h5ad'
22 |     mouse_adata = sc.read_h5ad(mouse_h5ad_file)
23 |     #mouse_adata = sc.pp.subsample(mouse_adata, fraction=0.1, copy=True)
24 | 
25 | 
26 |     target_cells = 20
27 | 
28 |     adatas = [mouse_adata[mouse_adata.obs['region_name'].isin([clust])] for clust in mouse_adata.obs['region_name'].cat.categories]
29 | 
30 |     for dat in adatas:
31 |         if dat.n_obs > target_cells:
32 |             sc.pp.subsample(dat, fraction=0.1)
33 | 
34 |     adata_downsampled = adatas[0].concatenate(*adatas[1:])
35 | 
36 |     print(adata_downsampled)
37 |     print(adata_downsampled.obs['region_name'].value_counts())
38 | 
39 |     adata_downsampled.write_h5ad("./mouse_2020sa_64regions_demo.h5ad")


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Saul Goodenough
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/readme_figs/alldatasets/all_dataset_seurate_alignment_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/alldatasets/all_dataset_seurate_alignment_score.png


--------------------------------------------------------------------------------
/readme_figs/alldatasets/all_dataset_umap_integration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/alldatasets/all_dataset_umap_integration.png


--------------------------------------------------------------------------------
/readme_figs/subsampled/subsampled_seurate_alignment_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/subsampled/subsampled_seurate_alignment_score.png


--------------------------------------------------------------------------------
/readme_figs/subsampled/subsampled_umap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/readme_figs/subsampled/subsampled_umap.png


--------------------------------------------------------------------------------
/requirements_pip.txt:
--------------------------------------------------------------------------------
 1 | adjustText==0.8
 2 | anndata==0.8.0
 3 | brotlipy==0.7.0
 4 | click==8.1.3
 5 | colorama==0.4.6
 6 | colorcet==3.0.1
 7 | colorlog==6.7.0
 8 | colormap==1.0.4
 9 | contourpy==1.0.7
10 | cycler==0.11.0
11 | dgl==1.0.1
12 | easydev==0.12.1
13 | fitter==1.5.2
14 | fonttools==4.38.0
15 | gseapy==1.0.4
16 | h5py==3.8.0
17 | igraph==0.10.4
18 | imbalanced-learn==0.10.1
19 | imblearn==0.0
20 | importlib-metadata==6.0.0
21 | importlib-resources==5.12.0
22 | joblib==1.2.0
23 | kaleido==0.2.1
24 | kiwisolver==1.4.4
25 | leidenalg==0.9.1
26 | llvmlite==0.39.1
27 | matplotlib==3.7.0
28 | matplotlib-venn==0.11.9
29 | mkl-fft==1.3.1
30 | mkl-service==2.4.0
31 | natsort==8.2.0
32 | networkx==3.0
33 | numba==0.56.4
34 | packaging==23.0
35 | pandas==1.5.3
36 | param==1.12.3
37 | patsy==0.5.3
38 | pexpect==4.8.0
39 | Pillow==9.4.0
40 | plotly==5.14.0
41 | psutil==5.9.4
42 | ptyprocess==0.7.0
43 | pyct==0.5.0
44 | pynndescent==0.5.8
45 | pyparsing==3.0.9
46 | python-dateutil==2.8.2
47 | python-igraph==0.10.4
48 | pytz==2022.7.1
49 | scanpy==1.9.2
50 | scikit-learn==1.2.1
51 | scipy==1.10.1
52 | seaborn==0.12.2
53 | session-info==1.0.0
54 | statannot==0.2.3
55 | statsmodels==0.13.5
56 | stdlib-list==0.8.0
57 | tenacity==8.2.2
58 | texttable==1.6.7
59 | threadpoolctl==3.1.0
60 | torch==1.13.1
61 | torchaudio==0.13.1
62 | torchvision==0.14.1
63 | tqdm==4.64.1
64 | treelib==1.6.4
65 | umap-learn==0.5.3
66 | webcolors==1.13
67 | xgboost==1.7.4
68 | yacs==0.1.8
69 | zipp==3.15.0
70 | ipywidgets==8.1.2


--------------------------------------------------------------------------------
/run_came/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2022/10/19 20:43
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py
6 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/.ipynb_checkpoints/H_run_came-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "befa887d-0cc5-4d4d-a7b1-1a21dfb4e5f1",
 7 |    "metadata": {
 8 |     "scrolled": true
 9 |    },
10 |    "outputs": [],
11 |    "source": [
12 |     "# -- coding: utf-8 --\n",
13 |     "\n",
14 |     "\n",
15 |     "import warnings\n",
16 |     "warnings.filterwarnings(\"ignore\")\n",
17 |     "import sys\n",
18 |     "sys.path.append('../')\n",
19 |     "\n",
20 |     "from analysis_utils import ttest_plot_utils\n",
21 |     "from analysis_utils import homo_random_config as config\n",
22 |     "import os\n",
23 |     "\n",
24 |     "\n",
25 |     "if __name__ == '__main__':\n",
26 |     "\n",
27 |     "    cfg = config._C\n",
28 |     "    #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device\n",
29 |     "    #cfg.CAME.n_top_genes = 1000\n",
30 |     "    cfg.CAME.visible_device = '0'\n",
31 |     "    n_top_genes_list = [2000]\n",
32 |     "\n",
33 |     "    cfg.CAME.n_top_genes = n_top_genes_list[0]\n",
34 |     "    cfg.CAME.sparse = False\n",
35 |     "    cfg.CAME.do_normalize = [False, False]\n",
36 |     "    cfg.CAME.ROOT = '../analysis_results/macaque_mouse_hippocampus/'\n",
37 |     "    cfg.CAME.path_rawdata2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Mouse.h5ad'\n",
38 |     "    cfg.CAME.path_rawdata1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque.h5ad'\n",
39 |     "\n",
40 |     "    cfg.CAME.path_labels_2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_region_list.csv'\n",
41 |     "    cfg.CAME.path_labels_1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/macaque_region_list.csv'\n",
42 |     "\n",
43 |     "    cfg.CAME.human_mouse_homo_region = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_macaque_homo_region.csv'\n",
44 |     "    #    ttest_plot_utils.run_came_homo_random(cfg)\n",
45 |     "\n",
46 |     "    cfg.CAME.path_varmap = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque_Mouse_multi2multi.csv'\n",
47 |     "    cfg.CAME.path_varmap_1v1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque_Mouse_one2one.csv'\n",
48 |     "\n",
49 |     "    cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1\n",
50 |     "    cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2\n",
51 |     "\n",
52 |     "    ttest_plot_utils.run_came_homo_random(cfg)\n",
53 |     "\n"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "code",
58 |    "execution_count": null,
59 |    "id": "d1071b72-043c-46aa-a9e0-541dc7a15ede",
60 |    "metadata": {},
61 |    "outputs": [],
62 |    "source": []
63 |   }
64 |  ],
65 |  "metadata": {
66 |   "kernelspec": {
67 |    "display_name": "env_came",
68 |    "language": "python",
69 |    "name": "env_came"
70 |   },
71 |   "language_info": {
72 |    "codemirror_mode": {
73 |     "name": "ipython",
74 |     "version": 3
75 |    },
76 |    "file_extension": ".py",
77 |    "mimetype": "text/x-python",
78 |    "name": "python",
79 |    "nbconvert_exporter": "python",
80 |    "pygments_lexer": "ipython3",
81 |    "version": "3.8.18"
82 |   }
83 |  },
84 |  "nbformat": 4,
85 |  "nbformat_minor": 5
86 | }
87 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/.ipynb_checkpoints/H_run_came-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2024/02/01 11:30
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : run_came.py
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | from analysis_utils import ttest_plot_utils
10 | from analysis_utils import homo_random_config as config
11 | import os
12 | 
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     cfg = config._C
17 |     #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device
18 |     #cfg.CAME.n_top_genes = 1000
19 |     cfg.CAME.visible_device = '-1'
20 |     n_top_genes_list = [2000]
21 |     #quantile_gene_list = [0.8]
22 |     #quantile_sample_list = [0.9]
23 |     #cfg.CAME.quantile_gene = quantile_gene_list[0]
24 |     #cfg.CAME.quantile_sample = quantile_sample_list[0]
25 |     #for n_top_genes in n_top_genes_list:
26 |     cfg.CAME.n_top_genes = n_top_genes_list[0]
27 |     cfg.CAME.sparse = False
28 |     cfg.CAME.do_normalize = [True, True]
29 |     cfg.CAME.ROOT = '../analysis_results/mouse_macaque_hippocampus/'
30 |     cfg.CAME.path_rawdata1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Mouse.h5ad'
31 |     cfg.CAME.path_rawdata2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque.h5ad'
32 | 
33 |     cfg.CAME.path_mouse_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_region_list.csv'
34 |     cfg.CAME.path_human_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/macaque_region_list.csv'
35 | 
36 |     cfg.CAME.human_mouse_homo_region = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_macaque_homo_region.csv'
37 |     #    ttest_plot_utils.run_came_homo_random(cfg)
38 | 
39 |     cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1
40 |     cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2
41 | 
42 |     #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
43 |     #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
44 | 
45 |     #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
46 |     #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
47 |     ttest_plot_utils.run_came_homo_random(cfg)
48 | 
49 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/.ipynb_checkpoints/run_came-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2024/02/01 11:30
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : run_came.py
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | from analysis_utils import ttest_plot_utils
10 | from analysis_utils import homo_random_config as config
11 | import os
12 | 
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     cfg = config._C
17 |     #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device
18 |     #cfg.CAME.n_top_genes = 1000
19 |     cfg.CAME.visible_device = '-1'
20 |     n_top_genes_list = [2000]
21 |     #quantile_gene_list = [0.8]
22 |     #quantile_sample_list = [0.9]
23 |     #cfg.CAME.quantile_gene = quantile_gene_list[0]
24 |     #cfg.CAME.quantile_sample = quantile_sample_list[0]
25 |     #for n_top_genes in n_top_genes_list:
26 |     cfg.CAME.n_top_genes = n_top_genes_list[0]
27 |     cfg.CAME.sparse = False
28 |     cfg.CAME.do_normalize = [False, True]
29 |     cfg.CAME.ROOT = '../analysis_results/mouse_2020sa/'
30 |     cfg.CAME.path_rawdata1 = '../brain_mouse_2020sa/mouse_2020sa_64regions.h5ad'
31 |     cfg.CAME.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad'
32 | 
33 |     cfg.CAME.path_mouse_labels = '../brain_mouse_2020sa/mouse_region_list_64.csv'
34 |     cfg.CAME.path_human_labels = '../brain_human_mouse/human_88_label_origin.csv'
35 | 
36 |     cfg.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67_all.csv'
37 |     #    ttest_plot_utils.run_came_homo_random(cfg)
38 | 
39 |     cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1
40 |     cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2
41 | 
42 |     #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
43 |     #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
44 | 
45 |     #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
46 |     #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
47 | 
48 |     ttest_plot_utils.run_came_homo_random(cfg)
49 | 
50 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/H_run_came.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2024/02/01 11:30
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : run_came.py
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | from analysis_utils import ttest_plot_utils
10 | from analysis_utils import homo_random_config as config
11 | import os
12 | 
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     cfg = config._C
17 |     #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device
18 |     #cfg.CAME.n_top_genes = 1000
19 |     cfg.CAME.visible_device = '-1'
20 |     n_top_genes_list = [2000]
21 |     #quantile_gene_list = [0.8]
22 |     #quantile_sample_list = [0.9]
23 |     #cfg.CAME.quantile_gene = quantile_gene_list[0]
24 |     #cfg.CAME.quantile_sample = quantile_sample_list[0]
25 |     #for n_top_genes in n_top_genes_list:
26 |     cfg.CAME.n_top_genes = n_top_genes_list[0]
27 |     cfg.CAME.sparse = False
28 |     cfg.CAME.do_normalize = [True, True]
29 |     cfg.CAME.ROOT = '../analysis_results/mouse_macaque_hippocampus/'
30 |     cfg.CAME.path_rawdata1 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Mouse.h5ad'
31 |     cfg.CAME.path_rawdata2 = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/Macaque.h5ad'
32 | 
33 |     cfg.CAME.path_mouse_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_region_list.csv'
34 |     cfg.CAME.path_human_labels = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/macaque_region_list.csv'
35 | 
36 |     cfg.CAME.human_mouse_homo_region = '../../BrainAlign/data/srrsc_mouse_macaque_hippocampus/Data/mouse_macaque_homo_region.csv'
37 |     #    ttest_plot_utils.run_came_homo_random(cfg)
38 | 
39 |     cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1
40 |     cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2
41 | 
42 |     #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
43 |     #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
44 | 
45 |     #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
46 |     #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
47 |     ttest_plot_utils.run_came_homo_random(cfg)
48 | 
49 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/load_mouse_region_tree.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2022/12/18 22:47
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : load_mouse_region_tree.r
 5 | #@Description: This file is used to ...
 6 | 
 7 | 
 8 | # Packages -------------------------------------------------------------------
 9 | 
10 | suppressPackageStartupMessages(library(tidyverse))
11 | suppressPackageStartupMessages(library(data.tree))
12 | suppressPackageStartupMessages(library(rjson))
13 | suppressPackageStartupMessages(library(optparse))
14 | 
15 | working_dir <- getwd()
16 | 
17 | path_tree_tools <- '../analysis_utils/tree_tools.R'
18 | fileTree <- '../brain_mouse_2020sa/DSURQE_tree.json'
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/load_part_expression.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/12/1 18:38
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : load_part_expression.py
 6 | import sys
 7 | 
 8 | import pandas as pd
 9 | 
10 | sys.path.append('..')
11 | from analysis_utils import ttest_plot_utils
12 | from analysis_utils import homo_random_config as config
13 | import os
14 | import scanpy as sc
15 | 
16 | if __name__ == "__main__":
17 |     cfg = config._C
18 | 
19 |     mouse_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata1)
20 |     mouse_region_list = set(list(pd.read_csv(cfg.PROCESS.path_mouse_labels)['region_name']))
21 |     print(mouse_region_list)
22 |     mouse_h5ad_part = mouse_all_h5ad[mouse_all_h5ad.obs['region_name'].isin(mouse_region_list)]
23 |     print(mouse_h5ad_part)
24 |     mouse_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata1_part)
25 | 
26 |     human_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata2)
27 |     human_region_list = set(list(pd.read_csv(cfg.PROCESS.path_human_labels)['region_name']))
28 |     print(human_region_list)
29 |     human_h5ad_part = human_all_h5ad[human_all_h5ad.obs['region_name'].isin(human_region_list)]
30 |     print(human_h5ad_part)
31 |     human_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata2_part)
32 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/load_part_expression_6regions.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/12/1 18:38
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : load_part_expression.py
 6 | import sys
 7 | 
 8 | import pandas as pd
 9 | 
10 | sys.path.append('..')
11 | from analysis_utils import ttest_plot_utils
12 | from analysis_utils import homo_random_config as config
13 | import os
14 | import scanpy as sc
15 | 
16 | if __name__ == "__main__":
17 |     cfg = config._C
18 | 
19 |     cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_6regions.csv'
20 |     cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_6regions.csv'
21 | 
22 |     cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/6regions_mouse_brain_region_67_sagittal.h5ad'
23 |     cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/6regions_human_brain_region_88_sparse_with3d.h5ad'
24 | 
25 |     mouse_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata1)
26 |     mouse_region_list = set(list(pd.read_csv(cfg.PROCESS.path_mouse_labels)['region_name']))
27 |     print(mouse_region_list)
28 |     mouse_h5ad_part = mouse_all_h5ad[mouse_all_h5ad.obs['region_name'].isin(mouse_region_list)]
29 |     print(mouse_h5ad_part)
30 |     mouse_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata1_part)
31 | 
32 |     human_all_h5ad = sc.read_h5ad(cfg.PROCESS.path_rawdata2)
33 |     human_region_list = set(list(pd.read_csv(cfg.PROCESS.path_human_labels)['region_name']))
34 |     print(human_region_list)
35 |     human_h5ad_part = human_all_h5ad[human_all_h5ad.obs['region_name'].isin(human_region_list)]
36 |     print(human_h5ad_part)
37 |     human_h5ad_part.write_h5ad(cfg.PROCESS.path_rawdata2_part)
38 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/read_rhesus_2018s.R:
--------------------------------------------------------------------------------
 1 | #@Time : 2023/1/20 14:28
 2 | #@Author : Biao Zhang
 3 | #@Email : littlebiao@outlook.com
 4 | #@File : read_rhesus_2018s.R.r
 5 | #@Description: This file is used to ...
 6 | 
 7 | # packages
 8 | 
 9 | 
10 | #
11 | load('../brain_rhesus_2018s/Sestan.adultMonkeyNuclei.Psychencode.Rdata')
12 | ls()
13 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/read_rhesus_2018s.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Notebook"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | The [R plugin](https://www.jetbrains.com/help/pycharm/r-plugin-support.html) for IntelliJ-based IDEs provides
 7 | handy capabilities to work with the [R Markdown](https://www.jetbrains.com/help/pycharm/r-markdown.html) files.
 8 | To [add](https://www.jetbrains.com/help/pycharm/r-markdown.html#add-code-chunk) a new R chunk,
 9 | position the caret at any line or the code chunk, then click "+".
10 | 
11 | The code chunk appears:
12 | ```{r}
13 | load('./brain_rhesus_2018s/Sestan.adultMonkeyNuclei.Psychencode.Rdata')
14 | 
15 | ```
16 | 
17 | Type any R code in the chunk, for example:
18 | ```{r}
19 | mycars <- within(mtcars, { cyl <- ordered(cyl) })
20 | mycars
21 | ```
22 | 
23 | Now, click the **Run** button on the chunk toolbar to [execute](https://www.jetbrains.com/help/pycharm/r-markdown.html#run-r-code) the chunk code. The result should be placed under the chunk.
24 | Click the **Knit and Open Document** to build and preview an output.
25 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/run_came.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2024/02/01 11:30
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : run_came.py
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | from analysis_utils import ttest_plot_utils
10 | from analysis_utils import homo_random_config as config
11 | import os
12 | 
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     cfg = config._C
17 |     #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device
18 |     #cfg.CAME.n_top_genes = 1000
19 |     cfg.CAME.visible_device = '-1'
20 |     n_top_genes_list = [2000]
21 |     #quantile_gene_list = [0.8]
22 |     #quantile_sample_list = [0.9]
23 |     #cfg.CAME.quantile_gene = quantile_gene_list[0]
24 |     #cfg.CAME.quantile_sample = quantile_sample_list[0]
25 |     #for n_top_genes in n_top_genes_list:
26 |     cfg.CAME.n_top_genes = n_top_genes_list[0]
27 |     cfg.CAME.sparse = False
28 |     cfg.CAME.do_normalize = [False, True]
29 |     cfg.CAME.ROOT = '../analysis_results/mouse_2020sa/'
30 |     cfg.CAME.path_rawdata1 = '../brain_mouse_2020sa/mouse_2020sa_64regions.h5ad'
31 |     cfg.CAME.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad'
32 | 
33 |     cfg.CAME.path_mouse_labels = '../brain_mouse_2020sa/mouse_region_list_64.csv'
34 |     cfg.CAME.path_human_labels = '../brain_human_mouse/human_88_label_origin.csv'
35 | 
36 |     cfg.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67_all.csv'
37 |     #    ttest_plot_utils.run_came_homo_random(cfg)
38 | 
39 |     cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1
40 |     cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2
41 | 
42 |     #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
43 |     #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
44 | 
45 |     #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
46 |     #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
47 | 
48 |     ttest_plot_utils.run_came_homo_random(cfg)
49 | 
50 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/run_came_demo.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2024/02/01 11:30
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : run_came.py
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | from analysis_utils import ttest_plot_utils
10 | from analysis_utils import homo_random_config as config
11 | import os
12 | 
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     cfg = config._C
17 |     #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #cfg.CAME.visible_device
18 |     #cfg.CAME.n_top_genes = 1000
19 |     cfg.CAME.visible_device = '-1'
20 |     n_top_genes_list = [2000]
21 |     #quantile_gene_list = [0.8]
22 |     #quantile_sample_list = [0.9]
23 |     #cfg.CAME.quantile_gene = quantile_gene_list[0]
24 |     #cfg.CAME.quantile_sample = quantile_sample_list[0]
25 |     #for n_top_genes in n_top_genes_list:
26 |     cfg.CAME.n_top_genes = n_top_genes_list[0]
27 |     cfg.CAME.sparse = False
28 |     cfg.CAME.do_normalize = [False, True]
29 |     cfg.CAME.ROOT = '../analysis_results/mouse_2020sa/'
30 |     cfg.CAME.path_rawdata1 = '../../BrainAlign/demo/mouse_2020sa_64regions_demo.h5ad'
31 |     cfg.CAME.path_rawdata2 = '../../BrainAlign/demo/human_brain_region_88_sparse_with3d.h5ad'
32 | 
33 |     cfg.CAME.path_mouse_labels = '../brain_mouse_2020sa/mouse_region_list_64.csv'
34 |     cfg.CAME.path_human_labels = '../brain_human_mouse/human_88_label_origin.csv'
35 | 
36 |     cfg.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67_all.csv'
37 |     #    ttest_plot_utils.run_came_homo_random(cfg)
38 | 
39 |     cfg.PROCESS.path_rawdata1 = cfg.CAME.path_rawdata1
40 |     cfg.PROCESS.path_rawdata2 = cfg.CAME.path_rawdata2
41 | 
42 |     #cfg.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
43 |     #cfg.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
44 | 
45 |     #cfg.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
46 |     #cfg.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
47 | 
48 |     ttest_plot_utils.run_came_homo_random(cfg)
49 | 
50 | 


--------------------------------------------------------------------------------
/run_came/analysis_script/test_tree.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/12/19 21:16
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : test_tree.py
 6 | # @Description: This file is used to ...
 7 | import pandas as pd
 8 | from treelib import Node, Tree
 9 | 
10 | tree = Tree()
11 | tree.create_node('root', 'root')
12 | 
13 | def add_node(tree_dict, key_input):
14 |     if tree_dict[key_input]['children'] == []:
15 |         return
16 |     elif isinstance(tree_dict[key_input]['children'], list):
17 |         for key in tree_dict[key_input]['children']:
18 |             tree.create_node(key, key, parent=key_input)
19 |     else:
20 |         for key in tree_dict[key_input]['children'].keys():
21 |             tree.create_node(key, key, parent=key_input)
22 |             add_node(tree_dict[key_input]['children'], key)
23 | 
24 | if __name__ == '__main__':
25 | 
26 | 
27 |     dict_ = {"2": {'parent': "1"}, "1": {'parent': None}, "3": {'parent': "2"}}
28 |     tree_dict = {"0": {'name':'n0','children': {'0-1':{'name':'n0-1', 'children':['n0-1-0']}}},
29 |                  "1": {'name':'n1','children': []},
30 |                  "2": {'name':'n2', 'children': {'n2-0':{'name':'n2-0', 'children':{'n2-0-0':{'name':'n2-0-0', 'children':[]}}}}}}
31 |     added = set()
32 |     #tree = Tree()
33 |     for key in tree_dict.keys():
34 |         tree.create_node(key, key, parent='root')
35 |         add_node(tree_dict, key)
36 | 
37 |     tree.show()
38 |     print(tree.depth())
39 |     print(tree.subtree('0-1').depth())
40 |     #new_tree = tree.expand_tree(filter=lambda x:(tree.depth()-tree.subtree(x).depth())!=2)
41 |     print([tree[node].tag for node in tree.subtree('0-1').expand_tree(mode=Tree.DEPTH)])
42 |     #new_tree.show()
43 |     '''
44 |     region_69_df = pd.read_csv('../brain_mouse_2020sa/mouse_69_label_acronym.csv', sep=',')
45 |     region_69_list = region_69_df['region_name']
46 | 
47 |     region_new_df = pd.read_csv('../brain_mouse_2020sa/mouse_region_list.csv')
48 |     region_list = region_new_df['region_name']
49 | 
50 |     for region in region_69_list:
51 |         if not region in set(region_list):
52 |             print(region)
53 |     '''
54 | 
55 | 


--------------------------------------------------------------------------------
/run_came/analysis_utils/.ipynb_checkpoints/homo_random_config-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/10/15 11:20
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : homo_random_config.py
 6 | 
 7 | 
 8 | from yacs.config import CfgNode as CN
 9 | 
10 | # --------------------------------------------------------------
11 | # Config of model
12 | # --------------------------------------------------------------
13 | _C = CN()
14 | 
15 | _C.CAME = CN()
16 | _C.CAME.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'#'../../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad'
17 | _C.CAME.path_rawdata2 = '../brain_human_mouse/human_brain_region_88_sparse.h5ad'
18 | 
19 | _C.CAME.path_labels_1 = '../brain_human_mouse/mouse_67_label.csv'
20 | _C.CAME.path_labels_2 = '../brain_human_mouse/human_88_label_origin.csv'
21 | 
22 | _C.CAME.embedding_path = None
23 | 
24 | _C.CAME.path_varmap = '../came/sample_data/gene_matches_mouse2human.csv'
25 | _C.CAME.path_varmap_1v1 = '../came/sample_data/gene_matches_1v1_mouse2human.csv'
26 | 
27 | _C.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67.csv'
28 | 
29 | _C.CAME.species_name_list = ['Mouse', 'Human']
30 | _C.CAME.annotation_name = ['region_name', 'region_name']
31 | 
32 | _C.CAME.learning_label = ['region_name', 'region_name']
33 | 
34 | _C.CAME.n_top_genes = 5000
35 | _C.CAME.do_normalize = [True, True]
36 | 
37 | _C.CAME.sparse = False
38 | _C.CAME.quantile_gene= 0.5
39 | _C.CAME.quantile_sample = 0.99
40 | 
41 | _C.CAME.embedding_size = 128
42 | 
43 | _C.CAME.preclustering_resolution = 3
44 | 
45 | 
46 | _C.TRAINING = CN()
47 | _C.TRAINING.n_epochs = 300
48 | 
49 | # The training batch size
50 | # When the GPU memory is limited, set 4096 or more if possible.
51 | _C.TRAINING.batch_size = 2048
52 | # The number of epochs to skip for checkpoint backup
53 | _C.TRAINING.n_pass = 50
54 | # The number of top DEGs to take as the node-features of each cells.
55 | # You set it 70-100 for distant species pairs.
56 | _C.TRAINING.ntop_deg = 70
57 | 
58 | # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot.
59 | _C.TRAINING.ntop_deg_nodes = 50
60 | 
61 | 
62 | _C.ANALYSIS = CN()
63 | _C.ANALYSIS.cut_ov = 0
64 | _C.ANALYSIS.umap_neighbor = 20 #30
65 | _C.ANALYSIS.mouse_umap_neighbor = 20 #40
66 | _C.ANALYSIS.human_umap_neighbor = 20
67 | 
68 | 
69 | _C.CAME.ROOT = '../analysis_results/'
70 | 
71 | _C.CAME.visible_device = '0'
72 | 
73 | 
74 | _C.PROCESS = CN()
75 | _C.PROCESS.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'
76 | _C.PROCESS.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad'
77 | 
78 | _C.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
79 | _C.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
80 | 
81 | _C.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
82 | _C.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
83 | 
84 | 
85 | # --------------------------------------------------------------
86 | # Config of INPUT
87 | # --------------------------------------------------------------
88 | _C.HOMO_RANDOM = CN()
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/run_came/analysis_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -- coding: utf-8 --
2 | # @Time : 2022/10/15 12:12
3 | # @Author : Biao Zhang
4 | # @Email : littlebiao@outlook.com
5 | # @File : __init__.py
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/run_came/analysis_utils/homo_random_config.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2022/10/15 11:20
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : homo_random_config.py
 6 | 
 7 | 
 8 | from yacs.config import CfgNode as CN
 9 | 
10 | # --------------------------------------------------------------
11 | # Config of model
12 | # --------------------------------------------------------------
13 | _C = CN()
14 | 
15 | _C.CAME = CN()
16 | _C.CAME.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'#'../../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad'
17 | _C.CAME.path_rawdata2 = '../brain_human_mouse/human_brain_region_88_sparse.h5ad'
18 | 
19 | _C.CAME.path_labels_1 = '../brain_human_mouse/mouse_67_label.csv'
20 | _C.CAME.path_labels_2 = '../brain_human_mouse/human_88_label_origin.csv'
21 | 
22 | _C.CAME.embedding_path = None
23 | 
24 | _C.CAME.path_varmap = '../came/sample_data/gene_matches_mouse2human.csv'
25 | _C.CAME.path_varmap_1v1 = '../came/sample_data/gene_matches_1v1_mouse2human.csv'
26 | 
27 | _C.CAME.human_mouse_homo_region = '../brain_human_mouse/MouseHumanMatches_H88M67.csv'
28 | 
29 | _C.CAME.species_name_list = ['Mouse', 'Human']
30 | _C.CAME.annotation_name = ['region_name', 'region_name']
31 | 
32 | _C.CAME.learning_label = ['region_name', 'region_name']
33 | 
34 | _C.CAME.n_top_genes = 5000
35 | _C.CAME.do_normalize = [True, True]
36 | 
37 | _C.CAME.sparse = False
38 | _C.CAME.quantile_gene= 0.5
39 | _C.CAME.quantile_sample = 0.99
40 | 
41 | _C.CAME.embedding_size = 128
42 | 
43 | _C.CAME.preclustering_resolution = 3
44 | 
45 | 
46 | _C.TRAINING = CN()
47 | _C.TRAINING.n_epochs = 300
48 | 
49 | # The training batch size
50 | # When the GPU memory is limited, set 4096 or more if possible.
51 | _C.TRAINING.batch_size = 2048
52 | # The number of epochs to skip for checkpoint backup
53 | _C.TRAINING.n_pass = 50
54 | # The number of top DEGs to take as the node-features of each cells.
55 | # You set it 70-100 for distant species pairs.
56 | _C.TRAINING.ntop_deg = 70
57 | 
58 | # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot.
59 | _C.TRAINING.ntop_deg_nodes = 50
60 | 
61 | 
62 | _C.ANALYSIS = CN()
63 | _C.ANALYSIS.cut_ov = 0
64 | _C.ANALYSIS.umap_neighbor = 20 #30
65 | _C.ANALYSIS.mouse_umap_neighbor = 20 #40
66 | _C.ANALYSIS.human_umap_neighbor = 20
67 | 
68 | 
69 | _C.CAME.ROOT = '../analysis_results/'
70 | 
71 | _C.CAME.visible_device = '0'
72 | 
73 | 
74 | _C.PROCESS = CN()
75 | _C.PROCESS.path_rawdata1 = '../../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad'
76 | _C.PROCESS.path_rawdata2 = '../../Brain_ST_human_mouse/data/human_brain_region_88_sparse_with3d.h5ad'
77 | 
78 | _C.PROCESS.path_mouse_labels = '../brain_human_mouse/mouse_67_label_10regions.csv'
79 | _C.PROCESS.path_human_labels = '../brain_human_mouse/human_88_label_10regions.csv'
80 | 
81 | _C.PROCESS.path_rawdata1_part = '../../Brain_ST_human_mouse/data/10regions_mouse_brain_region_67_sagittal.h5ad'
82 | _C.PROCESS.path_rawdata2_part = '../../Brain_ST_human_mouse/data/10regions_human_brain_region_88_sparse_with3d.h5ad'
83 | 
84 | 
85 | # --------------------------------------------------------------
86 | # Config of INPUT
87 | # --------------------------------------------------------------
88 | _C.HOMO_RANDOM = CN()
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/run_came/analysis_utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | import os.path as osp
 5 | import time
 6 | def setup_logger(name, save_dir, if_train):
 7 |     logger = logging.getLogger(name)
 8 |     logger.setLevel(logging.DEBUG)
 9 | 
10 |     ch = logging.StreamHandler(stream=sys.stdout)
11 |     ch.setLevel(logging.DEBUG)
12 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
13 |     ch.setFormatter(formatter)
14 |     logger.addHandler(ch)
15 | 
16 |     if save_dir:
17 |         if not osp.exists(save_dir):
18 |             os.makedirs(save_dir)
19 |         if if_train:
20 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_train_log.txt"), mode='w')
21 |         else:
22 |             fh = logging.FileHandler(os.path.join(save_dir, time.strftime("%Y-%m-%d-%H-%M-%S")+"_test_log.txt"), mode='w')
23 |         fh.setLevel(logging.DEBUG)
24 |         fh.setFormatter(formatter)
25 |         logger.addHandler(fh)
26 | 
27 |     return logger


--------------------------------------------------------------------------------
/run_came/brain_human_mouse/get_human_acronym_color.py:
--------------------------------------------------------------------------------
 1 | # -- coding: utf-8 --
 2 | # @Time : 2023/3/6 19:55
 3 | # @Author : Biao Zhang
 4 | # @Email : littlebiao@outlook.com
 5 | # @File : get_human_acronym_color.py
 6 | # @Description: This file is used to ...
 7 | 
 8 | import pandas as pd
 9 | 
10 | if __name__ == '__main__':
11 |     human_structure_df = pd.read_csv('./human_query.csv')
12 |     name_list = human_structure_df['name']
13 |     acronym_list = human_structure_df['acronym']
14 |     acronym_dict = {k:v for k,v in zip(name_list,acronym_list)}
15 |     color_list = human_structure_df['color_hex_triplet']
16 |     color_dict = {k:v for k,v in zip(name_list,color_list)}
17 | 
18 | 
19 |     human_88_label_df = pd.read_csv('human_88_label_origin.csv', index_col=0)
20 | 
21 |     region_name_list = human_88_label_df['region_name']
22 | 
23 |     human_88_label_df['acronym'] = [acronym_dict[r] for r in region_name_list]
24 |     human_88_label_df['color_hex_triplet'] = ['#'+color_dict[r] for r in region_name_list]
25 | 
26 |     human_88_label_df.to_csv('./human_88_labels.csv')
27 | 
28 | 


--------------------------------------------------------------------------------
/run_came/brain_mouse_2020sa/human_gene_palette/2011-12-16203C-Supplementary_Table8.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanglabtools/BrainAlign/5535813172d73b96772768a741c34a71a480b8e8/run_came/brain_mouse_2020sa/human_gene_palette/2011-12-16203C-Supplementary_Table8.xls


--------------------------------------------------------------------------------
/run_came/came/.ipynb_checkpoints/PARAMETERS-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 22:13:17 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | 
 7 | Parameter Settings
 8 | 
 9 | Notes
10 | -----
11 | * Do NOT change this file directly!
12 | 
13 | Examples
14 | --------
15 | >>> params_pre = PARAMETER.get_preprocess_params()
16 | >>> params_model = PARAMETER.get_model_params()
17 | >>> params_loss = PARAMETER.get_loss_params()
18 | 
19 | """
20 | import copy
21 | 
22 | # _params_pre = dict(
23 | #     remove_rare=False,  # True for benchmarking; False for case study
24 | #     min_samples=10,
25 | #     ###
26 | #     norm__rev=False,  # False by default
27 | #     norm__log_only=False,  # False by default
28 | #     ###
29 | #     scale_within=True,  # True by default
30 | #     unit_var=True,  # True by default
31 | #     clip=not True, clip_range=(-3, 5),  # False by default
32 | #     ###
33 | #     use_degs=True,
34 | #     only_1v1homo=False,  # False by default
35 | #     target_sum='auto',  # auto --> 1e4
36 | #     with_single_vnodes=not True,
37 | # )
38 | 
39 | _params_model = dict(
40 |     h_dim=128,
41 |     num_hidden_layers=2,
42 |     norm='right',
43 |     dropout_feat=0.0,  # no dropout for cell input features
44 |     dropout=0.2,
45 |     negative_slope=0.05,
46 |     layernorm_ntypes=['cell', 'gene'],
47 |     out_bias=True,
48 |     rel_names_out=[('gene', 'expressed_by', 'cell'),
49 |                    ],
50 |     share_hidden_weights=True,
51 |     attn_out=True,
52 |     kwdict_outgat=dict(n_heads=8,
53 |                        feat_drop=0.01,
54 |                        attn_drop=0.6,
55 |                        negative_slope=0.2,
56 |                        residual=False,
57 |                        attn_type='add',  # 'add' is more robust than 'mul'
58 |                        heads_fuse='mean',
59 |                        ),
60 |     share_layernorm=True,  # ignored if no weights are shared
61 |     residual=False,  # performance un-tested
62 | )
63 | 
64 | _params_lossfunc = dict(
65 |     smooth_eps=0.1, reduction='mean',
66 |     beta=1.,  # balance factor for multi-label loss
67 |     alpha=0,  # for R-drop, setting it larger than zero
68 | )
69 | 
70 | 
71 | def _get_parameter_dict(default={}, **kwds) -> dict:
72 |     params = copy.deepcopy(default)
73 |     if len(kwds) > 0:
74 |         params.update(**kwds)
75 |     return params
76 | 
77 | 
78 | # def get_preprocess_params(**kwds) -> dict:
79 | #     return _get_parameter_dict(_params_pre, **kwds)
80 | 
81 | 
82 | def get_loss_params(**kwds) -> dict:
83 |     return _get_parameter_dict(_params_lossfunc, **kwds)
84 | 
85 | 
86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict:
87 |     params = _get_parameter_dict(_params_model, **kwds)
88 |     if len(kwdict_outgat) > 0:
89 |         params['kwdict_outgat'].update(kwdict_outgat)
90 |     return params
91 | 
92 | 


--------------------------------------------------------------------------------
/run_came/came/.ipynb_checkpoints/__init__-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @author: Xingyan Liu
 3 | 
 4 | from .utils import (
 5 |         load_hidden_states,
 6 |         save_hidden_states,
 7 |         load_example_data
 8 | )
 9 | from .utils import base
10 | from .utils.base import (
11 |         save_pickle,
12 |         load_pickle,
13 |         save_json_dict,
14 |         load_json_dict,
15 |         check_dirs,
16 |         write_info,
17 |         make_nowtime_tag,
18 |         subsample_each_group,
19 |         )
20 | from .utils import preprocess as pp
21 | from .utils import plot as pl
22 | from .utils import analyze as ana
23 | from .utils.analyze import (
24 |         load_dpair_and_model,
25 |         weight_linked_vars,
26 |         make_abstracted_graph,
27 |         )
28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL
29 | from .utils._base_trainer import get_checkpoint_list
30 | from .utils.evaluation import accuracy
31 | from .model import (
32 |         Predictor,
33 |         detach2numpy,
34 |         as_probabilities,
35 |         predict_from_logits,
36 |         predict,
37 |         CGGCNet,
38 |         CGCNet
39 | )
40 | from .datapair import (
41 |         datapair_from_adatas,
42 |         aligned_datapair_from_adatas,
43 |         DataPair,
44 |         AlignedDataPair,
45 |         make_features,
46 | )
47 | from .PARAMETERS import get_model_params, get_loss_params
48 | from . import pipeline
49 | from .pipeline import KET_CLUSTER, __test1__, __test2__
50 | 
51 | 
52 | __version__ = "0.1.8"
53 | 


--------------------------------------------------------------------------------
/run_came/came/PARAMETERS.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 22:13:17 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | 
 7 | Parameter Settings
 8 | 
 9 | Notes
10 | -----
11 | * Do NOT change this file directly!
12 | 
13 | Examples
14 | --------
15 | >>> params_pre = PARAMETER.get_preprocess_params()
16 | >>> params_model = PARAMETER.get_model_params()
17 | >>> params_loss = PARAMETER.get_loss_params()
18 | 
19 | """
20 | import copy
21 | 
22 | # _params_pre = dict(
23 | #     remove_rare=False,  # True for benchmarking; False for case study
24 | #     min_samples=10,
25 | #     ###
26 | #     norm__rev=False,  # False by default
27 | #     norm__log_only=False,  # False by default
28 | #     ###
29 | #     scale_within=True,  # True by default
30 | #     unit_var=True,  # True by default
31 | #     clip=not True, clip_range=(-3, 5),  # False by default
32 | #     ###
33 | #     use_degs=True,
34 | #     only_1v1homo=False,  # False by default
35 | #     target_sum='auto',  # auto --> 1e4
36 | #     with_single_vnodes=not True,
37 | # )
38 | 
39 | _params_model = dict(
40 |     h_dim=128,
41 |     num_hidden_layers=2,
42 |     norm='right',
43 |     dropout_feat=0.0,  # no dropout for cell input features
44 |     dropout=0.2,
45 |     negative_slope=0.05,
46 |     layernorm_ntypes=['cell', 'gene'],
47 |     out_bias=True,
48 |     rel_names_out=[('gene', 'expressed_by', 'cell'),
49 |                    ],
50 |     share_hidden_weights=True,
51 |     attn_out=True,
52 |     kwdict_outgat=dict(n_heads=8,
53 |                        feat_drop=0.01,
54 |                        attn_drop=0.6,
55 |                        negative_slope=0.2,
56 |                        residual=False,
57 |                        attn_type='add',  # 'add' is more robust than 'mul'
58 |                        heads_fuse='mean',
59 |                        ),
60 |     share_layernorm=True,  # ignored if no weights are shared
61 |     residual=False,  # performance un-tested
62 | )
63 | 
64 | _params_lossfunc = dict(
65 |     smooth_eps=0.1, reduction='mean',
66 |     beta=1.,  # balance factor for multi-label loss
67 |     alpha=0,  # for R-drop, setting it larger than zero
68 | )
69 | 
70 | 
71 | def _get_parameter_dict(default={}, **kwds) -> dict:
72 |     params = copy.deepcopy(default)
73 |     if len(kwds) > 0:
74 |         params.update(**kwds)
75 |     return params
76 | 
77 | 
78 | # def get_preprocess_params(**kwds) -> dict:
79 | #     return _get_parameter_dict(_params_pre, **kwds)
80 | 
81 | 
82 | def get_loss_params(**kwds) -> dict:
83 |     return _get_parameter_dict(_params_lossfunc, **kwds)
84 | 
85 | 
86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict:
87 |     params = _get_parameter_dict(_params_model, **kwds)
88 |     if len(kwdict_outgat) > 0:
89 |         params['kwdict_outgat'].update(kwdict_outgat)
90 |     return params
91 | 
92 | 


--------------------------------------------------------------------------------
/run_came/came/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @author: Xingyan Liu
 3 | 
 4 | from .utils import (
 5 |         load_hidden_states,
 6 |         save_hidden_states,
 7 |         load_example_data
 8 | )
 9 | from .utils import base
10 | from .utils.base import (
11 |         save_pickle,
12 |         load_pickle,
13 |         save_json_dict,
14 |         load_json_dict,
15 |         check_dirs,
16 |         write_info,
17 |         make_nowtime_tag,
18 |         subsample_each_group,
19 |         )
20 | from .utils import preprocess as pp
21 | from .utils import plot as pl
22 | from .utils import analyze as ana
23 | from .utils.analyze import (
24 |         load_dpair_and_model,
25 |         weight_linked_vars,
26 |         make_abstracted_graph,
27 |         )
28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL
29 | from .utils._base_trainer import get_checkpoint_list
30 | from .utils.evaluation import accuracy
31 | from .model import (
32 |         Predictor,
33 |         detach2numpy,
34 |         as_probabilities,
35 |         predict_from_logits,
36 |         predict,
37 |         CGGCNet,
38 |         CGCNet
39 | )
40 | from .datapair import (
41 |         datapair_from_adatas,
42 |         aligned_datapair_from_adatas,
43 |         DataPair,
44 |         AlignedDataPair,
45 |         make_features,
46 | )
47 | from .PARAMETERS import get_model_params, get_loss_params
48 | from . import pipeline
49 | from .pipeline import KET_CLUSTER, __test1__, __test2__
50 | 
51 | 
52 | __version__ = "0.1.8"
53 | 


--------------------------------------------------------------------------------
/run_came/came/datapair/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from .unaligned import datapair_from_adatas, DataPair, make_features
 9 | from .aligned import aligned_datapair_from_adatas, AlignedDataPair
10 | 
11 | 


--------------------------------------------------------------------------------
/run_came/came/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from ._utils import *
 9 | from ._predict import *
10 | from .loss import *
11 | from ._predict import *
12 | from .loss import *
13 | from .cggc import CGGCNet
14 | from .cgc import CGCNet
15 | 


--------------------------------------------------------------------------------
/run_came/came/model/_minibatch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | """
 3 | @CreateDate: 2021/07/15
 4 | @Author: Qunlun Shen
 5 | @File: _minibatch.py
 6 | @Project: CAME
 7 | """
 8 | from pathlib import Path
 9 | from typing import Sequence, Union, Mapping, Optional
10 | import time
11 | import numpy as np
12 | import torch
13 | from torch import Tensor
14 | import dgl
15 | import tqdm
16 | 
17 | 
18 | def make_fanouts(etypes, etypes_each_layers, k_each_etype: Union[int, dict]):
19 |     if isinstance(k_each_etype, int):
20 |         k_each_etype = dict.fromkeys(etypes, k_each_etype)
21 | 
22 |     fanouts = []
23 |     for _etypes in etypes_each_layers:
24 |         _fanout = dict.fromkeys(etypes, 0)
25 |         _fanout.update({e: k_each_etype[e] for e in _etypes})
26 |         fanouts.append(_fanout)
27 |     return fanouts
28 | 
29 | 
30 | def involved_nodes(g,) -> dict:
31 |     """ collect all the involved nodes from the edges on g
32 |     (a heterogeneous graph)
33 | 
34 |     Examples
35 |     --------
36 | 
37 |     >>> input_nodes, output_nodes, mfgs = next(iter(train_dataloader))
38 |     >>> g.subgraph(involved_nodes(mfgs[0]))
39 | 
40 |     """
41 |     from collections import defaultdict
42 |     nodes = defaultdict(set)
43 |     for stype, etype, dtype in g.canonical_etypes:
44 |         src, dst = g.edges(etype=etype)
45 |         nodes[stype].update(src.numpy())
46 |         nodes[dtype].update(dst.numpy())
47 | 
48 |     nodes = {k: sorted(v) for k, v in nodes.items()}
49 |     return nodes
50 | 
51 | 


--------------------------------------------------------------------------------
/run_came/came/model/v0/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from ._utils import *
 9 | from ._predict import *
10 | from .loss import *
11 | from ._predict import *
12 | from .loss import *
13 | from .cggc import CGGCNet
14 | from .cgc import CGCNet
15 | 


--------------------------------------------------------------------------------
/run_came/came/model/v0/_minibatch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @CreateDate: 2021/07/15
  4 | @Author: Qunlun Shen
  5 | @File: _minibatch.py
  6 | @Project: CAME
  7 | """
  8 | from pathlib import Path
  9 | from typing import Sequence, Union, Mapping, Optional
 10 | import time
 11 | import numpy as np
 12 | import torch
 13 | from torch import Tensor
 14 | import dgl
 15 | import tqdm
 16 | 
 17 | 
 18 | def sub_graph(cell_ids, gene_ids, g):
 19 |     """
 20 |     Making sub_graph for g with input cell_ids and gene_ids
 21 |     """
 22 |     output_nodes_dict = {'cell': cell_ids, 'gene': gene_ids}
 23 |     g_subgraph = dgl.node_subgraph(g, output_nodes_dict)
 24 |     return g_subgraph
 25 | 
 26 | 
 27 | def create_blocks(g, output_nodes, etype='expressed_by'):
 28 |     cell_ids = output_nodes.clone().detach()
 29 |     gene_ids = g.in_edges(cell_ids, etype=etype)[0]  # genes expressed_by cells
 30 |     gene_ids = torch.unique(gene_ids)
 31 |     block = sub_graph(cell_ids, gene_ids, g)  # graph for GAT
 32 |     return block
 33 | 
 34 | 
 35 | def create_batch(
 36 |         sample_size=None,
 37 |         train_idx=None,
 38 |         test_idx=None,
 39 |         batch_size=None,
 40 |         labels=None,
 41 |         shuffle=True,
 42 |         label=True
 43 | ):
 44 |     """
 45 |     This function create batch idx, i.e. the cells IDs in a batch.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     train_idx:
 50 |         the index for reference cells
 51 |     test_idx:
 52 |         the index for query cells
 53 |     batch_size:
 54 |         the number of cells in each batch
 55 |     labels:
 56 |         the labels for both Reference cells and Query cells
 57 | 
 58 |     Returns
 59 |     -------
 60 |     train_labels
 61 |         the shuffled or non-shuffled labels for all reference cells
 62 |     test_labels
 63 |         the shuffled or non-shuffled labels for all query cells
 64 |     batch_list
 65 |         the list sores the batch of cell IDs
 66 |     all_idx
 67 |         the shuffled or non-shuffled index for all cells
 68 |     """
 69 |     if label:
 70 |         batch_list = []
 71 |         batch_labels = []
 72 |         sample_size = len(train_idx) + len(test_idx)
 73 |         if shuffle:
 74 |             all_idx = torch.randperm(sample_size)
 75 |             shuffled_labels = labels[all_idx]
 76 |             train_labels = shuffled_labels[all_idx < len(train_idx)].clone().detach()
 77 |             test_labels = shuffled_labels[all_idx >= len(train_idx)].clone().detach()
 78 | 
 79 |             if batch_size >= sample_size:
 80 |                 batch_list.append(all_idx)
 81 | 
 82 |             else:
 83 |                 batch_num = int(len(all_idx) / batch_size) + 1
 84 |                 for i in range(batch_num - 1):
 85 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
 86 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
 87 | 
 88 |         else:
 89 |             train_labels = labels[train_idx].clone().detach()
 90 |             test_labels = labels[test_idx].clone().detach()
 91 |             all_idx = torch.cat((train_idx, test_idx), 0)
 92 |             if batch_size >= sample_size:
 93 |                 batch_list.append(all_idx)
 94 |             else:
 95 |                 batch_num = int(len(all_idx) / batch_size) + 1
 96 |                 for i in range(batch_num - 1):
 97 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
 98 |                     batch_labels.append(labels[batch_size * i: batch_size * (i + 1)])
 99 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
100 | 
101 |         return train_labels, test_labels, batch_list, all_idx
102 | 
103 |     else:
104 |         batch_list = []
105 |         if shuffle:
106 |             all_idx = torch.randperm(sample_size)
107 | 
108 |             if batch_size >= sample_size:
109 |                 batch_list.append(all_idx)
110 |             else:
111 |                 batch_num = int(len(all_idx) / batch_size) + 1
112 |                 for i in range(batch_num - 1):
113 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
114 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
115 | 
116 |         else:
117 |             all_idx = torch.arange(sample_size)
118 |             if batch_size >= sample_size:
119 |                 batch_list.append(all_idx)
120 |             else:
121 |                 batch_num = int(len(all_idx) / batch_size) + 1
122 |                 for i in range(batch_num - 1):
123 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
124 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
125 | 
126 |         return batch_list, all_idx, None, None
127 | 
128 | 


--------------------------------------------------------------------------------
/run_came/came/utils/.ipynb_checkpoints/_get_example_data-checkpoint.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @author: Xingyan Liu
  4 | @file: _get_example_data.py
  5 | @time: 2021-06-12
  6 | """
  7 | 
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Sequence, Union, Dict, List, Optional  # , Callable
 11 | import numpy as np
 12 | import pandas as pd
 13 | import scanpy as sc
 14 | from scipy import sparse
 15 | import logging
 16 | 
 17 | CAME_ROOT = Path(__file__).parents[1]
 18 | 
 19 | 
 20 | def _extract_zip(
 21 |         fp_zip=CAME_ROOT / 'sample_data.zip',
 22 |         fp_unzip=CAME_ROOT / 'sample_data',
 23 | ):
 24 |     import zipfile
 25 |     with zipfile.ZipFile(fp_zip) as zipf:
 26 |         zipf.extractall(fp_unzip)
 27 | 
 28 | 
 29 | def load_example_data() -> Dict:
 30 |     """ Load example data, for a quick start with CAME.
 31 | 
 32 |     This pair of cross-species datasets contains the pancreatic scRNA-seq data
 33 |     of human ("Baron_human") and mouse ("Baron_human"),
 34 |     initially published with paper [1].
 35 | 
 36 |     NOTE that "Baron_human" is a 20%-subsample from the original data.
 37 |     The resulting cell-typing accuracy may not be as good as one
 38 |     using full dataset as the reference.
 39 | 
 40 |     [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human
 41 |     and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure.
 42 |     Cell Syst 3 (4), 346-360.e4.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     dict:
 47 |         a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']
 48 | 
 49 |     Examples
 50 |     --------
 51 |     >>> example_data_dict = load_example_data()
 52 |     >>> print(example_data_dict.keys())
 53 |     # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'])
 54 | 
 55 |     >>> adatas = example_data_dict['adatas']
 56 |     >>> dsnames = example_data_dict['dataset_names']  # ('Baron_human', 'Baron_mouse')
 57 |     >>> df_varmap = example_data_dict['varmap']
 58 |     >>> df_varmap_1v1 = example_data_dict['varmap_1v1']
 59 |     >>> key_class1 = key_class2 = example_data_dict['key_class']
 60 | 
 61 |     """
 62 |     datadir = CAME_ROOT / 'sample_data'
 63 | 
 64 |     sp1, sp2 = ('human', 'mouse')
 65 |     dsnames = ('Baron_human', 'Baron_mouse')
 66 |     dsn1, dsn2 = dsnames
 67 |     fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad'
 68 |     fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv'
 69 |     fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv'
 70 | 
 71 |     if not (datadir.exists() and fp1.exists() and fp2.exists() and
 72 |             fp_varmap.exists() and fp_varmap_1v1.exists()):
 73 |         _extract_zip()
 74 | 
 75 |     df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, )
 76 |     df_varmap = pd.read_csv(fp_varmap, )
 77 | 
 78 |     adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2)
 79 | 
 80 |     key_class = 'cell_ontology_class'
 81 |     example_dict = {
 82 |         'adatas': [adata_raw1, adata_raw2],
 83 |         'varmap': df_varmap,
 84 |         'varmap_1v1': df_varmap_1v1,
 85 |         'dataset_names': dsnames,
 86 |         'key_class': key_class,
 87 |     }
 88 |     logging.info(example_dict.keys())
 89 |     logging.debug(example_dict)
 90 |     return example_dict
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     logging.basicConfig(
 95 |         level=logging.DEBUG,
 96 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
 97 |                '%(levelname)s\n %(message)s')
 98 |     d = load_example_data()
 99 |     print(d.keys())
100 | 


--------------------------------------------------------------------------------
/run_came/came/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | from . import *
 8 | from .base import (
 9 |         save_pickle,
10 |         load_pickle,
11 |         check_dirs,
12 |         write_info,
13 |         make_nowtime_tag,
14 |         subsample_each_group,
15 |         )
16 | from .evaluation import accuracy
17 | from .analyze import (
18 |        weight_linked_vars,
19 |        make_abstracted_graph,
20 |        )
21 | from ._get_example_data import load_example_data
22 | from .downsample_counts import (
23 |         downsample_total_counts,
24 |         downsample_counts_per_cell
25 | )
26 | from ._io_h5py import load_hidden_states, save_hidden_states
27 | 


--------------------------------------------------------------------------------
/run_came/came/utils/_get_example_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @author: Xingyan Liu
  4 | @file: _get_example_data.py
  5 | @time: 2021-06-12
  6 | """
  7 | 
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Sequence, Union, Dict, List, Optional  # , Callable
 11 | import numpy as np
 12 | import pandas as pd
 13 | import scanpy as sc
 14 | from scipy import sparse
 15 | import logging
 16 | 
 17 | CAME_ROOT = Path(__file__).parents[1]
 18 | 
 19 | 
 20 | def _extract_zip(
 21 |         fp_zip=CAME_ROOT / 'sample_data.zip',
 22 |         fp_unzip=CAME_ROOT / 'sample_data',
 23 | ):
 24 |     import zipfile
 25 |     with zipfile.ZipFile(fp_zip) as zipf:
 26 |         zipf.extractall(fp_unzip)
 27 | 
 28 | 
 29 | def load_example_data() -> Dict:
 30 |     """ Load example data, for a quick start with CAME.
 31 | 
 32 |     This pair of cross-species datasets contains the pancreatic scRNA-seq data
 33 |     of human ("Baron_human") and mouse ("Baron_human"),
 34 |     initially published with paper [1].
 35 | 
 36 |     NOTE that "Baron_human" is a 20%-subsample from the original data.
 37 |     The resulting cell-typing accuracy may not be as good as one
 38 |     using full dataset as the reference.
 39 | 
 40 |     [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human
 41 |     and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure.
 42 |     Cell Syst 3 (4), 346-360.e4.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     dict:
 47 |         a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']
 48 | 
 49 |     Examples
 50 |     --------
 51 |     >>> example_data_dict = load_example_data()
 52 |     >>> print(example_data_dict.keys())
 53 |     # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'])
 54 | 
 55 |     >>> adatas = example_data_dict['adatas']
 56 |     >>> dsnames = example_data_dict['dataset_names']  # ('Baron_human', 'Baron_mouse')
 57 |     >>> df_varmap = example_data_dict['varmap']
 58 |     >>> df_varmap_1v1 = example_data_dict['varmap_1v1']
 59 |     >>> key_class1 = key_class2 = example_data_dict['key_class']
 60 | 
 61 |     """
 62 |     datadir = CAME_ROOT / 'sample_data'
 63 | 
 64 |     sp1, sp2 = ('human', 'mouse')
 65 |     dsnames = ('Baron_human', 'Baron_mouse')
 66 |     dsn1, dsn2 = dsnames
 67 |     fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad'
 68 |     fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv'
 69 |     fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv'
 70 | 
 71 |     if not (datadir.exists() and fp1.exists() and fp2.exists() and
 72 |             fp_varmap.exists() and fp_varmap_1v1.exists()):
 73 |         _extract_zip()
 74 | 
 75 |     df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, )
 76 |     df_varmap = pd.read_csv(fp_varmap, )
 77 | 
 78 |     adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2)
 79 | 
 80 |     key_class = 'cell_ontology_class'
 81 |     example_dict = {
 82 |         'adatas': [adata_raw1, adata_raw2],
 83 |         'varmap': df_varmap,
 84 |         'varmap_1v1': df_varmap_1v1,
 85 |         'dataset_names': dsnames,
 86 |         'key_class': key_class,
 87 |     }
 88 |     logging.info(example_dict.keys())
 89 |     logging.debug(example_dict)
 90 |     return example_dict
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     logging.basicConfig(
 95 |         level=logging.DEBUG,
 96 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
 97 |                '%(levelname)s\n %(message)s')
 98 |     d = load_example_data()
 99 |     print(d.keys())
100 | 


--------------------------------------------------------------------------------
/run_came/came/utils/_io_h5py.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @Author: Xingyan Liu
  4 | @File: _tmp_h5py.py
  5 | @Date: 2021-08-03
  6 | @Project: CAME
  7 | """
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Union, Optional, List, Mapping
 11 | import logging
 12 | import numpy as np
 13 | import h5py
 14 | 
 15 | 
 16 | def save_hidden_states(data_list: list, path: Union[Path, str]):
 17 |     """ Save hidden states into .h5 file
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     data_list
 22 |         a list of data matrix, or a list of dicts whose values are matrices
 23 |     path
 24 |         file-path ends with .h5, if not, '.h5' will be appended to it.
 25 | 
 26 |     Returns
 27 |     -------
 28 |     None
 29 |     """
 30 |     if not str(path).endswith('.h5'):
 31 |         path = str(path) + '.h5'
 32 |     f = h5py.File(path, 'w')
 33 |     if isinstance(data_list[0], dict):
 34 |         for i, dct in enumerate(data_list):
 35 |             for key, _data in dct.items():
 36 |                 f.create_dataset(f'/layer{i}/{key}', data=_data)
 37 |     else:
 38 |         for i, _data in enumerate(data_list):
 39 |             f.create_dataset(f'/layer{i}', data=_data)
 40 | 
 41 |     f.close()
 42 | 
 43 | 
 44 | def load_hidden_states(path) -> List[dict]:
 45 |     """ Load hidden states from .h5 file
 46 |     the data structure should be like
 47 |         [
 48 |         'layer0/cell', 'layer0/gene',
 49 |         'layer1/cell', 'layer1/gene',
 50 |         'layer2/cell', 'layer2/gene'
 51 |         ]
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     path
 56 |         .h5 file path
 57 | 
 58 |     Returns
 59 |     -------
 60 |     values: a list of dicts
 61 |     """
 62 |     f = h5py.File(path, 'r')
 63 |     prefix = 'layer'
 64 |     keys = sorted(f.keys(), key=lambda x: int(x.strip(prefix)))
 65 |     # print(keys)
 66 |     values = [_unfold_to_dict(f[key]) for key in keys]
 67 |     return values
 68 | 
 69 | 
 70 | def _unfold_to_dict(d: h5py.Group) -> dict:
 71 |     dct = {}
 72 |     for key, val in d.items():
 73 |         if isinstance(val, h5py.Dataset):
 74 |             dct[key] = np.array(val)
 75 |     return dct
 76 | 
 77 | 
 78 | def _visit(f: h5py.File):
 79 |     tree = []
 80 | 
 81 |     def foo(_name, _obj):
 82 |         if isinstance(_obj, h5py.Dataset):
 83 |             tree.append(_name)
 84 |     f.visititems(foo)
 85 |     logging.info(f'tree={tree}')
 86 |     return tree
 87 | 
 88 | 
 89 | def __test__():
 90 |     n_cells = 100
 91 |     n_genes = 114
 92 |     n_dims = 64
 93 |     hidden_data = [
 94 |         {'cell': np.random.randn(n_cells, n_dims),
 95 |          'gene': np.random.randn(n_genes, n_dims)}
 96 |         for i in range(3)
 97 |     ]
 98 |     hidden_data.append({'cell': np.random.randn(n_cells, n_dims)})
 99 | 
100 |     # logging.debug(hidden_data)
101 |     save_hidden_states(hidden_data, '_tmp_data')
102 |     f1 = h5py.File('_tmp_data.h5', 'r')
103 |     h_list = load_hidden_states('../../_tmp_data.h5')
104 |     # logging.info(values)
105 |     for k, d in zip(f1.keys(), h_list):
106 |         print(f'{k}: {list(d.keys())}')
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     logging.basicConfig(
111 |         level=logging.DEBUG,
112 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
113 |                '%(levelname)s\n %(message)s')
114 |     __test__()
115 | 


--------------------------------------------------------------------------------
/run_came/came/utils/evaluation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 19:43:10 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | import numpy as np
 9 | from sklearn import metrics
10 | import torch
11 | from torch import Tensor
12 | from typing import Sequence
13 | from ..model import detach2numpy
14 | 
15 | 
16 | def accuracy(logits: Tensor, labels: Tensor):
17 |     labels = labels.to(logits.device)
18 |     if len(logits.shape) >= 2:
19 |         _, preds = torch.max(logits, dim=1)
20 |     else:
21 |         preds = logits
22 |     if len(labels.shape) >= 2:
23 |         _, labels = torch.max(labels, dim=1)
24 |     else:
25 |         labels = labels
26 |     correct = torch.sum(preds == labels)
27 |     return correct.item() * 1.0 / len(labels)
28 | 
29 | 
30 | def get_AMI(y_true, y_pred, **kwds):
31 |     y_true, y_pred = list(map(detach2numpy, (y_true, y_pred)))
32 |     ami = metrics.adjusted_mutual_info_score(y_true, y_pred, **kwds)
33 |     return ami
34 | 
35 | 
36 | def get_F1_score(y_true, y_pred, average='macro', **kwds):
37 |     y_true, y_pred = list(map(detach2numpy, (y_true, y_pred)))
38 |     f1 = metrics.f1_score(y_true, y_pred, average=average, **kwds)
39 |     return f1
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/run_came/came_origin/.ipynb_checkpoints/PARAMETERS-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 22:13:17 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | 
 7 | Parameter Settings
 8 | 
 9 | Notes
10 | -----
11 | * Do NOT change this file directly!
12 | 
13 | Examples
14 | --------
15 | >>> params_pre = PARAMETER.get_preprocess_params()
16 | >>> params_model = PARAMETER.get_model_params()
17 | >>> params_loss = PARAMETER.get_loss_params()
18 | 
19 | """
20 | import copy
21 | 
22 | # _params_pre = dict(
23 | #     remove_rare=False,  # True for benchmarking; False for case study
24 | #     min_samples=10,
25 | #     ###
26 | #     norm__rev=False,  # False by default
27 | #     norm__log_only=False,  # False by default
28 | #     ###
29 | #     scale_within=True,  # True by default
30 | #     unit_var=True,  # True by default
31 | #     clip=not True, clip_range=(-3, 5),  # False by default
32 | #     ###
33 | #     use_degs=True,
34 | #     only_1v1homo=False,  # False by default
35 | #     target_sum='auto',  # auto --> 1e4
36 | #     with_single_vnodes=not True,
37 | # )
38 | 
39 | _params_model = dict(
40 |     h_dim=128,
41 |     num_hidden_layers=2,
42 |     norm='right',
43 |     dropout_feat=0.0,  # no dropout for cell input features
44 |     dropout=0.2,
45 |     negative_slope=0.05,
46 |     layernorm_ntypes=['cell', 'gene'],
47 |     out_bias=True,
48 |     rel_names_out=[('gene', 'expressed_by', 'cell'),
49 |                    ],
50 |     share_hidden_weights=True,
51 |     attn_out=True,
52 |     kwdict_outgat=dict(n_heads=8,
53 |                        feat_drop=0.01,
54 |                        attn_drop=0.6,
55 |                        negative_slope=0.2,
56 |                        residual=False,
57 |                        attn_type='add',  # 'add' is more robust than 'mul'
58 |                        heads_fuse='mean',
59 |                        ),
60 |     share_layernorm=True,  # ignored if no weights are shared
61 |     residual=False,  # performance un-tested
62 | )
63 | 
64 | _params_lossfunc = dict(
65 |     smooth_eps=0.1, reduction='mean',
66 |     beta=1.,  # balance factor for multi-label loss
67 |     alpha=0,  # for R-drop, setting it larger than zero
68 | )
69 | 
70 | 
71 | def _get_parameter_dict(default={}, **kwds) -> dict:
72 |     params = copy.deepcopy(default)
73 |     if len(kwds) > 0:
74 |         params.update(**kwds)
75 |     return params
76 | 
77 | 
78 | # def get_preprocess_params(**kwds) -> dict:
79 | #     return _get_parameter_dict(_params_pre, **kwds)
80 | 
81 | 
82 | def get_loss_params(**kwds) -> dict:
83 |     return _get_parameter_dict(_params_lossfunc, **kwds)
84 | 
85 | 
86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict:
87 |     params = _get_parameter_dict(_params_model, **kwds)
88 |     if len(kwdict_outgat) > 0:
89 |         params['kwdict_outgat'].update(kwdict_outgat)
90 |     return params
91 | 
92 | 


--------------------------------------------------------------------------------
/run_came/came_origin/.ipynb_checkpoints/__init__-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @author: Xingyan Liu
 3 | 
 4 | from .utils import (
 5 |         load_hidden_states,
 6 |         save_hidden_states,
 7 |         load_example_data
 8 | )
 9 | from .utils import base
10 | from .utils.base import (
11 |         save_pickle,
12 |         load_pickle,
13 |         save_json_dict,
14 |         load_json_dict,
15 |         check_dirs,
16 |         write_info,
17 |         make_nowtime_tag,
18 |         subsample_each_group,
19 |         )
20 | from .utils import preprocess as pp
21 | from .utils import plot as pl
22 | from .utils import analyze as ana
23 | from .utils.analyze import (
24 |         load_dpair_and_model,
25 |         weight_linked_vars,
26 |         make_abstracted_graph,
27 |         )
28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL
29 | from .utils._base_trainer import get_checkpoint_list
30 | from .utils.evaluation import accuracy
31 | from .model import (
32 |         Predictor,
33 |         detach2numpy,
34 |         as_probabilities,
35 |         predict_from_logits,
36 |         predict,
37 |         CGGCNet,
38 |         CGCNet
39 | )
40 | from .datapair import (
41 |         datapair_from_adatas,
42 |         aligned_datapair_from_adatas,
43 |         DataPair,
44 |         AlignedDataPair,
45 |         make_features,
46 | )
47 | from .PARAMETERS import get_model_params, get_loss_params
48 | from . import pipeline
49 | from .pipeline import KET_CLUSTER, __test1__, __test2__
50 | 
51 | 
52 | __version__ = "0.1.8"
53 | 


--------------------------------------------------------------------------------
/run_came/came_origin/PARAMETERS.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 22:13:17 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | 
 7 | Parameter Settings
 8 | 
 9 | Notes
10 | -----
11 | * Do NOT change this file directly!
12 | 
13 | Examples
14 | --------
15 | >>> params_pre = PARAMETER.get_preprocess_params()
16 | >>> params_model = PARAMETER.get_model_params()
17 | >>> params_loss = PARAMETER.get_loss_params()
18 | 
19 | """
20 | import copy
21 | 
22 | # _params_pre = dict(
23 | #     remove_rare=False,  # True for benchmarking; False for case study
24 | #     min_samples=10,
25 | #     ###
26 | #     norm__rev=False,  # False by default
27 | #     norm__log_only=False,  # False by default
28 | #     ###
29 | #     scale_within=True,  # True by default
30 | #     unit_var=True,  # True by default
31 | #     clip=not True, clip_range=(-3, 5),  # False by default
32 | #     ###
33 | #     use_degs=True,
34 | #     only_1v1homo=False,  # False by default
35 | #     target_sum='auto',  # auto --> 1e4
36 | #     with_single_vnodes=not True,
37 | # )
38 | 
39 | _params_model = dict(
40 |     h_dim=128,
41 |     num_hidden_layers=2,
42 |     norm='right',
43 |     dropout_feat=0.0,  # no dropout for cell input features
44 |     dropout=0.2,
45 |     negative_slope=0.05,
46 |     layernorm_ntypes=['cell', 'gene'],
47 |     out_bias=True,
48 |     rel_names_out=[('gene', 'expressed_by', 'cell'),
49 |                    ],
50 |     share_hidden_weights=True,
51 |     attn_out=True,
52 |     kwdict_outgat=dict(n_heads=8,
53 |                        feat_drop=0.01,
54 |                        attn_drop=0.6,
55 |                        negative_slope=0.2,
56 |                        residual=False,
57 |                        attn_type='add',  # 'add' is more robust than 'mul'
58 |                        heads_fuse='mean',
59 |                        ),
60 |     share_layernorm=True,  # ignored if no weights are shared
61 |     residual=False,  # performance un-tested
62 | )
63 | 
64 | _params_lossfunc = dict(
65 |     smooth_eps=0.1, reduction='mean',
66 |     beta=1.,  # balance factor for multi-label loss
67 |     alpha=0,  # for R-drop, setting it larger than zero
68 | )
69 | 
70 | 
71 | def _get_parameter_dict(default={}, **kwds) -> dict:
72 |     params = copy.deepcopy(default)
73 |     if len(kwds) > 0:
74 |         params.update(**kwds)
75 |     return params
76 | 
77 | 
78 | # def get_preprocess_params(**kwds) -> dict:
79 | #     return _get_parameter_dict(_params_pre, **kwds)
80 | 
81 | 
82 | def get_loss_params(**kwds) -> dict:
83 |     return _get_parameter_dict(_params_lossfunc, **kwds)
84 | 
85 | 
86 | def get_model_params(kwdict_outgat={}, **kwds) -> dict:
87 |     params = _get_parameter_dict(_params_model, **kwds)
88 |     if len(kwdict_outgat) > 0:
89 |         params['kwdict_outgat'].update(kwdict_outgat)
90 |     return params
91 | 
92 | 


--------------------------------------------------------------------------------
/run_came/came_origin/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @author: Xingyan Liu
 3 | 
 4 | from .utils import (
 5 |         load_hidden_states,
 6 |         save_hidden_states,
 7 |         load_example_data
 8 | )
 9 | from .utils import base
10 | from .utils.base import (
11 |         save_pickle,
12 |         load_pickle,
13 |         save_json_dict,
14 |         load_json_dict,
15 |         check_dirs,
16 |         write_info,
17 |         make_nowtime_tag,
18 |         subsample_each_group,
19 |         )
20 | from .utils import preprocess as pp
21 | from .utils import plot as pl
22 | from .utils import analyze as ana
23 | from .utils.analyze import (
24 |         load_dpair_and_model,
25 |         weight_linked_vars,
26 |         make_abstracted_graph,
27 |         )
28 | from .utils.train import prepare4train, Trainer, SUBDIR_MODEL
29 | from .utils._base_trainer import get_checkpoint_list
30 | from .utils.evaluation import accuracy
31 | from .model import (
32 |         Predictor,
33 |         detach2numpy,
34 |         as_probabilities,
35 |         predict_from_logits,
36 |         predict,
37 |         CGGCNet,
38 |         CGCNet
39 | )
40 | from .datapair import (
41 |         datapair_from_adatas,
42 |         aligned_datapair_from_adatas,
43 |         DataPair,
44 |         AlignedDataPair,
45 |         make_features,
46 | )
47 | from .PARAMETERS import get_model_params, get_loss_params
48 | from . import pipeline
49 | from .pipeline import KET_CLUSTER, __test1__, __test2__
50 | 
51 | 
52 | __version__ = "0.1.8"
53 | 


--------------------------------------------------------------------------------
/run_came/came_origin/datapair/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from .unaligned import datapair_from_adatas, DataPair, make_features
 9 | from .aligned import aligned_datapair_from_adatas, AlignedDataPair
10 | 
11 | 


--------------------------------------------------------------------------------
/run_came/came_origin/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from ._utils import *
 9 | from ._predict import *
10 | from .loss import *
11 | from ._predict import *
12 | from .loss import *
13 | from .cggc import CGGCNet
14 | from .cgc import CGCNet
15 | 


--------------------------------------------------------------------------------
/run_came/came_origin/model/_minibatch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | """
 3 | @CreateDate: 2021/07/15
 4 | @Author: Qunlun Shen
 5 | @File: _minibatch.py
 6 | @Project: CAME
 7 | """
 8 | from pathlib import Path
 9 | from typing import Sequence, Union, Mapping, Optional
10 | import time
11 | import numpy as np
12 | import torch
13 | from torch import Tensor
14 | import dgl
15 | import tqdm
16 | 
17 | 
18 | def make_fanouts(etypes, etypes_each_layers, k_each_etype: Union[int, dict]):
19 |     if isinstance(k_each_etype, int):
20 |         k_each_etype = dict.fromkeys(etypes, k_each_etype)
21 | 
22 |     fanouts = []
23 |     for _etypes in etypes_each_layers:
24 |         _fanout = dict.fromkeys(etypes, 0)
25 |         _fanout.update({e: k_each_etype[e] for e in _etypes})
26 |         fanouts.append(_fanout)
27 |     return fanouts
28 | 
29 | 
30 | def involved_nodes(g,) -> dict:
31 |     """ collect all the involved nodes from the edges on g
32 |     (a heterogeneous graph)
33 | 
34 |     Examples
35 |     --------
36 | 
37 |     >>> input_nodes, output_nodes, mfgs = next(iter(train_dataloader))
38 |     >>> g.subgraph(involved_nodes(mfgs[0]))
39 | 
40 |     """
41 |     from collections import defaultdict
42 |     nodes = defaultdict(set)
43 |     for stype, etype, dtype in g.canonical_etypes:
44 |         src, dst = g.edges(etype=etype)
45 |         nodes[stype].update(src.numpy())
46 |         nodes[dtype].update(dst.numpy())
47 | 
48 |     nodes = {k: sorted(v) for k, v in nodes.items()}
49 |     return nodes
50 | 
51 | 


--------------------------------------------------------------------------------
/run_came/came_origin/model/v0/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | from ._utils import *
 9 | from ._predict import *
10 | from .loss import *
11 | from ._predict import *
12 | from .loss import *
13 | from .cggc import CGGCNet
14 | from .cgc import CGCNet
15 | 


--------------------------------------------------------------------------------
/run_came/came_origin/model/v0/_minibatch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @CreateDate: 2021/07/15
  4 | @Author: Qunlun Shen
  5 | @File: _minibatch.py
  6 | @Project: CAME
  7 | """
  8 | from pathlib import Path
  9 | from typing import Sequence, Union, Mapping, Optional
 10 | import time
 11 | import numpy as np
 12 | import torch
 13 | from torch import Tensor
 14 | import dgl
 15 | import tqdm
 16 | 
 17 | 
 18 | def sub_graph(cell_ids, gene_ids, g):
 19 |     """
 20 |     Making sub_graph for g with input cell_ids and gene_ids
 21 |     """
 22 |     output_nodes_dict = {'cell': cell_ids, 'gene': gene_ids}
 23 |     g_subgraph = dgl.node_subgraph(g, output_nodes_dict)
 24 |     return g_subgraph
 25 | 
 26 | 
 27 | def create_blocks(g, output_nodes, etype='expressed_by'):
 28 |     cell_ids = output_nodes.clone().detach()
 29 |     gene_ids = g.in_edges(cell_ids, etype=etype)[0]  # genes expressed_by cells
 30 |     gene_ids = torch.unique(gene_ids)
 31 |     block = sub_graph(cell_ids, gene_ids, g)  # graph for GAT
 32 |     return block
 33 | 
 34 | 
 35 | def create_batch(
 36 |         sample_size=None,
 37 |         train_idx=None,
 38 |         test_idx=None,
 39 |         batch_size=None,
 40 |         labels=None,
 41 |         shuffle=True,
 42 |         label=True
 43 | ):
 44 |     """
 45 |     This function create batch idx, i.e. the cells IDs in a batch.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     train_idx:
 50 |         the index for reference cells
 51 |     test_idx:
 52 |         the index for query cells
 53 |     batch_size:
 54 |         the number of cells in each batch
 55 |     labels:
 56 |         the labels for both Reference cells and Query cells
 57 | 
 58 |     Returns
 59 |     -------
 60 |     train_labels
 61 |         the shuffled or non-shuffled labels for all reference cells
 62 |     test_labels
 63 |         the shuffled or non-shuffled labels for all query cells
 64 |     batch_list
 65 |         the list sores the batch of cell IDs
 66 |     all_idx
 67 |         the shuffled or non-shuffled index for all cells
 68 |     """
 69 |     if label:
 70 |         batch_list = []
 71 |         batch_labels = []
 72 |         sample_size = len(train_idx) + len(test_idx)
 73 |         if shuffle:
 74 |             all_idx = torch.randperm(sample_size)
 75 |             shuffled_labels = labels[all_idx]
 76 |             train_labels = shuffled_labels[all_idx < len(train_idx)].clone().detach()
 77 |             test_labels = shuffled_labels[all_idx >= len(train_idx)].clone().detach()
 78 | 
 79 |             if batch_size >= sample_size:
 80 |                 batch_list.append(all_idx)
 81 | 
 82 |             else:
 83 |                 batch_num = int(len(all_idx) / batch_size) + 1
 84 |                 for i in range(batch_num - 1):
 85 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
 86 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
 87 | 
 88 |         else:
 89 |             train_labels = labels[train_idx].clone().detach()
 90 |             test_labels = labels[test_idx].clone().detach()
 91 |             all_idx = torch.cat((train_idx, test_idx), 0)
 92 |             if batch_size >= sample_size:
 93 |                 batch_list.append(all_idx)
 94 |             else:
 95 |                 batch_num = int(len(all_idx) / batch_size) + 1
 96 |                 for i in range(batch_num - 1):
 97 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
 98 |                     batch_labels.append(labels[batch_size * i: batch_size * (i + 1)])
 99 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
100 | 
101 |         return train_labels, test_labels, batch_list, all_idx
102 | 
103 |     else:
104 |         batch_list = []
105 |         if shuffle:
106 |             all_idx = torch.randperm(sample_size)
107 | 
108 |             if batch_size >= sample_size:
109 |                 batch_list.append(all_idx)
110 |             else:
111 |                 batch_num = int(len(all_idx) / batch_size) + 1
112 |                 for i in range(batch_num - 1):
113 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
114 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
115 | 
116 |         else:
117 |             all_idx = torch.arange(sample_size)
118 |             if batch_size >= sample_size:
119 |                 batch_list.append(all_idx)
120 |             else:
121 |                 batch_num = int(len(all_idx) / batch_size) + 1
122 |                 for i in range(batch_num - 1):
123 |                     batch_list.append(all_idx[batch_size * i: batch_size * (i + 1)])
124 |                 batch_list.append(all_idx[batch_size * (batch_num - 1):])
125 | 
126 |         return batch_list, all_idx, None, None
127 | 
128 | 


--------------------------------------------------------------------------------
/run_came/came_origin/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 27 21:59:44 2020
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | from . import *
 8 | from .base import (
 9 |         save_pickle,
10 |         load_pickle,
11 |         check_dirs,
12 |         write_info,
13 |         make_nowtime_tag,
14 |         subsample_each_group,
15 |         )
16 | from .evaluation import accuracy
17 | from .analyze import (
18 |        weight_linked_vars,
19 |        make_abstracted_graph,
20 |        )
21 | from ._get_example_data import load_example_data
22 | from .downsample_counts import (
23 |         downsample_total_counts,
24 |         downsample_counts_per_cell
25 | )
26 | from ._io_h5py import load_hidden_states, save_hidden_states
27 | 


--------------------------------------------------------------------------------
/run_came/came_origin/utils/_get_example_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @author: Xingyan Liu
  4 | @file: _get_example_data.py
  5 | @time: 2021-06-12
  6 | """
  7 | 
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Sequence, Union, Dict, List, Optional  # , Callable
 11 | import numpy as np
 12 | import pandas as pd
 13 | import scanpy as sc
 14 | from scipy import sparse
 15 | import logging
 16 | 
 17 | CAME_ROOT = Path(__file__).parents[1]
 18 | 
 19 | 
 20 | def _extract_zip(
 21 |         fp_zip=CAME_ROOT / 'sample_data.zip',
 22 |         fp_unzip=CAME_ROOT / 'sample_data',
 23 | ):
 24 |     import zipfile
 25 |     with zipfile.ZipFile(fp_zip) as zipf:
 26 |         zipf.extractall(fp_unzip)
 27 | 
 28 | 
 29 | def load_example_data() -> Dict:
 30 |     """ Load example data, for a quick start with CAME.
 31 | 
 32 |     This pair of cross-species datasets contains the pancreatic scRNA-seq data
 33 |     of human ("Baron_human") and mouse ("Baron_human"),
 34 |     initially published with paper [1].
 35 | 
 36 |     NOTE that "Baron_human" is a 20%-subsample from the original data.
 37 |     The resulting cell-typing accuracy may not be as good as one
 38 |     using full dataset as the reference.
 39 | 
 40 |     [1] Baron, M. et al. (2016) A Single-Cell Transcriptomic Map of the Human
 41 |     and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure.
 42 |     Cell Syst 3 (4), 346-360.e4.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     dict:
 47 |         a dict with keys ['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class']
 48 | 
 49 |     Examples
 50 |     --------
 51 |     >>> example_data_dict = load_example_data()
 52 |     >>> print(example_data_dict.keys())
 53 |     # Out[]: dict_keys(['adatas', 'varmap', 'varmap_1v1', 'dataset_names', 'key_class'])
 54 | 
 55 |     >>> adatas = example_data_dict['adatas']
 56 |     >>> dsnames = example_data_dict['dataset_names']  # ('Baron_human', 'Baron_mouse')
 57 |     >>> df_varmap = example_data_dict['varmap']
 58 |     >>> df_varmap_1v1 = example_data_dict['varmap_1v1']
 59 |     >>> key_class1 = key_class2 = example_data_dict['key_class']
 60 | 
 61 |     """
 62 |     datadir = CAME_ROOT / 'sample_data'
 63 | 
 64 |     sp1, sp2 = ('human', 'mouse')
 65 |     dsnames = ('Baron_human', 'Baron_mouse')
 66 |     dsn1, dsn2 = dsnames
 67 |     fp1, fp2 = datadir / f'raw-{dsn1}.h5ad', datadir / f'raw-{dsn2}.h5ad'
 68 |     fp_varmap_1v1 = datadir / f'gene_matches_1v1_{sp1}2{sp2}.csv'
 69 |     fp_varmap = datadir / f'gene_matches_{sp1}2{sp2}.csv'
 70 | 
 71 |     if not (datadir.exists() and fp1.exists() and fp2.exists() and
 72 |             fp_varmap.exists() and fp_varmap_1v1.exists()):
 73 |         _extract_zip()
 74 | 
 75 |     df_varmap_1v1 = pd.read_csv(fp_varmap_1v1, )
 76 |     df_varmap = pd.read_csv(fp_varmap, )
 77 | 
 78 |     adata_raw1, adata_raw2 = sc.read_h5ad(fp1), sc.read_h5ad(fp2)
 79 | 
 80 |     key_class = 'cell_ontology_class'
 81 |     example_dict = {
 82 |         'adatas': [adata_raw1, adata_raw2],
 83 |         'varmap': df_varmap,
 84 |         'varmap_1v1': df_varmap_1v1,
 85 |         'dataset_names': dsnames,
 86 |         'key_class': key_class,
 87 |     }
 88 |     logging.info(example_dict.keys())
 89 |     logging.debug(example_dict)
 90 |     return example_dict
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     logging.basicConfig(
 95 |         level=logging.DEBUG,
 96 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
 97 |                '%(levelname)s\n %(message)s')
 98 |     d = load_example_data()
 99 |     print(d.keys())
100 | 


--------------------------------------------------------------------------------
/run_came/came_origin/utils/_io_h5py.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | @Author: Xingyan Liu
  4 | @File: _tmp_h5py.py
  5 | @Date: 2021-08-03
  6 | @Project: CAME
  7 | """
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Union, Optional, List, Mapping
 11 | import logging
 12 | import numpy as np
 13 | import h5py
 14 | 
 15 | 
 16 | def save_hidden_states(data_list: list, path: Union[Path, str]):
 17 |     """ Save hidden states into .h5 file
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     data_list
 22 |         a list of data matrix, or a list of dicts whose values are matrices
 23 |     path
 24 |         file-path ends with .h5, if not, '.h5' will be appended to it.
 25 | 
 26 |     Returns
 27 |     -------
 28 |     None
 29 |     """
 30 |     if not str(path).endswith('.h5'):
 31 |         path = str(path) + '.h5'
 32 |     f = h5py.File(path, 'w')
 33 |     if isinstance(data_list[0], dict):
 34 |         for i, dct in enumerate(data_list):
 35 |             for key, _data in dct.items():
 36 |                 f.create_dataset(f'/layer{i}/{key}', data=_data)
 37 |     else:
 38 |         for i, _data in enumerate(data_list):
 39 |             f.create_dataset(f'/layer{i}', data=_data)
 40 | 
 41 |     f.close()
 42 | 
 43 | 
 44 | def load_hidden_states(path) -> List[dict]:
 45 |     """ Load hidden states from .h5 file
 46 |     the data structure should be like
 47 |         [
 48 |         'layer0/cell', 'layer0/gene',
 49 |         'layer1/cell', 'layer1/gene',
 50 |         'layer2/cell', 'layer2/gene'
 51 |         ]
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     path
 56 |         .h5 file path
 57 | 
 58 |     Returns
 59 |     -------
 60 |     values: a list of dicts
 61 |     """
 62 |     f = h5py.File(path, 'r')
 63 |     prefix = 'layer'
 64 |     keys = sorted(f.keys(), key=lambda x: int(x.strip(prefix)))
 65 |     # print(keys)
 66 |     values = [_unfold_to_dict(f[key]) for key in keys]
 67 |     return values
 68 | 
 69 | 
 70 | def _unfold_to_dict(d: h5py.Group) -> dict:
 71 |     dct = {}
 72 |     for key, val in d.items():
 73 |         if isinstance(val, h5py.Dataset):
 74 |             dct[key] = np.array(val)
 75 |     return dct
 76 | 
 77 | 
 78 | def _visit(f: h5py.File):
 79 |     tree = []
 80 | 
 81 |     def foo(_name, _obj):
 82 |         if isinstance(_obj, h5py.Dataset):
 83 |             tree.append(_name)
 84 |     f.visititems(foo)
 85 |     logging.info(f'tree={tree}')
 86 |     return tree
 87 | 
 88 | 
 89 | def __test__():
 90 |     n_cells = 100
 91 |     n_genes = 114
 92 |     n_dims = 64
 93 |     hidden_data = [
 94 |         {'cell': np.random.randn(n_cells, n_dims),
 95 |          'gene': np.random.randn(n_genes, n_dims)}
 96 |         for i in range(3)
 97 |     ]
 98 |     hidden_data.append({'cell': np.random.randn(n_cells, n_dims)})
 99 | 
100 |     # logging.debug(hidden_data)
101 |     save_hidden_states(hidden_data, '_tmp_data')
102 |     f1 = h5py.File('_tmp_data.h5', 'r')
103 |     h_list = load_hidden_states('../../_tmp_data.h5')
104 |     # logging.info(values)
105 |     for k, d in zip(f1.keys(), h_list):
106 |         print(f'{k}: {list(d.keys())}')
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     logging.basicConfig(
111 |         level=logging.DEBUG,
112 |         format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): '
113 |                '%(levelname)s\n %(message)s')
114 |     __test__()
115 | 


--------------------------------------------------------------------------------
/run_came/came_origin/utils/evaluation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Apr 11 19:43:10 2021
 4 | 
 5 | @author: Xingyan Liu
 6 | """
 7 | 
 8 | import numpy as np
 9 | from sklearn import metrics
10 | import torch
11 | from torch import Tensor
12 | from typing import Sequence
13 | from ..model import detach2numpy
14 | 
15 | 
16 | def accuracy(logits: Tensor, labels: Tensor):
17 |     labels = labels.to(logits.device)
18 |     if len(logits.shape) >= 2:
19 |         _, preds = torch.max(logits, dim=1)
20 |     else:
21 |         preds = logits
22 |     if len(labels.shape) >= 2:
23 |         _, labels = torch.max(labels, dim=1)
24 |     else:
25 |         labels = labels
26 |     correct = torch.sum(preds == labels)
27 |     return correct.item() * 1.0 / len(labels)
28 | 
29 | 
30 | def get_AMI(y_true, y_pred, **kwds):
31 |     y_true, y_pred = list(map(detach2numpy, (y_true, y_pred)))
32 |     ami = metrics.adjusted_mutual_info_score(y_true, y_pred, **kwds)
33 |     return ami
34 | 
35 | 
36 | def get_F1_score(y_true, y_pred, average='macro', **kwds):
37 |     y_true, y_pred = list(map(detach2numpy, (y_true, y_pred)))
38 |     f1 = metrics.f1_score(y_true, y_pred, average=average, **kwds)
39 |     return f1
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/run_came/heco_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def threshold_array(X):
 5 |     '''
 6 |     input: array row: sample, column: gene vector
 7 |     output: a binary matrix
 8 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
 9 |     '''
10 |     return X > np.mean(X, axis=0)
11 | 
12 | def threshold_quantile(X, quantile_gene=0.9, quantile_sample=0.95):
13 |     '''
14 |     input: array row: sample, column: gene vector
15 |     output: a binary matrix
16 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
17 |     '''
18 |     keep_mat_gene = X > np.quantile(X, quantile_gene, axis=0)
19 |     keep_mat_sample = (X.T > np.quantile(X.T, quantile_sample, axis=0)).T
20 |     keep_mat = keep_mat_sample + keep_mat_gene
21 |     return X * keep_mat
22 | 
23 | 
24 | def threshold_top(X, percent=1):
25 |     '''
26 |     input: array row: sample, column: gene vector
27 |     output: a binary matrix
28 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
29 |     '''
30 |     #topk = int(round(X.shape[0] * percent))
31 |     topk = percent
32 |     #print(topk)
33 |     #topk_pos = X.shape[0] - topk
34 |     X_sort = np.sort(X, axis=0)
35 |     return X >= X_sort[-topk, :]
36 | 
37 | 
38 | def threshold_array_nonzero(X):
39 |     '''
40 |     input: array row: sample, column: gene vector
41 |     output: a binary matrix
42 |     For each value in (i, j), the binary value = if(M_ij > avg(column_j))
43 |     '''
44 |     return X > 0
45 | 
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     X = np.array([[1,2,3],[2,3,4], [2,3,4], [4,5,2], [7,26,10]])
50 |     print(X)
51 |     print(threshold_top(X, percent=0.4))
52 |     #print(threshold_array(X))


--------------------------------------------------------------------------------