├── .Rbuildignore ├── .gitignore ├── CellDART.Rproj ├── CellDART ├── __init__.py ├── da_cellfraction.py ├── pred_cellf_celldart.py └── utils.py ├── CellDART_example_mousebrain_markers.ipynb ├── DESCRIPTION ├── NAMESPACE ├── R ├── CellDART_R.R └── Read_R_wrap.md ├── README.md ├── celldart_env.yaml ├── da_cellfraction.py ├── data └── datafile.md ├── main_counts.py ├── man └── pred_cellf_celldart.Rd ├── setup.py ├── utils.py └── vignettes ├── .gitignore └── introduction.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CellDART\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^doc$ 4 | ^Meta$ 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | inst/doc 3 | /doc/ 4 | /Meta/ 5 | -------------------------------------------------------------------------------- /CellDART.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | Encoding: UTF-8 9 | 10 | AutoAppendNewline: Yes 11 | StripTrailingWhitespace: Yes 12 | LineEndingConversion: Posix 13 | 14 | BuildType: Package 15 | PackageUseDevtools: Yes 16 | PackageInstallArgs: --no-multiarch --with-keep.source 17 | PackageRoxygenize: rd,collate,namespace 18 | -------------------------------------------------------------------------------- /CellDART/__init__.py: -------------------------------------------------------------------------------- 1 | #from .CellDART import da_cellfraction 2 | #from .CellDART import utils 3 | -------------------------------------------------------------------------------- /CellDART/da_cellfraction.py: -------------------------------------------------------------------------------- 1 | # Suppress tensorflow warnings 2 | import os 3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 4 | 5 | import numpy as np 6 | from tensorflow.compat.v1 import logging, disable_eager_execution 7 | logging.set_verbosity(logging.ERROR) 8 | # Remove compatibility issues 9 | disable_eager_execution() 10 | experimental_run_tf_function=False 11 | 12 | from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Dropout 13 | from tensorflow.keras.models import Model 14 | from tensorflow.keras.utils import to_categorical 15 | from tensorflow.keras import optimizers 16 | from sklearn.metrics import accuracy_score 17 | 18 | ### Build deep learning models for adversarial domain adaptation 19 | def build_models(inp_dim, emb_dim, n_cls_source, alpha=2, alpha_lr=10): 20 | inputs = Input(shape=(inp_dim,)) 21 | x4 = Dense(1024, activation='linear')(inputs) 22 | x4 = BatchNormalization()(x4) 23 | x4 = Activation("elu")(x4) 24 | x4 = Dense(emb_dim, activation='linear')(x4) 25 | x4 = BatchNormalization()(x4) 26 | x4 = Activation("elu")(x4) 27 | 28 | source_classifier = Dense(n_cls_source, activation='linear', name="mo1")(x4) 29 | source_classifier = Activation('softmax', name='mo')(source_classifier) 30 | 31 | domain_classifier = Dense(32, activation='linear', name="do4")(x4) 32 | domain_classifier = BatchNormalization(name="do5")(domain_classifier) 33 | domain_classifier = Activation("elu", name="do6")(domain_classifier) 34 | domain_classifier = Dropout(0.5)(domain_classifier) 35 | domain_classifier = Dense(2, activation='softmax', name="do")(domain_classifier) 36 | 37 | comb_model = Model(inputs=inputs, outputs=[source_classifier, domain_classifier]) 38 | comb_model.compile(optimizer="Adam", 39 | loss={'mo': 'kld', 'do': 'categorical_crossentropy'}, 40 | loss_weights={'mo': 1, 'do': alpha}, metrics=['accuracy'], ) 41 | 42 | source_classification_model = Model(inputs=inputs, outputs=[source_classifier]) 43 | source_classification_model.compile(optimizer=optimizers.Adam(learning_rate=0.001), 44 | loss={'mo': 'kld'}, metrics=['mae'], ) 45 | 46 | 47 | domain_classification_model = Model(inputs=inputs, outputs=[domain_classifier]) 48 | domain_classification_model.compile(optimizer=optimizers.Adam(learning_rate=alpha_lr*0.001), 49 | loss={'do': 'categorical_crossentropy'}, metrics=['accuracy']) 50 | 51 | 52 | embeddings_model = Model(inputs=inputs, outputs=[x4]) 53 | embeddings_model.compile(optimizer="Adam",loss = 'categorical_crossentropy', metrics=['accuracy']) 54 | 55 | 56 | return comb_model, source_classification_model, domain_classification_model, embeddings_model 57 | 58 | ### Batch data generator for the given data 59 | def batch_generator(data, batch_size): 60 | """Generate batches of data. 61 | Given a list of numpy data, it iterates over the list and returns batches of the same size 62 | This 63 | """ 64 | all_examples_indices = len(data[0]) 65 | while True: 66 | mini_batch_indices = np.random.choice(all_examples_indices, size=batch_size, replace=False) 67 | tbr = [k[mini_batch_indices] for k in data] 68 | yield tbr 69 | 70 | ### Train the neural network to predict cell composition in spatial data 71 | # Xs: numpy array for composite log-normaized count of pseudospots 72 | # ys: numpy array for fraction of cell types across the pseudospots 73 | # Xt: numpy array for log-normaized count of spatial spots 74 | 75 | # emb_dim: output size of dimensions for feature extractor (default = 64) 76 | # batch_size: minibatch size for pseudospots and spatial data during the training (default = 64) 77 | # enable_dann: whether to use domain adaptation process 78 | # n_iterations: iteration number for the adversarial learning (default = 3000) 79 | 80 | # alpha: loss weights of domain classifier to the source classifier (default = 0.6) 81 | # alpha_lr: learning rate for the domain classifier (alpha_lr*0.001, default = 5) 82 | 83 | # init_train: whether to perform pre-training process 84 | # init_train_epoch: iteration number for the pre-training process (default = 10) 85 | def train( Xs, ys, Xt, yt=None, 86 | emb_dim=2, 87 | batch_size = 64, 88 | enable_dann = True, 89 | n_iterations = 1000, 90 | alpha=2, 91 | alpha_lr=10, 92 | initial_train=True, 93 | initial_train_epochs=100): 94 | 95 | 96 | inp_dim = Xs.shape[1] 97 | ncls_source = ys.shape[1] 98 | 99 | model, source_classification_model, domain_classification_model, embeddings_model = \ 100 | build_models(inp_dim, emb_dim, ncls_source, alpha=alpha, alpha_lr = alpha_lr) 101 | 102 | if initial_train: 103 | source_classification_model.fit(Xs, ys, batch_size= batch_size, epochs=initial_train_epochs) 104 | print("initial_train_done") 105 | y_adversarial_1 = to_categorical(np.array(([1] * batch_size + [0] * batch_size))) 106 | 107 | sample_weights_class = np.array(([1] * batch_size + [0] * batch_size)) 108 | sample_weights_adversarial = np.ones((batch_size * 2,)) 109 | 110 | S_batches = batch_generator([Xs, ys], batch_size) 111 | T_batches = batch_generator([Xt, np.zeros(shape = (len(Xt),2))], batch_size) 112 | 113 | for i in range(n_iterations): 114 | # # print(y_class_dummy.shape, ys.shape) 115 | y_adversarial_2 = to_categorical(np.array(([0] * batch_size + [1] * batch_size))) 116 | 117 | X0, y0 = next(S_batches) 118 | X1, y1 = next(T_batches) 119 | 120 | 121 | X_adv = np.concatenate([X0, X1]) 122 | y_class = np.concatenate([y0, np.zeros_like(y0)]) 123 | 124 | adv_weights = [] 125 | for layer in model.layers: 126 | if (layer.name.startswith("do")): 127 | adv_weights.append(layer.get_weights()) 128 | 129 | if(enable_dann): 130 | # note - even though we save and append weights, the batchnorms moving means and variances 131 | # are not saved throught this mechanism 132 | model.train_on_batch(X_adv, [y_class, y_adversarial_1], 133 | sample_weight=[sample_weights_class, sample_weights_adversarial]) 134 | 135 | k = 0 136 | for layer in model.layers: 137 | if (layer.name.startswith("do")): 138 | layer.set_weights(adv_weights[k]) 139 | k += 1 140 | 141 | class_weights = [] 142 | 143 | 144 | for layer in model.layers: 145 | if (not layer.name.startswith("do")): 146 | class_weights.append(layer.get_weights()) 147 | 148 | domain_classification_model.train_on_batch(X_adv, [y_adversarial_2]) 149 | 150 | k = 0 151 | for layer in model.layers: 152 | if (not layer.name.startswith("do")): 153 | layer.set_weights(class_weights[k]) 154 | k += 1 155 | 156 | else: 157 | source_classification_model.train_on_batch(X0,y0) 158 | 159 | 160 | if yt is None: 161 | if ((i + 1) % 100 == 0): 162 | # print(i, stats) 163 | sourceloss, sourceacc = source_classification_model.evaluate(Xs, ys,verbose=0) 164 | domainloss,domainacc = domain_classification_model.evaluate(np.concatenate([Xs, Xt]), 165 | to_categorical(np.array(([1] * Xs.shape[0] + [0] * Xt.shape[0]))), 166 | verbose=0) 167 | print("Iteration %d, source loss = %.3f, discriminator acc = %.3f"%(i, sourceloss ,domainacc)) 168 | else: 169 | if ((i + 1) % 100 == 0): 170 | # print(i, stats) 171 | y_test_hat_t = source_classification_model.predict(Xt).argmax(1) 172 | y_test_hat_s = source_classification_model.predict(Xs).argmax(1) 173 | print("Iteration %d, source accuracy = %.3f, target accuracy = %.3f"%(i, accuracy_score(ys, y_test_hat_s), accuracy_score(yt, y_test_hat_t))) 174 | 175 | return embeddings_model, source_classification_model 176 | 177 | -------------------------------------------------------------------------------- /CellDART/pred_cellf_celldart.py: -------------------------------------------------------------------------------- 1 | def pred_cellf_celldart(adata_sp=None, adata_sc=None, count_from_raw=False, 2 | gpu=True, spdir=None, sp10x=True, spfilter=False, spfilgene=5, spfilspot=50, 3 | scdir=None, sc_10x_mtx=True, sc10x_h5=False, sctranspose=False, 4 | celltype='celltype', num_markers=20, seed_num=0, 5 | nmix=10, npseudo=20000, alpha=0.6, alpha_lr=5, batch_size=512, emb_dim=64, n_iterations=3000, init_train_epoch=10, 6 | outdir='./CellDART_output', return_anndata=True): 7 | ''' 8 | ## Function to implement CellDART in python 9 | adata_sp: spatial data (AnnData object) to be used in predicting cell fraction (default: None) 10 | -> If None, then provide spdir where spatial datasets are saved (formats are explained below) 11 | adata_sc: single-cell data (AnnData object) to be used in making pseudospots (default: None) 12 | -> If None, then provide scdir where single-cell datasets are saved (formats are explained below) 13 | count_from_raw: whether to extract count matrix frow .raw of AnnData 14 | -> non-normalized count matrix should be contained in the AnnData .raw file 15 | -> if False, then utilize the count matrices saved in adata_sp and adata_sc directly 16 | 17 | gpu: check whether to use gpu (True) or not (False) (default = True) 18 | 19 | spdir: file directory to find or save spatial data 20 | -> In case of utilizing already saved spatial data, otherwise, put None 21 | -> Visium data should be separated in different folders 22 | Example directory (spatial) 23 | -> two spatial datasets (10x visium format) 24 | ./Mouse_sp/first/filtered_feature_bc_matrix.h5, ./Mouse_sp/first/spatial/tissue_hires_image.png, ./Mouse_sp/first/spatial/tissue_lowres_image.png, 25 | ./Mouse_sp/first/spatial/scalefactors_json.json, ./Mouse_sp/first/spatial/tissue_positions_list.csv 26 | second dataset directory starts with ./Mouse_sp/second/.., others are same as above. 27 | 28 | sp10x: whether the spatial data is 10x Visium format (True) or not (False) (default: True) 29 | spfilter: check whether to filter the number of cells and genes in spatial data (True: run filter) 30 | spfilgene: keep genes that are expressed in at least 'spfilgene' number of cells (default = 5) 31 | spfilspot: keep spots with at least 'spfilcell' counts (default = 50) 32 | 33 | scdir: file directory to find or save single-cell data 34 | -> In case of utilizing already saved sc data, otherwise, put None 35 | -> each single-cell data should be separated in different folders 36 | -> each file formats should be among 10x format or others (.mtx.gz, .h5ad, h5, .csv, .tsv, or .txt) 37 | -> and metadata with corresponding barcode name as index should be included in metadata folder of each single-cell data 38 | -> metadata should be csv format 39 | Example directory (single-cell) 40 | -> two single cell dataset (10x mtx format) with metadata 41 | ./Mouse_sc/first/barcodes.tsv, ./Mouse_sc/first/genes.tsv, ./Mouse_sc/first/matrix.mtx, ./Mouse_sc/first/metadata/metadata.csv 42 | ./Mouse_sc/second/barcodes.tsv, ./Mouse_sc/second/genes.tsv, ./Mouse_sc/second/matrix.mtx, ./Mouse_sc/first/second/metadata.csv 43 | 44 | sc10x_mtx: check whether single-cell data is 10x genomics formatted mtx directory (True) or not (False) 45 | sc10x_h5: check whether single-cell data is 10x genomics formatted hdf5 file (True) or not (False) 46 | sctranspose: if sc10x_mtx and sc10x_h5 is F, check whether loaded matrix should be transposed (True) or not (False) 47 | 48 | celltype: column name for single-cell annotation data in .obs (default: 'celltype') 49 | num_markers: number of selected marker genes in each cell-type (default = 20) 50 | 51 | seed_num: seed to be used in random sampling (default = 0) 52 | 53 | nmix: sampling number of cells in pseudospot (default = 10) 54 | npseudo: a total number of pseudospots (default = 20000) 55 | 56 | alpha: loss weights of domain classifier to source classifier (default = 0.6) 57 | alpha_lr: learning rate for domain classifier (alpha_lr*0.001, default = 5) 58 | batch_size: minibatch size during the training (default = 512) 59 | emb_dim: output size of dimensions for feature extractor (default = 64) 60 | 61 | n_iterations: iteration number for the adversarial learning (default = 3000) 62 | init_train_epoch: iteration number of pre-train (default = 10) 63 | 64 | outdir: the directory to save output files (models and results) 65 | return_anndata: return spatial AnnData file with predicted cell fraction in .obs (default = True) 66 | ''' 67 | import os 68 | if gpu: 69 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 70 | os.environ["CUDA_VISIBLE_DEVICES"]= "0" # Use only gpu-0 71 | print('GPU is available and will be used') 72 | else: 73 | os.environ['CUDA_VISIBLE_DEVICES'] = "-1" # Use CPU 74 | print('CPU will be used') 75 | 76 | from warnings import simplefilter 77 | simplefilter(action='ignore', category=Warning) 78 | 79 | import scanpy as sc 80 | import pandas as pd 81 | import numpy as np 82 | 83 | from CellDART import utils 84 | from CellDART import da_cellfraction 85 | 86 | ## Change float variables into integer (during conversion from R to python) 87 | num_markers, seed_num, \ 88 | nmix, npseudo, batch_size, emb_dim, n_iterations, init_train_epoch = \ 89 | int(num_markers), int(seed_num), \ 90 | int(nmix), int(npseudo), int(batch_size), int(emb_dim), \ 91 | int(n_iterations), int(init_train_epoch) 92 | 93 | ## Create directory if it does not exist 94 | if not os.path.exists(outdir): 95 | os.makedirs(outdir) 96 | 97 | ## Load and preprocess spatial dataset 98 | if adata_sp is not None: 99 | if spdir is not None: 100 | raise ValueError("'spdir' should be None when 'adata_sp' is provided.") 101 | if count_from_raw: spatial_all = adata_sp.raw.to_adata().copy() 102 | else: spatial_all = adata_sp.copy() 103 | sc.pp.normalize_total(spatial_all, target_sum=1e4, inplace=True) 104 | print('Shape of the provided spatial data is',spatial_all.shape) 105 | else: 106 | if spdir is None: 107 | raise ValueError("'spdir' should be provided when 'adata_sp' is None") 108 | # Load and normalize spatial data 109 | sp_list = os.listdir(spdir) 110 | adata_sp = [] 111 | for i, sp_data in enumerate(sp_list): 112 | if sp10x: 113 | adata = sc.read_visium(os.path.join(spdir,sp_data)) 114 | else: 115 | sp = os.listdir(os.path.join(spdir,sp_data))[0] 116 | adata = sc.read(os.path.join(spdir,sp_data,sp)) 117 | adata.var_names_make_unique() 118 | if spfilter: 119 | sc.pp.filter_genes(adata, min_cells=spfilgene) 120 | sc.pp.filter_cells(adata, min_counts=spfilspot) 121 | sc.pp.normalize_total(adata, target_sum=1e4, inplace=True) 122 | adata_sp.append(adata) 123 | print('Shape of spatial data',i,'is',adata.shape) 124 | 125 | # Merge spatial data 126 | if len(adata_sp)==1: 127 | spatial_all = adata_sp[0] 128 | else: 129 | spatial_all = adata_sp[0].concatenate(adata_sp[1:], join='inner', 130 | uns_merge='unique') 131 | print('Shape of the merged spatial data is',spatial_all.shape) 132 | 133 | ## Load and preprocess single-cell dataset 134 | if adata_sc is not None: 135 | if scdir is not None: 136 | raise ValueError("'scdir' should be None when 'adata_sc' is provided.") 137 | if count_from_raw: single_all = adata_sc.raw.to_adata().copy() 138 | else: single_all = adata_sc.copy() 139 | 140 | # Check if the column for the cell type is included in .obs 141 | if celltype not in list(single_all.obs): 142 | raise ValueError('Column for cell type is not found') 143 | 144 | sc.pp.normalize_total(single_all, target_sum=1e4, inplace=True) 145 | print('Shape of the provided single-cell data is',single_all.shape) 146 | else: 147 | if scdir is None: 148 | raise ValueError("'scdir' should be provided when 'adata_sc' is None") 149 | # Load single cell data 150 | sc_list = os.listdir(scdir) 151 | if sc_10x_mtx: 152 | adata_sc = [sc.read_10x_mtx(os.path.join(scdir,y), cache=True) for y in sc_list] 153 | elif sc10x_h5: 154 | adata_sc = [sc.read_10x_h5(os.path.join(scdir,y)) for y in sc_list] 155 | else: 156 | if sctranspose: 157 | adata_sc = [sc.read(os.path.join(scdir,y,z), cache=True).T for y in sc_list \ 158 | for z in [i for i in os.listdir(os.path.join(scdir,y)) \ 159 | if i.endswith('mtx.gz') or i.endswith('h5ad') or i.endswith('h5') or \ 160 | i.endswith('csv') or i.endswith('tsv') or i.endswith('txt')]] 161 | else: 162 | adata_sc = [sc.read(os.path.join(scdir,y,z), cache=True) for y in sc_list \ 163 | for z in [i for i in os.listdir(os.path.join(scdir,y)) \ 164 | if i.endswith('mtx.gz') or i.endswith('h5ad') or i.endswith('h5') or \ 165 | i.endswith('csv') or i.endswith('tsv') or i.endswith('txt')]] 166 | 167 | # preprocess each of the dataset 168 | for i, adata in enumerate(adata_sc): 169 | adata.var_names_make_unique() 170 | sc.pp.normalize_total(adata, target_sum=1e4, inplace=True) 171 | sc_meta_list = os.listdir(os.path.join(scdir,sc_list[i],'metadata')) 172 | sc_meta_list = [i for i in sc_meta_list if i.endswith('.csv')] 173 | if len(sc_meta_list)==0: 174 | raise NotImplementedError('No csv format metadata in the folder') 175 | tmp = pd.read_csv(os.path.join(scdir,sc_list[i],'metadata',sc_meta_list[0]),index_col=0) 176 | 177 | if (set(tmp.index)<=set(adata.obs.index)): 178 | print('All barcode names in metadata are found') 179 | else: 180 | raise ValueError('Unidentified barcode names in metadata of '+sc_list[i]) 181 | if celltype not in list(tmp): 182 | raise ValueError('Column for cell type is not found') 183 | 184 | # subset the data to include only the barcodes in metadata 185 | adata = adata[adata.obs.index.isin(tmp.index)].copy() 186 | # rearrange the metadata index according to adata 187 | tmp = tmp.reindex(adata.obs.index) 188 | adata.obs = tmp 189 | adata_sc[i] = adata 190 | print('Shape of single cell data',i,'is',adata.shape) 191 | 192 | if len(adata_sc)==1: 193 | single_all = adata_sc[0] 194 | else: 195 | single_all = adata_sc[0].concatenate(adata_sc[1:], join='inner') 196 | 197 | print('Shape of merged single cell data is',single_all.shape) 198 | 199 | # save the normalized data in raw 200 | single_all.raw = single_all.copy() 201 | 202 | # log-transform the count matrix 203 | sc.pp.log1p(single_all) 204 | 205 | # Find marker genes for single cell data 206 | single_all.obs[celltype] = single_all.obs[celltype].astype('category', copy=False) 207 | sc.tl.rank_genes_groups(single_all, celltype, method='wilcoxon') 208 | genelists=single_all.uns['rank_genes_groups']['names'] 209 | df_genelists = pd.DataFrame.from_records(genelists) 210 | 211 | # Combining top marker genes representing each cell type 212 | res_genes = [] 213 | for column in df_genelists.head(num_markers): 214 | res_genes.extend(df_genelists.head(num_markers)[column].tolist()) 215 | res_genes_ = list(set(res_genes)) 216 | 217 | # Calculate intersecting genes 218 | inter_genes_comb = [val for val in res_genes_ if val in spatial_all.var.index] 219 | print('Total number of marker genes: ',len(inter_genes_comb)) 220 | 221 | # Generation of an array representing cell type number 222 | df_sc = single_all.obs 223 | lab_sc_sub = df_sc[celltype] 224 | sc_sub_dict = dict(zip(range(len(set(lab_sc_sub))), set(lab_sc_sub))) 225 | sc_sub_dict2 = dict((y,x) for x,y in sc_sub_dict.items()) 226 | lab_sc_num = [sc_sub_dict2[ii] for ii in lab_sc_sub] 227 | # Make an array for cell type numbers following the sequence of single cell barcodes 228 | lab_sc_num = np.asarray(lab_sc_num, dtype='int') 229 | 230 | # Call original normalized count (not log-normalized count) 231 | adata_final = single_all.raw.to_adata().copy() 232 | 233 | # Generate count matrix for single-cell data (mat_sc) 234 | adata_final = adata_final[:,inter_genes_comb] 235 | if isinstance(adata_final.X, np.ndarray): 236 | mat_sc = adata_final.X 237 | else: 238 | mat_sc = adata_final.X.toarray() 239 | 240 | # Raw file for merged spatial data 241 | spatial_raw = spatial_all.copy() 242 | 243 | # Generate count matrix for spatial data (mat_sp) 244 | spatial_all = spatial_all[:,inter_genes_comb] 245 | if isinstance(spatial_all.X, np.ndarray): 246 | mat_sp = spatial_all.X 247 | else: 248 | mat_sp = spatial_all.X.toarray() 249 | 250 | # Generate pseudospot: random mixture of cells 251 | sc_mix, lab_mix = utils.random_mix(mat_sc, lab_sc_num, nmix=nmix, n_samples=npseudo, seed=seed_num) 252 | 253 | # Log-normalize and scale the data 254 | def log_minmaxscale(arr): 255 | arrd = len(arr) 256 | arr = np.log1p(arr) 257 | e = 1e-8 # modified by adding e 258 | return (arr-np.reshape(np.min(arr,axis=1),(arrd,1)))/np.reshape((np.max(arr,axis=1)-np.min(arr,axis=1))+e,(arrd,1)) 259 | 260 | sc_mix_s = log_minmaxscale(sc_mix) 261 | mat_sp_s = log_minmaxscale(mat_sp) 262 | mat_sc_s = log_minmaxscale(mat_sc) 263 | 264 | print('Size of spatial, single-cell, pseudospot, and cell fraction data:', 265 | mat_sp_s.shape, mat_sc_s.shape, sc_mix_s.shape, lab_mix.shape) 266 | 267 | # Train the CellDART model 268 | embs, clssmodel = da_cellfraction.train(sc_mix_s, lab_mix, mat_sp_s, enable_dann = True, 269 | alpha=alpha, alpha_lr=alpha_lr, 270 | emb_dim = emb_dim, 271 | batch_size = batch_size, 272 | n_iterations = n_iterations, 273 | initial_train=True, 274 | initial_train_epochs=init_train_epoch) 275 | # Prediction of cell fraction in each spot 276 | pred_sp = pd.DataFrame(clssmodel.predict(mat_sp_s)) 277 | pred_sp.index = spatial_all.obs.index 278 | 279 | # Make directory for the model save 280 | if not os.path.exists(os.path.join(outdir,'model')): 281 | os.makedirs(os.path.join(outdir,'model')) 282 | 283 | # Save the cell fraction in .obs of spatial_raw file 284 | for visnum in range(len(sc_sub_dict)): 285 | spatial_raw.obs[str(sc_sub_dict[visnum])+'_cellf'] = pred_sp.iloc[pred_sp.index.isin(spatial_raw.obs.index),visnum] 286 | 287 | # Save cell fraction data 288 | df = spatial_raw.obs.filter(regex='_cellf', axis=1) 289 | df.to_csv(os.path.join(outdir,'cellfraction.csv'),header=True,index=True) 290 | print('Cell fraction data for was saved') 291 | 292 | # Save model files 293 | embs.save_weights(os.path.join(outdir,'model/embedder.h5')) 294 | clssmodel.save_weights(os.path.join(outdir,'model/classifier.h5')) 295 | 296 | # Save spatial anndata 297 | spatial_raw.write_h5ad(os.path.join(outdir,'model/sp_data.h5ad')) 298 | print('Spatial anndata was saved') 299 | 300 | print('Model and python data files were saved') 301 | 302 | if return_anndata: return(spatial_raw) 303 | else: return(df) 304 | -------------------------------------------------------------------------------- /CellDART/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | from tensorflow.keras.utils import to_categorical 6 | 7 | def random_mix(Xs, ys, nmix=5, n_samples=10000, seed=0): 8 | # Define empty lists 9 | Xs_new, ys_new =[], [] 10 | ys_ = to_categorical(ys) 11 | 12 | rstate = np.random.RandomState(seed) 13 | fraction_all = rstate.rand(n_samples, nmix) 14 | randindex_all = rstate.randint(len(Xs), size=(n_samples,nmix)) 15 | 16 | for i in range(n_samples): 17 | # fraction: random fraction across the "nmix" number of sampled cells 18 | fraction = fraction_all[i] 19 | fraction = fraction/np.sum(fraction) 20 | fraction = np.reshape(fraction, (nmix,1)) 21 | 22 | # Random selection of the single cell data by the index 23 | randindex = randindex_all[i] 24 | ymix = ys_[randindex] 25 | # Calculate the fraction of cell types in the cell mixture 26 | yy = np.sum(ymix*fraction, axis=0) 27 | # Calculate weighted gene expression of the cell mixture 28 | XX = np.asarray(Xs[randindex])*fraction 29 | XX_ = np.sum(XX, axis=0) 30 | 31 | # Add cell type fraction & composite gene expression in the list 32 | ys_new.append(yy) 33 | Xs_new.append(XX_) 34 | 35 | Xs_new = np.asarray(Xs_new) 36 | ys_new = np.asarray(ys_new) 37 | 38 | return Xs_new, ys_new -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CellDART 2 | Title: Cell type inference by domain adaptation of single-cell and spatial transcriptomic data 3 | Version: 0.1.1 4 | Authors@R: 5 | c(person(given = "Hongyoon", 6 | family = "Choi", 7 | role = c("aut"), 8 | comment = c(ORCID = "0000-0002-8895-2449")), 9 | person(given = "Sungwoo", 10 | family = "Bae", 11 | role = c("aut", "cre"), 12 | email = "bsungwoo@snu.ac.kr", 13 | comment = c(ORCID = "0000-0002-3484-3749"))) 14 | Description: CellDART estimates the spatial distribution of cells defined by single-cell level data using domain adaptation of neural networks. The neural network that predicts the cell proportion in a pseudospot, a virtual mixture of cells from single-cell data, is translated to decompose the cell types in each spatial barcoded region. 15 | License: `use_mit_license()` 16 | Encoding: UTF-8 17 | Roxygen: list(markdown = TRUE) 18 | RoxygenNote: 7.2.0 19 | Imports: 20 | Seurat (>= 4.0.5), 21 | dplyr (>= 1.0.7), 22 | reticulate (>= 1.22), 23 | rmarkdown (>= 2.14) 24 | Suggests: 25 | knitr 26 | VignetteBuilder: knitr 27 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(pred_cellf_celldart) 4 | -------------------------------------------------------------------------------- /R/CellDART_R.R: -------------------------------------------------------------------------------- 1 | #' R wrap function to implement CellDART 2 | #' @description Cell type inference by domain adaptation of single-cell and spatial transcriptomic data 3 | #' 4 | #' @param sp_data spatial data (Seurat object) to be used in predicting cell fraction: non-normalized raw data should be in 'counts' slot 5 | #' @param sc_data single-cell data (Seurat object) to be used in making pseudospots: non-normalized raw data should be in 'counts' slot 6 | #' 7 | #' @param outdir the directory to save output files (models and results) (default = '.') 8 | #' 9 | #' @param sp_subset whether to subset spatial data and calculate for specific spot cluster (default = FALSE) 10 | #' @param spot.cluster.name group name of the cluster used for subsetting spatial data (default: 'seurat_clusters') 11 | #' @param spot.cluster.of.interest name of each spot clusters to be used (default: NULL) 12 | #' @param metadata_celltype column name for single-cell annotation data in metadata (default: 'celltype') 13 | #' 14 | #' @param env.select select between using reticulate virtual environment or conda environment (default: "conda") 15 | #' @param python.install whether to automatically install python version 3.8.12 16 | #' 17 | #' @param python_path path for the python 3.8. (default: NULL) 18 | #' \itemize{ 19 | #' \item If NULL, python version 3.8.12 will be installed (valid for Linux) 20 | #' \item If "current", python interpreter associated with current virtual env (ex: r-reticulate) will be used. (version should be 3.8) 21 | #' } 22 | #' 23 | #' @param env.name name of the virtual or conda environment to use for CellDART analysis (default: 'CellDART') 24 | #' 25 | #' @param gpu check whether to use gpu (True) or not (False) (default = True) 26 | #' @param metadata_celltype column name for single-cell annotation data in metadata (default: 'celltype') 27 | #' @param num_markers number of selected marker genes in each cell-type (default = 20) 28 | #' @param seed_num seed to be used in random sampling (default = 0) 29 | #' @param nmix the number of cells sampled from single-cell data when making a pseudospot (default = 10) 30 | #' @param npseudo a total number of pseudospots (default = 20000) 31 | #' 32 | #' @param alpha loss weights of domain classifier to the source classifier (default = 0.6) 33 | #' @param alpha_lr learning rate for the domain classifier (alpha_lr*0.001, default = 5) 34 | #' @param emb_dim output size of dimensions for feature extractor (default = 64) 35 | #' 36 | #' @param batch_size minibatch size for pseudospots and spatial data during the training (default = 512) 37 | #' @param n_iterations iteration number for the adversarial learning (default = 3000) 38 | #' @param init_train_epoch iteration number for the pre-training process (default = 10) 39 | #' 40 | #' @return spatial data (Seurat object) with predicted cell fraction in metadata (meta.data) 41 | #' @examples 42 | #' Using conda environment (environment will be automatically installed in Linux distributions) 43 | #' If using Windows, then install conda environment first and then run the function below with python.install = F 44 | #' sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.', 45 | #' sp_subset=F, spot.cluster.name='seurat_clusters', 46 | #' spot.cluster.of.interest=NULL, 47 | #' env.select='conda',python.install=T, 48 | #' python_path=NULL, env.name='CellDART', 49 | #' gpu=TRUE, metadata_celltype='celltype', 50 | #' num_markers=20, seed_num=0, 51 | #' nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5, 52 | #' emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 53 | #' 54 | #' Using virtual environment (environment will be automatically installed in Linux distributions) 55 | #' Not recommended for Windows 56 | #' sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.', 57 | #' sp_subset=F, spot.cluster.name='seurat_clusters', 58 | #' spot.cluster.of.interest=NULL, 59 | #' env.select='virtual',python.install=T, 60 | #' python_path=NULL, env.name='CellDART', 61 | #' gpu=TRUE, metadata_celltype='celltype', 62 | #' num_markers=20, seed_num=0, 63 | #' nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5, 64 | #' emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 65 | #' @export 66 | pred_cellf_celldart <- function(sp_data, sc_data, outdir='.', 67 | sp_subset=FALSE, spot.cluster.name='seurat_clusters', 68 | spot.cluster.of.interest=NULL, 69 | env.select='conda', python.install=F, 70 | python_path=NULL, env.name='CellDART', 71 | gpu=TRUE, metadata_celltype='celltype', 72 | num_markers=20, seed_num=0, 73 | nmix=8, npseudo=20000, alpha=0.6, alpha_lr=5, 74 | emb_dim=64, batch_size=512, n_iterations=3000, 75 | init_train_epoch=10){ 76 | # Suppress warnings 77 | defaultW <- getOption("warn") 78 | options(warn = -1) 79 | 80 | if (python.install){ 81 | reticulate::install_python(version = '3.8.12') 82 | } 83 | 84 | # Select between using reticulate virtual environment or conda environment ("virtual" or "conda") 85 | if (env.select=="virtual"){ 86 | # Setting virtual environment with reticulate 87 | if (!(env.name %in% reticulate::virtualenv_list())){ 88 | ## Python dependencies use python version 3.8 89 | if (is.null(python_path)){ 90 | reticulate::virtualenv_create(envname = env.name, version = '3.8.12') 91 | } else if (python_path=="current") { 92 | reticulate::virtualenv_create(envname = env.name, python = NULL) 93 | } else { 94 | reticulate::virtualenv_create(envname = env.name, python = python_path) 95 | } 96 | # python_depend = c("scanpy==1.9.1","pandas","numpy", 97 | # "tensorflow==2.9.1","tensorflow-gpu==2.9.1") 98 | 99 | # Create virtual env and install dependencies 100 | reticulate::virtualenv_install(env.name, packages = 'pip', ignore_installed=T, 101 | pip_options = "git+https://github.com/mexchy1000/CellDART.git") 102 | reticulate::use_virtualenv(env.name, required = T) 103 | } 104 | # Apply virtual environment 105 | reticulate::use_virtualenv(env.name, required = T) 106 | } else if (env.select=="conda"){ 107 | if (!(env.name %in% reticulate::conda_list()[['name']])){ 108 | ## Python dependencies use python version 3.8 109 | if (is.null(python_path)){ 110 | reticulate::conda_create(envname = env.name, python_version = '3.8.12') 111 | } else if (python_path=="current") { 112 | reticulate::conda_create(envname = env.name, python = NULL) 113 | } else { 114 | reticulate::conda_create(envname = env.name, python = python_path) 115 | } 116 | 117 | # Create conda env and install dependencies 118 | reticulate::conda_install(env.name, ignore_installed=T, 119 | pip = TRUE, "git+https://github.com/mexchy1000/CellDART.git") 120 | } 121 | # Apply conda environment 122 | reticulate::use_condaenv(env.name, required = T) 123 | } else { 124 | stop("'env.select' should be either 'virtual' or 'conda'") 125 | } 126 | 127 | ## Import anndata 128 | ann <- reticulate::import('anndata', convert = FALSE) 129 | 130 | ## Import python function 131 | CellDART <- reticulate::import('CellDART', convert = FALSE) 132 | 133 | ## 1. Saving single-cell data in anndata format 134 | # Define count matrix 135 | sparse_mtx <- Seurat::GetAssayData(sc_data, slot = "counts", assay = "RNA") 136 | 137 | # Define obs and var (reference from sceasy library: https://github.com/cellgeni/sceasy) 138 | obs <- sc_data@meta.data 139 | if (!metadata_celltype %in% colnames(obs)){ 140 | stop("Column name for the cell annotation should be provided.") 141 | } else { 142 | obs <- obs[metadata_celltype] 143 | obs[[metadata_celltype]] <- factor(obs[[metadata_celltype]]) 144 | } 145 | var <- data.frame(matrix(nrow=dim(sc_data)[1],ncol=0, 146 | dimnames = list(rownames(sc_data),NULL))) 147 | var[['name']] <- rownames(var) 148 | 149 | adata_sc <- ann$AnnData( 150 | X = Matrix::t(sparse_mtx), 151 | obs = obs, 152 | var = var 153 | ) 154 | 155 | ## 2. Subsetting spatial data and save in anndata format 156 | if (sp_subset){ 157 | cluster_info <- sp_data[[spot.cluster.name]] 158 | Seurat::Idents(sp_data) <- spot.cluster.name 159 | } 160 | 161 | if (is.null(spot.cluster.of.interest)){ 162 | sp_data_sub <- sp_data 163 | } else if (sum(spot.cluster.of.interest%in%levels(cluster_info))==length(spot.cluster.of.interest)){ 164 | sp_data_sub <- subset(sp_data, idents=spot.cluster.of.interest) 165 | } else { 166 | stop("'spot.cluster.of.interest' should be among the levels of 'spot.cluster.name' provided") 167 | } 168 | 169 | # Define count matrix 170 | sparse_mtx <- Seurat::GetAssayData(sp_data_sub, slot = "counts", assay = "Spatial") 171 | 172 | # Define obs and var (reference from sceasy library) 173 | obs <- sp_data_sub@meta.data 174 | var <- data.frame(matrix(nrow=dim(sp_data_sub)[1],ncol=0, 175 | dimnames = list(rownames(sp_data_sub),NULL))) 176 | var[['name']] <- rownames(var) 177 | 178 | adata_sp <- ann$AnnData( 179 | X = Matrix::t(sparse_mtx), 180 | obs = obs, 181 | var = var 182 | ) 183 | 184 | # Assign the output directory for the models generated 185 | if (!file.exists(outdir)){ 186 | dir.create(file.path(outdir, 'results')) 187 | } 188 | out_dir <- file.path(getwd(), outdir, 'results') 189 | 190 | # Run CellDART 191 | try({ 192 | df <- CellDART$pred_cellf_celldart$pred_cellf_celldart(adata_sp=adata_sp, adata_sc=adata_sc, count_from_raw=FALSE, 193 | gpu=gpu, celltype=metadata_celltype, num_markers=num_markers, 194 | nmix=nmix, npseudo=npseudo, alpha=alpha, alpha_lr=alpha_lr, 195 | batch_size=batch_size, emb_dim=emb_dim, n_iterations=n_iterations, 196 | init_train_epoch=init_train_epoch, 197 | outdir=out_dir, return_anndata=FALSE) 198 | 199 | # Saving cell fraction data into the metadata of spatial Seurat object 200 | sp_data_sub <- Seurat::AddMetaData(sp_data_sub, reticulate::py_to_r(df)) 201 | }) 202 | 203 | options(warn = defaultW) 204 | return(sp_data_sub) 205 | } 206 | -------------------------------------------------------------------------------- /R/Read_R_wrap.md: -------------------------------------------------------------------------------- 1 | ## R wrap function for CellDART 2 | ```Plain Text 3 | devtools::install_github("mexchy1000/CellDART", build_vignettes = T, force = T) 4 | library(CellDART) 5 | help(pred_cellf_celldart) # Explanation for the parameters and short examples 6 | browseVignettes("CellDART") # Browse for the vignettes (/doc/introduction.html) 7 | ``` 8 | ### Installation in Linux distributions 9 | Virtual environment (env.select="virtual") or conda environment (env.select="conda") will be automatically installed while running function 'pred_cellf_celldart' 10 | ### Installation in Windows 11 | Install conda environment first and then run the function with env.select='conda' and python.install=F 12 | Example: Please refer to the '/vignettes/introduction.Rmd' file. 13 | 14 | ### Datasets 15 | #### 1. Description 16 | Example single-cell Seurat object file: sc_data.rds (GSE115746: mouse from ALS and VISp) 17 | Example spatial Seurat object file: sp_data.rds 18 | (10X Genomics Data Repository: V1_Mouse_Brain_Sagittal_Anterior, V1_Mouse_Brain_Sagittal_Posterior) 19 | 20 | #### 2. Download 21 | sc_data.rds and sp_data.rds can be downloaded from: 22 | https://drive.google.com/drive/folders/1zvn3rJ6vH-LYPNnuWIjqb8USSfp7YfIy?usp=sharing 23 | 24 | ### Potential error in reticulate::install_python 25 | "ModuleNotFoundError: No module named '_ctypes'" 26 | Then try on the below command (suggested from https://stackoverflow.com/questions/27022373) 27 | sudo apt-get -y update 28 | sudo apt-get -y upgrade 29 | sudo apt-get -y dist-upgrade 30 | sudo apt-get -y install build-essential python-dev python-setuptools python-pip python-smbus 31 | sudo apt-get -y install libncursesw5-dev libgdbm-dev libc6-dev 32 | sudo apt-get -y install zlib1g-dev libsqlite3-dev tk-dev 33 | sudo apt-get -y install libssl-dev openssl 34 | sudo apt-get -y install libffi-dev 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CellDART: Cell type inference by domain adaptation of single-cell and spatial transcriptomic data 2 | CellDART is a tool to estimate cell fraction of spatial transcriptomic spots using domain adaptation of deep neural networks. 3 | * Cite as: Bae S, Na KJ, Koh J, Lee DS, Choi H, Kim YT. CellDART: cell type inference by domain adaptation of single-cell and spatial transcriptomic data. Nucleic Acids Res. 2022;50(10):e57. 4 | ![figure1png](https://user-images.githubusercontent.com/14209383/114880774-528b8100-9e3d-11eb-9b60-41c9d0acd5fd.png) 5 | 6 | ## Optimal parameter choices (for brain) 7 | Number of total marker genes = 200 ~ 400 (or number of markers per cluster: 10 ~ 20) 8 | Number of pseudospots = 5 to 10 times the number of real spots (20,000~40,000 per Visium slide) 9 | Number of sampled cells in a pseudospot (virtual mixture of single-cell data) = 8 10 | Iteration number = 3,000 11 | Mini-batch size = 512 12 | Loss weights between source and domain classifier (alpha) = 0.6 13 | Learning rate = 0.001 * alpha_lr = 0.005 14 | 15 | ## Code Example 16 | python: CellDART_example_mousebrain_markers.ipynb 17 | -> example file that shows the whole process step by step 18 | -> the pred_cellf_celldart function is a compressed version of all the steps shown in the notebook file 19 | -> see the below example how to use the function 20 | R wrap: Please refer to the '/vignettes/introduction.Rmd' file 21 | 22 | ## Python function for CellDART (pred_cellf_celldart) 23 | ### Install conda environment and add jupyter kernel 24 | ```Plain Text 25 | conda create -n CellDART python=3.8 26 | conda activate CellDART 27 | pip install git+https://github.com/mexchy1000/CellDART.git 28 | python -m ipykernel install --user --name CellDART --display-name CellDART 29 | ``` 30 | ### Dependency (python) 31 | ```Plain Text 32 | python 3.8 33 | tensorflow 2.9.1 34 | numpy 1.20.3 35 | pandas 1.4.3 36 | scanpy 1.9.1 37 | jupyter 1.0.0 38 | ``` 39 | ### Function and parameters 40 | ```Plain Text 41 | from CellDART.pred_cellf_celldart import pred_cellf_celldart 42 | adata_sp = pred_cellf_celldart(adata_sp=adata_sp, adata_sc=adata_sc, count_from_raw = False, 43 |             gpu=True, celltype='celltype', num_markers=20, 44 |              nmix=8, npseudo=20000, alpha=0.6, alpha_lr=5, batch_size=512, 45 |              emb_dim=64, n_iterations=3000, init_train_epoch=10, 46 |              outdir='./CellDART_output', return_anndata=True) 47 | ``` 48 | **(1) adata_sp:** spatial data (AnnData object) with raw count matrix to be used in predicting cell fraction (default: None) 49 | **(2) adata_sc:** single-cell data (AnnData object) with raw count matrix to be used in making pseudospots (default: None) 50 | **(3) count_from_raw:** whether to extract count matrix frow .raw of AnnData (default: False) 51 | -> non-normalized raw count matrix should be contained in the AnnData .raw file 52 | -> if False, then utilize the count matrices saved in adata_sp and adata_sc directly 53 | **(4) gpu:** check whether to use gpu (True) or not (False) (default = True) 54 | **(5) celltype:** column name for single-cell annotation data in .obs (default: 'celltype') 55 | **(6) num_markers:** number of selected marker genes in each celltype (default = 20) 56 | **(7) nmix:** sampling number of cells in pseudospot (default = 10) 57 | **(8) npseudo:** a total number of pseudospots (default = 20,000) 58 | **(9) alpha:** loss weights of the domain classifier to the source classifier (default = 0.6) 59 | **(10) alpha_lr:** learning rate for the domain classifier (alpha_lr*0.001, default = 5) 60 | **(11) batch_size:** minibatch size for pseudospots and spatial data during the training (default = 512) 61 | **(12) n_iterations:** iteration number for the adversarial learning (default = 3,000) 62 | **(13) init_train_epoch:** iteration number for the pre-training process (default = 10) 63 | **(14) outdir:** the directory to save output files (models and results) 64 | **(15) return_anndata:** whether to return spatial AnnData file with predicted cell fraction in .obs (default: False) 65 | 66 | ## R wrap function for CellDART using reticulate 67 | ```Plain Text 68 | devtools::install_github("mexchy1000/CellDART", build_vignettes = T, force = T) 69 | library(CellDART) 70 | help(pred_cellf_celldart) # Explanation for the parameters and short examples 71 | browseVignettes("CellDART") # Browse for the vignettes (/vignettes/introduction.Rmd) 72 | ``` 73 | ### Function and additional parameters 74 | ```Plain Text 75 | # Using conda environment (environment will be automatically installed in Linux distributions) 76 | # If using Windows, then install conda environment first and then run the function below with python.install = F 77 | sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.', 78 | sp_subset=F, spot.cluster.name='seurat_clusters', 79 | spot.cluster.of.interest=NULL, 80 | env.select='conda',python.install=T, 81 | python_path=NULL, env.name='CellDART', 82 | gpu=TRUE, metadata_celltype='celltype', 83 | num_markers=20, seed_num=0, 84 | nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5, 85 | emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 86 | ``` 87 | ```Plain Text 88 | # Using virtual environment (environment will be automatically installed in Linux distributions) 89 | # Not recommended for Windows 90 | sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.', 91 | sp_subset=F, spot.cluster.name='seurat_clusters', 92 | spot.cluster.of.interest=NULL, 93 | env.select='virtual',python.install=T, 94 | python_path=NULL, env.name='CellDART', 95 | gpu=TRUE, metadata_celltype='celltype', 96 | num_markers=20, seed_num=0, 97 | nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5, 98 | emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 99 | ``` 100 | **(1) outdir:** the directory to save output files (models and results) (default = '.') 101 | **(2) sp_subset:** whether to subset spatial data and calculate for specific spot cluster (default = FALSE) 102 | **(3) spot.cluster.name:** group name of the cluster used for subsetting spatial data (default = 'seurat_clusters') 103 | **(4) spot.cluster.of.interest:** name of each spot clusters to be used (default = NULL) 104 | **(5) env.select:** select between using reticulate virtual environment or conda environment (default = 'conda') 105 | -> either of the selection will search the already installed environment 106 | -> if environment is not found, then it will automatically install the new environment 107 | **(6) python.install:** whether to automatically install python version 3.8.13 (default = F) 108 | -> For Windows, set python.install = F 109 | **(7) python_path:** path for the python 3.8.13 (default = NULL) 110 | **(8) env.name:** name of the virtual or conda environment to use for the analysis (default = 'CellDART') 111 | **(9) metadata_celltype:** column name for single-cell annotation data in metadata (default = 'celltype') 112 | 113 | ### Dependency (R wrapper) 114 | ```Plain Text 115 | Seurat 4.0.5 116 | dplyr 1.0.7 117 | sceasy 0.0.6 118 | reticulate 1.22 119 | ``` 120 | ### Installation in Linux distributions 121 | Virtual environment (env.select="virtual") or conda environment (env.select="conda") will be automatically installed while running function 'pred_cellf_celldart' 122 | Detailed explanation is in '/R/Read_R_wrap.md' file. 123 | ### Installation in Windows 124 | Install conda environment first and then run the function with env.select='conda' and python.install=F 125 | 126 | 127 | ## R shiny application for CellDART 128 | Shiny application for preprocessing and CellDART analysis. (inside 'shiny') 129 | Refer to the STquantool application: [STquantool](https://github.com/bsungwoo/STquantool) 130 | -------------------------------------------------------------------------------- /celldart_env.yaml: -------------------------------------------------------------------------------- 1 | name: CellDART 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - keras=2.9.0 8 | - leidenalg=0.8.10 9 | - matplotlib=3.5.2 10 | - numpy=1.20.3 11 | - pandas=1.4.3 12 | - python=3.8.13 13 | - python-igraph=0.9.11 14 | - scanpy=1.9.1 15 | - scikit-learn=1.1.1 16 | - scipy=1.8.1 17 | - seaborn=0.11.2 18 | - statsmodels=0.13.2 19 | - tensorflow=2.9.1 20 | - tensorflow-gpu=2.9.1 21 | - umap-learn=0.5.3 22 | prefix: C:\Users\USER\Anaconda3\envs\CellDART -------------------------------------------------------------------------------- /da_cellfraction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.layers import Input, Dense, Activation, BatchNormalization, Dropout, Lambda 3 | from keras.models import Model 4 | from keras.utils import to_categorical 5 | from keras import losses 6 | from sklearn.datasets import make_blobs 7 | from sklearn.metrics import accuracy_score 8 | from keras import optimizers 9 | 10 | def build_models(inp_dim, emb_dim, n_cls_source, alpha=2, alpha_lr=10): 11 | inputs = Input(shape=(inp_dim,)) 12 | x4 = Dense(1024, activation='linear')(inputs) 13 | x4 = BatchNormalization()(x4) 14 | x4 = Activation("elu")(x4) 15 | x4 = Dense(emb_dim, activation='linear')(x4) 16 | x4 = BatchNormalization()(x4) 17 | x4 = Activation("elu")(x4) 18 | 19 | source_classifier = Dense(n_cls_source, activation='linear', name="mo1")(x4) 20 | source_classifier = Activation('softmax', name='mo')(source_classifier) 21 | 22 | domain_classifier = Dense(32, activation='linear', name="do4")(x4) 23 | domain_classifier = BatchNormalization(name="do5")(domain_classifier) 24 | domain_classifier = Activation("elu", name="do6")(domain_classifier) 25 | domain_classifier = Dropout(0.5)(domain_classifier) 26 | domain_classifier = Dense(2, activation='softmax', name="do")(domain_classifier) 27 | 28 | comb_model = Model(inputs=inputs, outputs=[source_classifier, domain_classifier]) 29 | comb_model.compile(optimizer="Adam", 30 | loss={'mo': 'kld', 'do': 'categorical_crossentropy'}, 31 | loss_weights={'mo': 1, 'do': alpha}, metrics=['accuracy'], ) 32 | 33 | source_classification_model = Model(inputs=inputs, outputs=[source_classifier]) 34 | source_classification_model.compile(optimizer=optimizers.adam(lr=0.001), 35 | loss={'mo': 'kld'}, metrics=['mae'], ) 36 | 37 | 38 | domain_classification_model = Model(inputs=inputs, outputs=[domain_classifier]) 39 | domain_classification_model.compile(optimizer=optimizers.adam(lr=alpha_lr*0.001), 40 | loss={'do': 'categorical_crossentropy'}, metrics=['accuracy']) 41 | 42 | 43 | embeddings_model = Model(inputs=inputs, outputs=[x4]) 44 | embeddings_model.compile(optimizer="Adam",loss = 'categorical_crossentropy', metrics=['accuracy']) 45 | 46 | 47 | return comb_model, source_classification_model, domain_classification_model, embeddings_model 48 | 49 | def batch_generator(data, batch_size): 50 | """Generate batches of data. 51 | Given a list of numpy data, it iterates over the list and returns batches of the same size 52 | This 53 | """ 54 | all_examples_indices = len(data[0]) 55 | while True: 56 | mini_batch_indices = np.random.choice(all_examples_indices, size=batch_size, replace=False) 57 | tbr = [k[mini_batch_indices] for k in data] 58 | yield tbr 59 | 60 | 61 | def train( Xs, ys, Xt, yt=None, 62 | emb_dim=2, 63 | batch_size = 64, 64 | enable_dann = True, 65 | n_iterations = 1000, 66 | alpha=2, 67 | alpha_lr=10, 68 | initial_train=True, 69 | initial_train_epochs=100): 70 | 71 | 72 | inp_dim = Xs.shape[1] 73 | ncls_source = ys.shape[1] 74 | 75 | model, source_classification_model, domain_classification_model, embeddings_model = \ 76 | build_models(inp_dim, emb_dim, ncls_source, alpha=alpha, alpha_lr = alpha_lr) 77 | 78 | if initial_train: 79 | source_classification_model.fit(Xs, ys, batch_size= batch_size, epochs=initial_train_epochs) 80 | print("initial_train_done") 81 | y_adversarial_1 = to_categorical(np.array(([1] * batch_size + [0] * batch_size))) 82 | 83 | sample_weights_class = np.array(([1] * batch_size + [0] * batch_size)) 84 | sample_weights_adversarial = np.ones((batch_size * 2,)) 85 | 86 | S_batches = batch_generator([Xs, ys], batch_size) 87 | T_batches = batch_generator([Xt, np.zeros(shape = (len(Xt),2))], batch_size) 88 | 89 | for i in range(n_iterations): 90 | # # print(y_class_dummy.shape, ys.shape) 91 | y_adversarial_2 = to_categorical(np.array(([0] * batch_size + [1] * batch_size))) 92 | 93 | X0, y0 = next(S_batches) 94 | X1, y1 = next(T_batches) 95 | 96 | 97 | X_adv = np.concatenate([X0, X1]) 98 | y_class = np.concatenate([y0, np.zeros_like(y0)]) 99 | 100 | adv_weights = [] 101 | for layer in model.layers: 102 | if (layer.name.startswith("do")): 103 | adv_weights.append(layer.get_weights()) 104 | 105 | if(enable_dann): 106 | # note - even though we save and append weights, the batchnorms moving means and variances 107 | # are not saved throught this mechanism 108 | model.train_on_batch(X_adv, [y_class, y_adversarial_1], 109 | sample_weight=[sample_weights_class, sample_weights_adversarial]) 110 | 111 | k = 0 112 | for layer in model.layers: 113 | if (layer.name.startswith("do")): 114 | layer.set_weights(adv_weights[k]) 115 | k += 1 116 | 117 | class_weights = [] 118 | 119 | 120 | for layer in model.layers: 121 | if (not layer.name.startswith("do")): 122 | class_weights.append(layer.get_weights()) 123 | 124 | domain_classification_model.train_on_batch(X_adv, [y_adversarial_2]) 125 | 126 | k = 0 127 | for layer in model.layers: 128 | if (not layer.name.startswith("do")): 129 | layer.set_weights(class_weights[k]) 130 | k += 1 131 | 132 | else: 133 | source_classification_model.train_on_batch(X0,y0) 134 | 135 | 136 | if yt is None: 137 | if ((i + 1) % 100 == 0): 138 | # print(i, stats) 139 | sourceloss, sourceacc = source_classification_model.evaluate(Xs, ys,verbose=0) 140 | domainloss,domainacc = domain_classification_model.evaluate(np.concatenate([Xs, Xt]), 141 | to_categorical(np.array(([1] * Xs.shape[0] + [0] * Xt.shape[0]))), 142 | verbose=0) 143 | print("Iteration %d, source loss = %.3f, discriminator acc = %.3f"%(i, sourceloss ,domainacc)) 144 | else: 145 | if ((i + 1) % 100 == 0): 146 | # print(i, stats) 147 | y_test_hat_t = source_classification_model.predict(Xt).argmax(1) 148 | y_test_hat_s = source_classification_model.predict(Xs).argmax(1) 149 | print("Iteration %d, source accuracy = %.3f, target accuracy = %.3f"%(i, accuracy_score(ys, y_test_hat_s), accuracy_score(yt, y_test_hat_t))) 150 | 151 | return embeddings_model, source_classification_model 152 | 153 | -------------------------------------------------------------------------------- /data/datafile.md: -------------------------------------------------------------------------------- 1 | Data file 2 | - Visium Data 3 | sc.datasets.visium_sge function will download Visium data (Please see scanpy) 4 | 5 | -Single cell data 6 | download from GEO(GSE115746) 7 | GSE115746_cells_exon_counts.csv 8 | GSE115746_complete_metadata_28706-cells.csv 9 | -------------------------------------------------------------------------------- /main_counts.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import pandas as pd 3 | import numpy as np 4 | import seaborn as sns 5 | import da_cellfraction 6 | from utils import random_mix 7 | from sklearn.manifold import TSNE 8 | 9 | sc.logging.print_versions() 10 | sc.set_figure_params(facecolor="white", figsize=(8, 8)) 11 | sc.settings.verbosity = 3 12 | 13 | adata_spatial_anterior = sc.datasets.visium_sge( 14 | sample_id="V1_Mouse_Brain_Sagittal_Anterior" 15 | ) 16 | adata_spatial_posterior = sc.datasets.visium_sge( 17 | sample_id="V1_Mouse_Brain_Sagittal_Posterior" 18 | ) 19 | 20 | #Normalize and log1P 21 | for adata in [ 22 | adata_spatial_anterior, 23 | adata_spatial_posterior, 24 | ]: 25 | sc.pp.normalize_total(adata, inplace=True) 26 | #sc.pp.log1p(adata) 27 | #sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True) 28 | 29 | 30 | ################## 31 | #Sc data GSE115746 32 | 33 | adata_cortex = sc.read_csv('../data/GSE115746_cells_exon_counts.csv').T 34 | adata_cortex_meta = pd.read_csv('../data/GSE115746_complete_metadata_28706-cells.csv', index_col=0) 35 | adata_cortex_meta_ = adata_cortex_meta.loc[adata_cortex.obs.index,] 36 | 37 | adata_cortex.obs = adata_cortex_meta_ 38 | 39 | adata_cortex.var_names_make_unique() 40 | 41 | adata_cortex.var['mt'] = adata_cortex.var_names.str.startswith('Mt-') # annotate the group of mitochondrial genes as 'mt' 42 | sc.pp.calculate_qc_metrics(adata_cortex, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) 43 | 44 | sc.pp.normalize_total(adata_cortex) 45 | #sc.pp.log1p(adata_cortex) 46 | #sc.pp.highly_variable_genes(adata_cortex,flavor="seurat", n_top_genes=2000, inplace=True) 47 | 48 | #PCA and clustering 49 | sc.tl.pca(adata_cortex, svd_solver='arpack') 50 | sc.pp.neighbors(adata_cortex, n_neighbors=10, n_pcs=40) 51 | sc.tl.umap(adata_cortex) 52 | sc.tl.leiden(adata_cortex, resolution = 0.5) 53 | sc.pl.umap(adata_cortex, color=['leiden','cell_subclass']) 54 | 55 | 56 | #Int Genes 57 | adata_spatial_anterior.var_names_make_unique() 58 | inter_genes = [val for val in adata_cortex.var.index if val in adata_spatial_anterior.var.index] 59 | adata_cortex = adata_cortex[:,inter_genes] 60 | 61 | adata_spatial_anterior = adata_spatial_anterior[:,inter_genes] 62 | 63 | ##### 64 | #To arrays# 65 | ########### 66 | mat_sc = adata_cortex.X 67 | mat_sp = adata_spatial_anterior.X.todense() 68 | 69 | df_sc = adata_cortex.obs 70 | lab_sc = np.asarray(df_sc.leiden, dtype='int') 71 | 72 | lab_sc_sub = df_sc.cell_subclass 73 | sc_sub_dict = dict(zip(range(len(set(lab_sc_sub))), set(lab_sc_sub))) 74 | sc_sub_dict2 = dict((y,x) for x,y in sc_sub_dict.items()) 75 | lab_sc_num = [sc_sub_dict2[ii] for ii in lab_sc_sub] 76 | lab_sc_num = np.asarray(lab_sc_num, dtype='int') 77 | 78 | sc_mix, lab_mix = random_mix(mat_sc, lab_sc_num, nmix=5, n_samples=2000) 79 | 80 | def log_minmaxscale(arr): 81 | arrd = len(arr) 82 | arr = np.log1p(arr) 83 | return (arr-np.reshape(np.min(arr,axis=1), (arrd,1)))/np.reshape((np.max(arr, axis=1)-np.min(arr,axis=1)),(arrd,1)) 84 | 85 | sc_mix_s = log_minmaxscale(sc_mix) 86 | mat_sp_s = log_minmaxscale(mat_sp) 87 | mat_sc_s = log_minmaxscale(mat_sc) 88 | 89 | embs, clssmodel = da_cellfraction.train(sc_mix_s, lab_mix, mat_sp_s, enable_dann = True, 90 | alpha=1, alpha_lr=10, emb_dim = 64, batch_size = 512, 91 | n_iterations = 2000, 92 | initial_train=True, 93 | initial_train_epochs=10) 94 | 95 | #Predicted Embedding 96 | z_sc = embs.predict(mat_sc_s) 97 | z_mix = embs.predict(sc_mix_s) 98 | z_sp = embs.predict(mat_sp_s) 99 | 100 | pred_mix = clssmodel.predict(sc_mix_s) 101 | 102 | z_mixsp = np.concatenate([z_mix, z_sp], axis=0) 103 | z_mixtsne = TSNE(n_components=2).fit_transform(z_mixsp) 104 | sns.scatterplot(x=z_mixtsne[:,0], y= z_mixtsne[:,1], 105 | hue = [0]*z_mix.shape[0]+[1]*z_sp.shape[0], alpha=0.1, size=1, 106 | linewidth=0) 107 | 108 | z_scsp= np.concatenate([z_sc,z_sp], axis=0) 109 | z_tsne = TSNE(n_components=2).fit_transform(z_scsp) 110 | 111 | sns.scatterplot(x=z_tsne[:,0], y= z_tsne[:,1], 112 | hue = [0]*z_sc.shape[0]+[1]*z_sp.shape[0], alpha=0.1, size=1, 113 | linewidth=0) 114 | 115 | pred_sc = clssmodel.predict(mat_sc_s) 116 | pred_sc_ = np.argmax(pred_sc, axis=1) 117 | 118 | pred_sp = clssmodel.predict(mat_sp_s) 119 | pred_sp_ = np.argmax(pred_sp, axis=1) 120 | 121 | sns.scatterplot(x=z_tsne[:z_sc.shape[0],0], 122 | y= z_tsne[:z_sc.shape[0],1], 123 | hue = [str(i) for i in lab_sc_sub.tolist()], 124 | palette = 'Set1', 125 | alpha=0.1, size=1, 126 | linewidth=0) 127 | 128 | sns.scatterplot(x=z_tsne[:z_sc.shape[0],0], 129 | y= z_tsne[:z_sc.shape[0],1], 130 | hue = [str(i) for i in pred_sc_.tolist()], 131 | palette = 'Set1', 132 | alpha=0.1, size=1, 133 | linewidth=0) 134 | 135 | sns.scatterplot(x=z_tsne[z_sc.shape[0]:,0], 136 | y= z_tsne[z_sc.shape[0]:,1], 137 | hue = [str(i) for i in pred_sp_.tolist()], 138 | palette = 'Set1', 139 | alpha=0.5, size=1, 140 | linewidth=0) 141 | 142 | #Score for specific cell types.. 143 | visnum=8 144 | adata_spatial_anterior.obs['Pred_label'] = pred_sp[:,visnum] 145 | sc.pl.spatial( 146 | adata_spatial_anterior, 147 | img_key="hires", 148 | color='Pred_label', 149 | palette='Set1', 150 | size=1.5, 151 | legend_loc=None, 152 | title = sc_sub_dict[visnum]) 153 | 154 | #Model Save 155 | embs.save_weights('./Model/embedder_200805.h5') 156 | clssmodel.save_weights('./Model/classifier_200805.h5') 157 | -------------------------------------------------------------------------------- /man/pred_cellf_celldart.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellDART_R.R 3 | \name{pred_cellf_celldart} 4 | \alias{pred_cellf_celldart} 5 | \title{R wrap function to implement CellDART} 6 | \usage{ 7 | pred_cellf_celldart( 8 | sp_data, 9 | sc_data, 10 | outdir = ".", 11 | sp_subset = FALSE, 12 | spot.cluster.name = "seurat_clusters", 13 | spot.cluster.of.interest = NULL, 14 | env.select = "conda", 15 | python.install = F, 16 | python_path = NULL, 17 | env.name = "CellDART", 18 | gpu = TRUE, 19 | metadata_celltype = "celltype", 20 | num_markers = 20, 21 | seed_num = 0, 22 | nmix = 8, 23 | npseudo = 20000, 24 | alpha = 0.6, 25 | alpha_lr = 5, 26 | emb_dim = 64, 27 | batch_size = 512, 28 | n_iterations = 3000, 29 | init_train_epoch = 10 30 | ) 31 | } 32 | \arguments{ 33 | \item{sp_data}{spatial data (Seurat object) to be used in predicting cell fraction: non-normalized raw data should be in 'counts' slot} 34 | 35 | \item{sc_data}{single-cell data (Seurat object) to be used in making pseudospots: non-normalized raw data should be in 'counts' slot} 36 | 37 | \item{outdir}{the directory to save output files (models and results) (default = '.')} 38 | 39 | \item{sp_subset}{whether to subset spatial data and calculate for specific spot cluster (default = FALSE)} 40 | 41 | \item{spot.cluster.name}{group name of the cluster used for subsetting spatial data (default: 'seurat_clusters')} 42 | 43 | \item{spot.cluster.of.interest}{name of each spot clusters to be used (default: NULL)} 44 | 45 | \item{env.select}{select between using reticulate virtual environment or conda environment (default: "conda")} 46 | 47 | \item{python.install}{whether to automatically install python version 3.8.12} 48 | 49 | \item{python_path}{path for the python 3.8. (default: NULL) 50 | \itemize{ 51 | \item If NULL, python version 3.8.12 will be installed (valid for Linux) 52 | \item If "current", python interpreter associated with current virtual env (ex: r-reticulate) will be used. (version should be 3.8) 53 | }} 54 | 55 | \item{env.name}{name of the virtual or conda environment to use for CellDART analysis (default: 'CellDART')} 56 | 57 | \item{gpu}{check whether to use gpu (True) or not (False) (default = True)} 58 | 59 | \item{metadata_celltype}{column name for single-cell annotation data in metadata (default: 'celltype')} 60 | 61 | \item{num_markers}{number of selected marker genes in each cell-type (default = 20)} 62 | 63 | \item{seed_num}{seed to be used in random sampling (default = 0)} 64 | 65 | \item{nmix}{the number of cells sampled from single-cell data when making a pseudospot (default = 10)} 66 | 67 | \item{npseudo}{a total number of pseudospots (default = 20000)} 68 | 69 | \item{alpha}{loss weights of domain classifier to the source classifier (default = 0.6)} 70 | 71 | \item{alpha_lr}{learning rate for the domain classifier (alpha_lr*0.001, default = 5)} 72 | 73 | \item{emb_dim}{output size of dimensions for feature extractor (default = 64)} 74 | 75 | \item{batch_size}{minibatch size for pseudospots and spatial data during the training (default = 512)} 76 | 77 | \item{n_iterations}{iteration number for the adversarial learning (default = 3000)} 78 | 79 | \item{init_train_epoch}{iteration number for the pre-training process (default = 10)} 80 | } 81 | \value{ 82 | spatial data (Seurat object) with predicted cell fraction in metadata (meta.data) 83 | } 84 | \description{ 85 | Cell type inference by domain adaptation of single-cell and spatial transcriptomic data 86 | } 87 | \examples{ 88 | Using conda environment (environment will be automatically installed in Linux distributions) 89 | If using Windows, then install conda environment first and then run the function below with python.install = F 90 | sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.', 91 | sp_subset=F, spot.cluster.name='seurat_clusters', 92 | spot.cluster.of.interest=NULL, 93 | env.select='conda',python.install=T, 94 | python_path=NULL, env.name='CellDART', 95 | gpu=TRUE, metadata_celltype='celltype', 96 | num_markers=20, seed_num=0, 97 | nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5, 98 | emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 99 | 100 | Using virtual environment (environment will be automatically installed in Linux distributions) 101 | Not recommended for Windows 102 | sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.', 103 | sp_subset=F, spot.cluster.name='seurat_clusters', 104 | spot.cluster.of.interest=NULL, 105 | env.select='virtual',python.install=T, 106 | python_path=NULL, env.name='CellDART', 107 | gpu=TRUE, metadata_celltype='celltype', 108 | num_markers=20, seed_num=0, 109 | nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5, 110 | emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 111 | } 112 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name = "CellDART", 5 | version = "0.1.3", 6 | description = "Cell type inference by domain adaptation of single-cell and spatial transcriptomic data", 7 | url = "https://github.com/mexchy1000/CellDART.git", 8 | author = "Hongyoon Choi, Sungwoo Bae", 9 | packages=find_packages(include=['CellDART', 'CellDART.*']), 10 | install_requires = ["tensorflow~=2.9.0","tensorflow-gpu~=2.9.0", 11 | "pandas","numpy", 12 | "scanpy","leidenalg","igraph", 13 | "jupyter","ply","pytest"] 14 | ) 15 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | from keras.utils import to_categorical 6 | 7 | def random_mix(Xs, ys, nmix=5, n_samples=10000): 8 | nclss=len(set(ys)) 9 | Xs_new, ys_new =[], [] 10 | ys_ = to_categorical(ys) 11 | for i in range(n_samples): 12 | yy = np.zeros(nclss) 13 | fraction = np.random.rand(nmix) 14 | fraction = fraction/np.sum(fraction) 15 | fraction = np.reshape(fraction, (nmix,1)) 16 | randindex = np.random.randint(len(Xs), size=nmix) 17 | ymix = ys_[randindex] 18 | yy = np.sum(ymix*np.reshape(fraction, (nmix,1)), axis=0) 19 | XX = Xs[randindex] * fraction 20 | XX_ = np.sum(XX, axis=0) 21 | ys_new.append(yy) 22 | Xs_new.append(XX_) 23 | Xs_new = np.asarray(Xs_new) 24 | ys_new = np.asarray(ys_new) 25 | return Xs_new, ys_new -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/introduction.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{introduction} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | ```{r setup} 18 | # library(CellDART) 19 | ``` 20 | 21 | # 0. Install required packages 22 | ## Install CellDART 23 | ```{r} 24 | # if (!requireNamespace("CellDART", quietly = TRUE)) 25 | # devtools::install_github("mexchy1000/CellDART") 26 | ``` 27 | 28 | 29 | # 1. Example of using function "pred_cellf_celldart" 30 | ```{r} 31 | # library(SeuratObject) 32 | # 33 | # # Find the directory for active script file (file_path) 34 | # file_path <- rstudioapi::getSourceEditorContext()$path 35 | # file_path <- strsplit(file_path, split=.Platform$file.sep) 36 | # file_path <- paste(file_path[[1]][-length(file_path[[1]])], 37 | # collapse=.Platform$file.sep) 38 | # 39 | # # Set working directory 40 | # setwd(file_path) 41 | # 42 | # # Make output folder 43 | # output_folder_name <- 'CellDART_output' 44 | # if (!file.exists(output_folder_name)){ 45 | # dir.create(output_folder_name) 46 | # } 47 | ``` 48 | 49 | ## Load single-cell and spatial datasets 50 | ### Load single-cell dataset (RDS file with Seurat object): GSE115746 (mouse single cell: ALS and VISp) 51 | ```{r} 52 | # sc_data <- readRDS('sc_data.rds') 53 | ``` 54 | 55 | 56 | ## Load spatial dataset (RDS file with Seurat object): 10X genomics data repository 57 | ### V1_Mouse_Brain_Sagittal_Anterior & V1_Mouse_Brain_Sagittal_Posterior 58 | ```{r} 59 | # sp_data <- readRDS('sp_data.rds') 60 | ``` 61 | 62 | ## Check the size of spatial dataset 63 | ```{r} 64 | # dim(sp_data) 65 | ``` 66 | 67 | 68 | ## Set the number of pseudospots: 5 times the number of spatial spots 69 | ```{r} 70 | # npseudo <- 5*dim(sp_data)[2] 71 | ``` 72 | 73 | ## Perform CellDART analysis 74 | ### Explanation of the function 75 | ```{r} 76 | help(CellDART) 77 | ``` 78 | 79 | 80 | ### Using conda environment (environment will be automatically installed in Linux distributions) 81 | #### If using Windows, then install conda environment first and then run the function below with python.install = F 82 | 83 | ```{r} 84 | # sp_data_cellf <- CellDART::pred_cellf_celldart(sp_data, sc_data, outdir = file.path(output_folder_name), 85 | # sp_subset=F, spot.cluster.name='seurat_clusters', 86 | # spot.cluster.of.interest=NULL, 87 | # env.select='conda',python.install=T, 88 | # python_path=NULL, env.name='CellDART', 89 | # gpu=TRUE, metadata_celltype='cell_subclass', 90 | # num_markers=20, seed_num=0, 91 | # nmix=8, npseudo=npseudo, alpha=0.6,alpha_lr=5, 92 | # emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 93 | ``` 94 | 95 | 96 | ### Using virtual environment (environment will be automatically installed in Linux distributions) 97 | #### Not recommended for Windows 98 | ```{r} 99 | # sp_data_cellf <- CellDART::pred_cellf_celldart(sp_data, sc_data, outdir = file.path(output_folder_name), 100 | # sp_subset=F, spot.cluster.name='seurat_clusters', 101 | # spot.cluster.of.interest=NULL, 102 | # env.select='virtual',python.install=T, 103 | # python_path=NULL, env.name='CellDART', 104 | # gpu=TRUE, metadata_celltype='cell_subclass', 105 | # num_markers=20, seed_num=0, 106 | # nmix=8, npseudo=npseudo, alpha=0.6,alpha_lr=5, 107 | # emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10) 108 | ``` 109 | 110 | 111 | ### Save seurat object with cell fraction 112 | ```{r} 113 | # saveRDS(sp_data_cellf, file.path(output_folder_name, 'sp_data_cellf.rds')) 114 | ``` 115 | 116 | # 2. Visualization of spatial cell fraction 117 | ### Remove '_cellf' from the column names cell fraction metadata 118 | ```{r} 119 | # cellf.data <- sp_data_cellf@meta.data 120 | # cellf.data.colname <- sapply(colnames(cellf.data), function(x){ 121 | # if (grepl('_cellf',x)){return(strsplit(x, split='_cellf')[[1]][1])} 122 | # else {return(x)} 123 | # }) 124 | # sp_data_cellf.mod <- sp_data_cellf 125 | # colnames(sp_data_cellf.mod@meta.data) <- cellf.data.colname 126 | ``` 127 | 128 | ### Visualize the layer-specific excitatory neuons 129 | ```{r} 130 | # cell_types <- c("L2.3.IT","L4","L5.IT","L5.PT","L6b","L6.CT","L6.IT") 131 | # p <- Seurat::SpatialFeaturePlot(sp_data_cellf.mod, features = cell_types, 132 | # ncol = 4, alpha=0.6, combine = FALSE) 133 | # patchwork::wrap_plots(p, ncol = 8) 134 | ``` 135 | --------------------------------------------------------------------------------