├── .Rbuildignore
├── .gitignore
├── CellDART.Rproj
├── CellDART
    ├── __init__.py
    ├── da_cellfraction.py
    ├── pred_cellf_celldart.py
    └── utils.py
├── CellDART_example_mousebrain_markers.ipynb
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── CellDART_R.R
    └── Read_R_wrap.md
├── README.md
├── celldart_env.yaml
├── da_cellfraction.py
├── data
    └── datafile.md
├── main_counts.py
├── man
    └── pred_cellf_celldart.Rd
├── setup.py
├── utils.py
└── vignettes
    ├── .gitignore
    └── introduction.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^CellDART\.Rproj$
2 | ^\.Rproj\.user$
3 | ^doc$
4 | ^Meta$
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | inst/doc
3 | /doc/
4 | /Meta/
5 | 


--------------------------------------------------------------------------------
/CellDART.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | Encoding: UTF-8
 9 | 
10 | AutoAppendNewline: Yes
11 | StripTrailingWhitespace: Yes
12 | LineEndingConversion: Posix
13 | 
14 | BuildType: Package
15 | PackageUseDevtools: Yes
16 | PackageInstallArgs: --no-multiarch --with-keep.source
17 | PackageRoxygenize: rd,collate,namespace
18 | 


--------------------------------------------------------------------------------
/CellDART/__init__.py:
--------------------------------------------------------------------------------
1 | #from .CellDART import da_cellfraction
2 | #from .CellDART import utils
3 | 


--------------------------------------------------------------------------------
/CellDART/da_cellfraction.py:
--------------------------------------------------------------------------------
  1 | # Suppress tensorflow warnings
  2 | import os
  3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  4 | 
  5 | import numpy as np
  6 | from tensorflow.compat.v1 import logging, disable_eager_execution
  7 | logging.set_verbosity(logging.ERROR)
  8 | # Remove compatibility issues
  9 | disable_eager_execution()
 10 | experimental_run_tf_function=False
 11 | 
 12 | from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Dropout
 13 | from tensorflow.keras.models import Model
 14 | from tensorflow.keras.utils import to_categorical
 15 | from tensorflow.keras import optimizers
 16 | from sklearn.metrics import accuracy_score
 17 | 
 18 | ### Build deep learning models for adversarial domain adaptation
 19 | def build_models(inp_dim, emb_dim, n_cls_source, alpha=2, alpha_lr=10):
 20 |     inputs = Input(shape=(inp_dim,)) 
 21 |     x4 = Dense(1024, activation='linear')(inputs)
 22 |     x4 = BatchNormalization()(x4)
 23 |     x4 = Activation("elu")(x4)  
 24 |     x4 = Dense(emb_dim, activation='linear')(x4)
 25 |     x4 = BatchNormalization()(x4)
 26 |     x4 = Activation("elu")(x4)      
 27 | 
 28 |     source_classifier = Dense(n_cls_source, activation='linear', name="mo1")(x4)  
 29 |     source_classifier = Activation('softmax', name='mo')(source_classifier)
 30 | 
 31 |     domain_classifier = Dense(32, activation='linear', name="do4")(x4)
 32 |     domain_classifier = BatchNormalization(name="do5")(domain_classifier)
 33 |     domain_classifier = Activation("elu", name="do6")(domain_classifier)
 34 |     domain_classifier = Dropout(0.5)(domain_classifier)
 35 |     domain_classifier = Dense(2, activation='softmax', name="do")(domain_classifier)
 36 | 
 37 |     comb_model = Model(inputs=inputs, outputs=[source_classifier, domain_classifier])
 38 |     comb_model.compile(optimizer="Adam",
 39 |               loss={'mo': 'kld', 'do': 'categorical_crossentropy'},
 40 |               loss_weights={'mo': 1, 'do': alpha}, metrics=['accuracy'], )
 41 | 
 42 |     source_classification_model = Model(inputs=inputs, outputs=[source_classifier])
 43 |     source_classification_model.compile(optimizer=optimizers.Adam(learning_rate=0.001),
 44 |               loss={'mo': 'kld'}, metrics=['mae'], )
 45 | 
 46 | 
 47 |     domain_classification_model = Model(inputs=inputs, outputs=[domain_classifier])
 48 |     domain_classification_model.compile(optimizer=optimizers.Adam(learning_rate=alpha_lr*0.001),
 49 |                   loss={'do': 'categorical_crossentropy'}, metrics=['accuracy'])
 50 |     
 51 |     
 52 |     embeddings_model = Model(inputs=inputs, outputs=[x4])
 53 |     embeddings_model.compile(optimizer="Adam",loss = 'categorical_crossentropy', metrics=['accuracy'])
 54 |                         
 55 |                         
 56 |     return comb_model, source_classification_model, domain_classification_model, embeddings_model
 57 | 
 58 | ### Batch data generator for the given data
 59 | def batch_generator(data, batch_size):
 60 |     """Generate batches of data.
 61 |     Given a list of numpy data, it iterates over the list and returns batches of the same size
 62 |     This
 63 |     """
 64 |     all_examples_indices = len(data[0])
 65 |     while True:
 66 |         mini_batch_indices = np.random.choice(all_examples_indices, size=batch_size, replace=False)
 67 |         tbr = [k[mini_batch_indices] for k in data]
 68 |         yield tbr
 69 |         
 70 | ### Train the neural network to predict cell composition in spatial data
 71 | # Xs: numpy array for composite log-normaized count of pseudospots
 72 | # ys: numpy array for fraction of cell types across the pseudospots
 73 | # Xt: numpy array for log-normaized count of spatial spots
 74 | 
 75 | # emb_dim: output size of dimensions for feature extractor (default = 64)
 76 | # batch_size: minibatch size for pseudospots and spatial data during the training (default = 64)
 77 | # enable_dann: whether to use domain adaptation process
 78 | # n_iterations: iteration number for the adversarial learning (default = 3000)
 79 | 
 80 | # alpha: loss weights of domain classifier to the source classifier (default = 0.6)
 81 | # alpha_lr: learning rate for the domain classifier (alpha_lr*0.001, default = 5)
 82 | 
 83 | # init_train: whether to perform pre-training process
 84 | # init_train_epoch: iteration number for the pre-training process (default = 10)
 85 | def train( Xs, ys, Xt, yt=None, 
 86 |           emb_dim=2,
 87 |           batch_size = 64, 
 88 |           enable_dann = True, 
 89 |           n_iterations = 1000,
 90 |           alpha=2,
 91 |           alpha_lr=10,
 92 |           initial_train=True,
 93 |           initial_train_epochs=100):
 94 |     
 95 |     
 96 |     inp_dim = Xs.shape[1]
 97 |     ncls_source = ys.shape[1]
 98 |     
 99 |     model, source_classification_model, domain_classification_model, embeddings_model = \
100 |           build_models(inp_dim, emb_dim, ncls_source, alpha=alpha, alpha_lr = alpha_lr)
101 |           
102 |     if initial_train:
103 |         source_classification_model.fit(Xs, ys, batch_size= batch_size, epochs=initial_train_epochs)
104 |         print("initial_train_done")
105 |     y_adversarial_1 = to_categorical(np.array(([1] * batch_size + [0] * batch_size)))
106 |     
107 |     sample_weights_class = np.array(([1] * batch_size + [0] * batch_size))
108 |     sample_weights_adversarial = np.ones((batch_size * 2,))
109 | 
110 |     S_batches = batch_generator([Xs, ys], batch_size)
111 |     T_batches = batch_generator([Xt, np.zeros(shape = (len(Xt),2))], batch_size)
112 |     
113 |     for i in range(n_iterations):
114 |         # # print(y_class_dummy.shape, ys.shape)
115 |         y_adversarial_2 = to_categorical(np.array(([0] * batch_size + [1] * batch_size)))
116 | 
117 |         X0, y0 = next(S_batches)
118 |         X1, y1 = next(T_batches)
119 | 
120 | 
121 |         X_adv = np.concatenate([X0, X1])
122 |         y_class = np.concatenate([y0, np.zeros_like(y0)])
123 | 
124 |         adv_weights = []
125 |         for layer in model.layers:
126 |             if (layer.name.startswith("do")):
127 |                 adv_weights.append(layer.get_weights())
128 | 
129 |         if(enable_dann):
130 |             # note - even though we save and append weights, the batchnorms moving means and variances
131 |             # are not saved throught this mechanism 
132 |             model.train_on_batch(X_adv, [y_class, y_adversarial_1],
133 |                                      sample_weight=[sample_weights_class, sample_weights_adversarial])
134 |             
135 |             k = 0
136 |             for layer in model.layers:
137 |                 if (layer.name.startswith("do")):
138 |                     layer.set_weights(adv_weights[k])
139 |                     k += 1
140 | 
141 |             class_weights = []
142 |             
143 |         
144 |             for layer in model.layers:
145 |                 if (not layer.name.startswith("do")):
146 |                     class_weights.append(layer.get_weights())
147 |             
148 |             domain_classification_model.train_on_batch(X_adv, [y_adversarial_2])
149 | 
150 |             k = 0
151 |             for layer in model.layers:
152 |                 if (not layer.name.startswith("do")):
153 |                     layer.set_weights(class_weights[k])
154 |                     k += 1
155 | 
156 |         else:
157 |             source_classification_model.train_on_batch(X0,y0)
158 |             
159 |         
160 |         if yt is None:
161 |             if ((i + 1) % 100 == 0):
162 |                 # print(i, stats)
163 |                 sourceloss, sourceacc = source_classification_model.evaluate(Xs, ys,verbose=0)
164 |                 domainloss,domainacc  = domain_classification_model.evaluate(np.concatenate([Xs, Xt]),
165 |                                                                      to_categorical(np.array(([1] * Xs.shape[0] + [0] * Xt.shape[0]))),
166 |                                                                      verbose=0)
167 |                 print("Iteration %d, source loss =  %.3f, discriminator acc = %.3f"%(i, sourceloss ,domainacc))
168 |         else:
169 |             if ((i + 1) % 100 == 0):
170 |                 # print(i, stats)
171 |                 y_test_hat_t = source_classification_model.predict(Xt).argmax(1)
172 |                 y_test_hat_s = source_classification_model.predict(Xs).argmax(1)
173 |                 print("Iteration %d, source accuracy =  %.3f, target accuracy = %.3f"%(i, accuracy_score(ys, y_test_hat_s), accuracy_score(yt, y_test_hat_t)))
174 |                 
175 |     return embeddings_model, source_classification_model 
176 | 
177 | 


--------------------------------------------------------------------------------
/CellDART/pred_cellf_celldart.py:
--------------------------------------------------------------------------------
  1 | def pred_cellf_celldart(adata_sp=None, adata_sc=None, count_from_raw=False, 
  2 |                         gpu=True, spdir=None, sp10x=True, spfilter=False, spfilgene=5, spfilspot=50, 
  3 |                         scdir=None, sc_10x_mtx=True, sc10x_h5=False, sctranspose=False, 
  4 |                         celltype='celltype', num_markers=20, seed_num=0, 
  5 |                         nmix=10, npseudo=20000, alpha=0.6, alpha_lr=5, batch_size=512, emb_dim=64, n_iterations=3000, init_train_epoch=10, 
  6 |                         outdir='./CellDART_output', return_anndata=True):
  7 |     '''
  8 |     ## Function to implement CellDART in python
  9 |     adata_sp: spatial data (AnnData object) to be used in predicting cell fraction (default: None)
 10 |         -> If None, then provide spdir where spatial datasets are saved (formats are explained below)
 11 |     adata_sc: single-cell data (AnnData object) to be used in making pseudospots (default: None)
 12 |         -> If None, then provide scdir where single-cell datasets are saved (formats are explained below)
 13 |     count_from_raw: whether to extract count matrix frow .raw of AnnData
 14 |         -> non-normalized count matrix should be contained in the AnnData .raw file
 15 |         -> if False, then utilize the count matrices saved in adata_sp and adata_sc directly
 16 | 
 17 |     gpu: check whether to use gpu (True) or not (False) (default = True)
 18 | 
 19 |     spdir: file directory to find or save spatial data
 20 |         -> In case of utilizing already saved spatial data, otherwise, put None
 21 |         -> Visium data should be separated in different folders
 22 |       Example directory (spatial)
 23 |         -> two spatial datasets (10x visium format)
 24 |         ./Mouse_sp/first/filtered_feature_bc_matrix.h5, ./Mouse_sp/first/spatial/tissue_hires_image.png, ./Mouse_sp/first/spatial/tissue_lowres_image.png,
 25 |         ./Mouse_sp/first/spatial/scalefactors_json.json, ./Mouse_sp/first/spatial/tissue_positions_list.csv
 26 |         second dataset directory starts with ./Mouse_sp/second/.., others are same as above.
 27 | 
 28 |     sp10x: whether the spatial data is 10x Visium format (True) or not (False) (default: True)
 29 |     spfilter: check whether to filter the number of cells and genes in spatial data (True: run filter)
 30 |     spfilgene: keep genes that are expressed in at least 'spfilgene' number of cells (default = 5)
 31 |     spfilspot: keep spots with at least 'spfilcell' counts (default = 50)
 32 | 
 33 |     scdir: file directory to find or save single-cell data
 34 |         -> In case of utilizing already saved sc data, otherwise, put None
 35 |         -> each single-cell data should be separated in different folders 
 36 |         -> each file formats should be among 10x format or others (.mtx.gz, .h5ad, h5, .csv, .tsv, or .txt)
 37 |         -> and metadata with corresponding barcode name as index should be included in metadata folder of each single-cell data
 38 |         -> metadata should be csv format
 39 |       Example directory (single-cell)
 40 |         -> two single cell dataset (10x mtx format) with metadata
 41 |         ./Mouse_sc/first/barcodes.tsv, ./Mouse_sc/first/genes.tsv, ./Mouse_sc/first/matrix.mtx, ./Mouse_sc/first/metadata/metadata.csv
 42 |         ./Mouse_sc/second/barcodes.tsv, ./Mouse_sc/second/genes.tsv, ./Mouse_sc/second/matrix.mtx, ./Mouse_sc/first/second/metadata.csv
 43 | 
 44 |     sc10x_mtx: check whether single-cell data is 10x genomics formatted mtx directory (True) or not (False)
 45 |     sc10x_h5: check whether single-cell data is 10x genomics formatted hdf5 file (True) or not (False)
 46 |     sctranspose: if sc10x_mtx and sc10x_h5 is F, check whether loaded matrix should be transposed (True) or not (False)
 47 | 
 48 |     celltype: column name for single-cell annotation data in .obs (default: 'celltype')
 49 |     num_markers: number of selected marker genes in each cell-type (default = 20)
 50 | 
 51 |     seed_num: seed to be used in random sampling (default = 0)
 52 | 
 53 |     nmix: sampling number of cells in pseudospot (default = 10)
 54 |     npseudo: a total number of pseudospots (default = 20000)
 55 | 
 56 |     alpha: loss weights of domain classifier to source classifier (default = 0.6)
 57 |     alpha_lr: learning rate for domain classifier (alpha_lr*0.001, default = 5)
 58 |     batch_size: minibatch size during the training (default = 512)
 59 |     emb_dim: output size of dimensions for feature extractor (default = 64)
 60 |     
 61 |     n_iterations: iteration number for the adversarial learning (default = 3000)
 62 |     init_train_epoch: iteration number of pre-train (default = 10)
 63 | 
 64 |     outdir: the directory to save output files (models and results)
 65 |     return_anndata: return spatial AnnData file with predicted cell fraction in .obs (default = True)
 66 |     '''
 67 |     import os
 68 |     if gpu:
 69 |         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 70 |         os.environ["CUDA_VISIBLE_DEVICES"]= "0" # Use only gpu-0
 71 |         print('GPU is available and will be used')
 72 |     else:
 73 |         os.environ['CUDA_VISIBLE_DEVICES'] = "-1" # Use CPU
 74 |         print('CPU will be used')
 75 |     
 76 |     from warnings import simplefilter 
 77 |     simplefilter(action='ignore', category=Warning)
 78 | 
 79 |     import scanpy as sc
 80 |     import pandas as pd
 81 |     import numpy as np
 82 | 
 83 |     from CellDART import utils
 84 |     from CellDART import da_cellfraction
 85 | 
 86 |     ## Change float variables into integer (during conversion from R to python)
 87 |     num_markers, seed_num, \
 88 |     nmix, npseudo, batch_size, emb_dim, n_iterations, init_train_epoch = \
 89 |         int(num_markers), int(seed_num), \
 90 |         int(nmix), int(npseudo), int(batch_size), int(emb_dim), \
 91 |         int(n_iterations), int(init_train_epoch)
 92 |     
 93 |     ## Create directory if it does not exist
 94 |     if not os.path.exists(outdir):
 95 |         os.makedirs(outdir)
 96 |     
 97 |     ## Load and preprocess spatial dataset
 98 |     if adata_sp is not None:
 99 |         if spdir is not None: 
100 |             raise ValueError("'spdir' should be None when 'adata_sp' is provided.")
101 |         if count_from_raw: spatial_all = adata_sp.raw.to_adata().copy()
102 |         else: spatial_all = adata_sp.copy()     
103 |         sc.pp.normalize_total(spatial_all, target_sum=1e4, inplace=True)
104 |         print('Shape of the provided spatial data is',spatial_all.shape)
105 |     else:
106 |         if spdir is None:
107 |             raise ValueError("'spdir' should be provided when 'adata_sp' is None")
108 |         # Load and normalize spatial data
109 |         sp_list = os.listdir(spdir)
110 |         adata_sp = []
111 |         for i, sp_data in enumerate(sp_list):
112 |             if sp10x:
113 |                 adata = sc.read_visium(os.path.join(spdir,sp_data))
114 |             else:
115 |                 sp = os.listdir(os.path.join(spdir,sp_data))[0]
116 |                 adata = sc.read(os.path.join(spdir,sp_data,sp))
117 |             adata.var_names_make_unique()
118 |             if spfilter:
119 |                 sc.pp.filter_genes(adata, min_cells=spfilgene)
120 |                 sc.pp.filter_cells(adata, min_counts=spfilspot)
121 |             sc.pp.normalize_total(adata, target_sum=1e4, inplace=True)
122 |             adata_sp.append(adata)
123 |             print('Shape of spatial data',i,'is',adata.shape)
124 |             
125 |         # Merge spatial data
126 |         if len(adata_sp)==1:
127 |             spatial_all = adata_sp[0]
128 |         else:
129 |             spatial_all = adata_sp[0].concatenate(adata_sp[1:], join='inner',
130 |                                                     uns_merge='unique')
131 |         print('Shape of the merged spatial data is',spatial_all.shape)
132 | 
133 |     ## Load and preprocess single-cell dataset
134 |     if adata_sc is not None:
135 |         if scdir is not None: 
136 |             raise ValueError("'scdir' should be None when 'adata_sc' is provided.")
137 |         if count_from_raw: single_all = adata_sc.raw.to_adata().copy()
138 |         else: single_all = adata_sc.copy()
139 | 
140 |         # Check if the column for the cell type is included in .obs
141 |         if celltype not in list(single_all.obs):
142 |             raise ValueError('Column for cell type is not found')
143 |         
144 |         sc.pp.normalize_total(single_all, target_sum=1e4, inplace=True)
145 |         print('Shape of the provided single-cell data is',single_all.shape)
146 |     else:
147 |         if scdir is None:
148 |             raise ValueError("'scdir' should be provided when 'adata_sc' is None")
149 |         # Load single cell data
150 |         sc_list = os.listdir(scdir)
151 |         if sc_10x_mtx:
152 |             adata_sc = [sc.read_10x_mtx(os.path.join(scdir,y), cache=True) for y in sc_list]
153 |         elif sc10x_h5:
154 |             adata_sc = [sc.read_10x_h5(os.path.join(scdir,y)) for y in sc_list]
155 |         else:
156 |             if sctranspose:
157 |                 adata_sc = [sc.read(os.path.join(scdir,y,z), cache=True).T for y in sc_list \
158 |                         for z in [i for i in os.listdir(os.path.join(scdir,y)) \
159 |                         if i.endswith('mtx.gz') or i.endswith('h5ad') or i.endswith('h5') or \
160 |                             i.endswith('csv') or i.endswith('tsv') or i.endswith('txt')]]
161 |             else: 
162 |                 adata_sc = [sc.read(os.path.join(scdir,y,z), cache=True) for y in sc_list \
163 |                         for z in [i for i in os.listdir(os.path.join(scdir,y)) \
164 |                         if i.endswith('mtx.gz') or i.endswith('h5ad') or i.endswith('h5') or \
165 |                             i.endswith('csv') or i.endswith('tsv') or i.endswith('txt')]]
166 |                 
167 |         # preprocess each of the dataset
168 |         for i, adata in enumerate(adata_sc):
169 |             adata.var_names_make_unique()
170 |             sc.pp.normalize_total(adata, target_sum=1e4, inplace=True)
171 |             sc_meta_list = os.listdir(os.path.join(scdir,sc_list[i],'metadata'))
172 |             sc_meta_list = [i for i in sc_meta_list if i.endswith('.csv')]
173 |             if len(sc_meta_list)==0:
174 |                 raise NotImplementedError('No csv format metadata in the folder')
175 |             tmp = pd.read_csv(os.path.join(scdir,sc_list[i],'metadata',sc_meta_list[0]),index_col=0)
176 |             
177 |             if (set(tmp.index)<=set(adata.obs.index)): 
178 |                 print('All barcode names in metadata are found')
179 |             else:
180 |                 raise ValueError('Unidentified barcode names in metadata of '+sc_list[i])
181 |             if celltype not in list(tmp):
182 |                 raise ValueError('Column for cell type is not found')
183 |             
184 |             # subset the data to include only the barcodes in metadata
185 |             adata = adata[adata.obs.index.isin(tmp.index)].copy()
186 |             # rearrange the metadata index according to adata
187 |             tmp = tmp.reindex(adata.obs.index)
188 |             adata.obs = tmp
189 |             adata_sc[i] = adata
190 |             print('Shape of single cell data',i,'is',adata.shape)
191 |         
192 |         if len(adata_sc)==1:
193 |             single_all = adata_sc[0]
194 |         else:
195 |             single_all = adata_sc[0].concatenate(adata_sc[1:], join='inner')
196 | 
197 |         print('Shape of merged single cell data is',single_all.shape)
198 |     
199 |     # save the normalized data in raw
200 |     single_all.raw = single_all.copy()
201 |     
202 |     # log-transform the count matrix
203 |     sc.pp.log1p(single_all)
204 |     
205 |     # Find marker genes for single cell data
206 |     single_all.obs[celltype] = single_all.obs[celltype].astype('category', copy=False)
207 |     sc.tl.rank_genes_groups(single_all, celltype, method='wilcoxon')
208 |     genelists=single_all.uns['rank_genes_groups']['names']
209 |     df_genelists = pd.DataFrame.from_records(genelists)
210 |     
211 |     # Combining top marker genes representing each cell type
212 |     res_genes = []
213 |     for column in df_genelists.head(num_markers): 
214 |         res_genes.extend(df_genelists.head(num_markers)[column].tolist())
215 |     res_genes_ = list(set(res_genes))
216 |    
217 |     # Calculate intersecting genes
218 |     inter_genes_comb = [val for val in res_genes_ if val in spatial_all.var.index]
219 |     print('Total number of marker genes: ',len(inter_genes_comb))
220 | 
221 |     # Generation of an array representing cell type number
222 |     df_sc = single_all.obs
223 |     lab_sc_sub = df_sc[celltype]
224 |     sc_sub_dict = dict(zip(range(len(set(lab_sc_sub))), set(lab_sc_sub)))
225 |     sc_sub_dict2 = dict((y,x) for x,y in sc_sub_dict.items())
226 |     lab_sc_num = [sc_sub_dict2[ii] for ii in lab_sc_sub]
227 |     # Make an array for cell type numbers following the sequence of single cell barcodes
228 |     lab_sc_num = np.asarray(lab_sc_num, dtype='int')
229 |     
230 |     # Call original normalized count (not log-normalized count)
231 |     adata_final = single_all.raw.to_adata().copy()
232 | 
233 |     # Generate count matrix for single-cell data (mat_sc)
234 |     adata_final = adata_final[:,inter_genes_comb]
235 |     if isinstance(adata_final.X, np.ndarray):
236 |         mat_sc = adata_final.X
237 |     else:
238 |         mat_sc = adata_final.X.toarray()
239 | 
240 |     # Raw file for merged spatial data
241 |     spatial_raw = spatial_all.copy()
242 |     
243 |     # Generate count matrix for spatial data (mat_sp)
244 |     spatial_all = spatial_all[:,inter_genes_comb]
245 |     if isinstance(spatial_all.X, np.ndarray):
246 |         mat_sp = spatial_all.X
247 |     else: 
248 |         mat_sp = spatial_all.X.toarray()
249 |     
250 |     # Generate pseudospot: random mixture of cells
251 |     sc_mix, lab_mix = utils.random_mix(mat_sc, lab_sc_num, nmix=nmix, n_samples=npseudo, seed=seed_num)
252 |     
253 |     # Log-normalize and scale the data 
254 |     def log_minmaxscale(arr):
255 |         arrd = len(arr)
256 |         arr = np.log1p(arr)
257 |         e = 1e-8 # modified by adding e
258 |         return (arr-np.reshape(np.min(arr,axis=1),(arrd,1)))/np.reshape((np.max(arr,axis=1)-np.min(arr,axis=1))+e,(arrd,1))
259 | 
260 |     sc_mix_s = log_minmaxscale(sc_mix)
261 |     mat_sp_s = log_minmaxscale(mat_sp)
262 |     mat_sc_s = log_minmaxscale(mat_sc)
263 | 
264 |     print('Size of spatial, single-cell, pseudospot, and cell fraction data:',
265 |         mat_sp_s.shape, mat_sc_s.shape, sc_mix_s.shape, lab_mix.shape)
266 | 
267 |     # Train the CellDART model
268 |     embs, clssmodel = da_cellfraction.train(sc_mix_s, lab_mix, mat_sp_s, enable_dann = True,
269 |                                             alpha=alpha, alpha_lr=alpha_lr,
270 |                                             emb_dim = emb_dim, 
271 |                                             batch_size = batch_size,
272 |                                             n_iterations = n_iterations,
273 |                                             initial_train=True,
274 |                                             initial_train_epochs=init_train_epoch)
275 |     # Prediction of cell fraction in each spot
276 |     pred_sp = pd.DataFrame(clssmodel.predict(mat_sp_s))
277 |     pred_sp.index = spatial_all.obs.index
278 | 
279 |     # Make directory for the model save
280 |     if not os.path.exists(os.path.join(outdir,'model')):
281 |         os.makedirs(os.path.join(outdir,'model'))
282 | 
283 |     # Save the cell fraction in .obs of spatial_raw file
284 |     for visnum in range(len(sc_sub_dict)):
285 |         spatial_raw.obs[str(sc_sub_dict[visnum])+'_cellf'] = pred_sp.iloc[pred_sp.index.isin(spatial_raw.obs.index),visnum]
286 | 
287 |     # Save cell fraction data
288 |     df = spatial_raw.obs.filter(regex='_cellf', axis=1)
289 |     df.to_csv(os.path.join(outdir,'cellfraction.csv'),header=True,index=True)
290 |     print('Cell fraction data for was saved')
291 | 
292 |     # Save model files
293 |     embs.save_weights(os.path.join(outdir,'model/embedder.h5'))
294 |     clssmodel.save_weights(os.path.join(outdir,'model/classifier.h5'))
295 | 
296 |     # Save spatial anndata
297 |     spatial_raw.write_h5ad(os.path.join(outdir,'model/sp_data.h5ad'))
298 |     print('Spatial anndata was saved')
299 | 
300 |     print('Model and python data files were saved')
301 | 
302 |     if return_anndata: return(spatial_raw)
303 |     else: return(df)
304 | 


--------------------------------------------------------------------------------
/CellDART/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | from tensorflow.keras.utils import to_categorical
 6 | 
 7 | def random_mix(Xs, ys, nmix=5, n_samples=10000, seed=0):
 8 |     # Define empty lists
 9 |     Xs_new, ys_new =[], []
10 |     ys_ = to_categorical(ys)
11 | 
12 |     rstate = np.random.RandomState(seed)
13 |     fraction_all = rstate.rand(n_samples, nmix)
14 |     randindex_all = rstate.randint(len(Xs), size=(n_samples,nmix))
15 | 
16 |     for i in range(n_samples):
17 |         # fraction: random fraction across the "nmix" number of sampled cells
18 |         fraction = fraction_all[i]
19 |         fraction = fraction/np.sum(fraction)
20 |         fraction = np.reshape(fraction, (nmix,1))
21 |         
22 |         # Random selection of the single cell data by the index
23 |         randindex = randindex_all[i]
24 |         ymix = ys_[randindex]
25 |         # Calculate the fraction of cell types in the cell mixture
26 |         yy = np.sum(ymix*fraction, axis=0)
27 |         # Calculate weighted gene expression of the cell mixture
28 |         XX = np.asarray(Xs[randindex])*fraction
29 |         XX_ = np.sum(XX, axis=0)
30 |         
31 |         # Add cell type fraction & composite gene expression in the list
32 |         ys_new.append(yy)
33 |         Xs_new.append(XX_)
34 | 
35 |     Xs_new = np.asarray(Xs_new)
36 |     ys_new = np.asarray(ys_new)
37 | 
38 |     return Xs_new, ys_new


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: CellDART
 2 | Title: Cell type inference by domain adaptation of single-cell and spatial transcriptomic data
 3 | Version: 0.1.1
 4 | Authors@R: 
 5 |     c(person(given = "Hongyoon",
 6 |            family = "Choi",
 7 |            role = c("aut"),
 8 |            comment = c(ORCID = "0000-0002-8895-2449")),
 9 |     person(given = "Sungwoo",
10 |            family = "Bae",
11 |            role = c("aut", "cre"),
12 |            email = "bsungwoo@snu.ac.kr",
13 |            comment = c(ORCID = "0000-0002-3484-3749")))
14 | Description: CellDART estimates the spatial distribution of cells defined by single-cell level data using domain adaptation of neural networks. The neural network that predicts the cell proportion in a pseudospot, a virtual mixture of cells from single-cell data, is translated to decompose the cell types in each spatial barcoded region.
15 | License: `use_mit_license()`
16 | Encoding: UTF-8
17 | Roxygen: list(markdown = TRUE)
18 | RoxygenNote: 7.2.0
19 | Imports:
20 |       Seurat (>= 4.0.5),
21 |       dplyr (>= 1.0.7),
22 |       reticulate (>= 1.22),
23 |       rmarkdown (>= 2.14)
24 | Suggests: 
25 |     knitr
26 | VignetteBuilder: knitr    
27 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(pred_cellf_celldart)
4 | 


--------------------------------------------------------------------------------
/R/CellDART_R.R:
--------------------------------------------------------------------------------
  1 | #' R wrap function to implement CellDART
  2 | #' @description Cell type inference by domain adaptation of single-cell and spatial transcriptomic data
  3 | #'
  4 | #' @param sp_data spatial data (Seurat object) to be used in predicting cell fraction: non-normalized raw data should be in 'counts' slot
  5 | #' @param sc_data single-cell data (Seurat object) to be used in making pseudospots: non-normalized raw data should be in 'counts' slot
  6 | #'
  7 | #' @param outdir the directory to save output files (models and results) (default = '.')
  8 | #'
  9 | #' @param sp_subset whether to subset spatial data and calculate for specific spot cluster (default = FALSE)
 10 | #' @param spot.cluster.name group name of the cluster used for subsetting spatial data (default: 'seurat_clusters')
 11 | #' @param spot.cluster.of.interest name of each spot clusters to be used (default: NULL)
 12 | #' @param metadata_celltype column name for single-cell annotation data in metadata (default: 'celltype')
 13 | #'
 14 | #' @param env.select select between using reticulate virtual environment or conda environment (default: "conda")
 15 | #' @param python.install whether to automatically install python version 3.8.12
 16 | #'
 17 | #' @param python_path path for the python 3.8. (default: NULL)
 18 | #' \itemize{
 19 | #'   \item If NULL, python version 3.8.12 will be installed (valid for Linux)
 20 | #'   \item If "current", python interpreter associated with current virtual env (ex: r-reticulate) will be used. (version should be 3.8)
 21 | #' }
 22 | #'
 23 | #' @param env.name name of the virtual or conda environment to use for CellDART analysis (default: 'CellDART')
 24 | #'
 25 | #' @param gpu check whether to use gpu (True) or not (False) (default = True)
 26 | #' @param metadata_celltype column name for single-cell annotation data in metadata (default: 'celltype')
 27 | #' @param num_markers number of selected marker genes in each cell-type (default = 20)
 28 | #' @param seed_num seed to be used in random sampling (default = 0)
 29 | #' @param nmix the number of cells sampled from single-cell data when making a pseudospot (default = 10)
 30 | #' @param npseudo a total number of pseudospots (default = 20000)
 31 | #'
 32 | #' @param alpha loss weights of domain classifier to the source classifier (default = 0.6)
 33 | #' @param alpha_lr learning rate for the domain classifier (alpha_lr*0.001, default = 5)
 34 | #' @param emb_dim output size of dimensions for feature extractor (default = 64)
 35 | #'
 36 | #' @param batch_size minibatch size for pseudospots and spatial data during the training (default = 512)
 37 | #' @param n_iterations iteration number for the adversarial learning (default = 3000)
 38 | #' @param init_train_epoch iteration number for the pre-training process (default = 10)
 39 | #'
 40 | #' @return spatial data (Seurat object) with predicted cell fraction in metadata (meta.data)
 41 | #' @examples
 42 | #' Using conda environment (environment will be automatically installed in Linux distributions)
 43 | #' If using Windows, then install conda environment first and then run the function below with python.install = F
 44 | #' sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.',
 45 | #'                                      sp_subset=F, spot.cluster.name='seurat_clusters',
 46 | #'                                      spot.cluster.of.interest=NULL,
 47 | #'                                      env.select='conda',python.install=T,
 48 | #'                                      python_path=NULL, env.name='CellDART',
 49 | #'                                      gpu=TRUE, metadata_celltype='celltype',
 50 | #'                                      num_markers=20, seed_num=0,
 51 | #'                                      nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5,
 52 | #'                                      emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
 53 | #' 
 54 | #' Using virtual environment (environment will be automatically installed in Linux distributions)
 55 | #' Not recommended for Windows
 56 | #' sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.',
 57 | #'                                      sp_subset=F, spot.cluster.name='seurat_clusters',
 58 | #'                                      spot.cluster.of.interest=NULL,
 59 | #'                                      env.select='virtual',python.install=T,
 60 | #'                                      python_path=NULL, env.name='CellDART',
 61 | #'                                      gpu=TRUE, metadata_celltype='celltype',
 62 | #'                                      num_markers=20, seed_num=0,
 63 | #'                                      nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5,
 64 | #'                                      emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
 65 | #' @export 
 66 | pred_cellf_celldart <- function(sp_data, sc_data, outdir='.',
 67 |                                 sp_subset=FALSE, spot.cluster.name='seurat_clusters',
 68 |                                 spot.cluster.of.interest=NULL,
 69 |                                 env.select='conda', python.install=F,
 70 |                                 python_path=NULL, env.name='CellDART',
 71 |                                 gpu=TRUE, metadata_celltype='celltype',
 72 |                                 num_markers=20, seed_num=0,
 73 |                                 nmix=8, npseudo=20000, alpha=0.6, alpha_lr=5,
 74 |                                 emb_dim=64, batch_size=512, n_iterations=3000, 
 75 |                                 init_train_epoch=10){
 76 |   # Suppress warnings
 77 |   defaultW <- getOption("warn") 
 78 |   options(warn = -1)
 79 | 
 80 |   if (python.install){
 81 |     reticulate::install_python(version = '3.8.12')
 82 |   }
 83 |   
 84 |   # Select between using reticulate virtual environment or conda environment ("virtual" or "conda")
 85 |   if (env.select=="virtual"){
 86 |     # Setting virtual environment with reticulate  
 87 |     if (!(env.name %in% reticulate::virtualenv_list())){
 88 |       ## Python dependencies use python version 3.8
 89 |       if (is.null(python_path)){
 90 |         reticulate::virtualenv_create(envname = env.name, version = '3.8.12')
 91 |       } else if (python_path=="current") {
 92 |         reticulate::virtualenv_create(envname = env.name, python = NULL)
 93 |       } else {
 94 |         reticulate::virtualenv_create(envname = env.name, python = python_path)
 95 |       }
 96 |       # python_depend = c("scanpy==1.9.1","pandas","numpy",
 97 |       #                   "tensorflow==2.9.1","tensorflow-gpu==2.9.1")
 98 | 
 99 |       # Create virtual env and install dependencies
100 |       reticulate::virtualenv_install(env.name, packages = 'pip', ignore_installed=T, 
101 |                                      pip_options = "git+https://github.com/mexchy1000/CellDART.git")
102 |       reticulate::use_virtualenv(env.name, required = T)
103 |     }
104 |     # Apply virtual environment
105 |     reticulate::use_virtualenv(env.name, required = T)
106 |   } else if (env.select=="conda"){
107 |     if (!(env.name %in% reticulate::conda_list()[['name']])){
108 |       ## Python dependencies use python version 3.8
109 |       if (is.null(python_path)){
110 |         reticulate::conda_create(envname = env.name, python_version = '3.8.12')
111 |       } else if (python_path=="current") {
112 |         reticulate::conda_create(envname = env.name, python = NULL)
113 |       } else {
114 |         reticulate::conda_create(envname = env.name, python = python_path)
115 |       }
116 | 
117 |       # Create conda env and install dependencies
118 |       reticulate::conda_install(env.name, ignore_installed=T,
119 |                                 pip = TRUE, "git+https://github.com/mexchy1000/CellDART.git")
120 |     }
121 |     # Apply conda environment
122 |     reticulate::use_condaenv(env.name, required = T)
123 |   } else {
124 |     stop("'env.select' should be either 'virtual' or 'conda'")
125 |   }
126 | 
127 |   ## Import anndata
128 |   ann <- reticulate::import('anndata', convert = FALSE)
129 | 
130 |   ## Import python function
131 |   CellDART <- reticulate::import('CellDART', convert = FALSE)
132 |   
133 |   ## 1. Saving single-cell data in anndata format
134 |   # Define count matrix
135 |   sparse_mtx <- Seurat::GetAssayData(sc_data, slot = "counts", assay = "RNA")
136 | 
137 |   # Define obs and var (reference from sceasy library: https://github.com/cellgeni/sceasy)
138 |   obs <- sc_data@meta.data
139 |   if (!metadata_celltype %in% colnames(obs)){
140 |     stop("Column name for the cell annotation should be provided.")
141 |   } else {
142 |     obs <- obs[metadata_celltype]
143 |     obs[[metadata_celltype]] <- factor(obs[[metadata_celltype]])
144 |   }
145 |   var <- data.frame(matrix(nrow=dim(sc_data)[1],ncol=0,
146 |                            dimnames = list(rownames(sc_data),NULL)))
147 |   var[['name']] <- rownames(var)
148 | 
149 |   adata_sc <- ann$AnnData(
150 |     X = Matrix::t(sparse_mtx),
151 |     obs = obs,
152 |     var = var
153 |   )
154 | 
155 |   ## 2. Subsetting spatial data and save in anndata format
156 |   if (sp_subset){
157 |     cluster_info <- sp_data[[spot.cluster.name]]
158 |     Seurat::Idents(sp_data) <- spot.cluster.name
159 |   }
160 | 
161 |   if (is.null(spot.cluster.of.interest)){
162 |     sp_data_sub <- sp_data
163 |   } else if (sum(spot.cluster.of.interest%in%levels(cluster_info))==length(spot.cluster.of.interest)){
164 |     sp_data_sub <- subset(sp_data, idents=spot.cluster.of.interest)
165 |   } else {
166 |     stop("'spot.cluster.of.interest' should be among the levels of 'spot.cluster.name' provided")
167 |   }
168 | 
169 |   # Define count matrix
170 |   sparse_mtx <- Seurat::GetAssayData(sp_data_sub, slot = "counts", assay = "Spatial")
171 | 
172 |   # Define obs and var (reference from sceasy library)
173 |   obs <- sp_data_sub@meta.data
174 |   var <- data.frame(matrix(nrow=dim(sp_data_sub)[1],ncol=0,
175 |                            dimnames = list(rownames(sp_data_sub),NULL)))
176 |   var[['name']] <- rownames(var)
177 | 
178 |   adata_sp <- ann$AnnData(
179 |     X = Matrix::t(sparse_mtx),
180 |     obs = obs,
181 |     var = var
182 |   )
183 | 
184 |   # Assign the output directory for the models generated
185 |   if (!file.exists(outdir)){
186 |     dir.create(file.path(outdir, 'results'))
187 |   }
188 |   out_dir <- file.path(getwd(), outdir, 'results')
189 |   
190 |   # Run CellDART
191 |   try({
192 |     df <- CellDART$pred_cellf_celldart$pred_cellf_celldart(adata_sp=adata_sp, adata_sc=adata_sc, count_from_raw=FALSE, 
193 |                                                            gpu=gpu, celltype=metadata_celltype, num_markers=num_markers,
194 |                                                            nmix=nmix, npseudo=npseudo, alpha=alpha, alpha_lr=alpha_lr, 
195 |                                                            batch_size=batch_size, emb_dim=emb_dim, n_iterations=n_iterations,
196 |                                                            init_train_epoch=init_train_epoch, 
197 |                                                            outdir=out_dir, return_anndata=FALSE)
198 | 
199 |     # Saving cell fraction data into the metadata of spatial Seurat object
200 |     sp_data_sub <- Seurat::AddMetaData(sp_data_sub, reticulate::py_to_r(df))
201 |   })
202 | 
203 |   options(warn = defaultW)
204 |   return(sp_data_sub)
205 | }
206 | 


--------------------------------------------------------------------------------
/R/Read_R_wrap.md:
--------------------------------------------------------------------------------
 1 | ## R wrap function for CellDART
 2 |   ```Plain Text
 3 |   devtools::install_github("mexchy1000/CellDART", build_vignettes = T, force = T)  
 4 |   library(CellDART)  
 5 |   help(pred_cellf_celldart)  # Explanation for the parameters and short examples  
 6 |   browseVignettes("CellDART")  # Browse for the vignettes (/doc/introduction.html)
 7 |   ```
 8 |   ### Installation in Linux distributions  
 9 |   Virtual environment (env.select="virtual") or conda environment (env.select="conda") will be automatically installed while running function 'pred_cellf_celldart'  
10 |   ### Installation in Windows  
11 |   Install conda environment first and then run the function with env.select='conda' and python.install=F   
12 |   Example: Please refer to the '/vignettes/introduction.Rmd' file.  
13 | 
14 | ### Datasets
15 |   #### 1. Description  
16 |   Example single-cell Seurat object file: sc_data.rds (GSE115746: mouse from ALS and VISp)  
17 |   Example spatial Seurat object file: sp_data.rds  
18 |   (10X Genomics Data Repository: V1_Mouse_Brain_Sagittal_Anterior, V1_Mouse_Brain_Sagittal_Posterior)  
19 |   
20 |   #### 2. Download  
21 |   sc_data.rds and sp_data.rds can be downloaded from:  
22 |   https://drive.google.com/drive/folders/1zvn3rJ6vH-LYPNnuWIjqb8USSfp7YfIy?usp=sharing
23 |   
24 | ### Potential error in reticulate::install_python
25 |   "ModuleNotFoundError: No module named '_ctypes'"  
26 |   Then try on the below command (suggested from https://stackoverflow.com/questions/27022373)  
27 |   sudo apt-get -y update  
28 |   sudo apt-get -y upgrade  
29 |   sudo apt-get -y dist-upgrade  
30 |   sudo apt-get -y install build-essential python-dev python-setuptools python-pip python-smbus  
31 |   sudo apt-get -y install libncursesw5-dev libgdbm-dev libc6-dev  
32 |   sudo apt-get -y install zlib1g-dev libsqlite3-dev tk-dev  
33 |   sudo apt-get -y install libssl-dev openssl  
34 |   sudo apt-get -y install libffi-dev  
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CellDART: Cell type inference by domain adaptation of single-cell and spatial transcriptomic data
  2 | CellDART is a tool to estimate cell fraction of spatial transcriptomic spots using domain adaptation of deep neural networks.
  3 | * Cite as: Bae S, Na KJ, Koh J, Lee DS, Choi H, Kim YT. CellDART: cell type inference by domain adaptation of single-cell and spatial transcriptomic data. Nucleic Acids Res. 2022;50(10):e57.  
  4 | ![figure1png](https://user-images.githubusercontent.com/14209383/114880774-528b8100-9e3d-11eb-9b60-41c9d0acd5fd.png)
  5 | 
  6 | ## Optimal parameter choices (for brain)
  7 |   Number of total marker genes = 200 ~ 400 (or number of markers per cluster: 10 ~ 20)  
  8 |   Number of pseudospots = 5 to 10 times the number of real spots (20,000~40,000 per Visium slide)  
  9 |   Number of sampled cells in a pseudospot (virtual mixture of single-cell data) = 8  
 10 |   Iteration number = 3,000  
 11 |   Mini-batch size = 512  
 12 |   Loss weights between source and domain classifier (alpha) = 0.6  
 13 |   Learning rate = 0.001 * alpha_lr = 0.005  
 14 | 
 15 | ## Code Example  
 16 | python: CellDART_example_mousebrain_markers.ipynb  
 17 | -> example file that shows the whole process step by step  
 18 | -> the pred_cellf_celldart function is a compressed version of all the steps shown in the notebook file  
 19 | -> see the below example how to use the function  
 20 | R wrap: Please refer to the '/vignettes/introduction.Rmd' file  
 21 | 
 22 | ## Python function for CellDART (pred_cellf_celldart)  
 23 | ### Install conda environment and add jupyter kernel  
 24 | ```Plain Text
 25 |   conda create -n CellDART python=3.8  
 26 |   conda activate CellDART  
 27 |   pip install git+https://github.com/mexchy1000/CellDART.git  
 28 |   python -m ipykernel install --user --name CellDART --display-name CellDART  
 29 | ```
 30 | ### Dependency (python)  
 31 | ```Plain Text
 32 | python 3.8
 33 | tensorflow 2.9.1
 34 | numpy 1.20.3
 35 | pandas 1.4.3
 36 | scanpy 1.9.1
 37 | jupyter 1.0.0
 38 | ```
 39 | ### Function and parameters
 40 | ```Plain Text
 41 | from CellDART.pred_cellf_celldart import pred_cellf_celldart  
 42 | adata_sp = pred_cellf_celldart(adata_sp=adata_sp, adata_sc=adata_sc, count_from_raw = False,  
 43 |         　　　　　　　　　　　   gpu=True, celltype='celltype', num_markers=20,  
 44 |         　　　　　　　　　　　　 nmix=8, npseudo=20000, alpha=0.6, alpha_lr=5, batch_size=512,  
 45 |         　　　　　　　　　　　　 emb_dim=64, n_iterations=3000, init_train_epoch=10,  
 46 |         　　　　　　　　　　　　 outdir='./CellDART_output', return_anndata=True)
 47 | ```        
 48 | **(1) adata_sp:** spatial data (AnnData object) with raw count matrix to be used in predicting cell fraction (default: None)  
 49 | **(2) adata_sc:** single-cell data (AnnData object) with raw count matrix to be used in making pseudospots (default: None)  
 50 | **(3) count_from_raw:** whether to extract count matrix frow .raw of AnnData (default: False)  
 51 | -> non-normalized raw count matrix should be contained in the AnnData .raw file  
 52 | -> if False, then utilize the count matrices saved in adata_sp and adata_sc directly  
 53 | **(4) gpu:** check whether to use gpu (True) or not (False) (default = True)  
 54 | **(5) celltype:** column name for single-cell annotation data in .obs (default: 'celltype')  
 55 | **(6) num_markers:** number of selected marker genes in each celltype (default = 20)   
 56 | **(7) nmix:** sampling number of cells in pseudospot (default = 10)  
 57 | **(8) npseudo:** a total number of pseudospots (default = 20,000)  
 58 | **(9) alpha:** loss weights of the domain classifier to the source classifier (default = 0.6)  
 59 | **(10) alpha_lr:** learning rate for the domain classifier (alpha_lr*0.001, default = 5)  
 60 | **(11) batch_size:** minibatch size for pseudospots and spatial data during the training (default = 512)  
 61 | **(12) n_iterations:** iteration number for the adversarial learning (default = 3,000)  
 62 | **(13) init_train_epoch:** iteration number for the pre-training process (default = 10)  
 63 | **(14) outdir:** the directory to save output files (models and results)  
 64 | **(15) return_anndata:** whether to return spatial AnnData file with predicted cell fraction in .obs (default: False)  
 65 | 
 66 | ## R wrap function for CellDART using reticulate  
 67 |   ```Plain Text
 68 |   devtools::install_github("mexchy1000/CellDART", build_vignettes = T, force = T)  
 69 |   library(CellDART)  
 70 |   help(pred_cellf_celldart)  # Explanation for the parameters and short examples  
 71 |   browseVignettes("CellDART")  # Browse for the vignettes (/vignettes/introduction.Rmd)
 72 |   ```
 73 |   ### Function and additional parameters
 74 |   ```Plain Text
 75 |   # Using conda environment (environment will be automatically installed in Linux distributions)
 76 |   # If using Windows, then install conda environment first and then run the function below with python.install = F
 77 |   sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.',
 78 |                                        sp_subset=F, spot.cluster.name='seurat_clusters',
 79 |                                        spot.cluster.of.interest=NULL,
 80 |                                        env.select='conda',python.install=T,
 81 |                                        python_path=NULL, env.name='CellDART',
 82 |                                        gpu=TRUE, metadata_celltype='celltype',
 83 |                                        num_markers=20, seed_num=0,
 84 |                                        nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5,
 85 |                                        emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
 86 |   ```
 87 |   ```Plain Text
 88 |   # Using virtual environment (environment will be automatically installed in Linux distributions)
 89 |   # Not recommended for Windows
 90 |   sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.',
 91 |                                        sp_subset=F, spot.cluster.name='seurat_clusters',
 92 |                                        spot.cluster.of.interest=NULL,
 93 |                                        env.select='virtual',python.install=T,
 94 |                                        python_path=NULL, env.name='CellDART',
 95 |                                        gpu=TRUE, metadata_celltype='celltype',
 96 |                                        num_markers=20, seed_num=0,
 97 |                                        nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5,
 98 |                                        emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
 99 |   ```
100 |   **(1) outdir:** the directory to save output files (models and results) (default = '.')  
101 |   **(2) sp_subset:** whether to subset spatial data and calculate for specific spot cluster (default = FALSE)  
102 |   **(3) spot.cluster.name:** group name of the cluster used for subsetting spatial data (default = 'seurat_clusters')  
103 |   **(4) spot.cluster.of.interest:** name of each spot clusters to be used (default = NULL)  
104 |   **(5) env.select:** select between using reticulate virtual environment or conda environment (default = 'conda')  
105 |   -> either of the selection will search the already installed environment  
106 |   -> if environment is not found, then it will automatically install the new environment  
107 |   **(6) python.install:** whether to automatically install python version 3.8.13 (default = F)  
108 |   -> For Windows, set python.install = F  
109 |   **(7) python_path:** path for the python 3.8.13 (default = NULL)  
110 |   **(8) env.name:** name of the virtual or conda environment to use for the analysis (default = 'CellDART')  
111 |   **(9) metadata_celltype:** column name for single-cell annotation data in metadata (default = 'celltype')  
112 | 
113 |   ### Dependency (R wrapper)
114 |   ```Plain Text
115 |   Seurat 4.0.5  
116 |   dplyr 1.0.7  
117 |   sceasy 0.0.6  
118 |   reticulate 1.22  
119 |   ```
120 |   ### Installation in Linux distributions  
121 |   Virtual environment (env.select="virtual") or conda environment (env.select="conda") will be automatically installed while running function 'pred_cellf_celldart'  
122 |   Detailed explanation is in '/R/Read_R_wrap.md' file.  
123 |   ### Installation in Windows  
124 |   Install conda environment first and then run the function with env.select='conda' and python.install=F   
125 |   
126 |   
127 | ## R shiny application for CellDART  
128 | Shiny application for preprocessing and CellDART analysis. (inside 'shiny')  
129 | Refer to the STquantool application: [STquantool](https://github.com/bsungwoo/STquantool)  
130 | 


--------------------------------------------------------------------------------
/celldart_env.yaml:
--------------------------------------------------------------------------------
 1 | name: CellDART
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - keras=2.9.0
 8 |   - leidenalg=0.8.10
 9 |   - matplotlib=3.5.2
10 |   - numpy=1.20.3
11 |   - pandas=1.4.3
12 |   - python=3.8.13
13 |   - python-igraph=0.9.11
14 |   - scanpy=1.9.1
15 |   - scikit-learn=1.1.1
16 |   - scipy=1.8.1
17 |   - seaborn=0.11.2
18 |   - statsmodels=0.13.2
19 |   - tensorflow=2.9.1
20 |   - tensorflow-gpu=2.9.1
21 |   - umap-learn=0.5.3
22 | prefix: C:\Users\USER\Anaconda3\envs\CellDART


--------------------------------------------------------------------------------
/da_cellfraction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.layers import Input, Dense, Activation, BatchNormalization, Dropout, Lambda
  3 | from keras.models import Model
  4 | from keras.utils import to_categorical
  5 | from keras import losses
  6 | from sklearn.datasets import make_blobs
  7 | from sklearn.metrics import accuracy_score
  8 | from keras import  optimizers
  9 | 
 10 | def build_models(inp_dim, emb_dim, n_cls_source, alpha=2, alpha_lr=10):
 11 |     inputs = Input(shape=(inp_dim,)) 
 12 |     x4 = Dense(1024, activation='linear')(inputs)
 13 |     x4 = BatchNormalization()(x4)
 14 |     x4 = Activation("elu")(x4)  
 15 |     x4 = Dense(emb_dim, activation='linear')(x4)
 16 |     x4 = BatchNormalization()(x4)
 17 |     x4 = Activation("elu")(x4)      
 18 | 
 19 |     source_classifier = Dense(n_cls_source, activation='linear', name="mo1")(x4)  
 20 |     source_classifier = Activation('softmax', name='mo')(source_classifier)
 21 | 
 22 |     domain_classifier = Dense(32, activation='linear', name="do4")(x4)
 23 |     domain_classifier = BatchNormalization(name="do5")(domain_classifier)
 24 |     domain_classifier = Activation("elu", name="do6")(domain_classifier)
 25 |     domain_classifier = Dropout(0.5)(domain_classifier)
 26 |     domain_classifier = Dense(2, activation='softmax', name="do")(domain_classifier)
 27 | 
 28 |     comb_model = Model(inputs=inputs, outputs=[source_classifier, domain_classifier])
 29 |     comb_model.compile(optimizer="Adam",
 30 |               loss={'mo': 'kld', 'do': 'categorical_crossentropy'},
 31 |               loss_weights={'mo': 1, 'do': alpha}, metrics=['accuracy'], )
 32 | 
 33 |     source_classification_model = Model(inputs=inputs, outputs=[source_classifier])
 34 |     source_classification_model.compile(optimizer=optimizers.adam(lr=0.001),
 35 |               loss={'mo': 'kld'}, metrics=['mae'], )
 36 | 
 37 | 
 38 |     domain_classification_model = Model(inputs=inputs, outputs=[domain_classifier])
 39 |     domain_classification_model.compile(optimizer=optimizers.adam(lr=alpha_lr*0.001),
 40 |                   loss={'do': 'categorical_crossentropy'}, metrics=['accuracy'])
 41 |     
 42 |     
 43 |     embeddings_model = Model(inputs=inputs, outputs=[x4])
 44 |     embeddings_model.compile(optimizer="Adam",loss = 'categorical_crossentropy', metrics=['accuracy'])
 45 |                         
 46 |                         
 47 |     return comb_model, source_classification_model, domain_classification_model, embeddings_model
 48 | 
 49 | def batch_generator(data, batch_size):
 50 |     """Generate batches of data.
 51 |     Given a list of numpy data, it iterates over the list and returns batches of the same size
 52 |     This
 53 |     """
 54 |     all_examples_indices = len(data[0])
 55 |     while True:
 56 |         mini_batch_indices = np.random.choice(all_examples_indices, size=batch_size, replace=False)
 57 |         tbr = [k[mini_batch_indices] for k in data]
 58 |         yield tbr
 59 |         
 60 | 
 61 | def train( Xs, ys, Xt, yt=None, 
 62 |           emb_dim=2,
 63 |           batch_size = 64, 
 64 |           enable_dann = True, 
 65 |           n_iterations = 1000,
 66 |           alpha=2,
 67 |           alpha_lr=10,
 68 |           initial_train=True,
 69 |           initial_train_epochs=100):
 70 |     
 71 |     
 72 |     inp_dim = Xs.shape[1]
 73 |     ncls_source = ys.shape[1]
 74 |     
 75 |     model, source_classification_model, domain_classification_model, embeddings_model = \
 76 |           build_models(inp_dim, emb_dim, ncls_source, alpha=alpha, alpha_lr = alpha_lr)
 77 |           
 78 |     if initial_train:
 79 |         source_classification_model.fit(Xs, ys, batch_size= batch_size, epochs=initial_train_epochs)
 80 |         print("initial_train_done")
 81 |     y_adversarial_1 = to_categorical(np.array(([1] * batch_size + [0] * batch_size)))
 82 |     
 83 |     sample_weights_class = np.array(([1] * batch_size + [0] * batch_size))
 84 |     sample_weights_adversarial = np.ones((batch_size * 2,))
 85 | 
 86 |     S_batches = batch_generator([Xs, ys], batch_size)
 87 |     T_batches = batch_generator([Xt, np.zeros(shape = (len(Xt),2))], batch_size)
 88 |     
 89 |     for i in range(n_iterations):
 90 |         # # print(y_class_dummy.shape, ys.shape)
 91 |         y_adversarial_2 = to_categorical(np.array(([0] * batch_size + [1] * batch_size)))
 92 | 
 93 |         X0, y0 = next(S_batches)
 94 |         X1, y1 = next(T_batches)
 95 | 
 96 | 
 97 |         X_adv = np.concatenate([X0, X1])
 98 |         y_class = np.concatenate([y0, np.zeros_like(y0)])
 99 | 
100 |         adv_weights = []
101 |         for layer in model.layers:
102 |             if (layer.name.startswith("do")):
103 |                 adv_weights.append(layer.get_weights())
104 | 
105 |         if(enable_dann):
106 |             # note - even though we save and append weights, the batchnorms moving means and variances
107 |             # are not saved throught this mechanism 
108 |             model.train_on_batch(X_adv, [y_class, y_adversarial_1],
109 |                                      sample_weight=[sample_weights_class, sample_weights_adversarial])
110 |             
111 |             k = 0
112 |             for layer in model.layers:
113 |                 if (layer.name.startswith("do")):
114 |                     layer.set_weights(adv_weights[k])
115 |                     k += 1
116 | 
117 |             class_weights = []
118 |             
119 |         
120 |             for layer in model.layers:
121 |                 if (not layer.name.startswith("do")):
122 |                     class_weights.append(layer.get_weights())
123 |             
124 |             domain_classification_model.train_on_batch(X_adv, [y_adversarial_2])
125 | 
126 |             k = 0
127 |             for layer in model.layers:
128 |                 if (not layer.name.startswith("do")):
129 |                     layer.set_weights(class_weights[k])
130 |                     k += 1
131 | 
132 |         else:
133 |             source_classification_model.train_on_batch(X0,y0)
134 |             
135 |         
136 |         if yt is None:
137 |             if ((i + 1) % 100 == 0):
138 |                 # print(i, stats)
139 |                 sourceloss, sourceacc = source_classification_model.evaluate(Xs, ys,verbose=0)
140 |                 domainloss,domainacc  = domain_classification_model.evaluate(np.concatenate([Xs, Xt]),
141 |                                                                      to_categorical(np.array(([1] * Xs.shape[0] + [0] * Xt.shape[0]))),
142 |                                                                      verbose=0)
143 |                 print("Iteration %d, source loss =  %.3f, discriminator acc = %.3f"%(i, sourceloss ,domainacc))
144 |         else:
145 |             if ((i + 1) % 100 == 0):
146 |                 # print(i, stats)
147 |                 y_test_hat_t = source_classification_model.predict(Xt).argmax(1)
148 |                 y_test_hat_s = source_classification_model.predict(Xs).argmax(1)
149 |                 print("Iteration %d, source accuracy =  %.3f, target accuracy = %.3f"%(i, accuracy_score(ys, y_test_hat_s), accuracy_score(yt, y_test_hat_t)))
150 |                 
151 |     return embeddings_model, source_classification_model 
152 | 
153 | 


--------------------------------------------------------------------------------
/data/datafile.md:
--------------------------------------------------------------------------------
1 | Data file
2 | - Visium Data
3 | sc.datasets.visium_sge function will download Visium data (Please see scanpy)
4 | 
5 | -Single cell data
6 | download from GEO(GSE115746)
7 | GSE115746_cells_exon_counts.csv
8 | GSE115746_complete_metadata_28706-cells.csv
9 | 


--------------------------------------------------------------------------------
/main_counts.py:
--------------------------------------------------------------------------------
  1 | import scanpy as sc
  2 | import pandas as pd
  3 | import numpy as np
  4 | import seaborn as sns
  5 | import da_cellfraction
  6 | from utils import random_mix
  7 | from sklearn.manifold import TSNE
  8 | 
  9 | sc.logging.print_versions()
 10 | sc.set_figure_params(facecolor="white", figsize=(8, 8))
 11 | sc.settings.verbosity = 3
 12 | 
 13 | adata_spatial_anterior = sc.datasets.visium_sge(
 14 |     sample_id="V1_Mouse_Brain_Sagittal_Anterior"
 15 | )
 16 | adata_spatial_posterior = sc.datasets.visium_sge(
 17 |     sample_id="V1_Mouse_Brain_Sagittal_Posterior"
 18 | )
 19 | 
 20 | #Normalize and log1P
 21 | for adata in [
 22 |     adata_spatial_anterior,
 23 |     adata_spatial_posterior,
 24 | ]:
 25 |     sc.pp.normalize_total(adata, inplace=True)
 26 |     #sc.pp.log1p(adata)
 27 |     #sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True)
 28 |     
 29 | 
 30 | ##################
 31 | #Sc data GSE115746
 32 | 
 33 | adata_cortex = sc.read_csv('../data/GSE115746_cells_exon_counts.csv').T
 34 | adata_cortex_meta = pd.read_csv('../data/GSE115746_complete_metadata_28706-cells.csv', index_col=0)
 35 | adata_cortex_meta_ = adata_cortex_meta.loc[adata_cortex.obs.index,]
 36 | 
 37 | adata_cortex.obs = adata_cortex_meta_
 38 | 
 39 | adata_cortex.var_names_make_unique()  
 40 | 
 41 | adata_cortex.var['mt'] = adata_cortex.var_names.str.startswith('Mt-')  # annotate the group of mitochondrial genes as 'mt'
 42 | sc.pp.calculate_qc_metrics(adata_cortex, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
 43 | 
 44 | sc.pp.normalize_total(adata_cortex)
 45 | #sc.pp.log1p(adata_cortex)
 46 | #sc.pp.highly_variable_genes(adata_cortex,flavor="seurat", n_top_genes=2000, inplace=True)
 47 | 
 48 | #PCA and clustering
 49 | sc.tl.pca(adata_cortex, svd_solver='arpack')
 50 | sc.pp.neighbors(adata_cortex, n_neighbors=10, n_pcs=40)
 51 | sc.tl.umap(adata_cortex)
 52 | sc.tl.leiden(adata_cortex, resolution = 0.5)
 53 | sc.pl.umap(adata_cortex, color=['leiden','cell_subclass'])
 54 | 
 55 | 
 56 | #Int Genes
 57 | adata_spatial_anterior.var_names_make_unique() 
 58 | inter_genes = [val for val in adata_cortex.var.index if val in adata_spatial_anterior.var.index]
 59 | adata_cortex = adata_cortex[:,inter_genes]
 60 | 
 61 | adata_spatial_anterior = adata_spatial_anterior[:,inter_genes]
 62 | 
 63 | #####
 64 | #To arrays#
 65 | ###########
 66 | mat_sc = adata_cortex.X
 67 | mat_sp = adata_spatial_anterior.X.todense()
 68 | 
 69 | df_sc = adata_cortex.obs
 70 | lab_sc = np.asarray(df_sc.leiden, dtype='int')
 71 | 
 72 | lab_sc_sub = df_sc.cell_subclass
 73 | sc_sub_dict = dict(zip(range(len(set(lab_sc_sub))), set(lab_sc_sub)))
 74 | sc_sub_dict2 = dict((y,x) for x,y in sc_sub_dict.items())
 75 | lab_sc_num = [sc_sub_dict2[ii] for ii in lab_sc_sub]
 76 | lab_sc_num = np.asarray(lab_sc_num, dtype='int')
 77 | 
 78 | sc_mix, lab_mix = random_mix(mat_sc, lab_sc_num, nmix=5, n_samples=2000)
 79 | 
 80 | def log_minmaxscale(arr):
 81 |     arrd = len(arr)
 82 |     arr = np.log1p(arr)
 83 |     return (arr-np.reshape(np.min(arr,axis=1), (arrd,1)))/np.reshape((np.max(arr, axis=1)-np.min(arr,axis=1)),(arrd,1))
 84 | 
 85 | sc_mix_s = log_minmaxscale(sc_mix)
 86 | mat_sp_s = log_minmaxscale(mat_sp)
 87 | mat_sc_s = log_minmaxscale(mat_sc)
 88 | 
 89 | embs, clssmodel = da_cellfraction.train(sc_mix_s, lab_mix, mat_sp_s, enable_dann = True,
 90 |                                  alpha=1, alpha_lr=10, emb_dim = 64, batch_size = 512,
 91 |                                  n_iterations = 2000,
 92 |                                   initial_train=True,
 93 |                                   initial_train_epochs=10)
 94 | 
 95 | #Predicted Embedding
 96 | z_sc = embs.predict(mat_sc_s)
 97 | z_mix = embs.predict(sc_mix_s)
 98 | z_sp = embs.predict(mat_sp_s)
 99 | 
100 | pred_mix = clssmodel.predict(sc_mix_s)
101 | 
102 | z_mixsp = np.concatenate([z_mix, z_sp], axis=0)
103 | z_mixtsne = TSNE(n_components=2).fit_transform(z_mixsp)
104 | sns.scatterplot(x=z_mixtsne[:,0], y= z_mixtsne[:,1],
105 |                 hue = [0]*z_mix.shape[0]+[1]*z_sp.shape[0], alpha=0.1, size=1,
106 |                 linewidth=0)
107 | 
108 | z_scsp= np.concatenate([z_sc,z_sp], axis=0)
109 | z_tsne = TSNE(n_components=2).fit_transform(z_scsp)
110 | 
111 | sns.scatterplot(x=z_tsne[:,0], y= z_tsne[:,1],
112 |                 hue = [0]*z_sc.shape[0]+[1]*z_sp.shape[0], alpha=0.1, size=1,
113 |                 linewidth=0)
114 | 
115 | pred_sc = clssmodel.predict(mat_sc_s)
116 | pred_sc_ = np.argmax(pred_sc, axis=1)
117 | 
118 | pred_sp = clssmodel.predict(mat_sp_s)
119 | pred_sp_ = np.argmax(pred_sp, axis=1)
120 | 
121 | sns.scatterplot(x=z_tsne[:z_sc.shape[0],0], 
122 |                 y= z_tsne[:z_sc.shape[0],1],
123 |                 hue = [str(i) for i in lab_sc_sub.tolist()],
124 |                 palette = 'Set1',
125 |                 alpha=0.1, size=1,
126 |                 linewidth=0)
127 | 
128 | sns.scatterplot(x=z_tsne[:z_sc.shape[0],0], 
129 |                 y= z_tsne[:z_sc.shape[0],1],
130 |                 hue = [str(i) for i in pred_sc_.tolist()],
131 |                 palette = 'Set1',
132 |                 alpha=0.1, size=1,
133 |                 linewidth=0)
134 | 
135 | sns.scatterplot(x=z_tsne[z_sc.shape[0]:,0], 
136 |                 y= z_tsne[z_sc.shape[0]:,1],
137 |                 hue = [str(i) for i in pred_sp_.tolist()],
138 |                 palette = 'Set1',
139 |                 alpha=0.5, size=1,
140 |                 linewidth=0)
141 | 
142 | #Score for specific cell types.. 
143 | visnum=8
144 | adata_spatial_anterior.obs['Pred_label'] = pred_sp[:,visnum]
145 | sc.pl.spatial(
146 |         adata_spatial_anterior,
147 |         img_key="hires",
148 |         color='Pred_label',
149 |         palette='Set1',
150 |         size=1.5,
151 |         legend_loc=None,
152 |         title = sc_sub_dict[visnum])
153 | 
154 | #Model Save
155 | embs.save_weights('./Model/embedder_200805.h5')
156 | clssmodel.save_weights('./Model/classifier_200805.h5')
157 | 


--------------------------------------------------------------------------------
/man/pred_cellf_celldart.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/CellDART_R.R
  3 | \name{pred_cellf_celldart}
  4 | \alias{pred_cellf_celldart}
  5 | \title{R wrap function to implement CellDART}
  6 | \usage{
  7 | pred_cellf_celldart(
  8 |   sp_data,
  9 |   sc_data,
 10 |   outdir = ".",
 11 |   sp_subset = FALSE,
 12 |   spot.cluster.name = "seurat_clusters",
 13 |   spot.cluster.of.interest = NULL,
 14 |   env.select = "conda",
 15 |   python.install = F,
 16 |   python_path = NULL,
 17 |   env.name = "CellDART",
 18 |   gpu = TRUE,
 19 |   metadata_celltype = "celltype",
 20 |   num_markers = 20,
 21 |   seed_num = 0,
 22 |   nmix = 8,
 23 |   npseudo = 20000,
 24 |   alpha = 0.6,
 25 |   alpha_lr = 5,
 26 |   emb_dim = 64,
 27 |   batch_size = 512,
 28 |   n_iterations = 3000,
 29 |   init_train_epoch = 10
 30 | )
 31 | }
 32 | \arguments{
 33 | \item{sp_data}{spatial data (Seurat object) to be used in predicting cell fraction: non-normalized raw data should be in 'counts' slot}
 34 | 
 35 | \item{sc_data}{single-cell data (Seurat object) to be used in making pseudospots: non-normalized raw data should be in 'counts' slot}
 36 | 
 37 | \item{outdir}{the directory to save output files (models and results) (default = '.')}
 38 | 
 39 | \item{sp_subset}{whether to subset spatial data and calculate for specific spot cluster (default = FALSE)}
 40 | 
 41 | \item{spot.cluster.name}{group name of the cluster used for subsetting spatial data (default: 'seurat_clusters')}
 42 | 
 43 | \item{spot.cluster.of.interest}{name of each spot clusters to be used (default: NULL)}
 44 | 
 45 | \item{env.select}{select between using reticulate virtual environment or conda environment (default: "conda")}
 46 | 
 47 | \item{python.install}{whether to automatically install python version 3.8.12}
 48 | 
 49 | \item{python_path}{path for the python 3.8. (default: NULL)
 50 | \itemize{
 51 | \item If NULL, python version 3.8.12 will be installed (valid for Linux)
 52 | \item If "current", python interpreter associated with current virtual env (ex: r-reticulate) will be used. (version should be 3.8)
 53 | }}
 54 | 
 55 | \item{env.name}{name of the virtual or conda environment to use for CellDART analysis (default: 'CellDART')}
 56 | 
 57 | \item{gpu}{check whether to use gpu (True) or not (False) (default = True)}
 58 | 
 59 | \item{metadata_celltype}{column name for single-cell annotation data in metadata (default: 'celltype')}
 60 | 
 61 | \item{num_markers}{number of selected marker genes in each cell-type (default = 20)}
 62 | 
 63 | \item{seed_num}{seed to be used in random sampling (default = 0)}
 64 | 
 65 | \item{nmix}{the number of cells sampled from single-cell data when making a pseudospot (default = 10)}
 66 | 
 67 | \item{npseudo}{a total number of pseudospots (default = 20000)}
 68 | 
 69 | \item{alpha}{loss weights of domain classifier to the source classifier (default = 0.6)}
 70 | 
 71 | \item{alpha_lr}{learning rate for the domain classifier (alpha_lr*0.001, default = 5)}
 72 | 
 73 | \item{emb_dim}{output size of dimensions for feature extractor (default = 64)}
 74 | 
 75 | \item{batch_size}{minibatch size for pseudospots and spatial data during the training (default = 512)}
 76 | 
 77 | \item{n_iterations}{iteration number for the adversarial learning (default = 3000)}
 78 | 
 79 | \item{init_train_epoch}{iteration number for the pre-training process (default = 10)}
 80 | }
 81 | \value{
 82 | spatial data (Seurat object) with predicted cell fraction in metadata (meta.data)
 83 | }
 84 | \description{
 85 | Cell type inference by domain adaptation of single-cell and spatial transcriptomic data
 86 | }
 87 | \examples{
 88 | Using conda environment (environment will be automatically installed in Linux distributions)
 89 | If using Windows, then install conda environment first and then run the function below with python.install = F
 90 | sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.',
 91 |                                      sp_subset=F, spot.cluster.name='seurat_clusters',
 92 |                                      spot.cluster.of.interest=NULL,
 93 |                                      env.select='conda',python.install=T,
 94 |                                      python_path=NULL, env.name='CellDART',
 95 |                                      gpu=TRUE, metadata_celltype='celltype',
 96 |                                      num_markers=20, seed_num=0,
 97 |                                      nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5,
 98 |                                      emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
 99 | 
100 | Using virtual environment (environment will be automatically installed in Linux distributions)
101 | Not recommended for Windows
102 | sp_data_cellf <- pred_cellf_celldart(sp_data, sc_data, outdir = '.',
103 |                                      sp_subset=F, spot.cluster.name='seurat_clusters',
104 |                                      spot.cluster.of.interest=NULL,
105 |                                      env.select='virtual',python.install=T,
106 |                                      python_path=NULL, env.name='CellDART',
107 |                                      gpu=TRUE, metadata_celltype='celltype',
108 |                                      num_markers=20, seed_num=0,
109 |                                      nmix=8, npseudo=20000, alpha=0.6,alpha_lr=5,
110 |                                      emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
111 | }
112 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name = "CellDART",
 5 |     version = "0.1.3",
 6 |     description = "Cell type inference by domain adaptation of single-cell and spatial transcriptomic data",
 7 |     url = "https://github.com/mexchy1000/CellDART.git",
 8 |     author = "Hongyoon Choi, Sungwoo Bae",
 9 |     packages=find_packages(include=['CellDART', 'CellDART.*']),
10 |     install_requires = ["tensorflow~=2.9.0","tensorflow-gpu~=2.9.0",
11 |                         "pandas","numpy",
12 |                         "scanpy","leidenalg","igraph",
13 |                         "jupyter","ply","pytest"]
14 | )
15 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | from keras.utils import to_categorical
 6 | 
 7 | def random_mix(Xs, ys, nmix=5, n_samples=10000):
 8 |     nclss=len(set(ys))
 9 |     Xs_new, ys_new =[], []
10 |     ys_ = to_categorical(ys)
11 |     for i in range(n_samples):
12 |         yy = np.zeros(nclss)
13 |         fraction = np.random.rand(nmix)
14 |         fraction = fraction/np.sum(fraction)
15 |         fraction = np.reshape(fraction, (nmix,1))
16 |         randindex = np.random.randint(len(Xs), size=nmix)
17 |         ymix = ys_[randindex]
18 |         yy = np.sum(ymix*np.reshape(fraction, (nmix,1)), axis=0)
19 |         XX = Xs[randindex] * fraction
20 |         XX_ = np.sum(XX, axis=0)
21 |         ys_new.append(yy)
22 |         Xs_new.append(XX_)
23 |     Xs_new = np.asarray(Xs_new)
24 |     ys_new = np.asarray(ys_new)
25 |     return Xs_new, ys_new


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/introduction.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{introduction}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | ```
 16 | 
 17 | ```{r setup}
 18 | # library(CellDART)
 19 | ```
 20 | 
 21 | # 0. Install required packages
 22 | ## Install CellDART
 23 | ```{r}
 24 | # if (!requireNamespace("CellDART", quietly = TRUE))
 25 | #   devtools::install_github("mexchy1000/CellDART")
 26 | ```
 27 | 
 28 | 
 29 | # 1. Example of using function "pred_cellf_celldart"
 30 | ```{r}
 31 | # library(SeuratObject)
 32 | # 
 33 | # # Find the directory for active script file (file_path)
 34 | # file_path <- rstudioapi::getSourceEditorContext()$path
 35 | # file_path <- strsplit(file_path, split=.Platform$file.sep)
 36 | # file_path <- paste(file_path[[1]][-length(file_path[[1]])],
 37 | #                     collapse=.Platform$file.sep)
 38 | # 
 39 | # # Set working directory
 40 | # setwd(file_path)
 41 | # 
 42 | # # Make output folder
 43 | # output_folder_name <- 'CellDART_output'
 44 | # if (!file.exists(output_folder_name)){
 45 | #   dir.create(output_folder_name)
 46 | # }
 47 | ```
 48 | 
 49 | ## Load single-cell and spatial datasets
 50 | ### Load single-cell dataset (RDS file with Seurat object): GSE115746 (mouse single cell: ALS and VISp)
 51 | ```{r}
 52 | # sc_data <- readRDS('sc_data.rds')
 53 | ```
 54 | 
 55 | 
 56 | ## Load spatial dataset (RDS file with Seurat object): 10X genomics data repository
 57 | ### V1_Mouse_Brain_Sagittal_Anterior & V1_Mouse_Brain_Sagittal_Posterior
 58 | ```{r}
 59 | # sp_data <- readRDS('sp_data.rds')
 60 | ```
 61 | 
 62 | ## Check the size of spatial dataset
 63 | ```{r}
 64 | # dim(sp_data)
 65 | ```
 66 | 
 67 | 
 68 | ## Set the number of pseudospots: 5 times the number of spatial spots
 69 | ```{r}
 70 | # npseudo <- 5*dim(sp_data)[2]
 71 | ```
 72 | 
 73 | ## Perform CellDART analysis
 74 | ### Explanation of the function
 75 | ```{r}
 76 | help(CellDART)
 77 | ```
 78 | 
 79 | 
 80 | ### Using conda environment (environment will be automatically installed in Linux distributions)
 81 | #### If using Windows, then install conda environment first and then run the function below with python.install = F
 82 | 
 83 | ```{r}
 84 | # sp_data_cellf <- CellDART::pred_cellf_celldart(sp_data, sc_data, outdir = file.path(output_folder_name),
 85 | #                                                sp_subset=F, spot.cluster.name='seurat_clusters',
 86 | #                                                spot.cluster.of.interest=NULL,
 87 | #                                                env.select='conda',python.install=T,
 88 | #                                                python_path=NULL, env.name='CellDART',
 89 | #                                                gpu=TRUE, metadata_celltype='cell_subclass',
 90 | #                                                num_markers=20, seed_num=0,
 91 | #                                                nmix=8, npseudo=npseudo, alpha=0.6,alpha_lr=5,
 92 | #                                                emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
 93 | ```
 94 | 
 95 | 
 96 | ### Using virtual environment (environment will be automatically installed in Linux distributions)
 97 | #### Not recommended for Windows
 98 | ```{r}
 99 | # sp_data_cellf <- CellDART::pred_cellf_celldart(sp_data, sc_data, outdir = file.path(output_folder_name),
100 | #                                                sp_subset=F, spot.cluster.name='seurat_clusters',
101 | #                                                spot.cluster.of.interest=NULL,
102 | #                                                env.select='virtual',python.install=T,
103 | #                                                python_path=NULL, env.name='CellDART',
104 | #                                                gpu=TRUE, metadata_celltype='cell_subclass',
105 | #                                                num_markers=20, seed_num=0,
106 | #                                                nmix=8, npseudo=npseudo, alpha=0.6,alpha_lr=5,
107 | #                                                emb_dim=64,batch_size=512,n_iterations=3000, init_train_epoch=10)
108 | ```
109 | 
110 | 
111 | ### Save seurat object with cell fraction
112 | ```{r}
113 | # saveRDS(sp_data_cellf, file.path(output_folder_name, 'sp_data_cellf.rds'))
114 | ```
115 | 
116 | # 2. Visualization of spatial cell fraction
117 | ### Remove '_cellf' from the column names cell fraction metadata
118 | ```{r}
119 | # cellf.data <- sp_data_cellf@meta.data
120 | # cellf.data.colname <- sapply(colnames(cellf.data), function(x){
121 | #   if (grepl('_cellf',x)){return(strsplit(x, split='_cellf')[[1]][1])}
122 | #   else {return(x)}
123 | # })
124 | # sp_data_cellf.mod <- sp_data_cellf
125 | # colnames(sp_data_cellf.mod@meta.data) <- cellf.data.colname
126 | ```
127 | 
128 | ### Visualize the layer-specific excitatory neuons
129 | ```{r}
130 | # cell_types <- c("L2.3.IT","L4","L5.IT","L5.PT","L6b","L6.CT","L6.IT")
131 | # p <- Seurat::SpatialFeaturePlot(sp_data_cellf.mod, features = cell_types, 
132 | #                                 ncol = 4, alpha=0.6, combine = FALSE)
133 | # patchwork::wrap_plots(p, ncol = 8)
134 | ```
135 | 


--------------------------------------------------------------------------------